#!/usr/bin/env python3
# -*- coding:utf-8 -*-
"""
  1. booster[默认是gbtree] 
  选择每次迭代的模型,有两种选择:gbtree基于树的模型、gbliner线性模型 
  2. silent[默认是0] 
  当这个参数值为1的时候,静默模式开启,不会输出任何信息。一般这个参数保持默认的0,这样可以帮我们更好的理解模型。 
  3. nthread[默认值为最大可能的线程数] 
  这个参数用来进行多线程控制,应当输入系统的核数,如果你希望使用cpu全部的核,就不要输入这个参数,算法会自动检测。
  4. objective[默认是reg:linear] 
  这个参数定义需要被最小化的损失函数。最常用的值有:binary:logistic二分类的逻辑回归,返回预测的概率非类别。multi:softmax使用softmax的多分类器,返回预测的类别。在这种情况下,你还要多设置一个参数:num_class类别数目。 
  5. eval_metric[默认值取决于objective参数的取之] 
  对于有效数据的度量方法。对于回归问题,默认值是rmse,对于分类问题,默认是error。典型值有:rmse均方根误差;mae平均绝对误差;logloss负对数似然函数值;error二分类错误率;merror多分类错误率;mlogloss多分类损失函数;auc曲线下面积。 
  6. seed[默认是0] 
随机数的种子,设置它可以复现随机数据的结果,也可以用于调整参数。
"""
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb


def demo1():
    # xgboost 有封装好的分类器和回归器,可以直接用 XGBClassifier 建立模型
    model = XGBClassifier(learning_rate=0.1)
    eval_set = [(X_test, y_test)]
    model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True)
    # model.fit(X_train, y_train)
    # gradient boosting 还有一个优点是可以给出训练好的模型的特征重要性
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))


def demo2():
    model = XGBClassifier(learning_rate=0.1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    plot_importance(model)
    pyplot.show()


def demo3():
    model = XGBClassifier()
    learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
    param_grid = dict(learning_rate=learning_rate)
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
    # print(kfold)
    grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
    grid_result = grid_search.fit(X, Y)
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))


if __name__ == '__main__':
    dataset = loadtxt('../dataset/pima-indians-diabetes.csv', delimiter=",")
    X = dataset[:, 0:8]
    Y = dataset[:, 8]
    seed = 7
    test_size = 0.33
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
    # lightgbm
    num_round = 50
    lgb_train = lgb.Dataset(X_train, y_train) # 将数据保存到LightGBM二进制文件将使加载更快
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)  # 创建验证数据
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',  # 设置提升类型
        'objective': 'regression',  # 目标函数
        'metric': {'l2', 'auc'},  # 评估函数
        'num_leaves': 31,   # 叶子节点数
        'learning_rate': 0.05,  # 学习速率
        'feature_fraction': 0.9,  # 建树的特征选择比例
        'bagging_fraction': 0.8,  # 建树的样本采样比例
        'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
        'verbose': 1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
    }
    print('Start training...')
    # 训练 cv and train
    gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_eval,
                    early_stopping_rounds=5) # 训练数据需要参数列表和数据集
    print('Start predicting...')
    # 预测数据集
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) #如果在训练期间启用了早期停止,可以通过best_iteration方式从最佳迭代中获得预测
    # 评估模型
    print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) # 计算真实值和预测值之间的均方根误差