python数据分析之分类模型与回归模型-第七次笔记


1.分类模型

*1.1KNN 算法
*1.2朴素贝叶斯 算法
*1.3支持向量机SVM 算法
*1.4集成方法—随机森林算法
*1.5集成方法—Adaboost 算法
*1.6决策树

2.回归模型

*2.1线性回归
*2.2岭回归
*2.3Lasso回归
*2.4逻辑回归
*2.5人工神经网络
*2.6GBDT,回归树和提升树


提取数据

#提取训练集,验证集,测试集   比例为6:2:2
    from  sklearn.model_selection import train_test_split
    f_v = features.values
    f_names = features.columns.values
    l_v = label.values
    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)

1.分类模型

1.1KNN 算法

#导入模块
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
models.append(("KNN",KNeighborsClassifier(n_neighbors=3)))

1.2朴素贝叶斯 算法

from sklearn.naive_bayes import GaussianNB,BernoulliNB
    #朴素贝叶斯
    models.append(("GaussianNB",GaussianNB()))
    models.append(("BernoulliNB",BernoulliNB()))

1.3支持向量机SVM 算法

from sklearn.svm import SVC
    # SVM 支持向量机  C参数控制精度
    models.append(("SVM Classifier",SVC(C=1000)))

1.4集成方法—随机森林算法

from sklearn.ensemble import RandomForestClassifier
    #原始森林
  models.append(("OriginalRandomForest",RandomForestClassifier()))
    #随机森林
    models.append(("RandomForest",RandomForestClassifier(n_estimators=11,max_features=None)))

1.5集成方法—Adaboost 算法

from sklearn.ensemble import AdaBoostClassifier
    #分类集成,Adaboost 方法:base_estimator=SVC(),n_estimators=100,algorithm="SAMME")))
    models.append(("Adaboost",AdaBoostClassifier(n_estimators=100)))

1.6决策树

from sklearn.tree import DecisionTreeClassifier,export_graphviz
    #min_impurity_split=0.1 最小不纯度的区分,减枝方法
    #决策树(Gini)不纯度
    models.append(("DecisionTreeGini",DecisionTreeClassifier()))
    #决策树
    models.append(("DecisionTreeEntropy",DecisionTreeClassifier(criterion="entropy")))

2.回归模型

2.1线性回归

#线性回归
    from sklearn.linear_model import LinearRegression,Ridge,Lasso
        #线性回归
    #regr=LinearRegression()

2.2岭回归

#岭回归
    regr=Ridge(alpha=1)

2.3Lasso回归

#Lasso
    regr=Lasso(alpha=0.001)

2.4逻辑回归

#逻辑回归也是一种线性回归
    models.append(("LogisticRegression",LogisticRegression(C=1000,tol=1e-10,solver="sag",max_iter=10000)))

2.5人工神经网络

#人工神经网络  人工神经网络的一个容器
    from keras.models import Sequential
    #Dense 神经网络层(稠密层)Activation激活函数
    from keras.layers.core import Dense,Activation
    #SGD 随机梯度下降算法
    from keras.optimizers import SGD
    #建个容器
    mdl=Sequential()
    #建个输入层,50指下一个层的神经元个数为50,intintput_dim表示输入的纬度
    mdl.add(Dense(50,input_dim=len(f_v[0])))
    #加入激活函数
    mdl.add(Activation("sigmoid"))
    #输出层:2:有两个标注所以为2,
    mdl.add(Dense(2))
    mdl.add(Activation("softmax"))
    #学习率为0.01
    sgd=SGD(lr=0.05)
    #参数 loss指的是最优化函数(损失函数)optimizer优化器 sgd优化器  adam亚当优化器
    mdl.compile(loss="mean_squared_error",optimizer="adam")
    #nb_epoch=迭代的次数,batch_size随机梯度下降算法,每次选取的个数
    mdl.fit(X_train,np.array([[0,1] if i ==1 else [1,0] for i in Y_train]),nb_epoch=1000,batch_size=8999)
        xy_lst = [(X_train, Y_train), (X_validation, Y_validation), (X_test, Y_test)]
    import matplotlib.pyplot as plt
    from sklearn.metrics import roc_curve,auc,roc_auc_score
    f=plt.figure()

    for i in range(len(xy_lst)):
        X_part = xy_lst[i][0]
        Y_part = xy_lst[i][1]
        #predict_classes()输出分类标注
        #Y_pred = mdl.predict_classes(X_part)
        Y_pred = mdl.predict(X_part)
        print(Y_pred)
        Y_pred=np.array(Y_pred[:,1]).reshape((1,-1))[0]

        # print(i)
        # print("NN", "-ACC", accuracy_score(Y_part, Y_pred))
        # print("NN", "-REC", recall_score(Y_part, Y_pred))
        # print("NN", "-Fl", f1_score(Y_part, Y_pred))
        f.add_subplot(1,3,i+1)
        fpr,tpr,threshold=roc_curve(Y_part,Y_pred)
        plt.plot(fpr,tpr)
        print("NN","AUC",auc(fpr,tpr))
        print("NN","AUC_Score",roc_auc_score(Y_part,Y_pred))
    plt.show()

2.6GBDT,回归树和提升树

from sklearn.ensemble import GradientBoostingClassifier
    #GBDT,回归树和提升树  参数max_depth=6一般深度为6,n_estimators=树的数量
    models.append(("GBDT",GradientBoostingClassifier(max_depth=6,n_estimators=100)))

模型的评估

#准确度,召回度, F-score度,为了评价模型的好坏。
    from sklearn.metrics import accuracy_score, recall_score, f1_score
    for clf_name ,clf in models:
        clf.fit(X_train,Y_train)
        xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
        for i in  range(len(xy_lst)):
            X_part=xy_lst[i][0]
            Y_part=xy_lst[i][1]
            Y_pred=clf.predict(X_part)
            print(i)
            print(clf_name,"-ACC",accuracy_score(Y_part,Y_pred))
            print(clf_name,"-REC",recall_score(Y_part,Y_pred))
            print(clf_name,"-Fl",f1_score(Y_part,Y_pred))

完整的程序:

#encoding utf-8
# time: 2018/08/08
# name: py粉
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
import os
import pydotplus
os.environ["PATH"]+=os.pathsep+"E:/Program/Graphviz/bin/"


#sl:satisfaction_level---Flase:MinMaxScaler;Ture:StandardScaler
#le:last_evaluation---Flase:MinMaxScaler;Ture:StandardScaler
#npr:number_project---Flase:MinMaxScaler;Ture:StandardScaler
#amh:average_monthly_hours---Flase:MinMaxScaler;Ture:StandardScaler
#tsc:time_spend_company---Flase:MinMaxScaler;Ture:StandardScaler
#wa:Work_accident---Flase:MinMaxScaler;Ture:StandardScaler
#pl5:promotion_last_5years---Flase:MinMaxScaler;Ture:StandardScaler
#dp:department---False:LabelEncoding;True:OneHotEncoding
#slr:salary---False:LabelEncoding;True:OneHotEncoding

def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=False,slr=False,lower_d=False,ld_n=1):
    f = open("D:\Python\python'数据分析与建模实现\data\HR.csv")
    df = pd.read_csv(f)

    #1.清洗数据
    #satisfaction_level, last_evaluation, number_project,\
    #average_monthly_hours, time_spend_company, Work_accident,
    #  left, promotion_last_5years, department, salary
    df=df.dropna(subset=["satisfaction_level","last_evaluation"])
    df=df[df["satisfaction_level"]<=1][df["salary"]!="nme"]
    # 2.得到标注
    label = df["left"]
    df = df.drop("left", axis=1)
    #3.特征选取
    #4.特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=["satisfaction_level","last_evaluation","number_project",\
                "average_monthly_hours","time_spend_company","Work_accident",\
                "promotion_last_5years"]
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=\
            MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=\
            StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]



    scaler_lst=[slr,dp]
    column_lst=[ "salary","department"]
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=="salary":
                df[column_lst[i]]=[map_salary(s) for s in df["salary"].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])
            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0]
        else:
            # pandas提供了一个OneHotEncoding的方法
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        return PCA(n_components=ld_n).fit_transform(df.values),label


    return df,label
#把“salary”的值标签化
d=dict([("low",0),("medium",1),("high",2)])
def map_salary(s):
    return d.get(s,0)
def hr_modeling(features,label):
    #提取训练集,验证集,测试集   比例为6:2:2
    from  sklearn.model_selection import train_test_split
    f_v = features.values
    f_names = features.columns.values
    l_v = label.values
    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)


    #models
    from sklearn.metrics import accuracy_score, recall_score, f1_score
    from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB,BernoulliNB
    from sklearn.tree import DecisionTreeClassifier,export_graphviz
    from sklearn.externals.six import StringIO
    from sklearn.svm import SVC
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import AdaBoostClassifier
    #逻辑回归
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import GradientBoostingClassifier



    #人工神经网络  人工神经网络的一个容器
    from keras.models import Sequential
    #Dense 神经网络层(稠密层)Activation激活函数
    from keras.layers.core import Dense,Activation
    #SGD 随机梯度下降算法
    from keras.optimizers import SGD
    #建个容器
    mdl=Sequential()
    #建个输入层,50指下一个层的神经元个数为50,intintput_dim表示输入的纬度
    mdl.add(Dense(50,input_dim=len(f_v[0])))
    #加入激活函数
    mdl.add(Activation("sigmoid"))
    #输出层:2:有两个标注所以为2,
    mdl.add(Dense(2))
    mdl.add(Activation("softmax"))
    #学习率为0.01
    sgd=SGD(lr=0.05)
    #参数 loss指的是最优化函数(损失函数)optimizer优化器 sgd优化器  adam亚当优化器
    mdl.compile(loss="mean_squared_error",optimizer="adam")
    #nb_epoch=迭代的次数,batch_size随机梯度下降算法,每次选取的个数
    mdl.fit(X_train,np.array([[0,1] if i ==1 else [1,0] for i in Y_train]),nb_epoch=1000,batch_size=8999)
    xy_lst = [(X_train, Y_train), (X_validation, Y_validation), (X_test, Y_test)]
    import matplotlib.pyplot as plt
    from sklearn.metrics import roc_curve,auc,roc_auc_score
    f=plt.figure()

    for i in range(len(xy_lst)):
        X_part = xy_lst[i][0]
        Y_part = xy_lst[i][1]
        #predict_classes()输出分类标注
        #Y_pred = mdl.predict_classes(X_part)
        Y_pred = mdl.predict(X_part)
        print(Y_pred)
        Y_pred=np.array(Y_pred[:,1]).reshape((1,-1))[0]

        # print(i)
        # print("NN", "-ACC", accuracy_score(Y_part, Y_pred))
        # print("NN", "-REC", recall_score(Y_part, Y_pred))
        # print("NN", "-Fl", f1_score(Y_part, Y_pred))
        f.add_subplot(1,3,i+1)
        fpr,tpr,threshold=roc_curve(Y_part,Y_pred)
        plt.plot(fpr,tpr)
        print("NN","AUC",auc(fpr,tpr))
        print("NN","AUC_Score",roc_auc_score(Y_part,Y_pred))
    plt.show()



    return
    models=[]
    models.append(("KNN",KNeighborsClassifier(n_neighbors=3)))
    #朴素贝叶斯
    models.append(("GaussianNB",GaussianNB()))
    models.append(("BernoulliNB",BernoulliNB()))
    #min_impurity_split=0.1 最小不纯度的区分,减枝方法
    #决策树(Gini)不纯度
    models.append(("DecisionTreeGini",DecisionTreeClassifier()))
    #决策树
    models.append(("DecisionTreeEntropy",DecisionTreeClassifier(criterion="entropy")))
    # SVM 支持向量机  C参数控制精度
    models.append(("SVM Classifier",SVC(C=1000)))
    #原始森林
    models.append(("OriginalRandomForest",RandomForestClassifier()))
    #随机森林
    models.append(("RandomForest",RandomForestClassifier(n_estimators=11,max_features=None)))
    #分类集成,Adaboost 方法:base_estimator=SVC(),n_estimators=100,algorithm="SAMME")))
    models.append(("Adaboost",AdaBoostClassifier(n_estimators=100)))
    #逻辑回归也是一种线性回归
    models.append(("LogisticRegression",LogisticRegression(C=1000,tol=1e-10,solver="sag",max_iter=10000)))
    #GBDT,回归树和提升树  参数max_depth=6一般深度为6,n_estimators=树的数量
    models.append(("GBDT",GradientBoostingClassifier(max_depth=6,n_estimators=100)))
    for clf_name ,clf in models:
        clf.fit(X_train,Y_train)
        xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
        for i in  range(len(xy_lst)):
            X_part=xy_lst[i][0]
            Y_part=xy_lst[i][1]
            Y_pred=clf.predict(X_part)
            print(i)
            print(clf_name,"-ACC",accuracy_score(Y_part,Y_pred))
            print(clf_name,"-REC",recall_score(Y_part,Y_pred))
            print(clf_name,"-Fl",f1_score(Y_part,Y_pred))
            #绘制决策树
            #dot_data=StringIO()
            #export_graphviz(clf,out_file=dot_data,
            #                         feature_names=f_names,
            #                         class_names=["NL","L"],
            #                         filled=True,
            #                         rounded=True,
            #                         special_characters=True)
            #graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
            #graph.write_pdf("dt_tree_2.pdf")

def regr_test(features,label):
    print("X",features)
    print("Y",label)
    #线性回归
    from sklearn.linear_model import LinearRegression,Ridge,Lasso
    #线性回归
    #regr=LinearRegression()
    #岭回归
    regr=Ridge(alpha=1)
    #Lasso
    #regr=Lasso(alpha=0.001)

    regr.fit(features.values,label.values)
    Y_pred=regr.predict(features.values)
    print("Coef:",regr.coef_)
    from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
    print("MSE:",mean_squared_error(label.values,Y_pred))
    print("MAE:",mean_absolute_error(label.values,Y_pred))
    print("R2:",r2_score(label.values,Y_pred))

def main():
    #数据处理,特征处理
    features,label=hr_preprocessing()
    #线性回归
    regr_test(features[["number_project","average_monthly_hours"]],features["last_evaluation"])
    #分类与集成
    #hr_modeling(features, label)

if __name__ == '__main__':
    main()