0 简介
1 混淆矩阵(Confusion Matrix)
#所有判断正确并确实为1的样本 / 所有被判断为1的样本 #对于没有class_weight,没有做样本平衡的灰色决策边界来说: (y[y == clf.predict(X)] == 1).sum()/(clf.predict(X) == 1).sum() #对于有class_weight,做了样本平衡的红色决策边界来说: (y[y == wclf.predict(X)] == 1).sum()/(wclf.predict(X) == 1).sum()
#所有predict为1的点 / 全部为1的点的比例 #对于没有class_weight,没有做样本平衡的灰色决策边界来说: (y[y == clf.predict(X)] == 1).sum()/(y == 1).sum() #对于有class_weight,做了样本平衡的红色决策边界来说: (y[y == wclf.predict(X)] == 1).sum()/(y == 1).sum()
#所有被正确预测为0的样本 / 所有的0样本 #对于没有class_weight,没有做样本平衡的灰色决策边界来说: (y[y == clf.predict(X)] == 0).sum()/(y == 0).sum() #对于有class_weight,做了样本平衡的红色决策边界来说: (y[y == wclf.predict(X)] == 0).sum()/(y == 0).sum()

2 ROC曲线以及其相关问题
1 自建数据集
class_1_ = 7 class_2_ = 4 centers_ = [[0.0, 0.0], [1,1]] clusters_std = [0.5, 1] X_, y_ = make_blobs(n_samples=[class_1_, class_2_], centers=centers_, cluster_std=clusters_std, random_state=0, shuffle=False) plt.scatter(X_[:, 0], X_[:, 1], c=y_, cmap="rainbow",s=30)
from sklearn.linear_model import LogisticRegression as LogiR clf_lo = LogiR().fit(X_,y_) prob = clf_lo.predict_proba(X_) #将样本和概率放到一个DataFrame中 import pandas as pd prob = pd.DataFrame(prob) prob.columns = ["0","1"] prob
#手动调节阈值,来改变我们的模型效果 for i in range(prob.shape[0]): if prob.loc[i,"1"] > 0.5: prob.loc[i,"pred"] = 1 else: prob.loc[i,"pred"] = 0 prob["y_true"] = y_ prob = prob.sort_values(by="1",ascending=False) prob
from sklearn.metrics import confusion_matrix as CM, precision_score as P, recall_score as R CM(prob.loc[:,"y_true"],prob.loc[:,"pred"],labels=[1,0]) #试试看手动计算Precision和Recall? P(prob.loc[:,"y_true"],prob.loc[:,"pred"],labels=[1,0]) R(prob.loc[:,"y_true"],prob.loc[:,"pred"],labels=[1,0])
for i in range(prob.shape[0]): if prob.loc[i,"1"] > 0.4: prob.loc[i,"pred"] = 1 else: prob.loc[i,"pred"] = 0 prob CM(prob.loc[:,"y_true"],prob.loc[:,"pred"],labels=[1,0]) P(prob.loc[:,"y_true"],prob.loc[:,"pred"],labels=[1,0]) R(prob.loc[:,"y_true"],prob.loc[:,"pred"],labels=[1,0]) #注意,降低或者升高阈值并不一定能够让模型的效果变好,一切都基于我们要追求怎样的模型效果

#使用最初的X和y,样本不均衡的这个模型 class_1 = 500 #类别1有500个样本 class_2 = 50 #类别2只有50个 centers = [[0.0, 0.0], [2.0, 2.0]] #设定两个类别的中心 clusters_std = [1.5, 0.5] #设定两个类别的方差,通常来说,样本量比较大的类别会更加松散 X, y = make_blobs(n_samples=[class_1, class_2], centers=centers, cluster_std=clusters_std, random_state=0, shuffle=False) #看看数据集长什么样 plt.scatter(X[:, 0], X[:, 1], c=y, cmap="rainbow",s=10) #其中红色点是少数类,紫色点是多数类 clf_proba = svm.SVC(kernel="linear",C=1.0,probability=True).fit(X,y) clf_proba.predict_proba(X) clf_proba.predict_proba(X).shape clf_proba.decision_function(X) clf_proba.decision_function(X).shape

#首先来看看如何从混淆矩阵中获取FPR和Recall cm = CM(prob.loc[:,"y_true"],prob.loc[:,"pred"],labels=[1,0]) cm #FPR cm[1,0]/cm[1,:].sum() #Recall cm[0,0]/cm[0,:].sum() #开始绘图 recall = [] FPR = [] probrange = np.linspace(clf_proba.predict_proba(X) [:,1].min(),clf_proba.predict_proba(X)[:,1].max(),num=50,endpoint=False) from sklearn.metrics import confusion_matrix as CM, recall_score as R import matplotlib.pyplot as plot for i in probrange: y_predict = [] for j in range(X.shape[0]): if clf_proba.predict_proba(X)[j,1] > i: y_predict.append(1) else: y_predict.append(0) cm = CM(y,y_predict,labels=[1,0]) recall.append(cm[0,0]/cm[0,:].sum()) FPR.append(cm[1,0]/cm[1,:].sum()) recall.sort() FPR.sort() plt.plot(FPR,recall,c="red") plt.plot(probrange+0.05,probrange+0.05,c="black",linestyle="--") plt.show()

from sklearn.metrics import roc_curve FPR, recall, thresholds = roc_curve(y,clf_proba.decision_function(X), pos_label=1) FPR recall thresholds #此时的threshold就不是一个概率值,而是距离值中的阈值了,所以它可以大于1,也可以为负
from sklearn.metrics import roc_auc_score as AUC area = AUC(y,clf_proba.decision_function(X))
plt.figure() plt.plot(FPR, recall, color='red', label='ROC curve (area = %0.2f)' % area) plt.plot([0, 1], [0, 1], color='black', linestyle='--') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('Recall') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show()

maxindex = (recall - FPR).tolist().index(max(recall - FPR)) thresholds[maxindex] #我们可以在图像上来看看这个点在哪里 plt.scatter(FPR[maxindex],recall[maxindex],c="black",s=30) #把上述代码放入这段代码中: plt.figure() plt.plot(FPR, recall, color='red', label='ROC curve (area = %0.2f)' % area) plt.plot([0, 1], [0, 1], color='black', linestyle='--') plt.scatter(FPR[maxindex],recall[maxindex],c="black",s=30) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('Recall') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show()