学习一时爽,一直学习一直爽
Hello,大家好,我是 もうり,一个从无到有的技术小白。
开车!开车!
車を運転する
運転手さんがまた転覆しました。
在决策树中有一个很重要的概念就是深度
没错决策树很容易过拟合
从iris来看下所谓的过拟合,学什么没用的概念,赶紧开车
环境
jupyter notebook
导入包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
import pydotplus
mpl.rcParams['font.sans-serif'] = ['simHei']
mpl.rcParams['axes.unicode_minus'] = False
iris_feature_E = 'sepal length', 'sepal width', 'petal length', 'petal width'
iris_feature = '花萼长度', '花萼宽度', '花瓣长度', '花瓣宽度'
iris_class = 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica'
# 加载数据
x = pd.DataFrame(load_iris().data)
y = load_iris().target
图片是二维的,所以只能使用两个特征
# 为了可视化,仅使用前两列特征
x = x[[0,1]]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
model = DecisionTreeClassifier(criterion='entropy')
model.fit(x_train, y_train)
y_train_pred = model.predict(x_train)
print('训练集正确率:', accuracy_score(y_train, y_train_pred))
y_test_hat = model.predict(x_test) # 测试数据
print('测试集正确率:', accuracy_score(y_test, y_test_hat))
结果
训练集正确率: 0.9523809523809523
测试集正确率: 0.6222222222222222
决策树分类的图片
dot_data = tree.export_graphviz(model, out_file=None, feature_names=iris_feature_E[0:2], class_names=iris_class,
filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf('iris.pdf')
f = open('iris.png', 'wb')
f.write(graph.create_png())
f.close()

不断地更新深度来探究所谓地精确度
depth = np.arange(1, 15)
err_train_list = []
err_test_list = []
clf = DecisionTreeClassifier(criterion='entropy')
for d in depth:
clf.set_params(max_depth=d)
clf.fit(x_train, y_train)
y_train_pred = clf.predict(x_train)
err_train = 1 - accuracy_score(y_train, y_train_pred)
err_train_list.append(err_train)
y_test_pred = clf.predict(x_test)
err_test = 1 - accuracy_score(y_test, y_test_pred)
err_test_list.append(err_test)
print(d, ' 测试集错误率: %.2f%%' % (100 * err_test))
plt.figure(facecolor='w')
plt.plot(depth, err_test_list, 'ro-', markeredgecolor='k', lw=2, label='测试集错误率')
plt.plot(depth, err_train_list, 'go-', markeredgecolor='k', lw=2, label='训练集错误率')
plt.xlabel('决策树深度', fontsize=13)
plt.ylabel('错误率', fontsize=13)
plt.legend(loc='lower left', fontsize=13)
plt.title('决策树深度与过拟合', fontsize=15)
plt.grid(b=True, ls=':', color='#606060')
plt.show()

这就是所谓的过拟合,当深度越深,分的次数越多,训练集的错误率还ok,但是在测试集就完了。
如何加强,答案就是随机森林,一共决策树不行,来多几棵
下面将利用采样方法来将图片分类的效果画出来
N, M = 50, 50 # 横纵各采样多少个值
x1_min, x2_min = x.min()
x1_max, x2_max = x.max()
t1 = np.linspace(x1_min, x1_max, N)
t2 = np.linspace(x2_min, x2_max, M)
x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点
x_show = np.stack((x1.flat, x2.flat), axis=1) # 测试点
print(x_show.shape)
print('x_show = \n', x_show)
cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
y_show_hat = model.predict(x_show) # 预测值
print(y_show_hat.shape)
print(y_show_hat)
y_show_hat = y_show_hat.reshape(x1.shape) # 使之与输入的形状相同
print(y_show_hat)
plt.figure(facecolor='w')
plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light) # 预测值的显示
plt.scatter(x_test[0], x_test[1], c=y_test.ravel(), edgecolors='k', s=100, zorder=10, cmap=cm_dark, marker='*') # 测试数据
plt.scatter(x[0], x[1], c=y.ravel(), edgecolors='k', s=20, cmap=cm_dark) # 全部数据
plt.xlabel(iris_feature[0], fontsize=13)
plt.ylabel(iris_feature[1], fontsize=13)
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.grid(b=True, ls=':', color='#606060')
plt.title('鸢尾花数据的决策树分类', fontsize=15)
plt.show()
结果如下

在往高层想想,使用了2个特征,一共4个,4个选2个,一共1+2+3 = 6个,就6个情况
from sklearn.ensemble import RandomForestClassifier
x_prime = pd.DataFrame(load_iris().data)
y = load_iris().target
x_prime_train, x_prime_test, y_train, y_test = train_test_split(x_prime, y, test_size=0.3, random_state=0)
feature_pairs = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]
plt.figure(figsize=(8, 6))
for i, pair in enumerate(feature_pairs):
# 准备数据
x_train = x_prime_train[pair]
x_test = x_prime_test[pair]
# 决策树学习
model = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=5, oob_score=True)
model.fit(x_train, y_train)
# 画图
N, M = 500, 500 # 横纵各采样多少个值
x1_min, x2_min = x_train.min()
x1_max, x2_max = x_train.max()
t1 = np.linspace(x1_min, x1_max, N)
t2 = np.linspace(x2_min, x2_max, M)
x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点
x_show = np.stack((x1.flat, x2.flat), axis=1) # 测试点
# 训练集上的预测结果
y_train_pred = model.predict(x_train)
acc_train = accuracy_score(y_train, y_train_pred)
y_test_pred = model.predict(x_test)
acc_test = accuracy_score(y_test, y_test_pred)
print('特征:', iris_feature[pair[0]], ' + ', iris_feature[pair[1]])
print('Score:', model.oob_score_)
print('\t训练集准确率: %.4f%%' % (100*acc_train))
print('\t测试集准确率: %.4f%%\n' % (100*acc_test))
cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
y_hat = model.predict(x_show)
y_hat = y_hat.reshape(x1.shape)
plt.subplot(2, 3, i+1)
plt.contour(x1, x2, y_hat, colors='k', levels=[0, 1], antialiased=True, linestyles='--', linewidths=1)
plt.pcolormesh(x1, x2, y_hat, cmap=cm_light) # 预测值
plt.scatter(x_train[pair[0]], x_train[pair[1]], c=y_train, s=20, edgecolors='k', cmap=cm_dark, label='训练集')
plt.scatter(x_test[pair[0]], x_test[pair[1]], c=y_test, s=100, marker='*', edgecolors='k', cmap=cm_dark, label='测试集')
plt.xlabel(iris_feature[pair[0]], fontsize=12)
plt.ylabel(iris_feature[pair[1]], fontsize=12)
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.grid(b=True, ls=':', color='#606060')
plt.suptitle('随机森林对鸢尾花数据两特征组合的分类结果', fontsize=15)
plt.tight_layout(1, rect=(0, 0, 1, 0.95)) # (left, bottom, right, top)
plt.show()
结果如下

好了,这就是今天的内容了,今天最后我有一句话要说:
