- 导入数据
- 导入pandas,并且重命名为pd。
- 数据导入
- 数据处理
- 建立模型
- 模型评估
import pandas as pd
titanic = pd.read_csv( ‘titanic.txt’)
泰坦尼克号= pd.read_csv(‘titanic.txt’)
import pandas as pd
titanic= pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
x = titanic[['pclass','age','sex']]
y = titanic['survived']
x['age'].fillna(x['age'].mean(), inplace= True)
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split (x, y,test_size=0.25,random_state = 33)
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer (sparse = False)
x_train = vec.fit_transform(x_train.to_dict (orient= 'record') )
x_test = vec.transform(x_test.to_dict(orient= 'record'))
使用多种用于评价分类任务性能的指标,在测试数据集上对比单一决策树(DecisionTree)、随机森林分类器(RandomForestClassifier)以及梯度提升决策树(Gradient Tree Boosting)的性能差异。
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
dtc_y_pred= dtc.predict(x_test)
from sklearn. ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_y_pred = rfc.predict(x_test)
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier ()
gbc.fit(x_train, y_train)
gbc_y_pred = gbc.predict (x_test)
#从sklearn .metrics导人classification report。
from sklearn.metrics import classification_report
print('The accuracy of decision tree is', dtc.score(x_test, y_test))
print(classification_report(dtc_y_pred, y_test))
print('The accuracy of random forest classifier is', rfc.score(x_test, y_test))
print(classification_report(rfc_y_pred, y_test))
print('The accuracy of gradient tree boosting is', gbc.score(x_test, y_test))
print(classification_report(gbc_y_pred, y_test))
The accuracy of decision tree is 0.7811550151975684
precision recall f1-score support
0 0.91 0.78 0.84 236
1 0.58 0.80 0.67 93
avg / total 0.81 0.78 0.79 329
The accuracy of random forest classifier is 0.78419452887538
precision recall f1-score support
0 0.90 0.78 0.84 233
1 0.60 0.79 0.68 96
avg / total 0.81 0.78 0.79 329
The accuracy of gradient tree boosting is 0.790273556231003
precision recall f1-score support
0 0.92 0.78 0.84 239
1 0.58 0.82 0.68 90
avg / total 0.83 0.79 0.80 329