KNN算法的简单运用
最简单的KNN算法
import pandas as pd
import numpy as np
df = pd.read_excel('./datasets/my_films.xlsx',engine='openpyxl')
feature = df[['Action Lens','Love Lens']]
target = df['target']
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3) # n_neighbors 就是knn中的k
knn.fit(feature,target) #X特征必须是二维,每行一个样例,每列是对应特征
knn.predict(feature)==np.array(target)
从网上获取数据集
#演示如何从网上获取数据集
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets as dt
#提取样本数据,鸢尾花
iris = dt.load_iris()
feature = iris.data #特征数据
target = iris.target #标签数据
print(type(feature),np.shape(feature)) # 这个是150行,4列的numpy.array
print(type(target),np.shape(target)) # 这个是一维的numpy.array
#切分数据集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(feature,target,test_size=0.2,random_state=2021)
#训练
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train,y_train)
knn.score(x_test,y_test)
改独热码,删除原列
#演示如何改独热码,并删原列
#获取源数据
data = pd.read_csv('./datasets/adults.txt')[['age','education_num','occupation','hours_per_week','salary']]
feature = data[['age','education_num','occupation','hours_per_week']]
target = data.salary
#特征工程,将某一列改成独热码
one_hot_df = pd.get_dummies(feature['occupation'])
feature = pd.concat((feature,one_hot_df),axis=1).drop('occupation',axis=1)
#数据集的切分
x_train,x_test,y_train,y_test = train_test_split(feature,target,test_size=0.2,random_state=2021)
#训练
knn = KNeighborsClassifier(n_neighbors=30)
knn.fit(x_train,y_train)
knn.score(x_test,y_test)
无量纲化,归一化,标准化
from sklearn.preprocessing import MinMaxScaler
#测试添加了无量纲化后模型的评估效果
data = pd.read_csv('./datasets/datingTestSet.txt',header=None,sep='\t')
feature = data[[0,1,2]]
target = data[3]
#Normal
x_train,x_test,y_train,y_test = train_test_split(feature,target,test_size=0.2)
knn = KNeighborsClassifier(15)
knn.fit(x_train,y_train)
knn.score(x_test,y_test)
print("Normal: ",knn.score(x_test,y_test))
#归一化
mm = MinMaxScaler() #####################很重要
m_feature = mm.fit_transform(feature)#####################很重要
x_train,x_test,y_train,y_test = train_test_split(m_feature,target,test_size=0.2)
knn = KNeighborsClassifier(15)
knn.fit(x_train,y_train)
knn.score(x_test,y_test)
print("MinMaxScaler: ",knn.score(x_test,y_test))
超参数循环
#特别智障的超参数循环
import matplotlib.pyplot as plt
data = pd.read_csv('./datasets/datingTestSet.txt',header=None,sep='\t')
feature = data[[0,1,2]]
target = data[3]
x_train,x_test,y_train,y_test = train_test_split(feature,target,test_size=0.2)
scores = []
for k in range(1,50):
knn = KNeighborsClassifier(k)
knn.fit(x_train,y_train)
scores.append(knn.score(x_test,y_test))
print(np.argmax(scores))
plt.plot(range(1,50),scores)
超参数循环 + k折交叉验证
#特别智障的超参数循环 + K折验算
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
data = pd.read_csv('./datasets/datingTestSet.txt',header=None,sep='\t')
feature = data[[0,1,2]]
target = data[3]
x_train,x_test,y_train,y_test = train_test_split(feature,target,test_size=0.2)
scores = []
for k in range(1,50):
knn = KNeighborsClassifier(k)
scores.append(cross_val_score(knn,x_train,y_train,cv=8).mean())
print(np.argmax(scores))
plt.plot(range(1,50),scores)