以前使用knn都是调用sklearn包里面的程序,这次自己尝试编写一下程序,如果有不足之处还望大家指点~
首先knn的原理其实很简单,先给模型训练数据,接着来一条测试数据,就去与所有训练数据计算距离,选出距离最小的k条(k近邻,k最好为奇数,避免不好决策的问题),看这k条数据最多的类标,然后将测试数据的类标取为该类标。
废话不多说,直接上代码,注解都写得十分清楚了
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 15:00:44 2020
@author: asus
"""
'''
knn实现
'''
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
path = r'F:\大三下\李艳老师数据挖掘实践\knn\iris.csv'
data = pd.read_csv(path)
#print(data.head())
##因为原数据按类别排序,所以打乱顺序以划分数据集
#data = data.sample(frac = 1).reset_index(drop=True)
#print(data)
def train_test_train(data,target_columns,test_size = 0.25,random_state = None):
'''
划分数据集,data为数据,
target_columns为决策属性列名,如本例是'Name'
test_size为测试集比例
'''
#打乱顺序以划分数据集
shuffle_data = data.sample(frac = 1,random_state = random_state).reset_index(drop=True)
split_index = int(len(shuffle_data) * (1 - test_size))
#提取决策属性
y = np.array(shuffle_data[target_columns])
x = np.array(shuffle_data.drop(columns = target_columns))
#划分数据
X_train,X_test = x[:split_index],x[split_index:]
y_train,y_test = y[:split_index],y[split_index:]
return X_train,X_test,y_train,y_test
class KNNClassfier(object):
def __init__(self,n_neighbors = 3):
'''
初始化
默认3近邻,计算距离是用欧氏距离
'''
self.n_neighbors = n_neighbors
self.X_train = None
self.y_train = None
def fit(self,X_train,y_train):
'''
X_train is array ,shape like [n_samples,shapes]
X_train is array ,shape like [n_samples,1]
'''
self.X_train = X_train
self.y_train = y_train
def predict(self,X_test):
'''
X_test is array ,shape like [n_samples,shapes]
该函数返回一个在X_test上的预测结果
'''
#先创建一个空的array,
#shape: 与X_test行数相同,一列
# y_pred = np.zeros((X_test.shape[0],1))
y_pred = list()
for i in range(X_test.shape[0]):
#遍历X_test,对于每一条测试集的数据
dis = []#用于存放距离
for j in range(self.X_train.shape[0]):
#计算第i条测试数据与所有训练数据的距离
#即第i条测试数据与第j条训练数据的差的平方和开更号
#np.linalg.norm()为矩阵整体元素平方和开根号
#所以对于每一个i,即每一条测试数据,有几条训练数据,dis就有几个元素
dis.append(np.linalg.norm(X_test[i] - self.X_train[j,:]))
#对于第i条数据,计算完距离以后排序
labels = []#存放类标
#这里的sorted(range(len(dis)),key = dis.__getitem__)意思是:
#返回一个新的列表,里面的元素是dis从小到大排序后对应的索引,所以前面的就是距离最小的
index = sorted(range(len(dis)),key = dis.__getitem__)
for j in range(self.n_neighbors):
# 假设n_neighbors为3,就取index前三个元素,这三个元素都是数据的索引
# 然后去取这三个数据的类标,比方说前三个是4,78,53,那么就取4,78,53条数据的类别放入labels
# 对于每一个i,labels都有三个类标
labels.append(self.y_train[index[j]])
counts = []#用于计数
for label in labels:
#统计每个类标的个数
counts.append(labels.count(label))
# y_pred[i] = labels[np.argmax(counts)]
'''
比方说labels = [A,B,A]
那么count = [2,1,2]
那么np.argmax(counts) = 0
根据这个去取labels[0] = A
'''
y_pred.append(labels[np.argmax(counts)])#返回沿最大值的索引
return y_pred
def score(self,X_test,y_test):
'''输入测试集,返回测试集精度'''
pred = self.predict(X_test)
score = np.mean(y_pred == y_test)
return score
if __name__ == '__main__':
#划分数据集
X_train,X_test,y_train,y_test = train_test_train(data,target_columns = 'Name',test_size = 0.3,random_state = 85)
# knn = KNeighborsClassifier(n_neighbors = 3)
# knn.fit(X_train,y_train)
# y_pred = knn.predict(X_test)
# print("Test set score: {:.2f}".format(np.mean(y_pred == y_test)))
# print("Train set score: {:.2f}".format(knn.score(X_train, y_train)))
knn = KNNClassfier(n_neighbors = 3)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
#print("Test set score: {:.2f}".format(np.mean(y_pred == y_test)))
print("Test set score: {:.2f}".format(knn.score(X_test,y_test)))
我还自己写了划分数据的函数,也算是初步实现了。后期再改进