import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
加载样本数据
- matplotlib读取图片
- 将每一张图片转化为一组数组
bmp1 = plt.imread('digits/0/0_1.bmp')
# 查看形状
display(bmp1.shape)
# 显示
plt.imshow(bmp1)
(28, 28)
<matplotlib.image.AxesImage at 0x5d80c50>
# 改变显示的颜色
bmp1 = plt.imread('digits/0/0_1.bmp')
# 查看形状
display(bmp1.shape)
# 显示
plt.imshow(bmp1,cmap='gray')
(28, 28)
<matplotlib.image.AxesImage at 0x5dfdeb0>
尝试将两张图片转化为2行 28*28列
bmp2 = plt.imread('digits/0/0_2.bmp')
digits =[]
# 把二维数组变成一维数组
digits.append(bmp1.ravel()) # bmp1.reshape(-1)
digits.append(bmp2.ravel())
digits = np.array(digits)
digits.shape
(2, 784)
尝试将数组中某一张图片进行显示
plt.imshow(digits[0].reshape(28,28),cmap='gray')
<matplotlib.image.AxesImage at 0x4ca1e90>
批量化读取所有的图片
- 样式数据:每一张图片的一维数组
- 标签:文件目录文件
data = [] # 完成的目标 (5000,784)
target = [] # 完成的目标(5000,)
for label in range(10):
for index in range(1,501):
bmp_filename = f'digits/{label}/{label}_{index}.bmp'
bmp = plt.imread(bmp_filename)
data.append(bmp.ravel())
target.append(label)
# 将list转化为ndarray
data = np.array(data)
target = np.array(target)
display(data.shape,target.shape)
(5000, 784)
(5000,)
创建分类模型
- KNN 分类
- 邻近数量:5,7,9,11
- weights: uniform,distance # 权重
knn = KNeighborsClassifier(n_neighbors=7)
"""
KNeighborsClassifier(
n_neighbors=5,
weights='uniform',
algorithm='auto',
leaf_size=30,
p=2,
metric='minkowski',
metric_params=None,
n_jobs=None,
**kwargs,
)
"""
from sklearn.model_selection import train_test_split as split
X_train,X_test,y_train,y_test = split(data,target,test_size=0.2)
拆分数据的训练
knn.fit(X_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=7, p=2,
weights='uniform')
knn.score(X_test,y_test)
0.93
# n_neighbors=5
knn.set_params(n_neighbors=5)
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.929
# n_neighbors=9
knn.set_params(n_neighbors=9)
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.922
knn.set_params(n_neighbors=9,weights='distance')
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.926
knn.set_params(n_neighbors=11)
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.917
重新训练全部样式
knn.fit(data,target)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=11, p=2,
weights='distance')
import os
加载测试样本
test1 = plt.imread('digits/test/4.bmp')
test1.shape
(28, 28, 3)
plt.imshow(test1.mean(axis=-1))
<matplotlib.image.AxesImage at 0xada3b30>
# 加灰度
plt.imshow(test1.mean(axis=-1),cmap='gray')
<matplotlib.image.AxesImage at 0xaddf710>
test_data = []
test_target = []
for filename in os.listdir('digits/test'):
print(filename)
bmp_file = f'digits/test/{filename}'
bmp =plt.imread(bmp_file)
# bmp是三维数组(28,28,3) 最后一个维度的数据(3)是颜色值。 (r,g,b)
# 对最内层的维度进行平均计算,得出一个单颜色值(0-255),即降维
test_data.append(bmp.mean(axis=-1).ravel())
# os.path.split() 分隔目录与文件名
# os.path.splitext() 将文件名的名称和扩展名分隔出来
label,ext_name = os.path.splitext(filename)
test_target.append(label)
test_data = np.array(test_data)
test_target = np.array(test_target)
display(test_data.shape,test_target.shape)
4.bmp
5.bmp
6.bmp
7.bmp
8.bmp
9.bmp
(6, 784)
(6,)
test_target
array(['4', '5', '6', '7', '8', '9'], dtype='<U1')
test_data[0].shape
(784,)
test_data
array([[255., 255., 255., ..., 255., 255., 255.],
[255., 255., 255., ..., 255., 255., 255.],
[255., 255., 255., ..., 255., 255., 255.],
[255., 255., 255., ..., 255., 255., 255.],
[255., 255., 255., ..., 255., 255., 255.],
[255., 255., 255., ..., 255., 255., 255.]])
预测测试数据集的结果
y_ = knn.predict(test_data)
y_
array([4, 4, 5, 1, 0, 5])
可视化的方式显示预测结果
plt.figure(figsize=(10,12))
for i,test_bmp in enumerate(test_data):
# 两行三列
plt.subplot(2,3,i+1)
plt.imshow(test_bmp.reshape(28,28),cmap='gray')
plt.title(f'True:{test_target[i]} Pred:{y_[i]}',size=20)
plt.axis('off') # 关闭坐标轴
plt.show()
读取手写的数字
- filename: digits/test/4_2.bmp
test_4_2 = plt.imread('digits/test/4_2.bmp')
test_4_2.shape
(28, 28, 4)
y_2 =knn.predict(test_4_2.mean(axis=-1).reshape(1,-1))
display(y_2)
array([1])
from pandas import Series
D:\yingyong\Anaconda3\lib\importlib\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 112 from C header, got 124 from PyObject
return f(*args, **kwds)
Series(test_4_2.ravel()).unique()
array([255, 0], dtype=uint64)
Series(bmp1.ravel()).unique()
array([255, 204, 96, 2, 205, 207, 17, 3, 18, 201, 28, 16, 22,
198, 249, 245, 195, 31, 53, 171, 133, 92, 159, 66, 88, 65,
141, 27, 208, 176, 0, 87, 76, 243, 180, 134, 234, 12, 217,
90, 47, 248, 77, 15, 184, 236, 227, 60, 192, 57, 59, 179,
9, 143, 107, 170, 25, 230, 120, 69, 32, 124, 30, 110, 82,
169, 93, 6, 109, 226, 199, 26, 40, 125, 56, 127, 114, 218],
dtype=uint64)
随机从训练集中抽取5000个数字图片
index = np.arange(5000)
index
array([ 0, 1, 2, ..., 4997, 4998, 4999])
np.random.permutation(index)# 随机打乱
array([4660, 2679, 3913, ..., 525, 3990, 146])
# 测试50个
test_index = np.random.randint(5000,size=50)
data[test_index].shape
(50, 784)
y_ = knn.predict(data[test_index])
target[test_index]
array([3, 3, 7, 8, 4, 7, 4, 3, 7, 3, 6, 5, 9, 7, 1, 8, 8, 2, 1, 5, 6, 2,
5, 7, 9, 3, 0, 3, 9, 2, 6, 0, 0, 6, 0, 8, 6, 7, 6, 2, 1, 0, 4, 9,
9, 8, 5, 0, 4, 5])
y_
array([3, 3, 7, 8, 4, 7, 4, 3, 7, 3, 6, 5, 9, 7, 1, 8, 8, 2, 1, 5, 6, 2,
5, 7, 9, 3, 0, 3, 9, 2, 6, 0, 0, 6, 0, 8, 6, 7, 6, 2, 1, 0, 4, 9,
9, 8, 5, 0, 4, 5])
test_label = target[test_index]
test_label[test_label == y_].size / test_label.size
1.0
test_index = np.random.randint(5000,size=50)
y_ = knn.predict(data[test_index])
test_label = target[test_index]
test_label[test_label == y_].size / test_label.size
1.0
test_index = np.random.randint(1000,size=50)
y_ = knn.predict(data[test_index])
test_label = target[test_index]
test_label[test_label == y_].size / test_label.size
1.0
plt.figure(figsize=(30,20))
i =1
for index in test_index:
plt.subplot(5,10,i)
plt.imshow(X_test[index].reshape(28,28),cmap='gray')
plt.axis('off')
plt.title(f'T:{y_test[index]} P:{y_[i-1]}',size=30)
i += 1
plt.show()
通过mean()方式实现降维效果
a = np.array(
[
[
[3,4,5],
[5,6,7],
[3,6,9]
],
[
[1,2,5],
[3,5,7],
[6,8,9]
]
]
)
a.shape # 两个,三行三列
(2, 3, 3)
# 2 --->(3+1)/ 2 3 --->(4+2)/ 2 5 --->(5+5)/ 2
# 4 --->(5+3)/ 2 5.5 --->(6+5)/ 2 7 --->(7+7)/ 2
# 4.5 --->(3+6)/ 2 7 --->(6+8)/ 2 9--->(9+9)/ 2
a.mean(axis=0)
array([[2. , 3. , 5. ],
[4. , 5.5, 7. ],
[4.5, 7. , 9. ]])
# 3.66666667 --->(3+5+3)/ 3 5.33333333 --->(4+6+6)/ 3 7. --->(5+7+9)/ 3
# 3.33333333 --->(1+3+6)/ 3 5. --->(2+5+8)/ 3 7. --->(5+7+9)/ 3
a.mean(axis=1)
array([[3.66666667, 5.33333333, 7. ],
[3.33333333, 5. , 7. ]])
# 4. --->(3+4+5)/ 3 6. --->(5+6+7)/ 3 6. --->(3+6+9)/ 2
# 2.66666667 --->(1+2+5)/ 3 5.--->(3+5+7)/ 3 7.66666667 --->(6+8+9)/ 2
a.mean(axis=-1)
array([[4. , 6. , 6. ],
[2.66666667, 5. , 7.66666667]])