文章目录

一、前言

开发环境:​​Anaconda | python 3.5 —pycharm / jupyter notebook​​​ 目前验证码的识别开源项目已经非常多了,这里列举一下keras官方的项目:
​https://keras.io/examples/vision/captcha_ocr/​

项目的整个识别流程:

① 验证码清理并生成训练集样本
② 验证码特征提取
  -图像灰度化、二值化,降噪
  -对于机器学习可以进行字符切割,若是对于无法切割的字符集,只好使用深度学习的方法了

③ 拟合识别模型
④ 识别模型测试

关于数据集

数据集来自:https://github.com/wzzzd/captcha_ml。验证码是数字+大写字母+小写字母的组合,即[0-9]+[A-Z]+[a-z]。文件名是验证码的正确数字标签,实例如下

python 基于机器学习—深度学习识别验证码_特征提取

二、特征提取

图像灰度化、(可以先高斯模糊)二值化进行降噪,之后针对二值图降噪​,去除零星噪点。
特征提取:其实图像本身矩阵就是特征,也可以使用HOG特征

关于HOG的使用方法:https://www.programcreek.com/python/example/84776/cv2.HOGDescriptor

list_hog_fd = [] 
for feature in image_split_array:
fd = hog(feature.reshape((28, 28)), # hog 特征
orientations=9,
pixels_per_cell=(14, 14),
cells_per_block=(1, 1),
visualise=False)
list_hog_fd.append(fd)
hog_features = np.array(list_hog_fd, 'float64')

三、模型训练

SVM中的SVC、以及random forest。
def trainModel(data, label):
print("fit model >>>>>>>>>>>>>>>>>>>>>>")

# svc_rbf = svm.SVC(decision_function_shape='ovo',kernel='rbf') # rbf核svc
# svc_linear = svm.SVC(decision_function_shape='ovo',kernel='linear') #linear核svc
rf = RandomForestClassifier(n_estimators=100, max_depth=10,min_samples_split=10, random_state=0) #随机森林
scores = cross_val_score(rf, data, label,cv=10) #交叉检验,计算模型平均准确率
print("rf: ",scores.mean())

rf.fit(data, label) # 拟合模型
joblib.dump(rf, model_path) # 模型持久化,保存到本地
print("model save success!")

return

四、模型预测

图像预测时的预处理流程必须与制作训练集一样,否则准确度无法保证。

import os
from captcha_test.captcha_soc import image_process, image_feature, image_model, image_training
from sklearn.externals import joblib
from captcha_test.captcha_soc.config import *


#验证码数据清洗:转成灰度图(高斯去噪)-> 去噪 -> 二值化 -> 字符分割
def clean():
image_array, image_label = image_process.read_captcha(test_data_path)
print("待测试的验证码数量:", len(image_array))
image_clean = image_process.image_transfer(image_array) #转换成灰度图像,并去除背景噪声
image_array = [] #[[im_1_1,im_1_2,im_1_3,im_1_4],[im_2_1,im_2_2,im_2_3,im_2_4],...]
for each_image in image_clean:
image_out = image_process.get_clear_bin_image(each_image) #转换为二值图片,并去除剩余噪声点
split_result = image_process.image_split(image_out) #切割图片
image_array.append(split_result)
return image_array, image_label


#特征矩阵生成
def featrue_generate(image_array):
feature = []
for num, image in enumerate(image_array):
feature_each_image = []
for im_meta in image:
fea_vector = image_feature.feature_transfer(im_meta)
# print('label: ',image_label[num])
# print(feature)
feature_each_image.append(fea_vector)
# print(fea_vector)
# print(len(feature_each_image))
if len(feature_each_image) == 0:
feature_each_image = [[0]*(image_width+image_height)]*int(image_character_num)
# print(feature_each_image)
feature.append(feature_each_image)
print("预测数据的长度:", len(feature))
print("预测数据特征示例:", feature[0])
return feature


#将结果写到文件
def write_to_file(predict_list):
file_list = os.listdir(test_data_path)
with open(output_path, 'w') as f:
for num, line in enumerate(predict_list):
if num == 0:
f.write("file_name\tresult\n")
f.write(file_list[num] + '\t' + line + '\n')
print("结果输出到文件:", output_path)

def main():
image_array, image_label = clean() #验证码清理
feature = featrue_generate(image_array) #特征处理
predict_list = [] #预测
acc = 0
model = joblib.load(model_path) #读取模型

for num, line in enumerate(feature):
predict_array = model.predict(line)
predict = ''.join(predict_array)
predict_list.append(predict)
if predict == image_label[num]:
acc += 1
else:
pass
print("-----------------------")
print("actual:",image_label[num])
print("predict:", predict)
print("测试集预测acc:", acc/len(image_label))
write_to_file(predict_list) #输出到文件

if __name__ == '__main__':
main()