4.0神经网络反向传播
反向传播
BP算法由信号的正向传播和误差的反向传播两个过程组成。
正向传播时,输入样本从输入层进入网络,经隐层逐层传递至输出层,如果输出层的实际输出与期望输出(导师信号)不同,则转至误差反向传播;如果输出层的实际输出与期望输出(导师信号)相同,结束学习算法。
反向传播时,将输出误差(期望输出与实际输出之差)按原通路反传计算,通过隐层反向,直至输入层,在反传过程中将误差分摊给各层的各个单元,获得各层各单元的误差信号,并将其作为修正各单元权值的根据。这一计算过程使用梯度下降法完成,在不停地调整各层神经元的权值和阈值后,使误差信号减小到最低限度
权值和阈值不断调整的过程,就是网络的学习与训练过程,经过信号正向传播与误差反向传播,权值和阈值的调整反复进行,一直进行到预先设定的学习训练次数,或输出误差减小到允许的程度。
梯度
损失函数
代价函数是非凸函数,最后的结果可能是局部极小值而不是全局最小值,但也是一个比较小的局部极小值。
python语法
np.random.uniform()
numpy.random.uniform(low,high,size)
功能:从一个均匀分布[low,high)中随机采样,注意定义域是左闭右开,即包含low,不包含high.
python实现
1 读取数据
# # 神经网络解决多分类问题
# 2.案例:手写数字识别
# 数据集:ex4data1.mat
# 初始参数:ex4weights.mat
# 搭建完整的神经网络
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
from scipy.optimize import minimize
path = 'ex4data1.mat'
data = sio.loadmat(path)
raw_X = data['X']
raw_Y = data['y']
X = np.insert(raw_X, 0, values=1, axis=1)
# (5000,401)
print(X.shape)
2 对 y 进行One-hot编码
# 对y进行独热编码处理:one-hot编码
def one_hot_encoder(raw_y):
result = []
for i in raw_y: # 1-10
y_temp = np.zeros(10)
y_temp[i - 1] = 1#i的值对应的索引
result.append(y_temp)
return np.array(result)
y = one_hot_encoder(raw_Y)
print(y, y.shape)
'''[[0. 0. 0. ... 0. 0. 1.]
[0. 0. 0. ... 0. 0. 1.]
[0. 0. 0. ... 0. 0. 1.]
...
[0. 0. 0. ... 0. 1. 0.]
[0. 0. 0. ... 0. 1. 0.]
[0. 0. 0. ... 0. 1. 0.]] (5000, 10)'''
3 序列化与解序列化权重参数
序列化 (Serialization)是将对象的状态信息转换为可以存储或传输的形式的过程。在序列化期间,对象将其当前状态写入到临时或持久性存储区。以后,可以通过从存储区中读取或反序列化对象的状态,重新创建该对象。
# 初始权重参数
theta = sio.loadmat('ex4weights.mat')
theta1, theta2 = theta['Theta1'], theta['Theta2']
# (25, 401) (10, 26)
print(theta1.shape, theta2.shape)
# 序列化权重参数
def serialize(a, b):
return np.append(a.flatten(), b.flatten()) # flatten 降维,折叠成一维,默认按行折叠
theta_serialize = serialize(theta1, theta2)
# (10285,) 10285=25*401+10*26
print(theta_serialize.shape)
#解序列化权重参数
def deserialize(theta_serialize):
theta1 = theta_serialize[:25 * 401].reshape(25, 401)#切片解序列化
theta2 = theta_serialize[25 * 401:].reshape(10, 26)
return theta1, theta2
theta1, theta2 = deserialize(theta_serialize)
# (25, 401) (10, 26)
print(theta1.shape, theta2.shape)
4 前向传播
#前向传播
# sigmoid函数
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def feed_forward(theta_serialize, X):
theta1, theta2 = deserialize(theta_serialize)
a1 = X
z2 = a1 @ theta1.T
a2 = sigmoid(z2)
a2 = np.insert(a2, 0, values=1, axis=1)
z3 = a2 @ theta2.T
h = sigmoid(z3)
return a1, z2, a2, z3, h
5 损失函数
损失函数以及后面的梯度函数都需要把第一个参数设为theta,后续TNC函数的需要。
#损失函数(分为有正则化和无正则化)
# 不带正则化的损失函数
def cost(theta_serialize, X, y):
a1, z2, a2, a3, h = feed_forward(theta_serialize, X)
J = -np.sum(y * np.log(h) + (1 - y) * np.log(1 - h)) / len(X)
return J
# 0.2876291651613189
print(cost(theta_serialize, X, y))
# 带正则化的损失函数
def reg_cost(theta_serialize, X, y, lamda):
sum1 = np.sum(np.power(theta1[:, 1:], 2))#第一列不做正则化
sum2 = np.sum(np.power(theta2[:, 1:], 2))
reg = (sum1 + sum2) * lamda / (2 * len(X))
return reg + cost(theta_serialize, X, y)
# 0.38376985909092365
print(reg_cost(theta_serialize, X, y, 1))
6 反向传播梯度
#反向传播
# 无正则化的梯度
def sigmoid_gradient(z):
return sigmoid(z) * (1 - sigmoid(z))
def gradient(theta_serialize, X, y):
theta1, theta2 = deserialize(theta_serialize)
a1, z2, a2, z3, h = feed_forward(theta_serialize, X)
d3 = h - y
d2 = d3 @ theta2[:, 1:] * sigmoid_gradient(z2)
D2 = (d3.T @ a2) / len(X)
D1 = (d2.T @ a1) / len(X)
return serialize(D1, D2)
# 带正则化的梯度
def reg_gradient(theta_serialize, X, y, lamda):
D = gradient(theta_serialize, X, y)
D1, D2 = deserialize(D)
theta1, theta2 = deserialize(theta_serialize)
D1[:, 1:] = D1[:, 1:] + theta1[:, 1:] * lamda / len(X)#第一项不做正则化
# D1[:, 1:] = D1[:, 1:] + theta1[:, 1] * lamda / len(X)
D2[:, 1:] = D2[:, 1:] + theta2[:, 1:] * lamda / len(X)
# D2[:, 1:] = D2[:, 1:] + theta2[:, 1] * lamda / len(X)
return serialize(D1, D2)
7 梯度检验
这块不是我自己的代码。我改完参数也没跑通,报错:unhashable type: 'slice'
#计算数值梯度的函数(不带正则项的)
def num_Gradient(X, y, theta):
numgrad = np.zeros(theta.shape)
perturb = np.zeros(theta.shape)
e = 1e-4
for p in range(len(theta)):
perturb[p] = e
loss1 = cost(theta - perturb, X, y)
loss2 = cost(theta + perturb, X, y)
#计算数值梯度
numgrad[p] = (loss2 - loss1) / (2*e)
perturb[p] = 0
return numgrad
predict_gradient = gradient(theta, X, y) #(10285,)
num_gradient = num_Gradient(X, y, theta) #(10285,)
#计算数值梯度的函数(正则约束)
def num_Gradient_reg(X, y, theta):
numgrad = np.zeros(theta.shape)
perturb = np.zeros(theta.shape)
e = 1e-4
for p in range(len(theta)):
perturb[p] = e
loss1 = cost_reg(theta - perturb, X, y, lmd=1)
loss2 = cost_reg(theta + perturb, X, y, lmd=1)
#计算数值梯度
numgrad[p] = (loss2 - loss1) / (2*e)
perturb[p] = 0
return numgrad
predict_gradient_reg = reg_gradient(theta, x, y_vec, lmd=1) #(10285,)
num_gradient_reg = num_Gradient_reg(x, y_vec, theta) #(10285,)
7 神经网络的优化及预测
TNC要求传入参数为一维
#神经网络的优化
from scipy.optimize import minimize
def nn_training(X, y):
init_theta = np.random.uniform(-0.5, 0.5, 10285)
#为解决对称权重问题,以防止对同一神经元的所有参数/权重都相等,需要随机地对初始的参数/权重赋值。一般说来初始化范围为,例如若随机生成的值属于,则只需即可。
res = minimize(fun=reg_cost, x0=init_theta, args=(X, y, lamda), method='TNC', jac=reg_gradient,options={'maxiter': 300})
return res
lamda = 10
res = nn_training(X, y)
raw_Y = data['y'].reshape(5000, )
_, _, _, _, h = feed_forward(res.x, X)
y_pred = np.argmax(h, axis=1) + 1
acc = np.mean(y_pred == raw_Y)
# 0.9398
print(acc)
#可视化隐藏层
def plot_hidden_layer(theta):
theta1, _ = deserialize(theta)
hidden_layer = theta1[:, 1:] # 25,400
fig, ax = plt.subplots(ncols=5, nrows=5, figsize=(8, 8), sharex=True, sharey=True)
for r in range(5):
for c in range(5):
ax[r, c].imshow(hidden_layer[5 * r + c].reshape(20, 20).T, cmap='gray_r')
plt.xticks([])
plt.yticks([])
plt.show
plot_hidden_layer(res.x)
完整代码
# # 神经网络解决多分类问题
# 2.案例:手写数字识别
# 数据集:ex4data1.mat
# 初始参数:ex4weights.mat
# 搭建完整的神经网络
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
from scipy.optimize import minimize
path = 'ex4data1.mat'
data = sio.loadmat(path)
raw_X = data['X']
raw_Y = data['y']
X = np.insert(raw_X, 0, values=1, axis=1)
# (5000,401)
print(X.shape)
# print(Y.shape)#(5000, 1)
# 对y进行独热编码处理:one-hot编码
def one_hot_encoder(raw_y):
result = []
for i in raw_y: # 1-10
y_temp = np.zeros(10)
y_temp[i - 1] = 1#i的值对应的索引
result.append(y_temp)
return np.array(result)
y = one_hot_encoder(raw_Y)
print(y, y.shape)
# 初始权重参数
theta = sio.loadmat('ex4weights.mat')
theta1, theta2 = theta['Theta1'], theta['Theta2']
# (25, 401) (10, 26)
print(theta1.shape, theta2.shape)
# 序列化权重参数
def serialize(a, b):
return np.append(a.flatten(), b.flatten()) # flatten 降维,折叠成一维,默认按行折叠
theta_serialize = serialize(theta1, theta2)
# (10285,) 10285=25*401+10*26
print(theta_serialize.shape)
#解序列化权重参数
def deserialize(theta_serialize):
theta1 = theta_serialize[:25 * 401].reshape(25, 401)
theta2 = theta_serialize[25 * 401:].reshape(10, 26)
return theta1, theta2
theta1, theta2 = deserialize(theta_serialize)
# (25, 401) (10, 26)
print(theta1.shape, theta2.shape)
#前向传播
# sigmoid函数
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def feed_forward(theta_serialize, X):
theta1, theta2 = deserialize(theta_serialize)
a1 = X
z2 = a1 @ theta1.T
a2 = sigmoid(z2)
a2 = np.insert(a2, 0, values=1, axis=1)
z3 = a2 @ theta2.T
h = sigmoid(z3)
return a1, z2, a2, z3, h
#损失函数(分为有正则化和无正则化)
# 不带正则化的损失函数
def cost(theta_serialize, X, y):
a1, z2, a2, a3, h = feed_forward(theta_serialize, X)
J = -np.sum(y * np.log(h) + (1 - y) * np.log(1 - h)) / len(X)
return J
# 0.2876291651613189
print(cost(theta_serialize, X, y))
# 带正则化的损失函数
def reg_cost(theta_serialize, X, y, lamda):
sum1 = np.sum(np.power(theta1[:, 1:], 2))#第一列不做正则化
sum2 = np.sum(np.power(theta2[:, 1:], 2))
reg = (sum1 + sum2) * lamda / (2 * len(X))
return reg + cost(theta_serialize, X, y)
# 0.38376985909092365
print(reg_cost(theta_serialize, X, y, 1))
#反向传播
# 无正则化的梯度
def sigmoid_gradient(z):
return sigmoid(z) * (1 - sigmoid(z))
def gradient(theta_serialize, X, y):
theta1, theta2 = deserialize(theta_serialize)
a1, z2, a2, z3, h = feed_forward(theta_serialize, X)
d3 = h - y
d2 = d3 @ theta2[:, 1:] * sigmoid_gradient(z2)
D2 = (d3.T @ a2) / len(X)
D1 = (d2.T @ a1) / len(X)
return serialize(D1, D2)
# 带正则化的梯度
def reg_gradient(theta_serialize, X, y, lamda):
D = gradient(theta_serialize, X, y)
D1, D2 = deserialize(D)
theta1, theta2 = deserialize(theta_serialize)
D1[:, 1:] = D1[:, 1:] + theta1[:, 1:] * lamda / len(X)
# D1[:, 1:] = D1[:, 1:] + theta1[:, 1] * lamda / len(X)
D2[:, 1:] = D2[:, 1:] + theta2[:, 1:] * lamda / len(X)
# D2[:, 1:] = D2[:, 1:] + theta2[:, 1] * lamda / len(X)
return serialize(D1, D2)
rom scipy.optimize import minimize
def nn_training(X, y):
init_theta = np.random.uniform(-0.5, 0.5, 10285)
res = minimize(fun=reg_cost, x0=init_theta, args=(X, y, lamda), method='TNC', jac=reg_gradient,
options={'maxiter': 300})
return res
lamda = 10
res = nn_training(X, y)
raw_Y = data['y'].reshape(5000, )
_, _, _, _, h = feed_forward(res.x, X)
y_pred = np.argmax(h, axis=1) + 1
acc = np.mean(y_pred == raw_Y)
# 0.9398
print(acc)
#可视化隐藏层
def plot_hidden_layer(theta):
theta1, _ = deserialize(theta)
hidden_layer = theta1[:, 1:] # 25,400
fig, ax = plt.subplots(ncols=5, nrows=5, figsize=(8, 8), sharex=True, sharey=True)
for r in range(5):
for c in range(5):
ax[r, c].imshow(hidden_layer[5 * r + c].reshape(20, 20).T, cmap='gray_r')
plt.xticks([])
plt.yticks([])
plt.show
plot_hidden_layer(res.x)