1、背景
多层神经网络的学习能力相当强大,在数据量相当充足的情况下,这种强大自然没有什么问题,但是对于一些问题来说,比如计算机视觉,数据通常是不足的,在这种情况下训练时就容易发生数据过拟合,通俗来讲,过拟合就是训练的模型学习到了数据集自带的特性,而这种特性并不是现实世界中存在的数据所共有的。当这种情况发生时,虽然训练误差较小,但是模型泛化能力不足,因此我们需要用一些方法去防止过拟合的发生,比较常见的方法有两种,其一,使用更多训练数据,其二,正则化。数据多了问题自然解决,但是有些问题去找更多数据的代价很大,并且更有一些问题无法找到更多训练数据,比如癌症的早期发现,那么正则化就更容易被使用了。
2、L2正则化简介
假如在神经网络训练时的目标函数是:
L2正则化就是在原本目标函数的基础上,添加一个损失项
有了新的目标函数,参数更新只需要在求梯度的时候做略微的改动即可
从上式中可以看出,在添加正则化损失项之后,在每次更新W时,首先将W乘以一个小于1的数,然后再加上没正则化时的目标函数所对应的梯度项,经过迭代后可以得到较小的W,进而得到较小的Z,如果激活函数使用tanh或者sigmoid时,会落在函数接近线性的区域,由神经网络的特性可知,如果激活函数使用线性函数,那么经过网络的前向传播之后,输出依然只是输入的线性组合,与感知机无异。所以最终得到的模型不能表达太复杂的问题,即学习能力受到限制。
3、实验结果
下图是无正则化时的模型:
Iterations:0---cost:0.6972652257211792
Iterations:2000---cost:0.21165381376399547
Iterations:4000---cost:0.1951425385136149
Iterations:6000---cost:0.1740360233286938
Iterations:8000---cost:0.15274737230486157
Iterations:10000---cost:0.133325170802212
Iterations:12000---cost:0.12762253797215065
Iterations:14000---cost:0.1240078424815226
Iterations:16000---cost:0.12780531249704044
Iterations:18000---cost:0.13313835960940024
Iterations:20000---cost:0.057367278096741206
Iterations:22000---cost:0.09418080799882553
Iterations:24000---cost:0.024817611835795575
Iterations:26000---cost:0.01912050781220849
Iterations:28000---cost:0.009494663286156817
训练正确率:1.0
测试正确率:0.935
下图为L2正则化模型:
Iterations:0---cost:0.8759553784922353
Iterations:2000---cost:0.3653597750925469
Iterations:4000---cost:0.3330295441809763
Iterations:6000---cost:0.30973576048901186
Iterations:8000---cost:0.28918653965297925
Iterations:10000---cost:0.2746114296324272
Iterations:12000---cost:0.26338130156127554
Iterations:14000---cost:0.2546441027617421
Iterations:16000---cost:0.2477042933674601
Iterations:18000---cost:0.241885525892029
Iterations:20000---cost:0.23707870137611828
Iterations:22000---cost:0.23331474815492242
Iterations:24000---cost:0.23046153380456355
Iterations:26000---cost:0.2282299050072628
Iterations:28000---cost:0.22629647982079956
训练正确率:0.9478672985781991
测试正确率:0.955
可以看出虽然训练正确率下降了,但是测试正确率上升了,而且模型复杂度降低了很多。
4、python实现
import numpy as np
import load_datasets
import matplotlib.pyplot as plt
from sklearn import datasets
import scipy.io as sio
def relu(x):
return np.maximum(x,0)
def sigmoid(x):
if (x>0).all():
return 1.0/(1+np.exp(-x))
else:
return np.exp(x)/(1+np.exp(x))
def plot_decision_boundary(W, b, X, Y):
x_min, x_max = X[0,:].min(), X[0,:].max() #取得绘图数值范围
y_min, y_max = X[1,:].min(), X[1,:].max()
step = 0.01 #网格精度
xx,yy = np.meshgrid( np.arange(x_min,x_max,step), np.arange(y_min,y_max,step) ) #生成一张网格
plot_samples = np.array( [xx.ravel(),yy.ravel()] )
A = plot_samples.copy()
for l in range(1, len(W)):
Z = np.dot(W[l], A)+b[l]
if l==len(W)-1:
A = sigmoid(Z)
else:
A = relu(Z)
A[A>0.5] = 1
A[A<=0.5] = 0
A =A.reshape(xx.shape)
plt.contourf(xx, yy, A, cmap=plt.cm.Spectral)
plt.xlabel('x1')
plt.ylabel('y2')
plt.scatter(X[0,:], X[1,:], c=Y[0,:])
plt.show()
#读取训练、测试数据
train_x, train_y, test_x, test_y = load_2D_dataset()
#初始化基本的参数
nTrain = train_x.shape[1]
nTest = test_x.shape[1]
Iterations = 30000 #迭代次数
Layers = [train_x.shape[0], 50,35,20,10,1] #网络结构
nLayers = len(Layers)-1
alpha = 0.02 #学习率
lambd = 0 #正则化系数
#初始化基本的参数
nTrain = train_x.shape[1]
nTest = test_x.shape[1]
Iterations = 30000 #迭代次数
Layers = [train_x.shape[0], 50,35,20,10,1] #网络结构
nLayers = len(Layers)-1
alpha = 0.02 #学习率
lambd = 0.7 #正则化系数
#初始化偏置、权重
W = [[] for l in range(nLayers+1)]
b = [[] for l in range(nLayers+1)]
for l in range(1,nLayers+1):
W[l] = np.random.randn(Layers[l],Layers[l-1])/np.sqrt(Layers[l-1])
b[l] = np.zeros((Layers[l],1))
print(W[l].shape)
print(b[l].shape)
dW = W.copy()
db = b.copy()
#初始化Cache
A = [[] for l in range(nLayers+1)]
Z = [[] for l in range(nLayers+1)]
for l in range(1,nLayers+1):
A[l] = np.zeros((Layers[l],nTrain))
Z[l] = np.zeros((Layers[l],nTrain))
print(A[l].shape)
print(Z[l].shape)
dA = A.copy()
dZ = Z.copy()
A[0] = train_x
cost = []
#迭代
for i in range(Iterations):
#前向传播
for l in range(1,nLayers+1):
Z[l] = np.dot(W[l], A[l-1])+b[l]
if l==nLayers: #最后一层激活函数待定
A[l] = sigmoid(Z[l])
else:
A[l] = relu(Z[l]) #隐层激活函数relu
dZ[nLayers] = (A[nLayers]-train_y)/nTrain
#反向传播
for l in np.arange(nLayers,0, -1):
dW[l] = np.dot(dZ[l], A[l-1].T) + lambd/nTrain*W[l]
db[l] = np.sum(dZ[l], axis=1, keepdims=True)
if l>1:
dA[l-1] = np.dot(W[l].T, dZ[l])
dZ[l-1] = dA[l-1].copy()
dZ[l-1][Z[l-1]<0] = 0
for l in range(1,nLayers+1):
W[l] = W[l]-alpha*dW[l]
b[l] = b[l]-alpha*db[l]
W_sum = 0
for l in range(1,nLayers):
W_sum += np.sum(W[l]**2)
if i%2000==0:
#下面在log函数中添加0.0001微小偏差,防止等于0,出现log(0)
cur_cost = -np.sum( train_y*np.log(A[nLayers]+0.0001)+(1-train_y)*np.log(1-A[nLayers]+0.0001) )/nTrain+(lambd/nTrain/2)*W_sum
cost.append(cur_cost)
print("Iterations:"+str(i)+"---cost:"+str(cur_cost))
train_err = np.sum( A[nLayers][train_y==1]<0.5 )+np.sum(A[nLayers][train_y==0]>0.5)#训练集错误数量
print("训练集正确率:"+str(1-train_err/nTrain))
#预测输出
A_predict = test_x
for l in range(1,nLayers+1):
Z_predict = np.dot(W[l], A_predict)+b[l]
if l==nLayers:
A_predict = sigmoid(Z_predict)
else:
A_predict = relu(Z_predict)
test_err = np.sum(A_predict[test_y==1]<0.5) + np.sum( A_predict[test_y==0]>0.5 )
print("测试正确率:"+str(1-test_err/nTest))
plt.plot(cost)
plot_decision_boundary(W, b, train_x, train_y)