5.2二元逻辑回归实战
5.2.1二元逻辑回归简单实例
首先看看数据吧。
随机的一些数据,这是一个二分问题的数据。先导入数据。Python代码如下。
"""
函数说明:读取数据
Parameters:
filename - 文件名
Returns:
xArr - x数据集
yArr - y数据集
"""
def loadDataSet(filename):
X = []
Y = []
with open(filename, 'rb') as f:
for idx, line in enumerate(f):
line = line.decode('utf-8').strip()
if not line:
continue
eles = line.split()
eles = list(map(float, eles))
if idx == 0:
numFea = len(eles)
#去掉每行的最后一个数据再放入X中
X.append(eles[:-1])
#获取每行的租后一个数据
Y.append([eles[-1]])
# 将X,Y列表转化成矩阵
xArr = np.array(X)
yArr = np.array(Y)
return xArr,yArr
接下来就是定义sigmoid函数。
g ( z ) = 1 1 + e − z g(z)=\frac{1}{1+e^{-z}} g(z)=1+e−z1
Python代码如下所示。
"""
函数说明:定义sigmoid函数
Parameters:
z
Returns:
返回
"""
def sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
接下来就是定义代价函数
J ( θ ) = − 1 m [ ∑ i = 1 m ( y ( i ) l o g ( h θ ( x ( i ) ) ) + ( 1 − y ( i ) ) l o g ( 1 − h θ ( x ( i ) ) ) ) ] + λ 1 2 m ∑ j = 1 m θ j 2 J(\theta)=-\frac{1}{m}[\sum_{i=1}^{m}(y^{(i)}log(h_{\theta}(x^{(i)}))+(1-y^{(i)})log(1-h_{\theta}(x^{(i)})))]+ \lambda\frac{1}{2m}\sum_{j=1}^{m}\theta_j^2 J(θ)=−m1[∑i=1m(y(i)log(hθ(x(i)))+(1−y(i))log(1−hθ(x(i))))]+λ2m1∑j=1mθj2
其中
h
θ
(
X
)
=
g
(
θ
T
X
)
=
s
i
g
m
o
i
d
(
θ
T
X
)
h_{\theta}(X)=g(\theta^T X)=sigmoid(\theta^TX)
hθ(X)=g(θTX)=sigmoid(θTX)
∑
i
=
1
m
(
y
(
i
)
l
o
g
(
h
θ
(
x
(
i
)
)
)
+
(
1
−
y
(
i
)
)
l
o
g
(
1
−
h
θ
(
x
(
i
)
)
)
)
\sum_{i=1}^{m}(y^{(i)}log(h_{\theta}(x^{(i)}))+(1-y^{(i)})log(1-h_{\theta}(x^{(i)})))
∑i=1m(y(i)log(hθ(x(i)))+(1−y(i))log(1−hθ(x(i))))为
∑
c
o
s
t
\sum cost
∑cost表示拟合程度。
λ
∑
j
=
1
m
θ
j
2
\lambda \sum_{j=1}^{m}\theta_j^2
λ∑j=1mθj2表示
L
2
L_2
L2正则化,用来表示模型的复杂度。
Python代码如下。
"""
函数说明:定义代价函数
Parameters:
theta, X, Y, theLambda
Returns:
返回
"""
def J(theta, X, Y, theLambda=0):
m, n = X.shape
h = sigmoid(np.dot(X, theta))
J = (-1.0/m)*(np.dot(np.log(h).T,Y)+np.dot(np.log(1-h).T,1-Y)) + (theLambda/(2.0*m))*np.sum(np.square(theta[1:]))
if np.isnan(J[0]):
return np.inf
# 其实J里面只有一个数值,需要取出该数值
return J.flatten()[0]
接着就是定义梯度下降求解参数。我们这里使用两种方法,随机梯度和批处理梯度下降的方法,alpha在每次迭代的时候都会调整,并且,虽然alpha会随着迭代次数不断减小,但永远不会减小到0,因为这里还存在一个常数项。必须这样做的原因是为了保证在多次迭代之后新数据仍然具有一定的影响。如果需要处理的问题是动态变化的,那么可以适当加大上述常数项,来确保新的值获得更大的回归系数。
θ j : = θ j − α 1 m ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) x j ( i ) − α λ m θ j \theta_j:=\theta_j-\alpha\frac{1}{m}\sum_{i=1}{m}(h_{\theta}(x^{(i)})-y^{(i)})x_j^{(i)}-\alpha\frac{\lambda}{m}\theta_j θj:=θj−αm1∑i=1m(hθ(x(i))−y(i))xj(i)−αmλθj
Python代码如下所示。
"""
函数说明:梯度下降求解参数
Parameters:
X, Y, options
Returns:
返回参数
"""
def gradient(X, Y, options):
'''
options.alpha 学习率
options.theLambda 正则参数λ
options.maxLoop 最大迭代轮次
options.epsilon 判断收敛的阈值
options.method
- 'sgd' 随机梯度下降
- 'bgd' 批量梯度下降
'''
m, n = X.shape
# 初始化模型参数,n个特征对应n个参数
theta = np.zeros((n,1))
cost = J(theta, X, Y) # 当前误差
costs = [cost]
thetas = [theta]
# Python 字典dict.get(key, default=None)函数返回指定键的值,如果值不在字典中返回默认值
alpha = options.get('alpha', 0.01)
epsilon = options.get('epsilon', 0.00001)
maxloop = options.get('maxloop', 1000)
theLambda = float(options.get('theLambda', 0)) # 后面有 theLambda/m 的计算,如果这里不转成float,后面这个就全是0
method = options.get('method', 'bgd')
# 定义随机梯度下降
def _sgd(theta):
count = 0
converged = False
while count < maxloop:
if converged :
break
# 随机梯度下降,每一个样本都更新
for i in range(m):
h =sigmoid(np.dot(X[i].reshape((1,n)), theta))
theta = theta - alpha*((1.0/m)*X[i].reshape((n,1))*(h-Y[i]) + (theLambda/m)*np.r_[[[0]], theta[1:]])
thetas.append(theta)
cost = J(theta, X, Y, theLambda)
costs.append(cost)
if abs(costs[-1] - costs[-2]) < epsilon:
converged = True
break
count += 1
return thetas, costs, count
# 定义批量梯度下降
def _bgd(theta):
count = 0
converged = False
while count < maxloop:
if converged :
break
h = sigmoid(np.dot(X, theta))
theta = theta - alpha*((1.0/m)*np.dot(X.T, (h-Y)) + (theLambda/m)*np.r_[[[0]],theta[1:]])
thetas.append(theta)
cost = J(theta, X, Y, theLambda)
costs.append(cost)
if abs(costs[-1] - costs[-2]) < epsilon:
converged = True
break
count += 1
return thetas, costs, count
methods = {'sgd': _sgd, 'bgd': _bgd}
return methods[method](theta)
接下来就是测试与绘图查看了。代码如下所示。
# -*- coding:utf-8 -*-
import numpy as np
import time
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['axes.unicode_minus'] = False
"""
函数说明:读取数据
Parameters:
filename - 文件名
Returns:
xArr - x数据集
yArr - y数据集
"""
def loadDataSet(filename):
X = []
Y = []
with open(filename, 'rb') as f:
for idx, line in enumerate(f):
line = line.decode('utf-8').strip()
if not line:
continue
eles = line.split()
eles = list(map(float, eles))
if idx == 0:
numFea = len(eles)
#去掉每行的最后一个数据再放入X中
X.append(eles[:-1])
#获取每行的租后一个数据
Y.append([eles[-1]])
# 将X,Y列表转化成矩阵
xArr = np.array(X)
yArr = np.array(Y)
return xArr,yArr
"""
函数说明:定义sigmoid函数
Parameters:
z
Returns:
返回
"""
def sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
"""
函数说明:定义代价函数
Parameters:
theta, X, Y, theLambda
Returns:
返回
"""
def J(theta, X, Y, theLambda=0):
m, n = X.shape
h = sigmoid(np.dot(X, theta))
J = (-1.0/m)*(np.dot(np.log(h).T,Y)+np.dot(np.log(1-h).T,1-Y)) + (theLambda/(2.0*m))*np.sum(np.square(theta[1:]))
if np.isnan(J[0]):
return np.inf
# 其实J里面只有一个数值,需要取出该数值
return J.flatten()[0]
"""
函数说明:梯度下降求解参数
Parameters:
X, Y, options
Returns:
返回参数
"""
def gradient(X, Y, options):
'''
options.alpha 学习率
options.theLambda 正则参数λ
options.maxLoop 最大迭代轮次
options.epsilon 判断收敛的阈值
options.method
- 'sgd' 随机梯度下降
- 'bgd' 批量梯度下降
'''
m, n = X.shape
# 初始化模型参数,n个特征对应n个参数
theta = np.zeros((n,1))
cost = J(theta, X, Y) # 当前误差
costs = [cost]
thetas = [theta]
# Python 字典dict.get(key, default=None)函数返回指定键的值,如果值不在字典中返回默认值
alpha = options.get('alpha', 0.01)
epsilon = options.get('epsilon', 0.00001)
maxloop = options.get('maxloop', 1000)
theLambda = float(options.get('theLambda', 0)) # 后面有 theLambda/m 的计算,如果这里不转成float,后面这个就全是0
method = options.get('method', 'bgd')
# 定义随机梯度下降
def _sgd(theta):
count = 0
converged = False
while count < maxloop:
if converged :
break
# 随机梯度下降,每一个样本都更新
for i in range(m):
h =sigmoid(np.dot(X[i].reshape((1,n)), theta))
theta = theta - alpha*((1.0/m)*X[i].reshape((n,1))*(h-Y[i]) + (theLambda/m)*np.r_[[[0]], theta[1:]])
thetas.append(theta)
cost = J(theta, X, Y, theLambda)
costs.append(cost)
if abs(costs[-1] - costs[-2]) < epsilon:
converged = True
break
count += 1
return thetas, costs, count
# 定义批量梯度下降
def _bgd(theta):
count = 0
converged = False
while count < maxloop:
if converged :
break
h = sigmoid(np.dot(X, theta))
theta = theta - alpha*((1.0/m)*np.dot(X.T, (h-Y)) + (theLambda/m)*np.r_[[[0]],theta[1:]])
thetas.append(theta)
cost = J(theta, X, Y, theLambda)
costs.append(cost)
if abs(costs[-1] - costs[-2]) < epsilon:
converged = True
break
count += 1
return thetas, costs, count
methods = {'sgd': _sgd, 'bgd': _bgd}
return methods[method](theta)
"""
函数说明:绘制决策边界
Parameters:
X, thetas
Returns:
无
"""
def plotBoundary(X,Y,thetas):
# 绘制决策边界
plt.figure(figsize=(6,4))
for i in range(len(X)):
x = X[i]
if Y[i] == 1:
plt.scatter(x[1], x[2], marker='*', color='blue', s=50)
else:
plt.scatter(x[1], x[2], marker='o', color='green', s=50)
hSpots = np.linspace(X[:,1].min(), X[:,1].max(), 100)
theta0, theta1, theta2 = thetas[-1]
vSpots = -(theta0+theta1*hSpots)/theta2
plt.plot(hSpots, vSpots, color='red', linewidth=.5)
plt.xlabel(r'$x_1$')
plt.ylabel(r'$x_2$')
"""
函数说明:绘制代价函数图像
Parameters:
costs
Returns:
返回参数
"""
def plotCost(costs):
# 绘制误差曲线
plt.figure(figsize=(8,4))
plt.plot(range(len(costs)), costs)
plt.xlabel(u'迭代次数')
plt.ylabel(u'代价J')
"""
函数说明:绘制参数曲线
Parameters:
thetass - 参数
Returns:
返回参数
"""
def plotthetas(thetas):
# 绘制参数theta变化
thetasFig, ax = plt.subplots(len(thetas[0]))
thetas = np.asarray(thetas)
for idx, sp in enumerate(ax):
thetaList = thetas[:, idx]
sp.plot(range(len(thetaList)), thetaList)
sp.set_xlabel('Number of iteration')
sp.set_ylabel(r'$\theta_%d$'%idx)
if __name__ == '__main__':
## Step 1: load data...
print("Step 1: load data...")
originX, Y = loadDataSet('linear.txt')
m,n = originX.shape
X = np.concatenate((np.ones((m,1)), originX), axis=1)
#print(X)
#print(Y)
## Step 2: training bgd...
print("Step 2: training bgd...")
# bgd批量梯度下降
# 设置超参数
options = {
'alpha':0.05,
'epsilon':0.00000001,
'maxloop':100000,
'method':'bgd' # sgd
}
# 训练模型
start = time.time()
bgd_thetas, bgd_costs, bgd_iterationCount = gradient(X, Y, options)
print("bgd_thetas:")
print(bgd_thetas[-1])
end = time.time()
bgd_time = end - start
print("bgd_time:"+str(bgd_time))
## Step 3: show boundary bgd...
print("Step 3: show boundary bgd...")
plotBoundary(X,Y,bgd_thetas)
## Step 4: show costs bgd...
print("Step 4: show costs bgd...")
plotCost(bgd_costs)
## Step 5: show thetas bgd...
print("Step 5: show thetas bgd...")
plotthetas(bgd_thetas)
print("======================================================")
## Step 2: training sgd...
print("Step 2: training sgd...")
# sgd随机梯度下降
options = {
'alpha':0.5,
'epsilon':0.00000001,
'maxloop':100000,
'method':'sgd'
}
# 训练模型
start = time.time()
sgd_thetas, sgd_costs, sgd_iterationCount = gradient(X, Y, options)
print("sgd_thetas:")
print(sgd_thetas[-1])
end = time.time()
sgd_time = end - start
print("sgd_time:"+str(sgd_time))
## Step 3: show boundary sgd...
print("Step 3: show boundary sgd...")
plotBoundary(X,Y,sgd_thetas)
## Step 4: show costs sgd...
print("Step 4: show costs sgd...")
plotCost(sgd_costs)
## Step 5: show thetas sgd...
print("Step 5: show thetas sgd...")
plotthetas(sgd_thetas)
结果如下所示。
【完整代码参考附件1.Logistic_Regression_Classify\Logistic_Regression_Classify_v1
Logistic_Regression_Classify_v1.0.py】
梯度上升迭代公式为:
θ j : = θ j + α 1 m ∑ i = 1 m ( y ( i ) − h θ ( x ( i ) ) ) x j ( i ) − α λ m θ j \theta_j:=\theta_j+\alpha\frac{1}{m}\sum_{i=1}{m}(y^{(i)}-h_{\theta}(x^{(i)}))x_j^{(i)}-\alpha\frac{\lambda}{m}\theta_j θj:=θj+αm1∑i=1m(y(i)−hθ(x(i)))xj(i)−αmλθj
【完整代码参考附件1.Logistic_Regression_Classify\Logistic_Regression_Classify_v1
Logistic_Regression_Classify_v1.1.py】
【注】请笔者朋友自己将梯度下降和梯度上升对比学习。结果都是一样的。
5.2.2二元逻辑调用sklearn库实现逻辑回归
数据和前文的例子一样,只是使用sklearn来实现逻辑回归罢了。
代码如下。
# -*- coding:utf-8 -*-
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['axes.unicode_minus'] = False
"""
函数说明:读取数据
Parameters:
filename - 文件名
Returns:
xArr - x数据集
yArr - y数据集
"""
def loadDataSet(filename):
X = []
Y = []
with open(filename, 'rb') as f:
for idx, line in enumerate(f):
line = line.decode('utf-8').strip()
if not line:
continue
eles = line.split()
eles = list(map(float, eles))
if idx == 0:
numFea = len(eles)
#去掉每行的最后一个数据再放入X中
X.append(eles[:-1])
#获取每行的租后一个数据
Y.append([eles[-1]])
# 将X,Y列表转化成矩阵
xArr = np.array(X)
yArr = np.array(Y)
return xArr,yArr
"""
函数说明:绘制决策边界
Parameters:
X, thetas
Returns:
无
"""
def plotBoundary(X,Y,thetas):
# 绘制决策边界
plt.figure(figsize=(6,4))
for i in range(len(X)):
x = X[i]
if Y[i] == 1:
plt.scatter(x[1], x[2], marker='*', color='blue', s=50)
else:
plt.scatter(x[1], x[2], marker='o', color='green', s=50)
xx = np.arange(-3.0, 3.0, 0.1)
theta0, theta1, theta2 = thetas
yy = theta0 - (theta1*xx) / theta2
plt.plot(xx, yy, color='red', linewidth = 2)
plt.xlabel(r'$x_1$')
plt.ylabel(r'$x_2$')
"""
函数说明:绘制预测值与真实值
Parameters:
X_test_std,Y_test,y_predict
Returns:
无
"""
def plotfit(X_test_std,Y_test,y_predict):
#画图对预测值和实际值进行比较
p_x = range(len(X_test_std))
plt.figure(figsize=(6,4),facecolor="w")
plt.ylim(0,6)
plt.plot(p_x,Y_test,"ro",markersize=8,zorder=3,label=u"真实值")
plt.plot(p_x,y_predict,"go",markersize=14,zorder=2,label=u"预测值,$R^2$=%.3f" %lr.score(X_test,Y_test))
plt.legend(loc="upper left")
plt.xlabel(u"编号",fontsize=12)
plt.ylabel(u"类型",fontsize=12)
plt.title(u"Logistic算法对数据进行分类",fontsize=12)
#plt.savefig("Logistic算法对数据进行分类.png")
plt.show()
if __name__ == '__main__':
## Step 1: load data...
print("Step 1: load data...")
originX, Y = loadDataSet('data.txt')
m,n = originX.shape
X = np.concatenate((np.ones((m,1)), originX), axis=1)
#print(X)
#print(Y)
## Step 2: split data...
print("Step 2: split data...")
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)
## Step 3: standard data...
print("Step 3: standard data...")
#标准化
sc=StandardScaler()
sc.fit(X_train)#计算样本的均值和标准差
X_train_std=sc.transform(X_train)
X_test_std=sc.transform(X_test)
start = time.time()
## Step 4: init LogisticRegression...
print("Step 4: init LogisticRegression...")
lr=LogisticRegression(C=1000.0,random_state=0)
## Step 5: training...
print("Step 5: training ...")
#输出y做出ravel()转换。不然会有警告
lr.fit(X_train_std,Y_train.ravel())
end = time.time()
sk_time = end - start
print('time:',sk_time)
## Step 6: show thetas...
print("Step 6: show thetas...")
#模型效果获取
r = lr.score(X_train_std,Y_train)
print("R值(准确率):",r)
print("参数:",lr.coef_)
print("截距:",lr.intercept_)
w0 = np.array(lr.intercept_)
w1 = np.array(lr.coef_.T[1:3])
#合并为一个矩阵
thetas = np.vstack((w0,w1))
print('thetas:')
print(thetas)
## Step 7: show sigmoid p...
print("Step 7: show sigmoid p...")
#sigmoid函数转化的值,即:概率p
predict_p = lr.predict_proba(X_test_std)
print(predict_p) #sigmoid函数转化的值,即:概率pprint(y_predict) #sigmoid函数转化的值,即:概率p
## Step 8: show boundary...
print("Step 8: show boundary...")
plotBoundary(X,Y,thetas)
## Step 9: predict ...
print("Step 9: predict...")
y_predict = lr.predict(X_test_std)
print(y_predict)
## Step 10: show plotfit ...
print("Step 10: show plotfit...")
#画图对预测值和实际值进行比较
plotfit(X_test_std,Y_test,y_predict)
【完整代码参考附件1.Logistic_Regression_Classify\Logistic_Regression_Classify-sklearn_v2
Logistic_Regression_Classify-sklearn_v2.0.py】
结果如下。
好了,现在总结Logistic回归的一般过程:
(1) 收集数据:采用任意方法收集数据。
(2) 准备数据:由于需要进行距离计算,因此要求数据类型为数值型。另外,结构化数据格式则最佳。
(3) 分析数据:采用任意方法对数据进行分析。
(4) 训练算法:大部分时间将用于训练,训练的目的是为了找到最佳的分类回归系数。
(5) 测试算法:一旦训练步骤完成,分类将会很快。
(6) 使用算法:首先,我们需要输入一些数据,并将其转换成对应的结构化数值;接着,基于训练好的回归系数就可以对这些数值进行简单的回归计算,判定它们属于哪个类别;在这之后,我们就可以在输出的类别上做一些其他分析工作。
本章参考代码
点击进入下载