《人工智能》机器学习 - 第5章线性回归（二一元线性回归实践）

原创

BruceOxl 2022-04-21 15:12:04 ©著作权

文章标签 回归机器学习人工智能 python 数据 文章分类 代码人生

©著作权归作者所有：来自51CTO博客作者BruceOxl的原创作品，请联系作者获取转载授权，否则将追究法律责任

5.3一元线性回归实践

5.3.1一元线性回归简单实例

在前文我们引入了预测房价的例子，最后的模型是形如，那么笔者就先实现这个最简单的算法，使用的是最小二乘法进行求解计算，前文已经推导了。

《人工智能》机器学习 - 第5章线性回归（二一元线性回归实践）_数据

为了便于编码，使用上述式子。代码如下。

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['axes.unicode_minus'] = False # 用来正常显示负号

"""
函数说明:求解参数

Parameters:
    x,y
Returns:
    ws - 系数
"""
def fitSLR(x,y):
    #数据集长度
    n=len(x)
    dinominator = 0
    numerator=0
    
    for i in range(0,n):
        numerator += (x[i]-np.mean(x))*(y[i]-np.mean(y))
        dinominator += (x[i]-np.mean(x))**2
        
    #print("numerator:"+str(numerator))
    #print("dinominator:"+str(dinominator))
    
    b1 = numerator/float(dinominator)
    b0 = np.mean(y)/float(np.mean(x))
    
    ws = np.mat([[b0], [b1]])
    #返回系数
    return ws 


"""
函数说明:求解参数(矩阵的方式)
Parameters:
    xMat - x数据集
    yMat - y数据集
Returns:
    ws - 回归系数
"""
def MatfitSLR(xArr, yArr):
    #转化为MAT矩阵
    xMat = np.mat(xArr); yMat = np.mat(yArr)
        
    xTx = xMat.T * xMat     #根据文中推导的公示计算回归系数
    if np.linalg.det(xTx) == 0.0:
        print("矩阵为奇异矩阵,不能求逆")
        return
    ws = xTx.I * (xMat.T*yMat)

    print(ws)
    
    return ws

"""
函数说明:预测数据

Parameters:
    x,b0,b1
Returns:
    预测值
"""
# y= b0+x*b1
def prefict(x, ws1):
    return ws[0] + x*ws[1]

"""
函数说明:绘制回归曲线和数据点
Parameters
    xArr, yArr,ws
Returns:
    无
"""
def plotRegression(xArr, yArr,ws):
    #创建xMat矩阵
    xMat = np.mat(xArr)                                                    
    #创建yMat矩阵
    yMat = np.mat(yArr)                                                    
    #深拷贝xMat矩阵
    xCopy = xMat.copy()                                                    
    
    xCopy.sort(0)  #排序
                
    #计算对应的y值                                       
    yHat = xCopy * ws                                                     
   
    fig = plt.figure()
    
    #添加subplot
    ax = fig.add_subplot(111)                                            
    
    #绘制回归曲线
    ax.plot(xCopy[:, 1], yHat, c = 'red')       
    
    ax.scatter(xMat[:,1].flatten().A[0], yMat.flatten().A[0], s = 20, c = 'blue',alpha = .5)               
    #绘制样本点
    plt.title('DataSet')     #绘制title
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.show()

#测试
if __name__ == '__main__':
    ## Step 1: load data...
    print("Step 1: load data...")
    x = [1,3,2,1,3]
    y = [14,24,18,17,27]
    
    xx = np.mat(x).T    
    m, n = xx.shape
    # 将第一列为1的矩阵，与原X相连X.shape
    X = np.concatenate((np.ones((m,1)), xx), axis=1)   
    #print(X) 
    
    #创建xMat矩阵
    xMat = np.mat(X)                                                    
    #创建yMat矩阵
    yMat = np.mat(y).T
    #print(xMat)
    #print(yMat)
    
    ## Step 2: training...
    print("Step 2: training...")
    ws = fitSLR(x, y)
    #矩阵求解参数
    #ws = MatfitSLR(xMat,yMat)
    
    ## Step 3: testing...
    print("Step 3: testing...")
    y_predict = prefict(6, ws)
    
    ## Step 4: show the plot...
    print("Step 4: show the plot...")
    plotRegression(X, y ,ws)
    
    ## Step 5: show the result...
    print("Step 5: show the result...")
    print("y_predict:"+str(y_predict))

结果如下所示。

《人工智能》机器学习 - 第5章线性回归（二一元线性回归实践）_机器学习_02

【完整代码参考附件1.LR_simple\LR_simple.py】

这个例子比较简单，在代码中笔者已经在关键处注释了，笔者也不在细说了，下面笔者将使用梯度下降来实现。

5.3.2一元线性回归实例详解

首先我们需要加载数据集，在加载数据集之前，先看看数据集的内容吧。

《人工智能》机器学习 - 第5章线性回归（二一元线性回归实践）_人工智能_03

第一列都为1.0，即x0。第二列为x1，即x轴数据。第三列为x2，即y轴数据。首先绘制下数据，看下数据分布。编写代码如下。

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['axes.unicode_minus'] = False # 用来正常显示负号

"""
函数说明:读取数据

Parameters:
    filename - 文件名
Returns:
    xArr - x数据集
    yArr - y数据集
"""
def loadDataSet(filename):
    X = []
    Y = []
    with open(filename, 'rb') as f:
        for idx, line in enumerate(f):
            line = line.decode('utf-8').strip()
            if not line:
                continue
                
            eles = line.split()
            if idx == 0:
                numFeature = len(eles)
            # 将数据转换成float型
            eles = list(map(float, eles)) 
            
            # 除最后一列都是feature，append(list)
            X.append(eles[:-1])   
            # 最后一列是实际值,同上
            Y.append([eles[-1]])    
     
    # 将X,Y列表转化成矩阵
    xArr = np.array(X)
    yArr = np.array(Y)
    
    return xArr,yArr

"""
函数说明:绘制数据集

Parameters:
    无
Returns:
    无
"""
def plotDataSet():
    #加载数据集
    xArr, yArr = loadDataSet('data.txt')                                    
   
    n = len(xArr)   #数据个数
    xcord = []; ycord = []      #样本点
    
    for i in range(n):                                                  
        xcord.append(xArr[i][1]); ycord.append(yArr[i]) #样本点

    fig = plt.figure()
    ax = fig.add_subplot(111)   #添加subplot
    #绘制样本点
    ax.scatter(xcord, ycord, s = 20, c = 'blue',alpha = .5) 
            
    plt.title('DataSet')        #绘制title
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.show()
#测试
if __name__ == '__main__':
    plotDataSet()

结果如下所示。

《人工智能》机器学习 - 第5章线性回归（二一元线性回归实践）_python_04

图5

通过可视化数据，我们可以看到数据的分布情况。接下来，以下定义cost function（代价函数）：

J ( θ 0 , θ 1 ) = 1 2 m ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 J(\theta_0, \theta_1) = \frac{1}{2m} \sum_{i=1}^{m}(h_{\theta}(x^{(i)})-y^{(i)})^2 J(θ0,θ1)=2m1∑i=1m(hθ(x(i))−y(i))2

而

( h θ ( x ( i ) ) − y ( i ) ) 2 = ( h θ ( x ( i ) ) − y ( i ) ) T ⋅ ( h θ ( x ( i ) ) − y ( i ) ) (h_{\theta}(x^{(i)})-y^{(i)})^2 = (h_{\theta}(x^{(i)})-y^{(i)})^T \cdot (h_{\theta}(x^{(i)})-y^{(i)}) (hθ(x(i))−y(i))2=(hθ(x(i))−y(i))T⋅(hθ(x(i))−y(i))

即

[ ( y ^ ( 1 ) − y ( 1 ) ) ( y ^ ( 2 ) − y ( 2 ) ) ⋯ ( y ^ ( m ) − y ( m ) ) ] ⋅ [ ( y ^ ( 1 ) − y ( 1 ) ) ( y ^ ( 2 ) − y ( 2 ) ) ⋮ ( y ^ ( m ) − y ( m ) ) ] {\begin{bmatrix} (\hat{y}^{(1)}-y^{(1)}) & (\hat{y}^{(2)}-y^{(2)}) & \cdots & (\hat{y}^{(m)}-y^{(m)}) \\ \end{bmatrix}} \cdot {\begin{bmatrix} (\hat{y}^{(1)}-y^{(1)}) \\ (\hat{y}^{(2)}-y^{(2)}) \\ \vdots \\ (\hat{y}^{(m)}-y^{(m)}) \\ \end{bmatrix}} [(y^(1)−y(1))(y^(2)−y(2))⋯(y^(m)−y(m))]⋅⎣⎢⎢⎢⎡(y^(1)−y(1))(y^(2)−y(2))⋮(y^(m)−y(m))⎦⎥⎥⎥⎤

Python代码如下。

"""
函数说明:代价函数

Parameters:
    theta, X, Y
Returns:
    
"""
def J(theta, X, Y):
    '''定义代价函数'''
    m = len(X)
    return np.sum(np.dot((predict(theta,X)-Y).T, (predict(theta,X)-Y))/(2 * m))

接下来定义梯度下降（BGD）公式：

θ 0 : = θ 0 − α 1 m ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) \theta_0 := \theta_0 - \alpha \frac{1}{m} \sum_{i=1}^{m}(h_\theta(x^{(i)})-y^{(i)}) θ0:=θ0−αm1∑i=1m(hθ(x(i))−y(i))

θ 1 : = θ 1 − α 1 m ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) ⋅ x ( i ) \theta_1 := \theta_1 - \alpha \frac{1}{m} \sum_{i=1}^{m}(h_\theta(x^{(i)})-y^{(i)}) \cdot x^{(i)} θ1:=θ1−αm1∑i=1m(hθ(x(i))−y(i))⋅x(i)

Python代码如下所示。

# -*- coding: utf-8 -*-
import numpy as np

"""
函数说明:读取数据

Parameters:
    filename - 文件名
Returns:
    xArr - x数据集
    yArr - y数据集
"""
def loadDataSet(filename):
    X = []
    Y = []
    with open(filename, 'rb') as f:
        for idx, line in enumerate(f):
            line = line.decode('utf-8').strip()
            if not line:
                continue
                
            eles = line.split()
            if idx == 0:
                numFeature = len(eles)
            # 将数据转换成float型
            eles = list(map(float, eles)) 
            
            # 除最后一列都是feature，append(list)
            X.append(eles[:-1])   
            # 最后一列是实际值,同上
            Y.append([eles[-1]])    
     
    # 将X,Y列表转化成矩阵
    xArr = np.array(X)
    yArr = np.array(Y)
    
    return xArr,yArr

"""
函数说明:代价函数

Parameters:
    theta, X, Y
Returns:
    
"""
def J(theta, X, Y):
    '''定义代价函数'''
    m = len(X)
    return np.sum(np.dot((predict(theta,X)-Y).T, (predict(theta,X)-Y))/(2 * m))

"""
函数说明:梯度下降求解参数

Parameters:
    alpha, maxloop, epsilon, X, Y
Returns:
    theta, costs, thetas
    
"""
def bgd(alpha, maxloop, epsilon, X, Y):
    '''定义梯度下降公式，其中alpha为学习率控制步长，maxloop为最大迭代次数，epsilon为阈值控制迭代（判断收敛）'''
    m, n = X.shape # m为样本数，n为特征数，在这里为2

    # 初始化参数为零
    theta = np.zeros((2,1))
    
    count = 0 # 记录迭代次数
    converged = False # 是否收敛标志
    cost = np.inf # 初始化代价为无穷大
    costs = [] # 记录每一次迭代的代价值
    thetas = {0:[theta[0,0]], 1:[theta[1,0]]} # 记录每一轮theta的更新
    
    while count<= maxloop:
        if converged:
            break
        # 更新theta
        count = count + 1
        
        # 单独计算
        #theta0 = theta[0,0] - alpha / m * (predict(theta, X) - Y).sum()
        #theta1 = theta[1,0] - alpha / m * (np.dot(X[:,1][:,np.newaxis].T,(h(theta, X) - Y))).sum()   # 重点注意一下    
        # 同步更新
        #theta[0,0] = theta0
        #theta[1,0] = theta1
        #thetas[0].append(theta0)
        #thetas[1].append(theta1)
        
        # 一起计算
        theta = theta - alpha / (1.0 * m) * np.dot(X.T, (predict(theta, X)-Y))
        # X.T : n*m , h(theta, Y) : m*1 , np.dot(X.T, (predict(theta, X)- Y)) : n*1
        # 同步更新
        thetas[0].append(theta[0])
        thetas[1].append(theta[1])        
        
        # 更新当前cost
        cost = J(theta, X, Y)
        costs.append(cost)
        
        # 如果收敛，则不再迭代
        if cost<epsilon:
            converged = True
    return theta, costs, thetas 


#测试
if __name__ == '__main__':
    X, Y = loadDataSet('data.txt')
    alpha = 0.02 # 学习率
    maxloop = 1500 # 最大迭代次数
    epsilon = 0.01 # 收敛判断条件

    resault = bgd(alpha, maxloop, epsilon, X, Y)
    theta, costs, thetas = resault  # 最优参数保存在theta中，costs保存每次迭代的代价值，thetas保存每次迭代更新的theta值
    print(theta)

结果如下所示。

《人工智能》机器学习 - 第5章线性回归（二一元线性回归实践）_数据_05

在前文我们根据上文中推导的回归系数计算方法，同样可以求出回归系数向量。接下来就是定义模型函数进行预测：

h θ ( x ) = θ 0 + θ 1 x h_\theta(x) = \theta_0 + \theta_1 x hθ(x)=θ0+θ1x

其中 h θ ( x ) h_\theta(x) hθ(x)为模型函数，用来预测； θ 0 \theta_0 θ0 , θ 1 \theta_1 θ1 为模型参数， θ 0 + θ 1 x \theta_0 + \theta_1 x θ0+θ1x为实际特征值；可以看做是 θ 0 × 1 + θ 1 × x ( i ) , i = 1 , 2 , . . . , m \theta_0\times1 + \theta_1\times x^{(i)} , i = 1,2,...,m θ0×1+θ1×x(i),i=1,2,...,m ，表示共有 m m m个样本

即：

[ 1 x ( 1 ) 1 x ( 2 ) ⋮ ⋮ 1 x ( m ) ] ⋅ [ θ 0 θ 1 ] = [ y ( 1 ) y ( 2 ) ⋮ y ( m ) ] {\begin{bmatrix} 1 & x^{(1)} \\ 1 & x^{(2)} \\ \vdots & \vdots \\ 1 & x^{(m)} \\ \end{bmatrix}}\cdot {\begin{bmatrix} \theta_0 \\ \theta_1 \\ \end{bmatrix}} = {\begin{bmatrix} y^{(1)} \\ y^{(2)} \\ \vdots \\ y^{(m)} \\ \end{bmatrix}} ⎣⎢⎢⎢⎡11⋮1x(1)x(2)⋮x(m)⎦⎥⎥⎥⎤⋅[θ0θ1]=⎣⎢⎢⎢⎡y(1)y(2)⋮y(m)⎦⎥⎥⎥⎤

Python代码如下。

"""
函数说明:定义模型函数

Parameters:
    theta, X
Returns:
    返回预测后的结果
"""
def predict(theta, X):
    return np.dot(X, theta)  # 此时的X为处理后的X

根据回归系数向量绘制回归曲线。

"""
函数说明:绘制回归曲线和数据点
Parameters
    xArr, yArr, ws
Returns:
    无
"""
def plotRegression(xArr, yArr, ws):
    
    xMat = np.mat(xArr)    #创建xMat矩阵
    yMat = np.mat(yArr)    #创建yMat矩阵
    xCopy = xMat.copy()          #深拷贝xMat矩阵
    xCopy.sort(0)      #排序
    yHat = xCopy * ws      #计算对应的y值
    fig = plt.figure()
    ax = fig.add_subplot(111)    #添加subplot
    ax.plot(xCopy[:, 1], yHat, c = 'red')   #绘制回归曲线
    ax.scatter(xMat[:,1].flatten().A[0], yMat.flatten().A[0], s = 20, c = 'blue',alpha = .5) #绘制样本点
    plt.title('DataSet')     #绘制title
    plt.xlabel('X')
    plt.show()

结果如下所示。

《人工智能》机器学习 - 第5章线性回归（二一元线性回归实践）_python_06

图2

绘制代价曲线。

"""
函数说明:绘制代价曲线
Parameters
    X, Y, costs ,theta
Returns:
    无
"""
def plotloss(X, Y, costs, theta):
    
    # 以下为训练集的预测值
    XCopy = X.copy()
    XCopy.sort(0)  # axis=0 表示列内排序
    #print(XCopy[:,1].shape, yHat.shape, theta.shape)
    
    # 找到代价值的最大值、最小值，便于控制y轴范围
    print(np.array(costs).max(),np.array(costs).min())
    
    plt.xlim(-1,1600) # maxloop为1500
    plt.ylim(4,20)  
    plt.xlabel(u'迭代次数')
    plt.ylabel(u'代价函数J')
    plt.plot(range(len(costs)), costs)

结果如下。

《人工智能》机器学习 - 第5章线性回归（二一元线性回归实践）_机器学习_07

图3 绘制梯度下降过程，代码如下。

"""
函数说明:绘制梯度下降过程
Parameters
    X, Y, theta
Returns:
    无
"""
def plotbgd(X, Y, theta):
    
    #查看theta0的范围
    print(np.array(thetas[0]).min(), np.array(thetas[0]).max()) 
    
    #查看theta1的范围
    print(np.array(thetas[1]).min(), np.array(thetas[1]).max())  
    
    # 准备网格数据，以备画梯度下降过程图
    # matplotlib
    size = 100
    theta0Vals = np.linspace(-10,10, size)
    theta1Vals = np.linspace(-2, 4, size)
    JVals = np.zeros((size, size))   # 按照theta0Vals与theta1Vals 将JVals初始化为0
    for i in range(size):
        for j in range(size):
            col = np.matrix([[theta0Vals[i]], [theta1Vals[j]]])
            JVals[i,j] = J(col, X, Y)
    
    theta0Vals, theta1Vals = np.meshgrid(theta0Vals, theta1Vals)
    JVals = JVals.T
    
    # 绘制3D代价函数图形
    contourSurf = plt.figure()
    ax = contourSurf.gca(projection='3d')
    
    ax.plot_surface(theta0Vals, theta1Vals, JVals,  rstride=2, cstride=2, alpha=0.3,
                    cmap=matplotlib.cm.rainbow, linewidth=0, antialiased=False)
    ax.plot(theta[0], theta[1], 'rx')
    ax.set_xlabel(r'$\theta_0$')
    ax.set_ylabel(r'$\theta_1$')
    ax.set_zlabel(r'$J(\theta)$')

结果如下所示。

《人工智能》机器学习 - 第5章线性回归（二一元线性回归实践）_回归_08

图4

绘制代价函数等高线图，python代码如下。

"""
函数说明:绘制代价函数等高线图
Parameters
    X, Y
Returns:
    无
"""
def plotinline(X, Y):
    
    # 准备网格数据，以备画梯度下降过程图
    size = 100
    theta0Vals = np.linspace(-10,10, size)
    theta1Vals = np.linspace(-2, 4, size)
    JVals = np.zeros((size, size))   # 按照theta0Vals与theta1Vals 将JVals初始化为0
    for i in range(size):
        for j in range(size):
            col = np.matrix([[theta0Vals[i]], [theta1Vals[j]]])
            JVals[i,j] = J(col, X, Y)
    
    theta0Vals, theta1Vals = np.meshgrid(theta0Vals, theta1Vals)
    JVals = JVals.T
    
    # 绘制代价函数等高线图
    #matplotlib inline
    plt.figure(figsize=(12,6))
    CS = plt.contour(theta0Vals, theta1Vals, JVals, np.logspace(-2,3,30), alpha=.75)
    plt.clabel(CS, inline=1, fontsize=10)
    
    # 绘制最优解
    plt.plot(theta[0,0], theta[1,0], 'rx', markersize=10, linewidth=3)
    
    # 绘制梯度下降过程
    plt.plot(thetas[0], thetas[1], 'rx', markersize=3, linewidth=1) # 每一次theta取值
    plt.plot(thetas[0], thetas[1], 'r-',markersize=3, linewidth=1) # 用线连起来

结果如下所示。

《人工智能》机器学习 - 第5章线性回归（二一元线性回归实践）_人工智能_09

图5

以上是对回归数据的检验查看，那么如何判断拟合曲线的拟合效果的如何呢？当然，我们可以根据自己的经验进行观察，除此之外，来比较预测值和真实值的相关性。编写代码如下。

"""
函数说明:计算相关系数
Parameters
    X, Y
Returns:
    相关系数
"""
def computeCorrelation(X, Y):
    xBar = np.mean(X)
    yBar = np.mean(Y)
    SSR = 0
    varX = 0
    varY = 0
    for i in range(0 , len(X)):
        diffXXBar = X[i] - xBar
        diffYYBar = Y[i] - yBar
        SSR += (diffXXBar * diffYYBar)
        varX +=  diffXXBar**2
        varY += diffYYBar**2
    
    SST = math.sqrt(varX * varY)
    return SSR / SST

【注】相关性计算可以使用numpy的corrcoef方法。

结果如下。

《人工智能》机器学习 - 第5章线性回归（二一元线性回归实践）_人工智能_10

【完整代码参考附件 2. LR\LR_v1.0\LR_v1.0.py】

最佳拟合直线方法将数据视为直线进行建模，具有十分不错的表现。但是图6的数据当中好像拟合得不够理想，我们可以根据数据来局部调整预测，下面就会介绍这种方法。

 局部加权线性回归

线性回归的一个问题是有可能出现欠拟合现象，因为它求的是具有最小均方误差的无偏估计。显而易见，如果模型欠拟合将不能取得最好的预测效果。所以有些方法允许在估计中引入一些偏差，从而降低预测的均方误差。

其中的一个方法是局部加权线性回归（Locally Weighted Linear Regression， LWLR）。在该算法中，我们给待预测点附近的每个点赋予一定的权重；然后在这个子集上基于最小均方差来进行普通的回归。与kNN一样，这种算法每次预测均需要事先选取出对应的数据子集。该算法解出回归系数 θ \theta θ的形式如下：

θ ^ = ( X T W X ) T X T W Y \hat{\theta}=(X^TWX)^TX^TWY θ^=(XTWX)TXTWY

其中 W W W是一个矩阵，用来给每个数据点赋予权重。

LWLR使用“核”（与支持向量机中的核类似）来对附近的点赋予更高的权重。核的类型可以自由选择，最常用的核就是高斯核，高斯核对应的权重如下：

《人工智能》机器学习 - 第5章线性回归（二一元线性回归实践）_人工智能_11

这样我们就可以根据上述公式，编写局部加权线性回归，我们通过改变k的值，可以调节回归效果，编写代码如下：

"""
函数说明:绘制多条局部加权回归曲线
Parameters:
    无
Returns:
    无
"""
def plotlwlrRegression(xArr, yArr):

    yHat_1 = lwlrTest(xArr, xArr, yArr, 1.0)                            #根据局部加权线性回归计算yHat
    yHat_2 = lwlrTest(xArr, xArr, yArr, 0.01)                           #根据局部加权线性回归计算yHat
    yHat_3 = lwlrTest(xArr, xArr, yArr, 0.003)                          #根据局部加权线性回归计算yHat
    xMat = np.mat(xArr)                                                 #创建xMat矩阵
    yMat = np.mat(yArr)                                                 #创建yMat矩阵
    srtInd = xMat[:, 1].argsort(0)                                      #排序，返回索引值
    xSort = xMat[srtInd][:,0,:]
    fig, axs = plt.subplots(nrows=3, ncols=1,sharex=False, sharey=False, figsize=(10,8))                                        

    axs[0].plot(xSort[:, 1], yHat_1[srtInd], c = 'red')                     #绘制回归曲线
    axs[1].plot(xSort[:, 1], yHat_2[srtInd], c = 'red')                     #绘制回归曲线
    axs[2].plot(xSort[:, 1], yHat_3[srtInd], c = 'red')                     #绘制回归曲线
    axs[0].scatter(xMat[:,1].flatten().A[0], yMat.flatten().A[0], s = 20, c = 'blue', alpha = .5)               #绘制样本点
    axs[1].scatter(xMat[:,1].flatten().A[0], yMat.flatten().A[0], s = 20, c = 'blue', alpha = .5)               #绘制样本点
    axs[2].scatter(xMat[:,1].flatten().A[0], yMat.flatten().A[0], s = 20, c = 'blue', alpha = .5)               #绘制样本点

    #设置标题,x轴label,y轴label
    axs0_title_text = axs[0].set_title(u'局部加权回归曲线,k=1.0')
    axs1_title_text = axs[1].set_title(u'局部加权回归曲线,k=0.01')
    axs2_title_text = axs[2].set_title(u'局部加权回归曲线,k=0.003')

    plt.setp(axs0_title_text, size=8, weight='bold', color='red')  
    plt.setp(axs1_title_text, size=8, weight='bold', color='red')  
    plt.setp(axs2_title_text, size=8, weight='bold', color='red')  

    plt.xlabel('X')
    plt.show()

"""
函数说明:使用局部加权线性回归计算回归系数w
Parameters:
    testPoint - 测试样本点
    xArr - x数据集
    yArr - y数据集
    k - 高斯核的k,自定义参数
Returns:
    ws - 回归系数
"""
def lwlr(testPoint, xArr, yArr, k = 1.0):

    xMat = np.mat(xArr); yMat = np.mat(yArr)
    m = np.shape(xMat)[0]
    weights = np.mat(np.eye((m)))   #创建权重对角矩阵
    for j in range(m):       #遍历数据集计算每个样本的权重
        diffMat = testPoint - xMat[j, :]                                
        weights[j, j] = np.exp(diffMat * diffMat.T/(-2.0 * k**2))
    xTx = xMat.T * (weights * xMat)                                     
    if np.linalg.det(xTx) == 0.0:
        print("矩阵为奇异矩阵,不能求逆")
        return
    ws = xTx.I * (xMat.T * (weights * yMat))    #计算回归系数
    return testPoint * ws

"""
函数说明:局部加权线性回归测试
Parameters:
    testArr - 测试数据集
    xArr - x数据集
    yArr - y数据集
    k - 高斯核的k,自定义参数
Returns:
    ws - 回归系数
"""
def lwlrTest(testArr, xArr, yArr, k=1.0):  
    m = np.shape(testArr)[0]        #计算测试数据集大小
    yHat = np.zeros(m)  
    
    for i in range(m):          #对每个样本点进行预测
        yHat[i] = lwlr(testArr[i],xArr,yArr,k)
    return yHat

结果如下所示。

《人工智能》机器学习 - 第5章线性回归（二一元线性回归实践）_机器学习_12

图6

可以看到，当 k k k越小，拟合效果越好。但是当 k k k过小，会出现过拟合的情况，例如k等于0.003的时候。

【完整代码参考附件2.LR\LR_v1.1\LR_v1.1.py】

 调用sklearn库实现线性回归

代码如下。

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import math
from sklearn import linear_model

matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['axes.unicode_minus'] = False # 用来正常显示负号

"""
函数说明:读取数据

Parameters:
    filename - 文件名
Returns:
    xArr - x数据集
    yArr - y数据集
"""
def loadDataSet(filename):
    X = []
    Y = []
    with open(filename, 'rb') as f:
        for idx, line in enumerate(f):
            line = line.decode('utf-8').strip()
            if not line:
                continue
                
            eles = line.split()
            if idx == 0:
                numFeature = len(eles)
            # 将数据转换成float型
            eles = list(map(float, eles)) 
            
            # 除最后一列都是feature，append(list)
            X.append(eles[:-1])   
            # 最后一列是实际值,同上
            Y.append([eles[-1]])    
     
    # 将X,Y列表转化成矩阵
    xArr = np.array(X)
    yArr = np.array(Y)
    
    return xArr,yArr

"""
函数说明:绘制数据集

Parameters:
    无
Returns:
    无
"""
def plotDataSet(xArr, yArr):
    n = len(xArr)   #数据个数
    xcord = []; ycord = []      #样本点
    
    for i in range(n):                                                  
        xcord.append(xArr[i][1]); ycord.append(yArr[i]) #样本点

    fig = plt.figure()
    ax = fig.add_subplot(111)   #添加subplot
    #绘制样本点
    ax.scatter(xcord, ycord, s = 20, c = 'blue',alpha = .5) 
            
    plt.title('DataSet')        #绘制title
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.show()

"""
函数说明:定义模型函数

Parameters:
    theta, X
Returns:
    返回预测后的结果
"""
def predict(theta, X):
    return np.dot(X, theta)  # 此时的X为处理后的X

"""
函数说明:绘制回归曲线和数据点
Parameters
    X, Y, theta
Returns:
    无
"""
def plotRegression(X, Y, theta):
    
    xMat = np.mat(X)    #创建xMat矩阵
    yMat = np.mat(Y)    #创建yMat矩阵
    xCopy = xMat.copy()          #深拷贝xMat矩阵
    xCopy.sort(0)      #排序
    yHat = xCopy * theta      #计算对应的y值
    
    fig = plt.figure()
    ax = fig.add_subplot(111)    #添加subplot
    ax.plot(xCopy[:, 1], yHat, c = 'red')   #绘制回归曲线
    ax.scatter(xMat[:,1].flatten().A[0], yMat.flatten().A[0], s = 20, c = 'blue',alpha = .5) #绘制样本点
    
    plt.title('DataSet')     #绘制title
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.show()

"""
函数说明:绘制回归曲线和数据点
Parameters
    X, Y, theta
Returns:
    无
"""
def plotRegression_new(X, Y, theta):
    
    # 以下为训练集的预测值
    XCopy = X.copy()
    XCopy.sort(0)  # axis=0 表示列内排序
    yHat = predict(theta, XCopy)

    #print(XCopy[:,1].shape, yHat.shape, theta.shape)
    
    # 绘制回归直线
    plt.title('DataSet')        #绘制title
    plt.xlabel(u'x')
    plt.ylabel(u'y')
    #绘制回归线
    plt.plot(XCopy[:,1], yHat,color='r')
    #绘制样本点
    plt.scatter(X[:,1].flatten(), Y.T.flatten())
    plt.show()

"""
函数说明:计算相关系数
Parameters
    X, Y
Returns:
    相关系数
"""
def computeCorrelation(X, Y):
    xBar = np.mean(X)
    yBar = np.mean(Y)
    SSR = 0
    varX = 0
    varY = 0
    for i in range(0 , len(X)):
        diffXXBar = X[i] - xBar
        diffYYBar = Y[i] - yBar
        SSR += (diffXXBar * diffYYBar)
        varX +=  diffXXBar**2
        varY += diffYYBar**2
    
    SST = math.sqrt(varX * varY)
    return SSR / SST

#测试
if __name__ == '__main__':
    ## Step 1: load data...
    print("Step 1: load data...")
    X, Y = loadDataSet('data.txt')
    #print(X.shape)
    #print(Y.shape)
  
    ## Step 2: show plot...
    print("Step 2: show plot...")
    plotDataSet(X, Y)
    
    ## Step 3: Create linear regression object...
    print("Step 3: Create linear regression object...")
    #regr = linear_model.LinearRegression()
    
    #岭回归
    regr = linear_model.Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)
    
    ## Step 4: training...
    print("Step 4: training...")
    regr.fit(X, Y)
    
    ## Step 5: testing...
    print("Step 5: testing...")
    X_predict = [1, 18.945]
    # 到此，参数学习出来了，模型也就定下来了，若要预测新的实例，进行以下即可
    Y_predict = regr.predict([X_predict])
    
    ## Step 6: show the result...
    print("Step 6: show the result...")
    #定义向量 w = (w_1,..., w_p) 作为 coef_ ，定义 w_0 作为 intercept_ 。
    print("coef_:"+str(regr.coef_))
    print("intercept_:"+str(regr.intercept_)) 
    print("Y_predict:"+str(Y_predict))
    
    w0 = np.array(regr.intercept_)
    w1 = np.array(regr.coef_.T[1:2])
    
    #合并为一个矩阵
    theta = np.array([w0,w1])
    print(theta)
    
    ## Step 7: show Regression plot...
    print("Step 7: show Regression plot...")
    #plotRegression(X, Y, theta)
    plotRegression_new(X, Y, theta)
    
    ## Step 8: math Pearson...
    print("Step 8: math Pearson...")
    xMat = np.mat(X)                                                    #创建xMat矩阵
    yMat = np.mat(Y)                                                 #创建yMat矩阵
    yHat = predict(theta, xMat)
    
    Pearson = computeCorrelation(yHat, yMat)
    print(Pearson)