java 加权线性回归加权线性回归方程

转载

数据分析家 2024-02-05 12:16:14

文章标签 java 加权线性回归拟合线性回归数据 文章分类 Java 后端开发

算法特征:
回归曲线上的每一点均对应一个独立的线性方程, 该线性方程由一组经过加权后的残差决定. 残差来源于待拟合数据点与拟合超平面在相空间的距离, 权重依赖于待拟合数据点与拟合数据点在参数空间的距离.
算法推导:
待拟合方程:
\begin{equation}\label{eq_1}
h_{\theta}(x) = x^T\theta
\end{equation}
最小二乘法:
\begin{equation}\label{eq_2}
min\ \frac{1}{2}(X^T\theta-\bar{Y})^TW(X^T\theta-\bar{Y})
\end{equation}
其中, $X=[x^1, x^2, ..., x^n]$是由待拟合数据之输入值所组成的矩阵, 每根分量均为一个列矢量, 将该列矢量的最后一个元素置1以满足线性拟合之常数项需求. $\bar{Y}$由待拟合数据之标准输出值组成, 是一个列矢量. $W$由待拟合数据之权重组成, 是一个对角矩阵, 此处取第$i$个对角元为$exp(-(x_i - x)^2 / (2\tau^2))$. $\theta$为待求解的优化变量, 是一个列矢量.

上诉优化问题(\ref{eq_2})为无约束凸优化问题, 存在如下近似解析解:
\begin{equation}\label{eq_3}
\theta = (XWX^T + \varepsilon I)^{-1}XW\bar{Y}
\end{equation}
代码实现:

1 # 局部加权线性回归
  2 # 交叉验证计算泛化误差最小点
  3 
  4 
  5 import numpy
  6 from matplotlib import pyplot as plt
  7 
  8 
  9 # 待拟合不含噪声之目标函数
 10 def oriFunc(x):
 11     y = numpy.exp(-x) * numpy.sin(10*x)
 12     return y
 13 # 待拟合包含噪声之目标函数
 14 def traFunc(x, sigma=0.03):
 15     y = oriFunc(x) + numpy.random.normal(0, sigma, numpy.array(x).size)
 16     return y
 17 
 18     
 19 # 局部加权线性回归之实现
 20 class LW(object):
 21     
 22     def __init__(self, xBound=(0, 3), number=500, tauBound=(0.001, 100), epsilon=1.e-3):
 23         self.__xBound = xBound               # 采样边界
 24         self.__number = number               # 采样数目
 25         self.__tauBound = tauBound           # tau之搜索边界
 26         self.__epsilon = epsilon             # tau之搜索精度
 27         
 28         
 29     def get_data(self):
 30         '''
 31         根据目标函数生成待拟合数据
 32         '''
 33         X = numpy.linspace(*self.__xBound, self.__number)
 34         oriY_ = oriFunc(X)                   # 不含误差之响应
 35         traY_ = traFunc(X)                   # 包含误差之响应
 36         
 37         self.X = numpy.vstack((X.reshape((1, -1)), numpy.ones((1, X.shape[0]))))
 38         self.oriY_ = oriY_.reshape((-1, 1))
 39         self.traY_ = traY_.reshape((-1, 1))
 40         
 41         return self.X, self.oriY_, self.traY_
 42         
 43         
 44     def lw_fitting(self, tau=None):
 45         if not hasattr(self, "X"):
 46             self.get_data()
 47         if tau is None:
 48             if hasattr(self, "bestTau"):
 49                 tau = self.bestTau
 50             else:
 51                 tau = self.get_tau()
 52         
 53         xList, yList = list(), list()
 54         for val in numpy.linspace(*self.__xBound, self.__number * 5):
 55             x = numpy.array([[val], [1]])
 56             theta = self.__fitting(x, self.X, self.traY_, tau)
 57             y = numpy.matmul(theta.T, x)
 58             xList.append(x[0, 0])
 59             yList.append(y[0, 0])
 60             
 61         resiList = list()                                              # 残差计算
 62         for idx in range(self.__number):
 63             x = self.X[:, idx:idx+1]
 64             theta = self.__fitting(x, self.X, self.traY_, tau)
 65             y = numpy.matmul(theta.T, x)
 66             resiList.append(self.traY_[idx, 0] - y[0, 0])
 67             
 68         return xList, yList, self.X[0, :].tolist(), resiList
 69         
 70         
 71     def show(self):
 72         '''
 73         绘图展示整体拟合情况
 74         '''
 75         xList, yList, sampleXList, sampleResiList = self.lw_fitting()
 76         y2List = oriFunc(numpy.array(xList))
 77         fig = plt.figure(figsize=(8, 14))
 78         ax1 = plt.subplot(2, 1, 1)
 79         ax2 = plt.subplot(2, 1, 2)
 80         
 81         ax1.scatter(self.X[0, :], self.traY_[:, 0], c="green", alpha=0.7, label="samples with noise")
 82         ax1.plot(xList, y2List, c="red", lw=4, alpha=0.7, label="standard curve")
 83         ax1.plot(xList, yList, c="black", lw=2, linestyle="--", label="fitting curve")
 84         ax1.set(xlabel="$x$", ylabel="$y$")
 85         ax1.legend()
 86         
 87         ax2.scatter(sampleXList, sampleResiList, c="blue", s=10)
 88         ax2.set(xlabel="$x$", ylabel="$\epsilon$", title="residual distribution")
 89         
 90         plt.show()
 91         plt.close()
 92         fig.tight_layout()
 93         fig.savefig("lw.png", dpi=300)
 94         
 95         
 96     def __fitting(self, x, X, Y_, tau, epsilon=1.e-9):
 97         tmpX = X[0:1, :]
 98         tmpW = (-(tmpX - x[0, 0]) ** 2 / tau ** 2 / 2).reshape(-1)
 99         W = numpy.diag(numpy.exp(tmpW))
100         
101         item1 = numpy.matmul(numpy.matmul(X, W), X.T)
102         item2 = numpy.linalg.inv(item1 + epsilon * numpy.identity(item1.shape[0]))
103         item3 = numpy.matmul(numpy.matmul(X, W), Y_)
104         
105         theta = numpy.matmul(item2, item3)
106         
107         return theta
108 
109         
110     def get_tau(self):
111         '''
112         交叉验证返回最优tau
113         采用黄金分割法计算最优tau
114         '''
115         if not hasattr(self, "X"):
116             self.get_data()
117         
118         lowerBound, upperBound = self.__tauBound
119         lowerTau = self.__calc_lowerTau(lowerBound, upperBound)
120         upperTau = self.__calc_upperTau(lowerBound, upperBound)
121         lowerErr = self.__calc_generalErr(self.X, self.traY_, lowerTau)
122         upperErr = self.__calc_generalErr(self.X, self.traY_, upperTau)
123         
124         while (upperTau - lowerTau) > self.__epsilon:
125             if lowerErr > upperErr:
126                 lowerBound = lowerTau
127                 lowerTau = upperTau
128                 lowerErr = upperErr
129                 upperTau = self.__calc_upperTau(lowerBound, upperBound)
130                 upperErr = self.__calc_generalErr(self.X, self.traY_, upperTau)
131             else:
132                 upperBound = upperTau
133                 upperTau = lowerTau
134                 upperErr = lowerErr
135                 lowerTau = self.__calc_lowerTau(lowerBound, upperBound)
136                 lowerErr = self.__calc_generalErr(self.X, self.traY_, lowerTau)
137                 
138         self.bestTau = (upperTau + lowerTau) / 2
139         return self.bestTau
140         
141         
142     def __calc_generalErr(self, X, Y_, tau):
143         generalErr = 0
144         
145         for idx in range(X.shape[1]):
146             tmpx = X[:, idx:idx+1]
147             tmpy_ = Y_[idx:idx+1, :]
148             tmpX = numpy.hstack((X[:, 0:idx], X[:, idx+1:]))
149             tmpY_ = numpy.vstack((Y_[0:idx, :], Y_[idx+1:, :]))
150             
151             theta = self.__fitting(tmpx, tmpX, tmpY_, tau)
152             tmpy = numpy.matmul(theta.T, tmpx)
153             generalErr += (tmpy_[0, 0] - tmpy[0, 0]) ** 2
154 
155         return generalErr
156         
157         
158     def __calc_lowerTau(self, lowerBound, upperBound):
159         delta = upperBound - lowerBound
160         lowerTau = upperBound - delta * 0.618
161         return lowerTau
162         
163         
164     def __calc_upperTau(self, lowerBound, upperBound):
165         delta = upperBound - lowerBound
166         upperTau = lowerBound + delta * 0.618
167         return upperTau
168         
169         
170         
171         
172 if __name__ == '__main__':
173     obj = LW()
174     obj.show()

View Code

笔者所用示例函数为:
\begin{equation}\label{eq_4}
f(x) = e^{-x}sin(10x)
\end{equation}

结果展示:
使用建议:
1. 局部加权线性回归主要作为一种光滑化的手段存在, 其对非样本覆盖区间的预测能力较低.
2. 在不明确具体拟合公式的前提下, 可采用局部加权线性回归获取一条可以接受的拟合曲线.
3. 在选定合适的加权方式后, 可以计算指定点处相对可靠的0阶及1阶函数信息.