线性回归法
目录
线性回归法
一.简单线性回归
二. 最小二乘法
三.简单线性回归的实现
四.向量化
五.衡量线性回归法的指标 MSE,RMS,MAE
1、均方误差(MSE)
2、均方根误差(RMSE)
3、平均绝对误差(MAE)
六.最好的衡量线性回归法的指标 R Squared
七.多元线性回归和正规方程解
一.简单线性回归
二. 最小二乘法
对b求偏导
对a求偏导
矩阵向量化
三.简单线性回归的实现
准备一些简单的数据。
import numpy as np
import matplotlib.pyplot as plt
x = np.array([1., 2., 3., 4., 5.])
y = np.array([1., 3., 2., 3., 5.])
plt.scatter(x, y)
plt.axis([0, 6, 0, 6])
plt.show()
x_mean = np.mean(x)
y_mean = np.mean(y)
up = 0
down = 0
for x_i, y_i in zip(x, y):
up += (x_i - x_mean) * (y_i - y_mean)
down += (x_i - x_mean) ** 2
a = up/down
b = y_mean - a * x_mean
这样我们就求出了a和b,也就求出了直线方程y=a+b
y_hat = a * x + b
plt.scatter(x, y)
plt.plot(x, y_hat, 'r')
plt.axis([0, 6, 0, 6])
plt.show()
# 接下来测试一个新的数据
x_new = 6
y_predict = a * x_new + b
使用面向对象编程
import numpy as np
class SimpleLinearRegreesion1(object):
def __init__(self):
self.a_ = None
self.b_ = None
def fit(self, x_train, y_train):
assert x_train.ndim == 1, "Simple linear regression can only solve single feature training data"
assert len(x_train) == len(y_train), "the size of x_train must be equal to the size of y_train"
x_mean = np.mean(x_train)
y_mean = np.mean(y_train)
up = 0
down = 0
for x_i, y_i in zip(x_train, y_train):
up += (x_i - x_mean) * (y_i - y_mean)
down += (x_i - x_mean) ** 2
self.a_ = up / down
self.b_ = y_mean - self.a_ * x_mean
return self
def predict(self, x_predict):
assert x_predict.ndim == 1, "Simple linear regression can only solve single feature training data"
assert self.a_ is not None and self.b_ is not None, "must fit before predict!"
"这个函数用来处理一次传入多个x_predict"
return [self._predict(x) for x in x_predict]
def _predict(self, x_single):
"预测单个数据"
return self.a_ * x_single + self.b_
def __repr__(self):
return "SimpleLinearRegression"
if __name__ == '__main__':
import numpy as np
x = np.array([1., 2., 3., 4., 5.])
y = np.array([1., 3., 2., 3., 5.])
x_new = 6
reg1 = SimpleLinearRegreesion1()
reg1.fit(x, y)
y_predict = reg1.predict(np.array([x_new]))
print(y_predict)
print(reg1.a_)
print(reg1.b_)
四.向量化
import numpy as np
class SimpleLinearRegression:
def __init__(self):
"""初始化Simple Linear Regression模型"""
self.a_ = None
self.b_ = None
def _r2_score(self, y_true, y_predict):
"""计算y_true和y_predict之间的R Square"""
return 1 - mean_squared_error(y_true, y_predict)/np.var(y_true)
def fit(self, x_train, y_train):
"""根据训练数据集x_train, y_train训练Simple Linear Regression模型"""
assert x_train.ndim == 1, \
"Simple Linear Regressor can only solve single feature training data."
assert len(x_train) == len(y_train), \
"the size of x_train must be equal to the size of y_train"
x_mean = np.mean(x_train)
y_mean = np.mean(y_train)
self.a_ = (x_train - x_mean).dot(y_train - y_mean) / (x_train - x_mean).dot(x_train - x_mean)
self.b_ = y_mean - self.a_ * x_mean
return self
def predict(self, x_predict):
"""给定待预测数据集x_predict,返回表示x_predict的结果向量"""
assert x_predict.ndim == 1, \
"Simple Linear Regressor can only solve single feature training data."
assert self.a_ is not None and self.b_ is not None, \
"must fit before predict!"
return np.array([self._predict(x) for x in x_predict])
def _predict(self, x_single):
"""给定单个待预测数据x,返回x的预测结果值"""
return self.a_ * x_single + self.b_
def score(self, x_test, y_test):
"""根据测试数据集 x_test 和 y_test 确定当前模型的准确度"""
y_predict = self.predict(x_test)
return _r2_score(y_test, y_predict)
def __repr__(self):
return "SimpleLinearRegression()"
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
boston = datasets.load_boston()
X = boston.data[:,5]#只是用房间数量这个特征
y = boston.target
plt.scatter(X,y)
plt.show()#最顶端的是样本的一个极限值
X = X [y < 50.0]
y = y [y < 50.0]
plt.scatter(X,y)
plt.show()#最顶端的是样本的一个极限值
from sklearn .model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 666)
reg = SimpleLinearRegression()
reg.fit(X_train, y_train)
plt.scatter(X_train, y_train)
plt.plot(X_train, reg.predict(X_train), color = "r")
plt.show()
y_predict = reg.predict(X_test)
五.衡量线性回归法的指标 MSE,RMS,MAE
1、均方误差(MSE)
均方误差(Mean Squared Error)
2、均方根误差(RMSE)
均方根误差(Root Mean Squared Error)
这样,MSE就和y有着相同的量纲。
3、平均绝对误差(MAE)
平均绝对误差(Mean Absolute Error)
我们在最初的时候选择目标函数不是它,因为绝对值不是一个处处可导的函数,因此不能作为损失函数,但是作为评价函数还是相当不错的。
接下里使用波士顿房价做简单线性回归,并使用这面三中评价函数对线性回归模型进行评价。
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
boston = datasets.load_boston()
print(boston.DESCR)
boston.feature_names
x = boston.data[:, 5] # 只是用房间数量这个特征
y = boston.target # 波士顿房价
plt.scatter(x, y)
plt.show()
通过可视化发现最上方有一排异常点,所以在数据处理的过程中我们需要考虑,可能是因为在统计的过程中设计了上限,就是大于50万的都按50万计算,因此,数据的预处理就显得尤为关键。
x = x[y < np.max(y)]
y = y[y < np.max(y)]
plt.scatter(x, y)
plt.show()
使用函数把这三个评价函数封装一下:
import numpy as np
from math import sqrt
def mean_square_error(y_true, y_predict):
assert len(y_true) == len(y_predict), "the size of y_true must be equal to the size of y_predict"
return np.sum((y_predict - y_true)**2) / len(y_true)
def root_mean_square_error(y_true, y_predict):
return sqrt(mean_square_error(y_true, y_predict))
def mean_absolute_error(y_true, y_predict):
assert len(y_true) == len(y_predict), "the size of y_true must be equal to the size of y_predict"
return np.sum(np.absolute(y_predict - y_true)) / len(y_true)
其实,在sklearn中又已经封装好了,那威慑呢么还要写一遍呢?因为不能仅仅只会调库,底层代码还是要知道是怎么实现的。那么我们看看sklearn中封装的函数和自己写的输出是否一致。
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
mean_absolute_error(y_test, y_predict)
mean_squared_error(y_test, y_predict)
六.最好的衡量线性回归法的指标 R Squared
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,y_predict)
mean_squared_error(y_test, y_predict)
##R2
1 - mean_squared_error(y_test, y_predict) / np.var(y_test)
七.多元线性回归和正规方程解
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
boston = datasets.load_boston()
X = boston.data
y = boston.target
X = X[y<50.]
y = y[y<50.]
import numpy as np
class LinearRegression:
def __init__(self):
"""初始化Linear Regression模型"""
self.coef_ = None
self.intercept_ = None
self._theta = None
def _mean_squared_error(self, y_true, y_predict):
"""计算y_true和y_predict之间的MSE"""
assert len(y_true) == len(y_predict), \
"the size of y_true must be equal to the size of y_predict"
return np.sum((y_true - y_predict)**2) / len(y_true)
def _r2_score(self, y_true, y_predict):
"""计算y_true和y_predict之间的R Square"""
return 1 - self._mean_squared_error(y_true, y_predict)/np.var(y_true)
def fit_normal(self, X_train, y_train):
"""根据训练数据集X_train, y_train训练Linear Regression模型"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def fit_gd(self, X_train, y_train, eta=0.01, n_iters=1e4):
"""根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
except:
return float('inf')
def dJ(theta, X_b, y):
# res = np.empty(len(theta))
# res[0] = np.sum(X_b.dot(theta) - y)
# for i in range(1, len(theta)):
# res[i] = (X_b.dot(theta) - y).dot(X_b[:, i])
# return res * 2 / len(X_b)
return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(X_b)
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.zeros(X_b.shape[1])
self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def fit_sgd(self, X_train, y_train, n_iters=5, t0=5, t1=50):
"""根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
assert n_iters >= 1
def dJ_sgd(theta, X_b_i, y_i):
return X_b_i * (X_b_i.dot(theta) - y_i) * 2.
def sgd(X_b, y, initial_theta, n_iters, t0=5, t1=50):
def learning_rate(t):
return t0 / (t + t1)
theta = initial_theta
m = len(X_b)
for cur_iter in range(n_iters):
indexes = np.random.permutation(m)
X_b_new = X_b[indexes]
y_new = y[indexes]
for i in range(m):
gradient = dJ_sgd(theta, X_b_new[i], y_new[i])
theta = theta - learning_rate(cur_iter * m + i) * gradient
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.random.randn(X_b.shape[1])
self._theta = sgd(X_b, y_train, initial_theta, n_iters, t0, t1)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def predict(self, X_predict):
"""给定待预测数据集X_predict,返回表示X_predict的结果向量"""
assert self.intercept_ is not None and self.coef_ is not None, \
"must fit before predict!"
assert X_predict.shape[1] == len(self.coef_), \
"the feature number of X_predict must be equal to X_train"
X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
return X_b.dot(self._theta)
def score(self, X_test, y_test):
"""根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
y_predict = self.predict(X_test)
return self._r2_score(y_test, y_predict)
def __repr__(self):
return "LinearRegression()"
from sklearn .model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 666)
reg = LinearRegression()
reg.fit_normal(X_train, y_train)
reg.score(X_test,y_test)