python读取mysql实现一元和多元的线性拟合
一元线性方程拟合
# -*- coding:utf-8 -*-
# __author__ = "LQ"
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from DataAnalysis.TeachingModel.dbc import dbcConnect
def main(examDf,colsName):
# 绘制散点图,examDf.jt为X轴,examDf.hk为Y轴
plt.scatter(examDf[colsName[0]], examDf[colsName[1]], color='darkgreen', label="Exam Data")
# 添加图的标签(x轴,y轴)
plt.xlabel(colsName[0]) # 设置X轴标签
plt.ylabel(colsName[1]) # 设置Y轴标签
plt.show() # 显示图像
rDf = examDf.corr() # 查看数据间的相关系数
print("相关系数:", rDf)
# 拆分训练集和测试集(train_test_split是存在与sklearn中的函数)
X_train, X_test, Y_train, Y_test = train_test_split(examDf[colsName[0]], examDf[colsName[1]], train_size=0.8)
# train为训练数据,test为测试数据,examDf为源数据,train_size 规定了训练数据的占比
print("自变量---源数据:", examDf[colsName[0]].shape, "; 训练集:", X_train.shape, "; 测试集:", X_test.shape)
print("因变量---源数据:", examDf[colsName[1]].shape, "; 训练集:", Y_train.shape, "; 测试集:", Y_test.shape)
# 调用线性规划包
model = LinearRegression()
# 在这里加一段
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
# 线性回归训练
model.fit(X_train, Y_train) # 调用线性回归包
a = model.intercept_ # 截距
b = model.coef_ # 回归系数
# 训练数据的预测值
y_train_pred = model.predict(X_train)
# 绘制最佳拟合线:标签用的是训练数据的预测值y_train_pred
plt.plot(X_train, y_train_pred, color='blue', linewidth=2, label="best line")
# 测试数据散点图
plt.scatter(X_train, Y_train, color='darkgreen', label="train data")
plt.scatter(X_test, Y_test, color='red', label="test data")
# 添加图标标签
plt.legend(loc=2) # 图标位于左上角,即第2象限,类似的,1为右上角,3为左下角,4为右下角
plt.xlabel(colsName[0]) # 添加 X 轴名称
plt.ylabel(colsName[1]) # 添加 Y 轴名称
plt.show() # 显示图像
print("拟合参数:截距", a, ",回归系数:", b)
print("最佳拟合线: Y = ", round(a, 2), "+", round(b[0], 2), "* X") # 显示线性方程,并限制参数的小数位为两位
if __name__ == '__main__':
sql = '''SELECT
total_log_num,
getscore
FROM
t_study_behavior_bak
'''
df = dbcConnect.selectDf(sql)
colsName=['total_log_num','getscore']
main(df,colsName)
多元线性方程拟合
# -*- coding:utf-8 -*-
# __author__ = "LQ"
import pymysql
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas import DataFrame, Series
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from DataAnalysis.TeachingModel.dbc import dbcConnect
from sklearn import preprocessing
def main(d,yName):
# 检验数据
print(d.describe()) # 数据描述,会显示最值,平均数等信息,可以简单判断数据中是否有异常值
print(d[d.isnull() == True].count()) # 检验缺失值,若输出为0,说明该列没有缺失值
# 输出相关系数,判断是否值得做线性回归模型
print(d.corr()) # 0-0.3弱相关;0.3-0.6中相关;0.6-1强相关;
# 拆分训练集和测试集
X_train, X_test, Y_train, Y_test = train_test_split(d.ix[:, :5], d.getscore, train_size=0.8)
# new_examDf.ix[:,:2]取了数据中的前两列为自变量,此处与单变量的不同
print("自变量---源数据:", d.ix[:, :5].shape, "; 训练集:", X_train.shape, "; 测试集:", X_test.shape)
print("因变量---源数据:", d[yName].shape, "; 训练集:", Y_train.shape, "; 测试集:", Y_test.shape)
# 调用线性规划包
model = LinearRegression()
model.fit(X_train, Y_train) # 线性回归训练
a = model.intercept_ # 截距
b = model.coef_ # 回归系数
print("拟合参数:截距", a, ",回归系数:", b)
for b0 in b:
print(b0)
# 显示线性方程,并限制参数的小数位为两位
print("最佳拟合线: Y = ", round(a, 2), "+", round(b[0], 2), "* X1", "+", round(b[1], 2), "* X2", "+", round(b[2], 2), "* X3", "+", round(b[3], 2), "* X4", "+", round(b[4], 2), "* X5")
Y_pred = model.predict(X_test) # 对测试集数据,用predict函数预测
plt.plot(range(len(Y_pred)), Y_pred, 'red', linewidth=2.5, label="predict data")
plt.plot(range(len(Y_test)), Y_test, 'green', label="test data")
plt.legend(loc=2)
plt.show() # 显示预测值与测试值曲线
# 均方差
# 查看残差平方的均值(mean square error,MSE)
print("Mean squared error: %.2f"
% mean_squared_error(Y_test, Y_pred))
# Explained variance score: 1 is perfect prediction
# R2 决定系数(拟合优度)
# 模型越好:r2→1
# 模型越差:r2→0
print('Variance score: %.2f' % r2_score(Y_test, Y_pred))
# Plot outputs
plt.scatter(range(len(Y_pred)), Y_test, color='black')
plt.plot(range(len(Y_test)), Y_pred, color='blue', linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()
if __name__ == '__main__':
sql = '''SELECT
study_document_num,
actual_brain_num,
actual_discuss_num,
actual_quiz_num,
actual_homework_num,
getscore
FROM
t_study_behavior_rate
'''
conn=dbcConnect.dbcconnect()
cursor = conn.cursor()
cursor.execute(sql)
# 获取剩余结果所有数据
results = cursor.fetchall()
# 获取列名
cols = [i[0] for i in cursor.description]
# sql内表转换pandas的DF
df = pd.DataFrame(np.array(results), columns=cols).astype(float)
conn.commit()
cursor.close()
# 归一化处理
min_max_scaler = preprocessing.MinMaxScaler()
df_minMax = min_max_scaler.fit_transform(df)
examDf = pd.DataFrame(df_minMax, columns=cols).astype(float)
yName = 'getscore'
main(examDf,yName)