# 导入必要的库
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error
import xgboost as lgb
import copy
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# 解决中文显示问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
# import lightgbm as lgb
from sklearn.metrics import mean_squared_error
def get_MAPE(real_data, predict_data):
real_data = real_data.flatten()
predict_data = predict_data.flatten()
new_real = []
new_pred = []
for i in range(len(real_data)):
if real_data[i] != 0:
new_real.append(real_data[i])
new_pred.append(predict_data[i])
new_real, new_pred = np.array(new_real), np.array(new_pred)
# MAPE1 = np.mean(np.abs(real_data - predict_data) / np.mean(real_data))
# MAPE = np.mean(np.abs(new_real - new_pred) / new_real)
MAPE = mean_absolute_percentage_error(real_data,predict_data)
return round((1 - abs(MAPE)), 2)
def get_MAE(real_data, predict_data):
real_data = real_data.flatten()
predict_data = predict_data.flatten()
new_real = []
new_pred = []
for i in range(len(real_data)):
if real_data[i] != 0:
new_real.append(real_data[i])
new_pred.append(predict_data[i])
new_real, new_pred = np.array(new_real), np.array(new_pred)
# MAPE1 = np.mean(np.abs(real_data - predict_data) / np.mean(real_data))
MAE = np.sum(np.abs(new_real - new_pred)) / len(new_real)
return round(MAE, 2)
def get_RMSE(real_data, predict_data):
real_data = real_data.flatten()
predict_data = predict_data.flatten()
max_real = np.max(real_data)
RMSE = np.sqrt((np.mean(real_data - predict_data)/max_real) ** 2)
return 1-round(RMSE, 2)
url = f'./.csv'
def get_train_test_data(url, feature):
data = pd.read_csv(url) # 假设数据集保存在名为data.csv的CSV文件中
data.set_index('time', inplace=True)
data.index = pd.to_datetime(data.index)
while data['value'].isnull().any():
# todo:处理确失值
df_15_min_shift_1 = data['value'].shift(1, freq='D')
data['value'].fillna(df_15_min_shift_1, inplace=True)
df = data.copy()
df['time'] = df.index
df['hour'] = df['time'].dt.hour
df['period'] = df['time'].dt.hour * 4 + df['time'].dt.minute // 15
df['day_of_week'] = df['time'].dt.weekday
df['shift_value'] = df['value'].shift(96)
df['rolling_H_mean'] = df['shift_value'].rolling(window='60T').mean()
df['value_max'] = df['shift_value'].resample('D').max()
df['value_min'] = df['shift_value'].resample('D').min()
df['value_min'].fillna(method='ffill', inplace=True)
df['value_max'].fillna(method='ffill', inplace=True)
df['diff_4'] = df['shift_value'].diff(4)
df['diff_4'].fillna(method='bfill', inplace=True)
df_trian_test = df[96:]
df_trian_test.to_csv('./re.csv')
df_trian = df_trian_test[:'2024-03-31']
df_test = df_trian_test['2024-04-01':]
X_train = df_trian[feature]
Y_train = df_trian['value']
X_test = df_test[feature]
Y_test = df_test['value']
return X_train, Y_train, X_test, Y_test
# df_trian_test[colums_to_normalize] = scaler.fit_transform(df_trian_test[colums_to_normalize])
from sklearn.model_selection import GridSearchCV
def get_model():
best_n_es = 450
best_max_depth = 4
best_sample_split = 5
best_min_samples_leaf = 4
# 创建 XGBRegressor 实例
rf = RandomForestRegressor(n_estimators=best_n_es,
max_depth=best_max_depth,
min_samples_split=best_sample_split,
min_samples_leaf=best_min_samples_leaf,
random_state=42)
return rf
# 创建LightGBM模型
# lgbm_model = lgb.LGBMRegressor(objective='regression', metric='l2', learning_rate=0.05, n_estimators=100)
def test(lgbm_model, X_test, Y_test):
mape_list = []
mae_list = []
rmse_list = []
time_list = []
# 定义子图的行数和列数
rows, cols = 5, 7
# 创建一个图形和一组子图
# fig, axs = plt.subplots(rows, cols, figsize=(20, 15))
for i in range(len(Y_test) // 96):
time_list.append(str(X_test.index[i * 96])[:10])
label = X_test[i * 96:(i + 1) * 96]
y = Y_test[i * 96:(i + 1) * 96]
# print(label)
# print(y)
pred = lgbm_model.predict(label)
df_pred = pd.DataFrame(pred, index=y.index)
df_pred = df_pred.resample('60T').mean()
y = y.resample('60T').mean()
mape = get_MAPE(real_data=np.array(y), predict_data=np.array(df_pred))
# print(mape)
if mape == 'Nan':
print("空值")
# print(str(X_test.index[i * 96])[:10])
mape_list.append(mape)
mae = get_MAE(real_data=np.array(y), predict_data=np.array(df_pred))
mae_list.append(mae)
rmse = get_RMSE(real_data=np.array(y), predict_data=np.array(df_pred))
rmse_list.append(rmse)
# 绘图
# row = i // cols
# col = i % cols
# axs[row,col].plot(df_pred, label='预测数据')
# axs[row,col].plot(y, label='真实数据')
# axs[row,col].set_title(str(X_test.index[i * 96])[:10] + ' MAPE:' + str(mape) + ' MAE:' + str(mae) + ' 1-rmse' + str(rmse))
# plt.grid(True)
# plt.legend(loc='right')
# plt.gcf().autofmt_xdate()
# plt.tight_layout()
# plt.title('4月份结果')
# plt.show()
# print(df_pred)
# print(i)
# plt.plot(df_pred, label='预测数据' + ' MAPE:' + str(mape) + ' 1-rmse:' + str(1 - rmse))
# plt.plot(y, label='真实数据')
# plt.legend(loc='right')
# plt.title(str(X_test.index[i * 96])[:10] + ' MAPE:' + str(mape) + ' MAE:' + str(mae) + ' rmse' + str(rmse))
# plt.grid(True)
# plt.gcf().autofmt_xdate()
# plt.show()
# plt.close()
mape_df = pd.DataFrame(mape_list, columns=['mape'], index=time_list)
mae_df = pd.DataFrame(mae_list, columns=['mae'], index=time_list)
rmse_df = pd.DataFrame(rmse_list, columns=['rmse'], index=time_list)
mape_df = mape_df.set_index(pd.to_datetime(time_list))
mae_df = mae_df.set_index(pd.to_datetime(time_list))
rmse_df = rmse_df.set_index(pd.to_datetime(time_list))
rest = pd.concat([mape_df, mae_df, rmse_df], axis=1)
# rest.to_csv('./结果mape.csv')
return mape_df, mae_df, rmse_df
def trian_test(lgbm_model, X_train, Y_train, ):
# 训练模型
lgbm_model.fit(X_train, Y_train)
return lgbm_model
def show_importance(model, feat_list=None):
"""
打印特征重要性
:return:
"""
if feat_list is None:
feat_list = model.feature_name_
impo_list = model.feature_importances_
pair_list = [(fe, round(im, 2)) for fe, im in zip(feat_list, impo_list)]
pair_list = sorted(pair_list, key=lambda x: x[1], reverse=True)
logger.info(f'feature importance: {pair_list}')
return pair_list
def valadation(url,feature_list):
X_train, Y_train, X_test, Y_test = get_train_test_data(url, feature_list)
lgbm_model = get_model()
lgbm_model = trian_test(lgbm_model, X_train, Y_train)
print(feature_list)
mape_df, mae_df, rmse_df = test(lgbm_model, X_test, Y_test)
mape_df = mape_df[mape_df>=0.01]
# mae_df = mae_df[mape_df[mape_df>=0.01].index]
# rmse_df = rmse_df[mape_df[mape_df>=0.01].index]
print('-----------2024-04-----------')
print("1-MAPE:\t", np.mean(mape_df['2024-04']))
print("MAE:\t", np.mean(mae_df['2024-04']))
print("RMSE:\t", np.mean(rmse_df['2024-04']))
print('-----------2024-05-----------')
print("1-MAPE:\t", np.mean(mape_df['2024-05']))
print("MAE:\t", np.mean(mae_df['2024-05']))
print("RMSE:\t", np.mean(rmse_df['2024-05']))
print('-----------2024-06-----------')
print("1-MAPE:\t", np.mean(mape_df['2024-06']))
print("MAE:\t", np.mean(mae_df['2024-06']))
print("RMSE:\t", np.mean(rmse_df['2024-06']))
importance_info = show_importance(lgbm_model)
print(importance_info)
return np.mean(mape_df)
if __name__ == '__main__':
url = f'./周娟_AllFeature.csv'
Necessary_Feature = [
'period',
'shift_value',
# 'temperature_2m',
]
Time_Feature = [
'h_d',
'day_of_week',
'shortwave_radiation',
'surface_pressure',
'wind_speed_10m',
'relative_humidity_2m'
]
feature = [
'period',
'shift_value',
'diff_4',
'rolling_H_mean',
'value_min',
# 'value_max',
'relative_humidity_2m', # 湿度
'wind_speed_10m', # 风速
'temperature_2m', # 温度
'shortwave_radiation' # 辐射
]
best_features = copy.deepcopy(Necessary_Feature)
best_mape = valadation(url,feature)
# for f1i in Time_Feature:
# temp_fetures = copy.deepcopy(best_features)
# temp_fetures.append(f1i)
# temp_mape = valadation(url,temp_fetures)
# if float(temp_mape)>float(best_mape):
# print('最佳1-MAPE:\t',best_mape)
# print('目前1-MAPE:\t',temp_mape)
# best_features = copy.deepcopy(temp_fetures)
# best_mape = temp_mape
# print('找到更好的特征{}'.format(f1i))
# else:
# temp_fetures.pop()
# for f2i in Load_Feature:
# temp_fetures = copy.deepcopy(best_features)
# temp_fetures.append(f2i)
# temp_mape = valadation(url,temp_fetures)
# if float(temp_mape)>float(best_mape):
# print('最佳1-MAPE:\t', best_mape)
# print('目前1-MAPE:\t', temp_mape)
# best_features = copy.deepcopy(temp_fetures)
# print('找到更好的特征{}'.format(f2i))
# else:
# temp_fetures.pop()
#
# for f3i in Weather_Feature:
# temp_fetures = copy.deepcopy(best_features)
# temp_fetures.append(f3i)
# temp_mape = valadation(url,temp_fetures)
# if float(temp_mape)>float(best_mape):
# print('最佳1-MAPE:\t', best_mape)
# print('目前1-MAPE:\t', temp_mape)
# best_features = copy.deepcopy(temp_fetures)
# print('找到更好的特征{}'.format(f3i))
# else:
# temp_fetures.pop()
# print('最好的特征',best_features)
lgbm
原创
©著作权归作者所有:来自51CTO博客作者Halo辉Go的原创作品,请联系作者获取转载授权,否则将追究法律责任
上一篇:一个新的策略
提问和评论都可以,用心的回复会被更多人看到
评论
发布评论
相关文章
-
LGBM python lgbm python包
说明:这是一个机器学习实战项目(附带数据+代码+文档+代码讲解),如需数据+代码+文档+代码讲解可以直接到文章最后获取。1.项目背景 如今已是大数据时代,具备大数据思想至关重要,人工智能技术在各行各业的应用已是随处可见。GBDT (Gradient Boosting Decision Tree) 是机器学习中一个长盛不衰的模型,其主要思想是利用弱分类器(决策
LGBM python python LightGBM分类模型 LGBMClassifier ROC-AUC -
centos 安装监听所有访问者IP
博文目录 一、TCP Wrappers概述 二、TCP Wrappers的访问策略 1、策略的配置格式 2、访问控制的基本原则 3、TCP Wrappers配置实例一、TCP Wrappers概述TCP Wrappers将TCP服务程序“包裹”起来,代为监听TCP服务程序的端口,增加了一个安全检测过程,外来的连接请求必须先通过这层安全检测,获得许可后才能访问真正的服务程序,如下图所示,TCP Wr
centos 安装监听所有访问者IP TCP Wrappers远程访问 TCP Wrappers远程访问基础配置 TCP Wrappers远程访问原理 TCP