文章目录
- 先来看一下需求
- 直接上代码,需要的自行修改
- 首先需要获取断开的时间,以及要往前插的时间
- 将日期转为数字,便于插值
- 对断开的数据进行三次样条插值,对其余的往前取均值插
- 插值完成
- 全部代码
先来看一下需求
有一个时间序列数据,例如如下:
x = ['2021-05-10', '2021-05-11', '2021-05-12', '2021-05-13', '2021-05-16']
y = [3.4783, 1.25, 1.1111, 1.1111, 2.26]
plt.scatter(x, y)
plt.plot(x, y)
plt.show()
需求:需要对数据进行插值,断开的数据插值,如果不足10个点,就往前插
例如上图结果可以是:
[‘2021-05-07’,
‘2021-05-08’,
‘2021-05-09’,
‘2021-05-10’,
‘2021-05-11’,
‘2021-05-12’,
‘2021-05-13’,
‘2021-05-14’,
‘2021-05-15’,
‘2021-05-16’]
直接上代码,需要的自行修改
import numpy as np
from scipy import interpolate
import matplotlib.pyplot as plt
import time, datetime
from datetime import datetime, date, timedelta
import math
x = ['2021-05-10', '2021-05-11', '2021-05-12', '2021-05-13', '2021-05-16']
y = [3.4783, 1.25, 1.1111, 1.1111, 2.26]
plt.scatter(x, y)
plt.plot(x, y)
plt.show()
首先需要获取断开的时间,以及要往前插的时间
'''
插值到10个点
中间断开的插值
将时间变为数字,保存对应的时间
插值训练用数字
插值预测用数字
没有断开往前取均值
return:new_x
'''
currdate = '2021-5-18' # 假设当前时间
lxs = len(x)
insert_x = []
mean_x = []
isx = 0
flag = 10
if lxs < 10:
# 判断是否是连续日期,并对不连续的日期进行时间插值
for i in range(len(x)):
if i+1 == len(x):
break
t1 = int(time.mktime(time.strptime(x[i],"%Y-%m-%d")))
t2 = int(time.mktime(time.strptime(x[i+1],"%Y-%m-%d")))
differ = (datetime.fromtimestamp(t2)-datetime.fromtimestamp(t1)).days
# print("相差",differ,"天")
while differ != 1:
differ -= 1
tmp = (datetime.fromtimestamp(t2) + timedelta(days = -differ)).strftime("%Y-%m-%d")
insert_x.append(tmp)
isx = len(insert_x)
tos = isx + lxs
# 如果不够10个点,往前插取均值,如第一个是现有数据前2个的均值。第二个是现有数据前3个的均值
if tos < math.floor(lxs / 2 + 1) + lxs:
flag = math.floor(lxs / 2 + 1)
diffs = flag
timx0 = int(time.mktime(time.strptime(x[0],"%Y-%m-%d")))
while diffs != 0:
tmp = (datetime.fromtimestamp(timx0) + timedelta(days = -diffs)).strftime("%Y-%m-%d")
mean_x.append(tmp)
diffs -= 1
# insert_x = sorted(insert_x)
print(insert_x)
print(mean_x)
[‘2021-05-14’, ‘2021-05-15’]
[‘2021-05-07’, ‘2021-05-08’, ‘2021-05-09’]
将日期转为数字,便于插值
newxlist = x + insert_x + mean_x
newxlist = sorted(newxlist)
print(newxlist)
[‘2021-05-07’, ‘2021-05-08’, ‘2021-05-09’, ‘2021-05-10’, ‘2021-05-11’, ‘2021-05-12’, ‘2021-05-13’, ‘2021-05-14’, ‘2021-05-15’, ‘2021-05-16’]
# xydict = {}
# for i in range(len(x)):
# xydict[x[i]] = y[i]
xdict = {}
resx_dict = {}
x_list = []
x_i_list = []
x_m_list = []
j = 0
for i in range(len(newxlist)):
xdict[newxlist[i]] = i + 1
if newxlist[i] in x:
x_list.append(xdict[newxlist[i]])
resx_dict[newxlist[i]] = y[j]
j += 1
elif newxlist[i] in insert_x:
x_i_list.append(xdict[newxlist[i]])
elif newxlist[i] in mean_x:
x_m_list.append(xdict[newxlist[i]])
print(xdict)
print(x_list)
print(x_i_list)
print(x_m_list)
print(resx_dict)
{‘2021-05-07’: 1, ‘2021-05-08’: 2, ‘2021-05-09’: 3, ‘2021-05-10’: 4, ‘2021-05-11’: 5, ‘2021-05-12’: 6, ‘2021-05-13’: 7, ‘2021-05-14’: 8, ‘2021-05-15’: 9, ‘2021-05-16’: 10}
[4, 5, 6, 7, 10]
[8, 9]
[1, 2, 3]
{‘2021-05-10’: 3.4783, ‘2021-05-11’: 1.25, ‘2021-05-12’: 1.1111, ‘2021-05-13’: 1.1111, ‘2021-05-16’: 2.26}
对断开的数据进行三次样条插值,对其余的往前取均值插
# 得到差值函数 linear: 线性插值 cubic: 三次样条插值
# Flinear = interpolate.interp1d(x_list, y, kind='linear')
Flinear = interpolate.interp1d(x_list, y, kind='cubic')
# 三次样条插值
if len(x_i_list) != 0:
ynew = Flinear(x_i_list)
ynew = np.array(ynew).tolist()
ynew = [abs(round(xi, 4)) for xi in ynew]
j = 0
for i in x_i_list:
k = [k for k,v in xdict.items() if v == i][0]
resx_dict[k] = ynew[j]
j += 1
# 往前取均值插
if len(x_m_list) != 0:
l = len(x_m_list)
ls = len(x)
for i in x_m_list:
k = [k for k,v in xdict.items() if v == i][0]
tmp = xdict[k] + 1
value = round(sum(y[:tmp]) / tmp, 4)
resx_dict[k] = value
resx_dict = sorted(resx_dict.items(), key=lambda x:x[0], reverse=False)
resx_dict = dict(resx_dict)
print(resx_dict)
{‘2021-05-10’: 3.4783, ‘2021-05-11’: 1.25, ‘2021-05-12’: 1.1111, ‘2021-05-13’: 1.1111, ‘2021-05-16’: 2.26, ‘2021-05-14’: 1.1014, ‘2021-05-15’: 1.3837, ‘2021-05-07’: 2.3641, ‘2021-05-08’: 1.9465, ‘2021-05-09’: 1.7376}
插值完成
resx_list, resy_list = [], []
for k, v in resx_dict.items():
resx_list.append(k)
resy_list.append(v)
plt.scatter(resx_list, resy_list)
plt.plot(resx_list, resy_list)
plt.show()
全部代码
# 插值
def interpolation_value(data_gp):
'''
对时序数据进行插值,断开的数据用三次样条插值,不足数目的往前取均值插
:param data_gp: id分组后的dataframe
:return: df
'''
x = data_gp['date_time'].values.tolist()
y = data_gp['ecpm_tomorrow'].values.tolist()
# print(x)
# print(y)
# plt.scatter(x, y)
# plt.plot(x, y)
# plt.show()
# 获取需要插值的时间
lxs = len(x)
insert_x = [] # 插值时间列表
mean_x = [] # 往前插均值时间列表
isx = 0
flag = 10
if lxs < 10:
# 判断是否是连续日期,并对不连续的日期进行时间插值
for i in range(len(x)):
if i + 1 == len(x):
break
t1 = int(time.mktime(time.strptime(x[i], "%Y-%m-%d")))
t2 = int(time.mktime(time.strptime(x[i + 1], "%Y-%m-%d")))
differ = (datetime.fromtimestamp(t2) - datetime.fromtimestamp(t1)).days
# print("相差",differ,"天")
while differ != 1:
differ -= 1
tmp = (datetime.fromtimestamp(t2) + timedelta(days=-differ)).strftime("%Y-%m-%d")
insert_x.append(tmp)
isx = len(insert_x)
tos = isx + lxs
# 如果不够10个点,往前插取均值: 如第一个是现有数据前2个的均值、第二个是现有数据前3个的均值
if tos < math.floor(lxs / 2 + 1) + lxs:
flag = math.floor(lxs / 2 + 1)
diffs = flag
timx0 = int(time.mktime(time.strptime(x[0], "%Y-%m-%d")))
while diffs != 0:
tmp = (datetime.fromtimestamp(timx0) + timedelta(days=-diffs)).strftime("%Y-%m-%d")
mean_x.append(tmp)
diffs -= 1
# print(insert_x)
# print(mean_x)
# 将时间变为数字,保存对应的时间,便于插值
newxlist = x + insert_x + mean_x
newxlist = sorted(newxlist)
# print(newxlist)
# xydict = {}
# for i in range(len(x)):
# xydict[x[i]] = y[i]
xdict = {} # 插值后的时间x
resx_dict = {} # 存放插值的结果列表,key:时间,value:ecpm_yesterday
x_list = [] # 原x转为对应数字
x_i_list = [] # 待插值x转为对应数字
x_m_list = [] # 往前插均值x转为对应数字
j = 0
for i in range(len(newxlist)):
xdict[newxlist[i]] = i + 1
if newxlist[i] in x:
x_list.append(xdict[newxlist[i]])
resx_dict[newxlist[i]] = y[j]
j += 1
elif newxlist[i] in insert_x:
x_i_list.append(xdict[newxlist[i]])
elif newxlist[i] in mean_x:
x_m_list.append(xdict[newxlist[i]])
# print(xdict)
# print(x_list)
# print(x_i_list)
# print(x_m_list)
# print(resx_dict)
# 得到差值函数 linear: 线性插值 cubic: 三次样条插值
# Flinear = interpolate.interp1d(x_list, y, kind='linear')
Flinear = interpolate.interp1d(x_list, y, kind='cubic')
# 三次样条插值
if len(x_i_list) != 0:
ynew = Flinear(x_i_list)
ynew = np.array(ynew).tolist()
ynew = [abs(round(xi, 4)) for xi in ynew]
j = 0
for i in x_i_list:
k = [k for k, v in xdict.items() if v == i][0]
resx_dict[k] = ynew[j]
j += 1
# 往前取均值插
if len(x_m_list) != 0:
for i in x_m_list:
k = [k for k, v in xdict.items() if v == i][0]
tmp = xdict[k] + 1
value = round(sum(y[:tmp]) / tmp, 4)
resx_dict[k] = value
resx_dict = sorted(resx_dict.items(), key=lambda x:x[0], reverse=False)
resx_dict = dict(resx_dict)
# print(resx_dict)
resx_list, resy_list = [], []
for k, v in resx_dict.items():
resx_list.append(k)
resy_list.append(v)
# plt.scatter(resx_list, resy_list)
# plt.plot(resx_list, resy_list)
# plt.show()
df = {
'date_time': resx_list,
'ecpm_tomorrow': resy_list,
}
data = pd.DataFrame(df)
return data
有关scipy interpolate的差值方法demo可以参考如下两篇文章