文章目录

  • 先来看一下需求
  • 直接上代码,需要的自行修改
  • 首先需要获取断开的时间,以及要往前插的时间
  • 将日期转为数字,便于插值
  • 对断开的数据进行三次样条插值,对其余的往前取均值插
  • 插值完成
  • 全部代码

先来看一下需求

有一个时间序列数据,例如如下:

x = ['2021-05-10', '2021-05-11', '2021-05-12', '2021-05-13', '2021-05-16']
y = [3.4783, 1.25, 1.1111, 1.1111, 2.26]

plt.scatter(x, y)
plt.plot(x, y)
plt.show()

python 时间序列插值 数据增强 时间序列插值法_python 时间序列插值 数据增强

需求:需要对数据进行插值,断开的数据插值,如果不足10个点,就往前插

例如上图结果可以是:

[‘2021-05-07’,
 ‘2021-05-08’,
 ‘2021-05-09’,
 ‘2021-05-10’,
 ‘2021-05-11’,
 ‘2021-05-12’,
 ‘2021-05-13’,
 ‘2021-05-14’,
 ‘2021-05-15’,
 ‘2021-05-16’]

直接上代码,需要的自行修改

import numpy as np
from scipy import interpolate
import matplotlib.pyplot as plt
import time, datetime
from datetime import datetime, date, timedelta
import math

x = ['2021-05-10', '2021-05-11', '2021-05-12', '2021-05-13', '2021-05-16']
y = [3.4783, 1.25, 1.1111, 1.1111, 2.26]

plt.scatter(x, y)
plt.plot(x, y)
plt.show()

首先需要获取断开的时间,以及要往前插的时间

'''
插值到10个点
中间断开的插值
将时间变为数字,保存对应的时间
插值训练用数字
插值预测用数字
没有断开往前取均值
return:new_x
'''
currdate = '2021-5-18'  # 假设当前时间
lxs = len(x)

insert_x = []
mean_x = []
isx = 0
flag = 10
if lxs < 10:
    # 判断是否是连续日期,并对不连续的日期进行时间插值
    for i in range(len(x)):
        if i+1 == len(x):
            break
        t1 = int(time.mktime(time.strptime(x[i],"%Y-%m-%d")))
        t2 = int(time.mktime(time.strptime(x[i+1],"%Y-%m-%d")))
        differ = (datetime.fromtimestamp(t2)-datetime.fromtimestamp(t1)).days
        # print("相差",differ,"天")
        while differ != 1:
            differ -= 1
            tmp = (datetime.fromtimestamp(t2) + timedelta(days = -differ)).strftime("%Y-%m-%d")
            insert_x.append(tmp)
    isx = len(insert_x)
    tos = isx + lxs
    
    # 如果不够10个点,往前插取均值,如第一个是现有数据前2个的均值。第二个是现有数据前3个的均值
    if tos < math.floor(lxs / 2 + 1) + lxs:
        flag = math.floor(lxs / 2 + 1)
        diffs = flag
        timx0 = int(time.mktime(time.strptime(x[0],"%Y-%m-%d")))
        while diffs != 0:
            tmp = (datetime.fromtimestamp(timx0) + timedelta(days = -diffs)).strftime("%Y-%m-%d")
            mean_x.append(tmp)
            diffs -= 1
    
# insert_x = sorted(insert_x)      
print(insert_x)
print(mean_x)
[‘2021-05-14’, ‘2021-05-15’]
 [‘2021-05-07’, ‘2021-05-08’, ‘2021-05-09’]

将日期转为数字,便于插值

newxlist = x + insert_x + mean_x
newxlist = sorted(newxlist)
print(newxlist)
[‘2021-05-07’, ‘2021-05-08’, ‘2021-05-09’, ‘2021-05-10’, ‘2021-05-11’, ‘2021-05-12’, ‘2021-05-13’, ‘2021-05-14’, ‘2021-05-15’, ‘2021-05-16’]
# xydict = {}
# for i in range(len(x)):
#     xydict[x[i]] = y[i]

xdict = {}
resx_dict = {}
x_list = []
x_i_list = []
x_m_list = []
j = 0
for i in range(len(newxlist)):
    xdict[newxlist[i]] = i + 1
    if newxlist[i] in x:
        x_list.append(xdict[newxlist[i]])
        resx_dict[newxlist[i]] = y[j]
        j += 1
    elif newxlist[i] in insert_x:
        x_i_list.append(xdict[newxlist[i]])
    elif newxlist[i] in mean_x:
        x_m_list.append(xdict[newxlist[i]])
        
print(xdict)
print(x_list)
print(x_i_list)
print(x_m_list)

print(resx_dict)
{‘2021-05-07’: 1, ‘2021-05-08’: 2, ‘2021-05-09’: 3, ‘2021-05-10’: 4, ‘2021-05-11’: 5, ‘2021-05-12’: 6, ‘2021-05-13’: 7, ‘2021-05-14’: 8, ‘2021-05-15’: 9, ‘2021-05-16’: 10}
 [4, 5, 6, 7, 10]
 [8, 9]
 [1, 2, 3]
 {‘2021-05-10’: 3.4783, ‘2021-05-11’: 1.25, ‘2021-05-12’: 1.1111, ‘2021-05-13’: 1.1111, ‘2021-05-16’: 2.26}

对断开的数据进行三次样条插值,对其余的往前取均值插

# 得到差值函数  linear: 线性插值  cubic: 三次样条插值
# Flinear = interpolate.interp1d(x_list, y, kind='linear')
Flinear = interpolate.interp1d(x_list, y, kind='cubic')
# 三次样条插值
if len(x_i_list) != 0:
    ynew = Flinear(x_i_list)
    ynew = np.array(ynew).tolist()
    ynew = [abs(round(xi, 4)) for xi in ynew]
    j = 0
    for i in x_i_list:
        k = [k for k,v in xdict.items() if v == i][0]
        resx_dict[k] = ynew[j]
        j += 1
        
# 往前取均值插
if len(x_m_list) != 0:
    l = len(x_m_list)
    ls = len(x)
    for i in x_m_list:
        k = [k for k,v in xdict.items() if v == i][0]
        tmp = xdict[k] + 1
        value = round(sum(y[:tmp]) / tmp, 4)
        resx_dict[k] = value
        
resx_dict = sorted(resx_dict.items(), key=lambda x:x[0], reverse=False)
resx_dict = dict(resx_dict)
print(resx_dict)
{‘2021-05-10’: 3.4783, ‘2021-05-11’: 1.25, ‘2021-05-12’: 1.1111, ‘2021-05-13’: 1.1111, ‘2021-05-16’: 2.26, ‘2021-05-14’: 1.1014, ‘2021-05-15’: 1.3837, ‘2021-05-07’: 2.3641, ‘2021-05-08’: 1.9465, ‘2021-05-09’: 1.7376}

插值完成

resx_list, resy_list = [], []
for k, v in resx_dict.items():
    resx_list.append(k)
    resy_list.append(v)
    
plt.scatter(resx_list, resy_list)
plt.plot(resx_list, resy_list)
plt.show()

python 时间序列插值 数据增强 时间序列插值法_数据_02

全部代码

# 插值
def interpolation_value(data_gp):
    '''
    对时序数据进行插值,断开的数据用三次样条插值,不足数目的往前取均值插
    :param data_gp: id分组后的dataframe
    :return: df
    '''
    x = data_gp['date_time'].values.tolist()
    y = data_gp['ecpm_tomorrow'].values.tolist()

    # print(x)
    # print(y)
    
    # plt.scatter(x, y)
    # plt.plot(x, y)
    # plt.show()

    # 获取需要插值的时间
    lxs = len(x)
    insert_x = []   # 插值时间列表
    mean_x = []     # 往前插均值时间列表
    isx = 0
    flag = 10
    if lxs < 10:
        # 判断是否是连续日期,并对不连续的日期进行时间插值
        for i in range(len(x)):
            if i + 1 == len(x):
                break
            t1 = int(time.mktime(time.strptime(x[i], "%Y-%m-%d")))
            t2 = int(time.mktime(time.strptime(x[i + 1], "%Y-%m-%d")))
            differ = (datetime.fromtimestamp(t2) - datetime.fromtimestamp(t1)).days
            # print("相差",differ,"天")
            while differ != 1:
                differ -= 1
                tmp = (datetime.fromtimestamp(t2) + timedelta(days=-differ)).strftime("%Y-%m-%d")
                insert_x.append(tmp)
        isx = len(insert_x)
        tos = isx + lxs

        # 如果不够10个点,往前插取均值: 如第一个是现有数据前2个的均值、第二个是现有数据前3个的均值
        if tos < math.floor(lxs / 2 + 1) + lxs:
            flag = math.floor(lxs / 2 + 1)
            diffs = flag
            timx0 = int(time.mktime(time.strptime(x[0], "%Y-%m-%d")))
            while diffs != 0:
                tmp = (datetime.fromtimestamp(timx0) + timedelta(days=-diffs)).strftime("%Y-%m-%d")
                mean_x.append(tmp)
                diffs -= 1

    # print(insert_x)
    # print(mean_x)

    # 将时间变为数字,保存对应的时间,便于插值
    newxlist = x + insert_x + mean_x
    newxlist = sorted(newxlist)
    # print(newxlist)

    # xydict = {}
    # for i in range(len(x)):
    #     xydict[x[i]] = y[i]

    xdict = {}          # 插值后的时间x
    resx_dict = {}      # 存放插值的结果列表,key:时间,value:ecpm_yesterday
    x_list = []         # 原x转为对应数字
    x_i_list = []       # 待插值x转为对应数字
    x_m_list = []       # 往前插均值x转为对应数字
    j = 0
    for i in range(len(newxlist)):
        xdict[newxlist[i]] = i + 1
        if newxlist[i] in x:
            x_list.append(xdict[newxlist[i]])
            resx_dict[newxlist[i]] = y[j]
            j += 1
        elif newxlist[i] in insert_x:
            x_i_list.append(xdict[newxlist[i]])
        elif newxlist[i] in mean_x:
            x_m_list.append(xdict[newxlist[i]])

    # print(xdict)
    # print(x_list)
    # print(x_i_list)
    # print(x_m_list)
    # print(resx_dict)

    # 得到差值函数  linear: 线性插值  cubic: 三次样条插值
    # Flinear = interpolate.interp1d(x_list, y, kind='linear')
    Flinear = interpolate.interp1d(x_list, y, kind='cubic')
    # 三次样条插值
    if len(x_i_list) != 0:
        ynew = Flinear(x_i_list)
        ynew = np.array(ynew).tolist()
        ynew = [abs(round(xi, 4)) for xi in ynew]
        j = 0
        for i in x_i_list:
            k = [k for k, v in xdict.items() if v == i][0]
            resx_dict[k] = ynew[j]
            j += 1
    # 往前取均值插
    if len(x_m_list) != 0:
        for i in x_m_list:
            k = [k for k, v in xdict.items() if v == i][0]
            tmp = xdict[k] + 1
            value = round(sum(y[:tmp]) / tmp, 4)
            resx_dict[k] = value

    resx_dict = sorted(resx_dict.items(), key=lambda x:x[0], reverse=False)
    resx_dict = dict(resx_dict)
    # print(resx_dict)

    resx_list, resy_list = [], []
    for k, v in resx_dict.items():
        resx_list.append(k)
        resy_list.append(v)

    # plt.scatter(resx_list, resy_list)
    # plt.plot(resx_list, resy_list)
    # plt.show()

    df = {
        'date_time': resx_list,
        'ecpm_tomorrow': resy_list,
    }
    data = pd.DataFrame(df)

    return data

有关scipy interpolate的差值方法demo可以参考如下两篇文章