设计一张日期维度表
表名称:DIM_PUB_DATE
具体属性值:
字段名称 | 字段类型 | 字段描述 | 示例 |
ds | string | Id(主键) | 20210624 |
d_date | string | 日期 | 2021-06-24 |
d_datetime | string | 日期(包含时分秒) | 2021-06-24 00:00:00 |
y_ds | string | 去年id | 20200624(减12月) |
y_date | string | 去年日期 | 2020-06-24 |
y_datetime | string | 去年日期(包含时分秒) | 2020-06-24 00:00:00 |
d_year | string | 年份 | 2021 |
d_yearstartday | string | 年初日 | 2021-01-01 |
y_year | string | 去年份 | 2020 |
y_yearstartday | string | 去年初日 | 2020-01-01 |
d_month | string | 月份 | 2021-06 |
d_monthstartday | string | 月初日 | 2021-06-01 |
d_monthendday | string | 月尾日 | 2021-06-30 |
y_month | string | 去年月份 | 2020-06 |
y_monthstartday | string | 去年月初日 | 2020-06-01 |
y_monthendday | string | 去年月尾日 | 2020-06-30 |
d_quarter | string | 季度 | Q2 |
d_yearquarter | string | 季度(带年份) | 2021-2 |
week | string | 星期(数字表示) | 1,2,3,4,5,6,7:1代表星期一 |
d_week | string | 星期几 | 星期一 |
d_dayofyear | string | 当年第几天 | D100 |
d_dayofmonth | string | 当月第几天 | D24 |
d_dayofweek | string | 当周第几天 | D4 |
d_weekofmonth | string | 当月第几周 | W4 |
d_weekofyear | string | 当年第几周 | W24 |
is_weekend | string | 是否周末 | 1:是,2:否 |
day_type | string | 日历天类型 | 0:节假日,1:工作日,2:休息日 (调休也属于工作日) |
hol_name | string | 节假日名称 | 元旦,调休,春节,清明节,劳动节,端午节,中秋节,国庆节 |
生成方法:
使用python生成一张维度表,考虑到通用性,可以先写到Excel或者CSV里面,然后自定义写入到hive或者odps里面去。
Py3语法:
# -*- coding:utf-8 -*-
import csv
import json
from datetime import *
import requests
from dateutil.relativedelta import *
import calendar
import pandas as pd
from odps import ODPS
def getDate(ddate, holiday_list):
# 日期
d_date = datetime.strptime(ddate, "%Y-%m-%d").date()
# id(主键)
ds = str(d_date).replace('-', '')
# 日期时间(包含时分秒)
d_datetime = datetime(d_date.year,d_date.month,d_date.day)
# 去年今日
y_date = d_date + relativedelta(months=-12)
# 去年ds
y_ds = str(y_date).replace('-', '')
# 去年日期
y_datetime = datetime(y_date.year, y_date.month, y_date.day)
# 年份
d_year = str(d_date.year)
# 年初日
d_yearstartday = d_year+'-01-01'
# 年份
y_year = str(y_date.year)
# 去年初日
y_yearstartday = y_year + '-01-01'
# 月份
d_month = d_year+'-'+str(d_date).split('-')[1]
# 月初日
d_monthstartday = d_month + '-01'
# 月尾日
d_monthendday = d_month + '-' + str(calendar.monthrange(int(d_year), int(d_date.month))[1])
# 去年月份
y_month = y_year+'-'+str(y_date).split('-')[1]
# 去年月初日
y_monthstartday = y_month + '-01'
# 去年月尾日
y_monthendday = y_month + '-' + str(calendar.monthrange(int(y_year), int(y_date.month))[1])
# 季度
d_quarter = "Q"+str((int(d_date.month)-1) // 3 + 1)
# 星期几
week = d_date.strftime('%u')
# 星期
d_week = ''
if week == '1':
d_week = "星期一"
elif week == '2':
d_week = "星期二"
elif week == '3':
d_week = "星期三"
elif week == '4':
d_week = "星期四"
elif week == '5':
d_week = "星期五"
elif week == '6':
d_week = "星期六"
else:
d_week = "星期日"
# 季度带年份
d_yearquarter = y_year+'-'+str((int(d_date.month)-1) // 3 + 1)
# 当年第几天
d_dayofyear = d_date.strftime('%j')
# 当月第几天
d_dayofmonth = d_date.strftime('%d')
# 当周第几天:星期一为第一天 (值从1到7,星期一为1)
d_dayofweek = d_date.strftime('%u')
# 当月第几周:第一天只要不是星期一,都算一周
d_weekofmonth = (int(d_date.strftime('%W'))-int(date(d_date.year,d_date.month,1).strftime('%W')))+1
# 当年第几周:把星期一做为第一天(值从0到53)
d_weekofyear = int(d_date.strftime('%W'))+1
# 是否周末
is_weekend = ''
# 日期类型:0:节假日,1:工作日,2:休息日
day_type = ''
# 具体节假日名称(包含调休)
hol_name = ''
if d_dayofweek in ('6', '7'):
is_weekend = '1'
day_type = '2'
else:
is_weekend = '0'
day_type = '1'
# 法定节假日:0,工作日:1,休息日:2
for hol_json in holiday_list:
if str(d_date) == json.loads(hol_json)['date']:
# 是否节假日
if json.loads(hol_json)['flag'] != "调休":
day_type = '0'
else:
day_type = '1'
# 具体节假日(包含调休)
hol_name = json.loads(hol_json)['flag']
# 把数据添加到列表中
date_list = [ds, str(d_date), str(d_datetime), y_ds, str(y_date), str(y_datetime),
d_year, d_yearstartday, y_year, y_yearstartday,
d_month, d_monthstartday, d_monthendday, y_month,
y_monthstartday, y_monthendday, d_quarter, d_yearquarter, week,
d_week, 'D'+str(d_dayofyear), 'D'+str(d_dayofmonth), 'D'+str(d_dayofweek),
'W'+str(d_weekofmonth), 'W'+str(d_weekofyear), is_weekend, day_type, hol_name]
return date_list
# 获取节假日,转为需要的格式
def get_holiday_dict(start_year, y_cnt):
y_list = [start_year]
for i in range(1, y_cnt+1):
y_list.append(start_year+i)
hol_list = []
dict_res = {}
for year in y_list:
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
response = requests.get('https://timor.tech/api/holiday/year/'+str(year)+'/', headers=headers)
res = response.json()['holiday'].items()
for i in res:
# 日期
dict_res['date'] = i[1]['date']
# 标识节假日(详细)
dict_res['name'] = i[1]['name']
# 薪资倍数,3表示是3倍工资
dict_res['wage'] = i[1]['wage']
dict_res['holiday'] = i[1]['holiday']
# 标识节假日
if ("初" in str(i[1]['name'])) or ("除夕" in str(i[1]['name'])):
dict_res['flag'] = '春节'
elif "调休" in str(i[1]['name']):
dict_res['flag'] = '调休'
else:
dict_res['flag'] = i[1]['name']
hol_list.append(str(json.dumps(dict_res, ensure_ascii=False)))
print("节假日数据已经获取完毕!!!")
return hol_list
#获取生成指定日期范围的数据
def get_ste_day(time_start, time_end, holiday_list):
date_l = [datetime.strftime(x, '%Y-%m-%d') for x in list(pd.date_range(start=time_start, end=time_end))]
res_list = []
# 遍历日期,生成时间数据
for ddate in date_l:
date_list = getDate(ddate, holiday_list)
res_list.append(date_list)
# 返回结果数据
return res_list
# 把数据写入csv里面
def write_csv(res_list):
# 1. 创建文件对象
f = open('E:\\dim_pub_date.txt', 'w', encoding='utf-8', newline="")
# 2. 基于文件对象构建 csv写入对象
csv_writer = csv.writer(f)
# 3. 构建列表头
csv_writer.writerow(["ds", "d_date", "d_datetime", "y_ds", "y_date", "y_datetime",
"d_year", "d_yearstartday", "y_year", "y_yearstartday",
"d_month", "d_monthstartday", "d_monthendday", "y_month",
"y_monthstartday", "y_monthendday", "d_quarter", "d_yearquarter", "week",
"d_week", "d_dayofyear", "d_dayofmonth", "d_dayofweek",
"d_weekofmonth", "d_weekofyear", "is_weekend", "day_type", "hol_name"])
# 4. 写入csv文件内容
for ls in res_list:
if len(ls) != 0:
csv_writer.writerow(ls)
# 5. 关闭文件
print("数据写入csv完成!!!")
f.close()
# 把数据写入到odps里面
def write_odps(res_list):
# 获取odps环境
o = ODPS('xxx', 'xxx', 'DILI_FRESH',
endpoint='http://service.cn-beijing.maxcompute.aliyun.com/api')
# 写表
t = o.get_table("dim_pub_date")
with t.open_writer(partition="year=2018", create_partition=True) as writer:
writer.write(res_list)
print("数据写入odps完成!!!")
# 把数据写入到hive里面
def write_hive(res_list):
pass
if __name__ == '__main__':
# 初始时间和结束时间
time_start = date(2018, 1, 1)
time_end = date(2018, 12, 31)
# 节假日年份
y_cnt = time_end.year-time_start.year
# 获取节假日信息
holiday_list = get_holiday_dict(time_start.year, y_cnt)
# 生成指定的时间范围数据
res_list = get_ste_day(time_start, time_end, holiday_list)
# 把数据写入到csv文件中
write_csv(res_list)
# 把数据写入到odps中
write_odps(res_list)
# 把数据写入到hive中
write_hive(res_list)