一、项目介绍
1.1 项目需求
基于第一部分业务背景,为满足业务自主分析提高运营决策时效,现使用Python、数据仓库实现数据自动更新式可视化面板。
1.2 项目操作流程
二、数据处理
2.1 代码处理分析
2.1.1 聚合生成dw_order_by_day表
从mysql数据库中获取ods_sales_orders订单明细表并按日期分组聚合,得到每日销量和每日销售额。读取日期维度表,与上表合并,得到dw_order_by_day每日环比表。 关键代码如下(部分):
#导入相关模块
import pandas as pd
import pymysql
import random
from sqlalchemy import create_engine
pymysql.install_as_MySQLdb()
import datetime
import warnings
warnings.filterwarnings("ignore")
# 连接adventure_ods_new库
adventure_ods = create_engine('mysql://用户名:密码@106.12.180.221:3306/数据库?charset=gbk')
# 连接datafrog05_adventure_new库
adventure_dw = create_engine('mysql://用户名:密码@106.15.121.232:3306/数据库名?charset=gbk')
"""利用空列表及循环生成对应随机值,与销量订单聚合表合并形成sum_amount_order_goal(销量订单聚合目标表)"""
sum_amount_goal_list = []
sum_order_goal_list = []
# 转为list类型,遍历每个日期
create_date_list = list(sum_amount_order['create_date'])
for i in create_date_list:
# 生成一个在[0.85,1.1]随机数
a = random.uniform(0.85, 1.1)
b = random.uniform(0.85, 1.1)
# 对应日期下生成总金额(sum_amount)*a的列
amount_goal = list(sum_amount_order[sum_amount_order['create_date'] == i]['sum_amount'])[0] * a
# 对应日期下生成总订单数(sum_order)*b的列
order_goal = list(sum_amount_order[sum_amount_order['create_date'] == i]['sum_order'])[0] * b
# 将生成的目标值加入空列表
sum_amount_goal_list.append(amount_goal)
sum_order_goal_list.append(order_goal)
# 合并sum_amount_order表与刚生成的目标值列,形成sum_amount_order_goal表
sum_amount_order_goal = pd.concat([sum_amount_order, pd.DataFrame(
{'sum_amount_goal': sum_amount_goal_list, 'sum_order_goal':
sum_order_goal_list})], axis=1)
sum_amount_order_goal.head()
"""存储新的dw_order_by_day """
#增添环比值amount_diff
# pct_change()表示当前元素与先前元素的相差百分比,默认竖向,例:前面元素x,当前元素y,公式 result = (y-x)/x
dw_order_by_day['amount_diff'] = dw_order_by_day['sum_amount'].pct_change().fillna(0)
# 追加数据至dw_order_by_day
dw_order_by_day.to_sql('dw_order_by_day_jrh', con=adventure_dw,
if_exists='replace', index=False)
2.1.2 聚合生成dw_order_diff表
上一步中得到dw_order_by_day每日环比表,将当前时间维度和去年同期维度下相同字段进行聚合,在计算销售额,销售量,和客单价的同比,关键代码如下(部分):
"""求取各阶段的总金额"""
"""当天"""
# 当天的总金额
today_amount = dw_order_by_day[dw_order_by_day['is_today'] == 1]['sum_amount'].sum()
# 去年同期的日期维度
before_year_today = list(dw_order_by_day[dw_order_by_day['is_today'] == 1]
['create_date'] + datetime.timedelta(days=-365))
# 去年同期总金额
before_year_today_amount = dw_order_by_day[dw_order_by_day['create_date'].isin(before_year_today)]['sum_amount'].sum()
"""昨天"""
# 昨天的总金额
yesterday_amount = dw_order_by_day[dw_order_by_day['is_yesterday'] == 1]['sum_amount'].sum()
# 去年同期的日期维度
before_year_yesterday = list(dw_order_by_day[dw_order_by_day['is_yesterday'] == 1]
['create_date'] + datetime.timedelta(days=-365))
# 去年同期总金额
before_year_yesterday_amount = dw_order_by_day[dw_order_by_day['create_date'].isin(
before_year_yesterday)]['sum_amount'].sum()
"""当前月份"""
# 当月的总金额
month_amount = dw_order_by_day[dw_order_by_day['is_current_month'] == 1]['sum_amount'].sum()
# 去年同期的日期维度
before_year_month = list(dw_order_by_day[dw_order_by_day['is_current_month'] == 1]
['create_date'] + datetime.timedelta(days=-365))
# 去年同期总金额
before_year_month_amount = dw_order_by_day[dw_order_by_day['create_date'].isin(
before_year_month)]['sum_amount'].sum()
"""当前季度"""
# 当前季度的总金额
quarter_amount = dw_order_by_day[dw_order_by_day['is_current_quarter'] == 1]['sum_amount'].sum()
# 去年同期的日期维度
before_year_quarter = list(dw_order_by_day[dw_order_by_day['is_current_quarter'] == 1]
['create_date'] + datetime.timedelta(days=-365))
# 去年同期总金额
before_year_quarter_amount = dw_order_by_day[dw_order_by_day['create_date'].isin(
before_year_quarter)]['sum_amount'].sum()
"""当前年份"""
# 当年的总金额
year_amount = dw_order_by_day[dw_order_by_day['is_current_year'] == 1]['sum_amount'].sum()
# 去年同期的日期维度
before_year_year = list(dw_order_by_day[dw_order_by_day['is_current_year'] == 1]
['create_date'] + datetime.timedelta(days=-365))
# 去年同期总金额
before_year_year_amount = dw_order_by_day[dw_order_by_day['create_date'].isin(
before_year_year)]['sum_amount'].sum()
"""求取各阶段的总金额、订单数的同期对比数据"""
# 做符号简称,横向提取数据方便
amount_dict = {'today_diff': [today_amount / before_year_today_amount - 1,
today_order / before_year_today_order - 1,
(today_amount / today_order) / (before_year_today_amount /
before_year_today_order) - 1],
'yesterday_diff': [yesterday_amount / before_year_yesterday_amount - 1,
yesterday_order / before_year_yesterday_order - 1,
(yesterday_amount / yesterday_order) / (before_year_yesterday_amount /
before_year_yesterday_order) - 1],
'month_diff': [month_amount / before_year_month_amount - 1,
month_order / before_year_month_order - 1,
(month_amount / month_order) / (before_year_month_amount /
before_year_month_order) - 1],
'quarter_diff': [quarter_amount / before_year_quarter_amount - 1,
quarter_order / before_year_quarter_order - 1,
(quarter_amount / quarter_order) / (before_year_quarter_amount /
before_year_quarter_order) - 1],
'year_diff': [year_amount / before_year_year_amount - 1,
year_order / before_year_year_order - 1,
(year_amount / year_order) / (before_year_year_amount /
before_year_year_order) - 1],
'flag': ['amount', 'order', 'avg']}
dw_amount_diff = pd.DataFrame(amount_dict)
dw_amount_diff
"""存储新的dw_amount_diff表 """
dw_amount_diff.to_sql('dw_amount_diff_jrh', con=adventure_dw, if_exists='replace', index=False)
2.1.3 聚合生成update_sum_data表
将ods_sales_orders订单明细表和ods_customer连接,按照订单日期/产品名/产品子类/产品类别/所在区域/所在省份/所在城市分组聚合,再与日期维度表连接,得到sum_customer_order 关键代码如下(部分):
"""进行数据的聚合"""
sales_customer_order=pd.merge(ods_sales_orders,ods_customer,on='customer_key',how='left')
# 提取订单主键/订单日期/客户编号/产品名/产品子类/产品类别/产品单价/所在区域/所在省份/所在城市
sales_customer_order=sales_customer_order.groupby(['create_date','english_product_name','cpzl_zw',"cplb_zw",
'chinese_territory','chinese_province','chinese_city'],as_index=False).
agg({'sales_order_key':pd.Series.nunique,'customer_key':pd.Series.nunique,
'unit_price':'sum'}).rename(columns={'sales_order_key':'order_num','customer_key':'customer_num','unit_price':'sum_amount',
'english_product_name':'product_name'})
# 转化订单日期为字符型格式
sales_customer_order['create_date'] = sales_customer_order['create_date'].apply(lambda x: x.strftime('%Y-%m-%d'))
# 获取当日日期维度
dw_customer_order = pd.merge(sales_customer_order, dim_date_df, on='create_date', how='inner')
dw_customer_order.head()
"""step5:进行数据的存储"""
dw_customer_order.to_sql('dw_customer_order_jrh', con=adventure_dw,
if_exists='replace', index=False)
2.2 在MySQL追加索引
在mysql追加索引优化查询速度,类似于让搜索结果不需要遍历整个表,而是根据前缀索引一步步搜寻。
create index index_date on dw_order_by_day_hjf(create_date(8));
select * from dw_order_by_day_hjf where create_date='2019-02-11';
EXPLAIN select * from dw_order_by_day_hjf where create_date='2019-02-11';
2.3 在Linux上做定时部署
在服务器上部署代码,让其每日自动更新,实现方法分三步:
1.定时执行命令:import schedule
2.能够跳出命令窗口输入 python xxx.py 文件使其运行:import os 的os.system()
3.将这个文件挂在后台,等时间一到,自动执行:利用Linux系统自带的&挂后台
import schedule
import time
import datetime
import os
import requests
def job1():
"""
dw_order_by_day 每日环比表
"""
print('Job1:每天6:00执行一次')
print('Job1-startTime:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
os.system(
"/home/anaconda3/bin/python3 /home/frog005/adventure/dw_order_by_day_jrh.py >> /home/frog005/adventure_jrh/jrh_logs/dw_order_by_day_schedule.log 2>&1 &")
time.sleep(20)
print('Job1-endTime:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
print('------------------------------------------------------------------------')
def job2():
"""
dw_order_diff 同比数据表
"""
print('Job2:每天6:20执行一次')
print('Job2-startTime:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
os.system(
"/home/anaconda3/bin/python3 /home/frog005/adventure_jrh/dw_order_diff_jrh.py >> /home/frog005/adventure_jrh/jrh_logs/dw_order_diff_schedule.log 2>&1 &")
time.sleep(20)
print('Job3-endTime:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
print('------------------------------------------------------------------------')
def job3():
"""
update_sum_data 生成时间地区产品聚合表
"""
print('Job3:每天6:40执行一次')
print('Job3-startTime:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
os.system(
"/home/anaconda3/bin/python3 /home/frog005/adventure_jrh/update_sum_data_jrh.py >> /home/frog005/adventure_jrh/jrh_logs/update_sum_data_schedule.log 2>&1 &")
time.sleep(20)
print('Job4-endTime:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
print('------------------------------------------------------------------------')
if __name__ == '__main__':
schedule.every().day.at('06:00').do(job1)
schedule.every().day.at('06:20').do(job2)
schedule.every().day.at('06:40').do(job3)
while True:
schedule.run_pending()
time.sleep(10)
print("wait", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
将schedule程序挂在后台执行,并输出日志:
nohup python3 schedule_job_jrh.py > schedule_job_jrh.log 2>&1 &
查看该schedule程序运行情况(验证是否顺利挂在后台)
ps aux| grep schedule_job_jrh.py
三、可视化看板搭建
power bi可视化链接
3.1 整体销售情况
3.2 最近21天销售趋势图
3.3 区域销售详情表