业务数据保存在mysql中,定期用Sqoop导入到HDFS的ODS层,DWD层的业务数据进行简单的数据清洗并降维(退化维度)

需求1:求GMV成交总额

  • 思路:在ADS层建每日GMV总和表ads_gmv_sum_day
drop table if exists ads_gmv_sum_day;
create table ads_gmv_sum_day( 
    `dt` string COMMENT '统计日期',
    `gmv_count`  bigint COMMENT '当日gmv订单个数',
    `gmv_amount`  decimal(16,2) COMMENT '当日gmv订单总金额',
    `gmv_payment`  decimal(16,2) COMMENT '当日支付金额' 
) COMMENT '每日活跃用户数量'
row format delimited  fields terminated by '\t' 
location '/warehouse/gmall/ads/ads_gmv_sum_day/';
  • 从用户行为宽表导入数据到每日GMV表
insert into table ads_gmv_sum_day 
select 
    '2019-02-10' dt ,
    sum(order_count)  gmv_count ,
    sum(order_amount) gmv_amount ,
    sum(payment_amount) payment_amount 
from dws_user_action 
where dt ='2019-02-10'
group by dt;

需求2:转化率

在统计分析指标中,经常会提及转化率,但实际上转化率的定义,各个行业各个公司有各自的口径。具体的转化率是什么,取决于你的转化目标。
最常见的口径:指实际下单的用户在单日总活跃用户中的比例。即单日消费用户数/单日日活数量。其他比如:
新访问用户转化率=单日新访问设备数/日活数新注册用户转化率=单日新注册用户数/日活数新付费用户转化率=单日新付费用户数/日活数

ADS层之新增用户占日活跃用户比率

  • 建表ads_user_convert_day
drop table if exists ads_user_convert_day;
create   table ads_user_convert_day( 
    `dt` string COMMENT '统计日期',
    `uv_m_count`  bigint COMMENT '当日活跃设备',
    `new_m_count`  bigint COMMENT '当日新增设备',
    `new_m_ratio`   decimal(10,2) COMMENT '当日新增占日活的比率'
) COMMENT '每日活跃用户数量'
row format delimited  fields terminated by '\t' 
location '/warehouse/gmall/ads/ads_user_convert_day/';
  • 数据导入
insert into table ads_user_convert_day 
select
  '2019-02-10',
  sum(uc.dc) sum_dc,
  sum(uc.nmc) sum_nmc,
  cast(sum(uc.nmc)/sum(uc.dc)*100 as decimal(10,2)) new_m_ratio
from(
  select 
        day_count dc,
        0 nmc
  from ads_uv_count
  where  dt='2019-02-10'
  union all
  select  
        0 dc,
        new_mid_count nmc
  from ads_new_mid_count  
  where create_date='2019-02-10'
) uc;

需求3:ADS层之用户行为漏斗分析

 
业务数据仓库指标分析_数据仓库
漏斗分析
  • 建表ads_user_action_convert_day
drop table if exists ads_user_action_convert_day;
create table ads_user_action_convert_day(
   `dt` string COMMENT '统计日期',
    `total_visitor_m_count`  bigint COMMENT '总访问人数',
    `order_u_count` bigint     COMMENT '下单人数', 
    `visitor2order_convert_ratio`  decimal(10,2) COMMENT '访问到下单转化率', 
    `payment_u_count` bigint     COMMENT '支付人数',
    `order2payment_convert_ratio` decimal(10,2) COMMENT '下单到支付的转化率' 
) COMMENT '每日用户行为转化率统计'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_convert_day/' ;
  • 数据导入
insert into table ads_user_action_convert_day
select 
    '2019-02-10',
    uv.day_count,
    ua.order_count,
    cast(ua.order_count/uv.day_count*100 as  decimal(10,2)) visitor2order_convert_ratio, 
    ua.payment_count,
    cast(ua.payment_count/ua.order_count*100 as  decimal(10,2)) order2payment_convert_ratio
from  
(
    select 
        sum(if(order_count>0,1,0)) order_count,
        sum(if(payment_count>0,1,0)) payment_count
    from dws_user_action  
    where  dt='2019-02-10' 
)ua, ads_uv_count  uv 
where  uv.dt='2019-02-10' ;

需求4:品牌复购率

  • 具体要求实现:以月为单位统计,购买两次以上商品的用户


     
    业务数据仓库指标分析_数据仓库_02
    品牌复购率实现
  • 创建DWS层用户购买商品明细表(宽表)
drop table if exists  dws_sale_detail_daycount;
create external table  dws_sale_detail_daycount
(   user_id   string  comment '用户 id',
    sku_id    string comment '商品 Id',
    user_gender  string comment '用户性别',
    user_age string  comment '用户年龄',
    user_level string comment '用户等级',
    order_price decimal(10,2) comment '订单价格',
    sku_name string   comment '商品名称',
    sku_tm_id string   comment '品牌id',
    sku_category3_id string comment '商品三级品类id',
    sku_category2_id string comment '商品二级品类id',
    sku_category1_id string comment '商品一级品类id',
    sku_category3_name string comment '商品三级品类名称',
    sku_category2_name string comment '商品二级品类名称',
    sku_category1_name string comment '商品一级品类名称',
    spu_id  string comment '商品 spu',
    sku_num  int comment '购买个数',
    order_count string comment '当日下单单数',
    order_amount string comment '当日下单金额'
) COMMENT '用户购买商品明细表'
PARTITIONED BY ( `dt` string)
stored as  parquet 
location '/warehouse/gmall/dws/dws_user_sale_detail_daycount/'
tblproperties ("parquet.compression"="snappy");
  • 导入数据
with
tmp_detail as
(
    select 
        user_id,
        sku_id, 
        sum(sku_num) sku_num ,--当日某个用户下单某一个SKU商品总数量(个人注释)
        count(*) order_count , --当日某个用户下单某一个SKU商品次数(个人注释)
        sum(od.order_price*sku_num)  order_amount --总金额
    from ods_order_detail od
    where od.dt='2019-02-10' and user_id is not null
    group by user_id, sku_id
)  
insert overwrite table  dws_sale_detail_daycount partition(dt='2019-02-10')
select 
    tmp_detail.user_id,--用户id
    tmp_detail.sku_id,--商品id
    u.gender, --用户性别
    months_between('2019-02-10', u.birthday)/12  age, --用户年龄
    u.user_level,--用户等级
    price,--商品价格
    sku_name,--商品名字
    tm_id,--品牌id
    category3_id ,  
    category2_id ,  
    category1_id ,  
    category3_name ,  
    category2_name ,  
    category1_name ,  
    spu_id,
    tmp_detail.sku_num,
    tmp_detail.order_count,
    tmp_detail.order_amount 
from tmp_detail 
left join dwd_user_info u on u.id=tmp_detail.user_id  and u.dt='2019-02-10'
left join dwd_sku_info s on tmp_detail.sku_id =s.id  and s.dt='2019-02-10';
  • ADS层品牌复购率分析报表
  • 品牌复购率分析报表建表语句
drop  table ads_sale_tm_category1_stat_mn;
create  table ads_sale_tm_category1_stat_mn
(   
    tm_id string comment '品牌id ' ,
    category1_id string comment '1级品类id ',
    category1_name string comment '1级品类名称 ',
    buycount   bigint comment  '购买人数',
    buy_twice_last bigint  comment '两次以上购买人数',
    buy_twice_last_ratio decimal(10,2)  comment  '单次复购率', 
    buy_3times_last   bigint comment   '三次以上购买人数',
    buy_3times_last_ratio decimal(10,2)  comment  '多次复购率' ,
    stat_mn string comment '统计月份',
    stat_date string comment '统计日期' 
)   COMMENT '复购率统计'
row format delimited  fields terminated by '\t' 
location '/warehouse/gmall/ads/ads_sale_tm_category1_stat_mn/';

ADS层品牌复购率报表数据导入

insert into table ads_sale_tm_category1_stat_mn
select   
    mn.sku_tm_id,
    mn.sku_category1_id,
    mn.sku_category1_name,
    sum(if(mn.order_count>=1,1,0)) buycount,
    sum(if(mn.order_count>=2,1,0)) buyTwiceLast,
    sum(if(mn.order_count>=2,1,0))/sum( if(mn.order_count>=1,1,0)) buyTwiceLastRatio,
    sum(if(mn.order_count>3,1,0))  buy3timeLast  ,
    sum(if(mn.order_count>=3,1,0))/sum( if(mn.order_count>=1,1,0)) buy3timeLastRatio ,
    date_format('2019-02-10' ,'yyyy-MM') stat_mn,
    '2019-02-10' stat_date
from 
(     
    select od.sku_tm_id, 
        od.sku_category1_id,
        od.sku_category1_name,  
        user_id , 
        sum(order_count) order_count
    from  dws_sale_detail_daycount  od 
    where 
        date_format(dt,'yyyy-MM')<=date_format('2019-02-10' ,'yyyy-MM')
    group by 
        od.sku_tm_id, od.sku_category1_id, user_id, od.sku_category1_name
) mn
group by mn.sku_tm_id, mn.sku_category1_id, mn.sku_category1_name;
  • 最后将品牌复购率结果输出到MySQL
    1.在MySQL中创建ads_sale_tm_category1_stat_mn表
create  table ads_sale_tm_category1_stat_mn
(   
    tm_id varchar(200) comment '品牌id ' ,
    category1_id varchar(200) comment '1级品类id ',
    category1_name varchar(200) comment '1级品类名称 ',
    buycount   varchar(200) comment  '购买人数',
    buy_twice_last varchar(200) comment '两次以上购买人数',
    buy_twice_last_ratio varchar(200) comment  '单次复购率', 
    buy_3times_last   varchar(200) comment   '三次以上购买人数',
    buy_3times_last_ratio varchar(200)  comment  '多次复购率' ,
    stat_mn varchar(200) comment '统计月份',
    stat_date varchar(200) comment '统计日期' 
)

2.编写Sqoop导出脚本

#!/bin/bash

db_name=gmall

export_data() {
/opt/module/sqoop/bin/sqoop export \
--connect "jdbc:mysql://hadoop102:3306/${db_name}?useUnicode=true&characterEncoding=utf-8"  \
--username root \
--password 000000 \
--table $1 \
--num-mappers 1 \
--export-dir /warehouse/$db_name/ads/$1 \
--input-fields-terminated-by "\t"  \
--update-key "tm_id,category1_id,stat_mn,stat_date" \
--update-mode allowinsert \
#sqoop避免空值
--input-null-string '\\N'    \
--input-null-non-string '\\N'  
}

case $1 in
  "ads_sale_tm_category1_stat_mn")
     export_data "ads_sale_tm_category1_stat_mn"
;;
   "all")
     export_data "ads_sale_tm_category1_stat_mn"
;;
esac

关于导出update还是insert的问题
--update-mode
参数 :
updateonly 只更新,无法插入新数据
allowinsert 允许新增
--update-key
允许更新的情况下,指定哪些字段匹配视为同一条数据,进行更新而不增加。多个字段用逗号分隔。