Hivesql10题
- 1.找出所有科目成绩都大于某一学科平均成绩的学生
- 2.统计出每个用户的累积访问次数
- 3.有50W个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为Visit,访客的用户id为user_id,被访问的店铺名称为shop,请统计:
- 4.已知一个表STG.ORDER,有如下字段:Date,Order_id,User_id,amount。请给出sql进行统计:数据样例:2017-01-01,10029028,1000003251,33.57。(表名统一用order_tab)
- 1)给出 2017年每个月的订单数、用户数、总成交金额。
- 2)给出2017年11月的新客数(指在11月才有第一笔订单)
- 5.有日志如下,请写出代码求得所有用户和活跃用户的总数及平均年龄。(活跃用户指连续两天都有访问记录的用户)日期 用户 年龄
- 6.请用sql写出所有用户中在今年10月份第一次购买商品的金额,表名ordertable,字段(购买用户:userid,金额:money,购买时间:paymenttime(格式:2017-10-01),订单id:orderid)
- 7.求11月9号下午14点(14-15点),访问api/user/login接口的top10的ip地址
1.找出所有科目成绩都大于某一学科平均成绩的学生
数据集如下
uid subject_id score
1001 01 90
1001 02 90
1001 03 90
1002 01 85
1002 02 85
1002 03 70
1003 01 70
1003 02 70
1003 03 85
select
uid
from
(
select
uid
,if(score>avg_score,0,1) flag
from
(
select
uid
,subject_id
,avg(score) over(partition by subject_id) avg_score
from
score) t1) t2
where
group by
uid
having
sum(flag)= 0;
2.统计出每个用户的累积访问次数
用户访问数据
userId visitDate visitCount
u01 2017/1/21 5
u02 2017/1/23 6
u03 2017/1/22 8
u04 2017/1/20 3
u01 2017/1/23 6
u01 2017/2/21 8
u02 2017/1/23 6
u01 2017/2/22 4
最后结果如下所示
用户id 月份 小计 累积
u01 2017-01 11 11
u01 2017-02 12 23
u02 2017-01 12 12
u03 2017-01 8 8
u04 2017-01 3 3
select
userId,
mn,
mn_count,
sum(mn_count) over(partition by userId order by mn)
from
( select
userId,
mn,
sum(visitCount) mn_count
from
(select
userId,
date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') mn,
visitCount
from
action)t1
group by userId,mn)t2;
3.有50W个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为Visit,访客的用户id为user_id,被访问的店铺名称为shop,请统计:
1)每个店铺的UV(访客数)
select shop
,count(distinct user_id)
from visit
group by shop;
2)每个店铺访问次数top3的访客信息,输出店铺名称、访客id、访问次数。
select shop
,user_id
,ct
from (select shop
,user_id
,ct
,rank() over(partition by shop order by ct) rk
from
(select shop
,user_id
,count(*) ct
from visit
group by shop,user_id) t1
) t2
where rk<=3;
4.已知一个表STG.ORDER,有如下字段:Date,Order_id,User_id,amount。请给出sql进行统计:数据样例:2017-01-01,10029028,1000003251,33.57。(表名统一用order_tab)
1)给出 2017年每个月的订单数、用户数、总成交金额。
select
date_format(dt,'yyyy-MM'),
count(order_id),
count(distinct user_id),
sum(amount)
from
order_tab
where
date_format(dt,'yyyy')='2017'
group by
date_format(dt,'yyyy-MM');
2)给出2017年11月的新客数(指在11月才有第一笔订单)
select
count(user_id)
from
order_tab
group by
user_id
having
date_format(min(dt),'yyyy-MM')='2017-11';
5.有日志如下,请写出代码求得所有用户和活跃用户的总数及平均年龄。(活跃用户指连续两天都有访问记录的用户)日期 用户 年龄
数据如下所示,表名user_age
dt user_id age
2019-02-11,test_1,23
2019-02-11,test_2,19
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-12,test_2,19
2019-02-13,test_1,23
2019-02-15,test_2,19
2019-02-16,test_2,19
-- 求得所有用户和活跃用户的总数及平均年龄。
-- (活跃用户指连续两天都有访问记录的用户)
-- 1.按日期以及用户分组,按日期排序并给出排名
select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,
user_id
-- 2.计算日期和排名的差值
select
user_id,
age,
date_sub(dt, rk) flag
from
(
select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,
user_id) t1
-- 3.过滤出差值大于2的,即为连续两天活跃的用户
select
user_id,
min(age) age
from
(
select
user_id,
age,
date_sub(dt, rk) flag
from
(
select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,
user_id) t1
)t2
group by
user_id,
flag
having
count(*)>= 2
-- 4.对数据进行去重处理
select
user_id,
min(age) age
from
(select
user_id,
min(age) age
from
(
select
user_id,
age,
date_sub(dt, rk) flag
from
(
select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,
user_id) t1
)t2
group by
user_id,
flag
having
count(*)>= 2)t3
group by
user_id
-- 5.计算活跃用户的人数以及平均年龄
select
count(*) ct,
cast(sum(age)/ count(*) as decimal(10, 2))
from
(
select
user_id,
min(age) age
from
(
select
user_id,
min(age) age
from
(
select
user_id,
age,
date_sub(dt, rk) flag
from
(
select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,
user_id) t1
)t2
group by
user_id,
flag
having
count(*)>= 2)t3
group by
user_id)t4;
-- 6.对全量数据进行去重(按用户去重)
select
user_id,
min(age) age
from
user_age
group by
user_id;
-- 7.计算所有用户的数量以及平均年龄
select
count(*) user_count,
cast((sum(age)/ count(*)) as decimal(10, 1))
from
(
select
user_id,
min(age) age
from
user_age
group by
user_id)t5;
-- 8.把4和7的数据进行union
select
0 user_total_count,
0 user_total_avg_age,
count(*) twice_count,
cast(sum(age)/ count(*) as decimal(10, 2)) twice_count_avg_age
from
(
select
user_id,
min(age) age
from
(
select
user_id,
min(age) age
from
(
select
user_id,
age,
date_sub(dt, rk) flag
from
(
select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,
user_id
) t1
) t2
group by
user_id,
flag
having
count(*)>= 2)t3
group by
user_id
) t4
union all
select
count(*) user_total_count,
cast((sum(age)/ count(*)) as decimal(10, 1)),
0 twice_count,
0 twice_count_avg_age
from
(
select
user_id,
min(age) age
from
user_age
group by
user_id
) t5
-- 9.最终整合
select
sum(user_total_count),
sum(user_total_avg_age),
sum(twice_count),
sum(twice_count_avg_age)
from
(
select
0 user_total_count,
0 user_total_avg_age,
count(*) twice_count,
cast(sum(age)/ count(*) as decimal(10, 2)) twice_count_avg_age
from
(
select
user_id,
min(age) age
from
(
select
user_id,
min(age) age
from
(
select
user_id,
age,
date_sub(dt, rk) flag
from
(
select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,
user_id
) t1
) t2
group by
user_id,
flag
having count(*)>= 2
) t3
group by
user_id
) t4
union all
select
count(*) user_total_count,
cast((sum(age)/ count(*)) as decimal(10, 1)),
0 twice_count,
0 twice_count_avg_age
from
(
select
user_id,
min(age) age
from
user_age
group by
user_id
) t5
) t6;
6.请用sql写出所有用户中在今年10月份第一次购买商品的金额,表名ordertable,字段(购买用户:userid,金额:money,购买时间:paymenttime(格式:2017-10-01),订单id:orderid)
-- 1.
select
userid,
min(paymenttime) paymenttime
from ordertable
where date_format(paymenttime,'yyyy-MM')='2017-10'
group by userid
-- 2.
select
t1.userid,
t1.paymenttime,
ot.money
from (select
userid,
min(paymenttime) paymenttime
from ordertable
where date_format(paymenttime,'yyyy-MM')='2017-10'
group by userid) t1
join ordertable ot
on t1.userid=ot.userid and t1.paymenttime=ot.paymenttime;
7.求11月9号下午14点(14-15点),访问api/user/login接口的top10的ip地址
有一个线上服务器访问日志格式如下
时间 --------------------- 接口 ------------ ip地址
2016-11-09 11:22:05 /api/user/login 110.23.5.33
2016-11-09 11:23:10 /api/user/detail 57.3.2.16
…
2016-11-09 23:59:40 /api/user/login 200.6.5.166
求11月9号下午14点(14-15点),访问api/user/login接口的top10的ip地址
还在完善中…(有时间就弄完…)