答案是我自己写的,有不同看法的欢迎讨论
1.编写sql实现每个用户截止到每月为止的最大单月访问次数和累计到该月的总访问次数
数据:
A,2015-01,5
A,2015-01,15
B,2015-01,5
A,2015-01,8
B,2015-01,25
A,2015-01,5
A,2015-02,4
A,2015-02,6
B,2015-02,10
B,2015-02,5
A,2015-03,16
A,2015-03,22
B,2015-03,23
B,2015-03,10
B,2015-03,1
建表:
create table if not exists visits(
userid string,
month string,
visits int
)
row format delimited fields terminated by ','
;
load data local inpath '/root/hivedata/visits.txt' overwrite into table visits;
执行
select
userid,
month,
max(visits) over(distribute by userid sort by month),
sum(visits) over(distribute by userid sort by month),
visits
from
(select
userid,
month,
sum(visits) visits
from visits
group by userid,month) as t
;
2.求出每个栏目的被观看次数及累计观看时长
数据
vedio表
Uid channel min
1 1 23
2 1 12
3 1 12
4 1 32
5 1 342
6 2 13
7 2 34
8 2 13
9 2 134
建表
create table if not exists video(
uid int,
channel string,
min int
)
row format delimited fields terminated by ' ' ;
load data local inpath '/root/hivedata/video.txt' into table video;
执行
select
channel,
count(channel),
sum(min)
from video
group by channel
;
3.编写连续7天登录的总人数
数据:
t1表
Uid dt login_status(1登录成功,0异常)
1 2019-07-11 1
1 2019-07-12 1
1 2019-07-13 1
1 2019-07-14 1
1 2019-07-15 1
1 2019-07-16 1
1 2019-07-17 1
1 2019-07-18 1
2 2019-07-11 1
2 2019-07-12 1
2 2019-07-13 0
2 2019-07-14 1
2 2019-07-15 1
2 2019-07-16 0
2 2019-07-17 1
2 2019-07-18 0
3 2019-07-11 1
3 2019-07-12 1
3 2019-07-13 1
3 2019-07-14 1
3 2019-07-15 1
3 2019-07-16 1
3 2019-07-17 1
3 2019-07-18 1
建表
create table if not exists login2(
uid int,
dt string,
login_status int
)
row format delimited fields terminated by ' ' ;
load data local inpath '/root/hivedata/login.txt' into table login2;
执行:
select
count(t3.uid)
from
(select
uid
from
(select
t1.uid,
date_sub(t1.dt,t1.rm) as dt
from
(select
uid,
dt,
row_number() over(distribute by uid sort by dt) as rm
from login2
where login_status=1) t1) t2
group by t2.uid,t2.dt
having count(t2.uid) > 7) t3
;
4.编写sql语句实现每班前三名,分数一样并列,同时求出前三名按名次排序的一次的分差:
数据
stu表
Stu_no class score
1 1901 90
2 1901 90
3 1901 83
4 1901 60
5 1902 66
6 1902 23
7 1902 99
8 1902 67
9 1902 87
建表
create table if not exists stu(
stu_no int,
class string,
score int
)
row format delimited fields terminated by ' '
;
load data local inpath '/root/hivedata/stu.txt' into table stu;
执行
select
t1.class,
t1.stu_no,
t1.score,
t1.rn,
t1.score - lag(t1.score) over(distribute by class sort by score desc) diff
from
(select
class,
rank() over(distribute by class sort by score desc) as rn,
stu_no,
score
from
stu
) t1
where t1.rn <= 3
;
5.每个店铺的当月销售额和累计到当月的总销售额
数据:
店铺,月份,金额
a,01,150
a,01,200
b,01,1000
b,01,800
c,01,250
c,01,220
b,01,6000
a,02,2000
a,02,3000
b,02,1000
b,02,1500
c,02,350
c,02,280
a,03,350
a,03,250
建表:
create table if not exists store(
sname string,
month string,
money int
)
row format delimited fields terminated by ','
;
load data local inpath '/root/hivedata/store.txt' into table store;
执行:
select
t1.sname,
t1.month,
t1.money,
sum(t1.money) over(distribute by t1.sname sort by month)
from
(select
sname,
month,
sum(money) money
from store
group by sname,month
) t1
;
6.分析用户行为习惯,找到每一个用户在表中的第一次行为
数据:
uid,time,action
1,time1,read
3,time2,comment
1,time3,share
2,time4,like
1,time5,write
2,time6,like
3,time7,write
2,time8,read
建表:
create table if not exists user_action_log(
uid string,
time string,
action string
)
row format delimited fields terminated by ','
;
load data local inpath '/root/hivedata/user_action_log.txt' into table user_action_log;
执行:
select
t1.uid,
t1.action
from
user_action_log t1
join
(select
uid,
min(time) time
from user_action_log
group by uid
) t2
on t1.uid=t2.uid
and t1.time=t2.time
;
7.订单及订单类型行列互换
数据:
order_id,order_type,order_time
111 N 10:00
111 A 10:05
111 B 10:10
建表:
create table if not exists myorder(
order_id string,
order_type string,
order_time string
)
row format delimited fields terminated by '\t'
;
load data local inpath '/root/hivedata/order.txt' into table myorder;
执行:
select
*
from
(select
order_id,
order_type order_type1,
lead(order_type) over(distribute by order_id sort by order_time) order_type2,
order_time order_time1,
lead(order_time) over(distribute by order_id sort by order_time) order_time2
from myorder) t1
where t1.order_type2 is not null
;
8.某APP每天访问数据存放在表access_log里面,
包含日期字段ds,用户类型字段user_type,用户账号user_id,用户访问时间 log_time,请使用hive的hql语句实现如下需求:
PV(访问量):用户每次刷新即被计算一次。
UV(独立访客):00:00-24:00内相同的客户端只被计算一次。
(1)、每天整体的访问UV、PV?
(2)、每天每个类型的访问UV、PV?
(3)、每天每个类型中最早访问时间和最晚访问时间?
(4)、每天每个类型中访问次数最高的10个用户?
数据:
2019-09-01 a u001 01:12
2019-09-01 a u001 01:13
2019-09-01 a u001 01:46
2019-09-01 b u002 04:56
2019-09-01 b u003 13:12
2019-09-02 a u001 12:12
2019-09-02 c u004 11:34
2019-09-02 a u005 14:12
2019-09-02 c u006 16:18
2019-09-02 a u007 05:10
2019-09-02 c u008 07:12
2019-09-02 a u009 09:06
2019-09-02 b u023 10:12
2019-09-02 a u045 18:47
2019-09-03 a u023 12:15
2019-09-04 b u054 06:12
2019-09-04 c u057 09:35
2019-09-04 c u056 00:57
2019-09-05 a u068 15:12
2019-09-06 b u053 11:25
2019-09-08 a u001 09:34
建表:
create table if not exists access_log(
ds string,
user_type string,
user_id string,
log_time string
)
row format delimited fields terminated by '\t'
;
load data local inpath '/root/hivedata/access_log.txt' into table access_log;
(1)、每天整体的访问UV、PV?
select
ds,
count(distinct user_id) uv,
count(*) pv
from access_log
group by ds
;
(2)、每天每个类型的访问UV、PV?
select
ds,
user_type,
count(distinct user_id) uv,
count(*) pv
from access_log
group by ds,user_type
;
(3)、每天每个类型中最早访问时间和最晚访问时间?
select
ds,
user_type,
min(log_time),
max(log_time)
from access_log
group by ds,user_type
;
(4)、每天每个类型中访问次数最高的10个用户?
select
ds,
user_type,
user_id,
cnt,
rn
from
(select
ds,
user_type,
user_id,
cnt,
rank() over(distribute by ds,user_type sort by cnt desc) rn
from
(select
ds,
user_type,
user_id,
count(*) cnt
from access_log
group by ds,user_type,user_id) t1) t2
where rn < 10
;
9.每个用户连续登陆的最大天数
数据:
login表
uid,date
1,2019-08-01
1,2019-08-02
1,2019-08-03
2,2019-08-01
2,2019-08-02
3,2019-08-01
3,2019-08-03
4,2019-07-28
4,2019-07-29
4,2019-08-01
4,2019-08-02
4,2019-08-03
建表:
create table if not exists login(
uid int,
udate string
)
row format delimited fields terminated by ','
;
load data local inpath '/root/hivedata/login2.txt' into table login;
执行:
select
uid,
max(cn)
from
(select
uid,
count(*) cn
from
(select
uid,
date_sub(udate,row_number() over(distribute by uid sort by udate)) udate
from login) t1
group by uid,udate) t2
group by uid
;
10.使用hive的hql实现男女各自第一名及其它
id sex chinese_s math_s
0 0 70 50
1 0 90 70
2 1 80 90
1、男女各自语文第一名(0:男,1:女)
2、男生成绩语文大于80,女生数学成绩大于70
建表:
create table if not exists score_s(
uid int,
usex int,
chinese_s int,
math_s int
)
row format delimited fields terminated by ' '
;
load data local inpath '/root/hivedata/score_s.txt' into table score_s;
1、男女各自语文第一名(0:男,1:女)
执行:
select
uid,
usex,
chinese_s
from
(select
uid,
usex,
chinese_s,
rank() over(distribute by usex sort by chinese_s desc) rn
from score_s) t1
where t1.rn=1
;
2、男生成绩语文大于80,女生数学成绩大于70
执行
select
*
from
score_s
where usex=0 and chinese_s>80
or usex=1 and math_s>70
;
11.使用hive的hql实现最大连续访问天数
求出每个用户当月最大连续登录天数
数据
log_time uid
2018-10-01 18:00:00,123
2018-10-02 18:00:00,123
2018-10-02 19:00:00,456
2018-10-04 18:00:00,123
2018-10-04 18:00:00,456
2018-10-05 18:00:00,123
2018-10-06 18:00:00,123
建表
create table if not exists login3(
log_time string,
uid string
)
row format delimited fields terminated by ','
;
load data local inpath '/root/hivedata/login3.txt' into table login3;
执行:
select
uid,
max(cnt)
from
(select
uid,
count(*) cnt
from
(select
uid,
mon,
dt-dense_rank() over(distribute by uid,mon sort by dt) dt
from
(select
uid,
month(log_time) mon,
day(log_time) dt
from
login3) t1
) t2
group by uid,mon,t2.dt) t3
group by t3.uid
;