计算 TopN
1. 0040 筛选每个班级各科成绩前两名学生成绩
数据:
grade_id,user_id,subject_name,score
1,a,语文,89
2,b,数学,39
4,e,语文,79
3,c,英语,29
12,d,数学,19
11,d,数学,49
1,a,数学,90
1,a,英语,69
1,r,英语,39
1,t,英语,19
create database if not exists exercise;
use exercise;
drop table if exists exercise0400;
create table exercise0400(grade_id int,user_id string, subject_name string, score string) row format delimited fields terminated by ",";
load data local inpath "/root/hivedata/stu.txt" into table exercise0400;
select * from(select *,row_number() over(partition by grade_id,subject_name order by score desc) as rn from exercise0400) t1 where rn <=2;
1.1 表中的成绩列转换为 map 格式 (未完成)
a:{数学:90,英语69}
b:{数学:39}
2. 0401 求出每种爱好中,年龄最大的两个人(爱好,年龄,姓名)
数据:
id,name,age,favors
id,姓名,年龄,爱好
1,huangxiaoming,45,a-c-d-f
2,huangzitao,36,b-c-d-e
3,huanglei,41,c-d-e
4,liushishi,22,a-d-e
5,liudehua,39,e-f-d
6,liuyifei,35,a-d-e
其中需要注意的是:每一条记录中的爱好有多个值,以"-"分隔
需求:
求出每种爱好中,年龄最大的两个人(爱好,年龄,姓名)
注意思考一个问题:如果某个爱好中的第二大年龄有多个相同的怎么办?
解题:
第一步:建表导入数据相关准备:
create database if not exists exercise;
use exercise;
drop table if exists exercise0401;
create table exercise0401(id int, name string, age int, favors string) row format delimited fields terminated by ",";
load data local inpath "/root/hivedata/exces.txt" into table exercise0401;
select * from exercise0401;
desc exercise0401;
topn 查询
思路:
select a.id,a.name,a.age, fv.favor as favor from exercise0401 a lateral view explode(split(favors, "-")) fv as favor;
row_number() over (partition by b.favor order by b.age desc)
通过 row_number() 窗口函数 求出每一列值的 rank 用于求topN
partition by b.favor order by b.age desc
根据 favor 进行分组计算出 top2 年龄的学员信息
where c.rank <= 2;
select c.* from
(select b.*,
row_number() over (partition by b.favor order by b.age desc) as rank
from
(select a.id,a.name,a.age, fv.favor as favor from exercise0401 a lateral view explode(split(favors, "-")) fv as favor) b) c
where c.rank <= 2;
两张user表整合 根据登陆时间求 start_time,end_time(未完成)
u1 表:
user id time
用户名 用户id 用户登录时间
a,1,2019-1-1
b,2,2019-1-4
c,1,2019-4-2
u2 表:
user id time
用户名 用户id 用户登录时间
c,1,2019-4-11
b,2,2019-1-14
c,1,2019-4-22
综合练习
0050 商城数据
数据:
商店名称,月份,电子类销售额,服装类销售额,食品类销售额
shop,mth, dz, fz, sp
a,2019-01,3000,5000,2000
a,2019-02,6000,4000,3000
a,2019-03,3000,4000,2000
a,2019-04,3500,5500,4000
a,2019-05,3800,5200,6000
a,2019-06,3900,3000,4000
a,2019-07,2000,2500,2800
a,2019-08,1000,4000,2600
b,2019-01,3000,5000,2000
b,2019-02,6000,4000,3000
b,2019-03,3000,4000,2000
b,2019-04,3500,5500,4000
b,2019-05,3800,5200,6000
b,2019-06,3600,3000,4000
b,2019-07,2000,2500,2800
b,2019-08,1000,4000,2600
c,2019-01,3000,5000,2000
c,2019-02,1000,4000,3000
c,2019-03,3000,4000,2000
c,2019-04,3500,5500,4000
c,2019-05,3800,5200,6000
c,2019-06,3900,3000,4000
c,2019-07,2000,2500,2800
c,2019-08,1000,4000,2600
建表
CREATE TABLE exercise0050(
shop STRING,
mth STRING,
dz double,
fz double,
sp double
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
load data local inpath '/root/hivedata/sale.txt' into table exercise0050;
1. 求每个店铺的各品类全年销售额
sSELECT
shop,
sum(dz) as dz_ze,
sum(fz) as fz_ze,
sum(sp) as sp_ze
FROM DEMO_SALE
GROUP BY shop;
2. 求每个店铺的各品类各季度销售额
SELECT
shop,jd,sum(dz),sum(fz),sum(sp),max(mth)
FROM
(
SELECT
shop,
case
when mth between '2019-01' and '2019-03' then '1季度'
when mth between '2019-04' and '2019-06' then '2季度'
when mth between '2019-07' and '2019-09' then '3季度'
else '4季度'
end as jd,
mth,
dz,
fz,
sp
FROM DEMO_SALE
) tmp
GROUP BY shop,jd;
3. 求每个店铺的每月销售总额
SELECT
shop,mth,(dz+fz+sp) as ze
FROM DEMO_SALE;
4. 求每个店铺的年销售总额
SELECT
shop as `店铺`,sum(dz+fz+sp) as `全年总额`
FROM DEMO_SALE
GROUP BY shop;
5. 求每个店铺每个月份中,销售额最大的品类的销售额,及其月份
SELECT
shop,mth,
case
when dz>=fz and dz>=sp then dz
when fz>=dz and fz>=sp then fz
when sp>=dz and sp>=fz then sp
end
FROM DEMO_SALE;
或
SELECT
shop,mth,greatest(dz,fz,sp) as zde
FROM DEMO_SALE;
注:greatest(col_a,col_b,…) 比较n个column的大小返回最大值,过滤掉null
,如果某个 column 中是 string,而其他是 int/double/float 等时,返回null
。
6. 求每个店铺每个月份中,销售额最大的品类,及其销售额,及其月份
SELECT
shop,mth,
case
when dz>=fz and dz>=sp then '电子'
when fz>=dz and fz>=sp then '服装'
when sp>=dz and sp>=fz then '食品'
end as `最大品类`,
greatest(dz,fz,sp) as `最大额`
FROM DEMO_SALE
;
7. 求"电子类"商品累计月销售额
店铺,月份,月额,累计到月额
a,2019-01,3000,3000
a,2019-02,6000,9000
…
SELECT
shop,
mth,
dz,
sum(dz) over(partition by shop order by mth rows between unbounded preceding and current row)
-- rows between x preceding and current row
-- rows between unbounded preceding and current row
-- rows between unbounded preceding and unbounded following
-- rows between x preceding and y following
FROM DEMO_SALE;
7.1 累计到全年总额
店铺,月份,月额,累计到月额 ,全年总额
a,2019-01,3000,3000, …
a,2019-02,6000,9000, …
SELECT
shop,
mth,
dz,
sum(dz) over(partition by shop order by mth rows between unbounded preceding and current row) as `月累计额`,
sum(dz) over(partition by shop order by mth rows between unbounded preceding and unbounded following) as `年总额`
FROM DEMO_SALE;
8.求出各品类月累计销售额
报表结果如下:
±------±---------±--------±-------------------±---------±--------------±---------±--------------±------------|
| shop | mth | dz | 电子月累计 | fz | 服装月累计 | sp | 食品月累计 |
±------±---------±--------±-------------------±---------±--------------±---------±--------------±------------|
| a | 2019-01 | 3000.0 | 3000.0 | 5000.0 | 5000.0 | 2000.0 | 2000.0 |
| a | 2019-02 | 6000.0 | 9000.0 | 4000.0 | 9000.0 | 3000.0 | 5000.0 |
| a | 2019-03 | 3000.0 | 12000.0 | 4000.0 | 13000.0 | 2000.0 | 7000.0 |
| a | 2019-04 | 3500.0 | 15500.0 | 5500.0 | 18500.0 | 4000.0 | 11000.0 |
| a | 2019-05 | 3800.0 | 19300.0 | 5200.0 | 23700.0 | 6000.0 | 17000.0 |
SELECT
shop,
mth,
dz,
sum(dz) over(partition by shop order by mth rows between unbounded preceding and current row) as `电子月累计`,
fz,
sum(fz) over(partition by shop order by mth rows between unbounded preceding and current row) as `服装月累计`,
sp,
sum(sp) over(partition by shop order by mth rows between unbounded preceding and current row) as `食品月累计`
FROM DEMO_SALE;
9.求哪些店铺电子类产品的销售有过连续3月增长
店铺
a
c
…
步骤1:找出有增长的月份,并标记连续的行号
WITH tmp as (
select
shop,mth,row_number() over(partition by shop order by mth) as rn
from
(
SELECT
shop,mth,dz,lead(dz,1,null) over(partition by shop order by mth) as next_dz
FROM DEMO_SALE
) o1
where dz<next_dz
)
结果:
shop|mth |rn|
----|-------|--|
a |2019-01| 1|
a |2019-03| 2|
a |2019-04| 3|
a |2019-05| 4|
b |2019-01| 1|
b |2019-03| 2|
b |2019-04| 3|
c |2019-02| 1|
c |2019-03| 2|
c |2019-04| 3|
c |2019-05| 4|
步骤2:求月份和行号的差值
步骤3:寻找 shop + 差值组合的个数,留下个数 >=3 的店铺
select
shop
from
(
select
shop,
count(1) as cnts
from tmp
group by shop,cast(substr(mth,6) as int)-rn
)
where cnts>=3
group by shop;
a |2019-04| 3|
a |2019-05| 4|
b |2019-01| 1|
b |2019-03| 2|
b |2019-04| 3|
c |2019-02| 1|
c |2019-03| 2|
c |2019-04| 3|
c |2019-05| 4|
步骤2:求月份和行号的差值
步骤3:寻找 shop + 差值组合的个数,留下个数 >=3 的店铺
select
shop
from
(
select
shop,
count(1) as cnts
from tmp
group by shop,cast(substr(mth,6) as int)-rn
)
where cnts>=3
group by shop;