计算 TopN

1. 0040 筛选每个班级各科成绩前两名学生成绩

数据:

grade_id,user_id,subject_name,score

1,a,语文,89
2,b,数学,39
4,e,语文,79
3,c,英语,29
12,d,数学,19
11,d,数学,49
1,a,数学,90
1,a,英语,69
1,r,英语,39
1,t,英语,19
create database if not exists exercise;
use exercise;
drop table if exists exercise0400;
create table exercise0400(grade_id int,user_id string, subject_name string, score string) row format delimited fields terminated by ",";
load data local inpath "/root/hivedata/stu.txt" into table exercise0400;

hive如何用出生日期计算年龄 hive sql 计算年龄_数据

select * from(select *,row_number() over(partition by grade_id,subject_name order by score desc) as rn from exercise0400) t1 where rn <=2;

hive如何用出生日期计算年龄 hive sql 计算年龄_数据仓库_02

1.1 表中的成绩列转换为 map 格式 (未完成)

a:{数学:90,英语69}

b:{数学:39}

2. 0401 求出每种爱好中,年龄最大的两个人(爱好,年龄,姓名)

数据:

id,name,age,favors
id,姓名,年龄,爱好

1,huangxiaoming,45,a-c-d-f
2,huangzitao,36,b-c-d-e
3,huanglei,41,c-d-e
4,liushishi,22,a-d-e
5,liudehua,39,e-f-d
6,liuyifei,35,a-d-e

其中需要注意的是:每一条记录中的爱好有多个值,以"-"分隔

需求:
求出每种爱好中,年龄最大的两个人(爱好,年龄,姓名)
注意思考一个问题:如果某个爱好中的第二大年龄有多个相同的怎么办?

解题:
第一步:建表导入数据相关准备:

create database if not exists exercise;
use exercise;
drop table if exists exercise0401;
create table exercise0401(id int, name string, age int, favors string) row format delimited fields terminated by ",";
load data local inpath "/root/hivedata/exces.txt" into table exercise0401;

select * from exercise0401;
desc exercise0401;

hive如何用出生日期计算年龄 hive sql 计算年龄_数据仓库_03

topn 查询

思路:

select a.id,a.name,a.age, fv.favor as favor from exercise0401 a lateral view explode(split(favors, "-")) fv as favor;

hive如何用出生日期计算年龄 hive sql 计算年龄_数据_04

row_number() over (partition by b.favor order by b.age desc)
通过 row_number() 窗口函数 求出每一列值的 rank 用于求topN
partition by b.favor order by b.age desc
根据 favor 进行分组计算出 top2 年龄的学员信息
where c.rank <= 2;
select c.* from 
(select b.*, 
row_number() over (partition by b.favor order by b.age desc) as rank 
from 
(select a.id,a.name,a.age, fv.favor as favor from exercise0401 a lateral view explode(split(favors, "-")) fv as favor) b) c 
where c.rank <= 2;

两张user表整合 根据登陆时间求 start_time,end_time(未完成)

u1 表:

user id time

用户名 用户id 用户登录时间

a,1,2019-1-1
b,2,2019-1-4
c,1,2019-4-2

u2 表:

user id time

用户名 用户id 用户登录时间

c,1,2019-4-11
b,2,2019-1-14
c,1,2019-4-22

综合练习

0050 商城数据

数据:

商店名称,月份,电子类销售额,服装类销售额,食品类销售额
shop,mth, dz, fz, sp

a,2019-01,3000,5000,2000
a,2019-02,6000,4000,3000
a,2019-03,3000,4000,2000
a,2019-04,3500,5500,4000
a,2019-05,3800,5200,6000
a,2019-06,3900,3000,4000
a,2019-07,2000,2500,2800
a,2019-08,1000,4000,2600
b,2019-01,3000,5000,2000
b,2019-02,6000,4000,3000
b,2019-03,3000,4000,2000
b,2019-04,3500,5500,4000
b,2019-05,3800,5200,6000
b,2019-06,3600,3000,4000
b,2019-07,2000,2500,2800
b,2019-08,1000,4000,2600
c,2019-01,3000,5000,2000
c,2019-02,1000,4000,3000
c,2019-03,3000,4000,2000
c,2019-04,3500,5500,4000
c,2019-05,3800,5200,6000
c,2019-06,3900,3000,4000
c,2019-07,2000,2500,2800
c,2019-08,1000,4000,2600
建表
CREATE TABLE exercise0050(
shop    STRING,
mth     STRING,
dz      double,
fz      double,
sp      double
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
load data local inpath '/root/hivedata/sale.txt' into table exercise0050;
1. 求每个店铺的各品类全年销售额
sSELECT
shop,
sum(dz) as dz_ze,
sum(fz) as fz_ze,
sum(sp) as sp_ze
FROM DEMO_SALE
GROUP BY shop;
2. 求每个店铺的各品类各季度销售额
SELECT
shop,jd,sum(dz),sum(fz),sum(sp),max(mth)

FROM 
(
   SELECT
     shop,
     case 
      when  mth between '2019-01' and '2019-03'  then  '1季度'
      when  mth between '2019-04' and '2019-06'  then  '2季度'
      when  mth between '2019-07' and '2019-09'  then  '3季度'
      else  '4季度'
     end  as jd,
     mth,
     dz,
     fz,
     sp
   FROM DEMO_SALE
) tmp
GROUP BY shop,jd;
3. 求每个店铺的每月销售总额
SELECT
shop,mth,(dz+fz+sp) as ze
FROM DEMO_SALE;
4. 求每个店铺的年销售总额
SELECT
shop as `店铺`,sum(dz+fz+sp) as `全年总额`
FROM DEMO_SALE
GROUP BY shop;
5. 求每个店铺每个月份中,销售额最大的品类的销售额,及其月份
SELECT
shop,mth,
case 
  when dz>=fz and dz>=sp  then dz
  when fz>=dz and fz>=sp  then fz
  when sp>=dz and sp>=fz  then sp
end

FROM DEMO_SALE;

SELECT
shop,mth,greatest(dz,fz,sp) as zde
FROM DEMO_SALE;

注:greatest(col_a,col_b,…) 比较n个column的大小返回最大值,过滤掉null,如果某个 column 中是 string,而其他是 int/double/float 等时,返回null

6. 求每个店铺每个月份中,销售额最大的品类,及其销售额,及其月份
SELECT
shop,mth,
case 
  when dz>=fz and dz>=sp then '电子'
  when fz>=dz and fz>=sp then '服装'
  when sp>=dz and sp>=fz then '食品'
end as `最大品类`,
greatest(dz,fz,sp) as `最大额`
FROM DEMO_SALE
;
7. 求"电子类"商品累计月销售额

店铺,月份,月额,累计到月额
a,2019-01,3000,3000
a,2019-02,6000,9000

SELECT
shop,
mth,
dz,
sum(dz)  over(partition by shop order by mth rows between unbounded preceding and current row)
-- rows between x preceding and current row
-- rows between unbounded preceding and current row
-- rows between unbounded preceding and unbounded following
-- rows between x preceding and y following
FROM DEMO_SALE;
7.1 累计到全年总额

店铺,月份,月额,累计到月额 ,全年总额
a,2019-01,3000,3000, …
a,2019-02,6000,9000, …

SELECT
shop,
mth,
dz,
sum(dz) over(partition by shop order by mth rows between unbounded preceding and current row) as `月累计额`,
sum(dz) over(partition by shop order by mth rows between unbounded preceding and unbounded following) as `年总额`
FROM DEMO_SALE;
8.求出各品类月累计销售额

报表结果如下:

±------±---------±--------±-------------------±---------±--------------±---------±--------------±------------|
| shop | mth | dz | 电子月累计 | fz | 服装月累计 | sp | 食品月累计 |
±------±---------±--------±-------------------±---------±--------------±---------±--------------±------------|
| a | 2019-01 | 3000.0 | 3000.0 | 5000.0 | 5000.0 | 2000.0 | 2000.0 |
| a | 2019-02 | 6000.0 | 9000.0 | 4000.0 | 9000.0 | 3000.0 | 5000.0 |
| a | 2019-03 | 3000.0 | 12000.0 | 4000.0 | 13000.0 | 2000.0 | 7000.0 |
| a | 2019-04 | 3500.0 | 15500.0 | 5500.0 | 18500.0 | 4000.0 | 11000.0 |
| a | 2019-05 | 3800.0 | 19300.0 | 5200.0 | 23700.0 | 6000.0 | 17000.0 |

SELECT
shop,
mth,
dz,
sum(dz) over(partition by shop order by mth rows between unbounded preceding and current row) as `电子月累计`,
fz,
sum(fz) over(partition by shop order by mth rows between unbounded preceding and current row) as `服装月累计`,
sp,
sum(sp) over(partition by shop order by mth rows between unbounded preceding and current row) as `食品月累计`

FROM DEMO_SALE;
9.求哪些店铺电子类产品的销售有过连续3月增长

店铺

a

c

步骤1:找出有增长的月份,并标记连续的行号
WITH tmp as (
 select
 shop,mth,row_number() over(partition by shop order by mth) as rn
 from 
 (
  SELECT
  shop,mth,dz,lead(dz,1,null) over(partition by shop order by mth) as next_dz
  FROM DEMO_SALE
 ) o1
 where dz<next_dz
)

结果:

shop|mth    |rn|
----|-------|--|
a   |2019-01| 1|
a   |2019-03| 2|
a   |2019-04| 3|
a   |2019-05| 4|
b   |2019-01| 1|
b   |2019-03| 2|
b   |2019-04| 3|
c   |2019-02| 1|
c   |2019-03| 2|
c   |2019-04| 3|
c   |2019-05| 4|
步骤2:求月份和行号的差值
步骤3:寻找 shop + 差值组合的个数,留下个数 >=3 的店铺
select
shop
from 
(
  select
  shop,
  count(1) as cnts
  from tmp
  group by shop,cast(substr(mth,6) as int)-rn
)
where cnts>=3
group by shop;
a   |2019-04| 3|
a   |2019-05| 4|
b   |2019-01| 1|
b   |2019-03| 2|
b   |2019-04| 3|
c   |2019-02| 1|
c   |2019-03| 2|
c   |2019-04| 3|
c   |2019-05| 4|
步骤2:求月份和行号的差值
步骤3:寻找 shop + 差值组合的个数,留下个数 >=3 的店铺
select
shop
from 
(
  select
  shop,
  count(1) as cnts
  from tmp
  group by shop,cast(substr(mth,6) as int)-rn
)
where cnts>=3
group by shop;