文章目录
- Hive实战
- 1、观察数据建表
- 1.1 建立外表
- 题目:
- 1、统计视频观看数Top10
- 2、统计视频类别热度Top10(类别热度:类别下的总视频数)
- 3、统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数
- 4、统计视频观看数Top50所关联视频的所属类别排序
- 5、统计每个类别中的视频热度(视频观看数)Top10,以Music为例
- 6、统计每个类别视频观看数Top10
- 7、统计上传视频最多的用户Top10以及他们上传的视频观看次数在前20的视频
Hive实战
数据
- 视频表
字段 | 备注 | 详细描述 |
videoId | 视频唯一id(String) | 11位字符串 |
uploader | 视频上传者(String) | 上传视频的用户名String |
age | 视频年龄(int) | 视频在平台上的整数天 |
category | 视频类别(Array) | 上传视频指定的视频分类 |
length | 视频长度(Int) | 整形数字标识的视频长度 |
views | 观看次数(Int) | 视频被浏览的次数 |
rate | 视频评分(Double) | 满分5分 |
Ratings | 流量(Int) | 视频的流量,整型数字 |
conments | 评论数(Int) | 一个视频的整数评论数 |
relatedId | 相关视频id(Array) | 相关视频的id,最多20个 |
- 用户
字段 | 备注 | 字段类型 |
uploader | 上传者用户名 | string |
videos | 上传视频数 | int |
friends | 朋友数量 | int |
1、观察数据建表
用户表:
字段分隔符为制表符
视频表:
字段分隔符为制表符
集合元素间分隔符为&
1.1 建立外表
1.创建普通格式表
create external table user_t(
uploader string,
videos int,
friends int
)
row format delimited fields terminated by '\t'
collection items terminated by '&'
lines terminated by '\n'
stored as textfile
location '/shizan/user';
create external table video_t(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as textfile
location '/shizhan/video';
2.导入数据
load data inpath '/shizhan/user' into table user_t;
load data inpath '/shizhan/video' into table video_t;
3.创建orc格式的表
create table video_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");
create table user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc
tblproperties("orc.compress"="SNAPPY");
4.将外表查询插入至内表
insert into table user_orc select * from user_t;
insert into table video_orc select * from video_t;
题目:
1、统计视频观看数Top10
select
videoid,
views
from
video_orc
order by
views desc
limit 10;
2、统计视频类别热度Top10(类别热度:类别下的总视频数)
分析:
类别为数组
先将类别侧写
再统计视频个数
再排序
select
EXPLODE(category)
from
video_orc
limit
10
;
----
select
videoid,
categary_name
from
video_orc
lateral view EXPLODE(category) temp as categary_name
limit
20;
---
select
categary_name,
count(videoid) as c_id
from
video_orc
lateral view EXPLODE(category) temp as categary_name
group by
categary_name
order by
c_id desc
limit
10;
---
+----------------+---------+
| categary_name | c_id |
+----------------+---------+
| Music | 179049 |
| Entertainment | 127674 |
| Comedy | 87818 |
| Animation | 73293 |
| Film | 73293 |
| Sports | 67329 |
| Games | 59817 |
| Gadgets | 59817 |
| People | 48890 |
| Blogs | 48890 |
+----------------+---------+
3、统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数
分析:
- 频观看数最高的20个视频的所属类别
---
select
videoid,
category_name,
count(videoid)over(partition by category_name)
from
(
select
videoid,
category_name
from
(
select
videoid,
category,
views
from
video_orc
order by
views desc
limit 20
) t1
lateral view explode(category) temp as category_name
) t2
---
+--------------+----------------+-----------------+
| videoid | category_name | count_window_0 |
+--------------+----------------+-----------------+
| D2kJZOfq7zk | Blogs | 2 |
| -_CSo1gOd48 | Blogs | 2 |
| MNxwAU_xAMk | Comedy | 6 |
| 0XxI-hvPRRA | Comedy | 6 |
| dMH0bHeiRNg | Comedy | 6 |
| _BuRwH59oAo | Comedy | 6 |
| 49IDp76kjPw | Comedy | 6 |
| 5P6UU6m3cqk | Comedy | 6 |
| RUCZJVJ_M8o | Entertainment | 6 |
| ixsZy2425eY | Entertainment | 6 |
| lsO6D1rwrKc | Entertainment | 6 |
| vr3x_RRJdd4 | Entertainment | 6 |
| RB-wUgnyGv0 | Entertainment | 6 |
| 1dmVU08zVpA | Entertainment | 6 |
| 8bbTtPL1jRs | Music | 5 |
| QjA5faZF1A8 | Music | 5 |
| UMf40daefsI | Music | 5 |
| pv5zWaTEVkI | Music | 5 |
| tYnn51C3X_w | Music | 5 |
| -_CSo1gOd48 | People | 2 |
| D2kJZOfq7zk | People | 2 |
| aRNzWyD7C9o | UNA | 1 |
+--------------+----------------+-----------------+
---
select
category_name,
count(videoid)
from
(
select
videoid,
category_name
from
(
select
videoid,
category,
views
from
video_orc
order by
views desc
limit 20
) t1
lateral view explode(category) temp as category_name
) t2
group by
category_name
---
+----------------+------+
| category_name | _c1 |
+----------------+------+
| Blogs | 2 |
| Comedy | 6 |
| Entertainment | 6 |
| Music | 5 |
| People | 2 |
| UNA | 1 |
+----------------+------+
4、统计视频观看数Top50所关联视频的所属类别排序
--观看数前50
select
videoid,
relatedid
from
video_orc
order by
views desc
limit 50
--
select
videoid,
id
from
(
select
videoid,
relatedid,
views
from
video_orc
order by
views desc
limit 50
) t1
lateral view explode(relatedid) temp as id
--
select
t2.id,
video_orc.category
from
(
select
videoid,
id
from
(
select
videoid,
relatedid,
views
from
video_orc
order by
views desc
limit 50
) t1
lateral view explode(relatedid) temp as id
) t2
join video_orc on t2.id=video_orc.videoid
--
select
t3.id,
category_name
from
(
select
t2.id,
video_orc.category
from
(
select
videoid,
id
from
(
select
videoid,
relatedid,
views
from
video_orc
order by
views desc
limit 50
) t1
lateral view explode(relatedid) temp as id
) t2
join video_orc on t2.id=video_orc.videoid
) t3
lateral view explode(t3.category) tmp as category_name
---
select
t4.category_name,
count(t4.id) c
from
(
select
t3.id,
category_name
from
(
select
t2.id,
video_orc.category
from
(
select
videoid,
id
from
(
select
videoid,
relatedid,
views
from
video_orc
order by
views desc
limit 50
) t1
lateral view explode(relatedid) temp as id
) t2
join video_orc on t2.id=video_orc.videoid
) t3
lateral view explode(t3.category) tmp as category_name
)t4
group by
t4.category_name
order by
c desc;
--
select
t5.category_name,
t5.c,
rank()over(order by t5.c desc)
from
(
select
t4.category_name,
count(t4.id) c
from
(
select
t3.id,
category_name
from
(
select
t2.id,
video_orc.category
from
(
select
videoid,
id
from
(
select
videoid,
relatedid,
views
from
video_orc
order by
views desc
limit 50
) t1
lateral view explode(relatedid) temp as id
) t2
join video_orc on t2.id=video_orc.videoid
) t3
lateral view explode(t3.category) tmp as category_name
)t4
group by
t4.category_name
order by
c desc
) t5
5、统计每个类别中的视频热度(视频观看数)Top10,以Music为例
--music
select
videoid,
views,
category_name
from
video_orc
lateral view explode(category) temp as category_name
--
select
videoid,
category_name,
views
from
video_orc
lateral view explode(category) temp as category_name
where
category_name='Music'
order by
views desc
limit 10
;
6、统计每个类别视频观看数Top10
select
videoid,
views,
category_name,
rk
from
(
select
videoid,
views,
category_name,
rank()over(partition by category_name order by views desc) rk
from
video_orc
lateral view explode(category) temp as category_name
) t1
where
t1.rk<=10
7、统计上传视频最多的用户Top10以及他们上传的视频观看次数在前20的视频
约定1:统计上传视频最多的用户Top10和这10名用户上传的视频观看次数在前20的视频
select
uploader,
videos
from
user_orc
order by
videos desc
limit 10;
----
select
v.uploader,
v.videoid,
v.views
from
video_orc as v
join
(
select
uploader,
videos
from
user_orc
order by
videos desc
limit 10
) t1
on t1.uploader=v.uploader
order by
v.views desc
limit 20
+----------------+--------------+----------+
| v.uploader | v.videoid | v.views |
+----------------+--------------+----------+
| expertvillage | -IxHBW0YpZw | 39059 |
| expertvillage | BU-fT5XI_8I | 29975 |
| expertvillage | ADOcaBYbMl0 | 26270 |
| expertvillage | yAqsULIDJFE | 25511 |
| expertvillage | vcm-t0TJXNg | 25366 |
| expertvillage | 0KYGFawp14c | 24659 |
| expertvillage | j4DpuPvMLF4 | 22593 |
| expertvillage | Msu4lZb2oeQ | 18822 |
| expertvillage | ZHZVj44rpjE | 16304 |
| expertvillage | foATQY3wovI | 13576 |
| expertvillage | -UnQ8rcBOQs | 13450 |
| expertvillage | crtNd46CDks | 11639 |
| expertvillage | D1leA0JKHhE | 11553 |
| expertvillage | NJu2oG1Wm98 | 11452 |
| expertvillage | CapbXdyv4j4 | 10915 |
| expertvillage | epr5erraEp4 | 10817 |
| expertvillage | IyQoDgaLM7U | 10597 |
| expertvillage | tbZibBnusLQ | 10402 |
| expertvillage | _GnCHodc7mk | 9422 |
| expertvillage | hvEYlSlRitU | 7123 |
+----------------+--------------+----------+
约定2:取Top10中每个人上传的视频的观看次数前20
select
uploader,
videos
from
user_orc
order by
videos desc
limit 10;
---
select
v.uploader,
v.videoid,
v.views,
rank()over(partition by v.uploader)
from
video_orc as v
join
(
select
uploader,
videos
from
user_orc
order by
videos desc
limit 10
) t1
on t1.uploader=v.uploader
---
select
*
from
(
select
v.uploader,
v.videoid,
v.views,
rank()over(partition by v.uploader order by v.views) rk
from
video_orc as v
join
(
select
uploader,
videos
from
user_orc
order by
videos desc
limit 10
) t1
on t1.uploader=v.uploader
) t2
where
t2.rk<=20
约定3:Top10用户上传的所有视频,有哪些视频是在视频观看次数前20的视频。
---
select
videoid,
uploader,
views
from
video_orc
order by
views desc
limit 20
---
select
uploader,
videos
from
user_orc
order by
videos desc
limit 10
----
select
*
from
(
select
videoid,
uploader,
views
from
video_orc
order by
views desc
limit 20
) t1
join
(
select
uploader,
videos
from
user_orc
order by
videos desc
limit 10
) t2
on t1.uploader=t2.uploader