文章目录
- Hive 中的 json_tuple 和 parse_url_tuple 函数
- json_tuple
- parse_url_tuple
- 开窗函数的使用
- hiveserver2(HS2) 和 beeline
- 复杂数据类型
- 分区表
Hive 中的 json_tuple 和 parse_url_tuple 函数
json_tuple
- json_tuple 函数用来解析 json 串
# 示例json串
{"movie":"2022","rate":"5","time":"956716207","userid":"6040"}
# 使用函数解析
select json_tuple('{"movie":"2022","rate":"5","time":"956716207","userid":"6040"}',"movie", "rate", "time", "userid")
as (movieid, rate, time, userid);
# 输出
movieid rate time userid
2022 5 956716207 6040
parse_url_tuple
- parse_url_tuple 函数用来解析 URL
# 示例URL
http://www.aaa.com/bbb/cc.html?sessionid=123456&a=b&c=d
# 使用函数解析
select parse_url_tuple("http://www.aaa.com/bbb/cc.html?sessionid=123456&a=b&c=d", "HOST", "PATH", "QUERY", "QUERY:sessionid", "QUERY:a", "QUERY:c")
as (HOST, PATH, QUERY, SESSIONID, A, C);
# 输出
host path query sessionid a c
www.aaa.com /bbb/cc.html sessionid=123456&a=b&c=d 123456 b d
开窗函数的使用
# 数据样例
hive_rownumber.id hive_rownumber.age hive_rownumber.name hive_rownumber.gender
1 18 zhangsan M
2 19 lisi M
3 22 wangwu F
4 16 zhaoliu F
5 30 tianqi M
6 26 wangba F
# 根据性别类型,统计年龄最大的两条数据(分组 TOPN)
select id, age, name, gender, r
from
(
select id, age, name, gender,
ROW_NUMBER() over(PARTITION BY gender order by age desc) r
from hive_rownumber
) t
where t.r <= 2;
# 输出
id age name gender r
6 26 wangba F 1
3 22 wangwu F 2
5 30 tianqi M 1
2 19 lisi M 2
hiveserver2(HS2) 和 beeline
- beeline是hive官方推荐的一个命令行客户端,需要先启动hiveserver2(HS2)服务
# 进入hive的bin目录下,如果配置了环境变量可以直接执行
cd ${HIVE_HOME}/bin
# 启动 hiveserver2
# 可以使用 nohup 后台运行该服务
nohup ./hiveserver2 &
# 启动beeline
# -u 指定数据库地址,默认端口 10000
# -n 指定当前用户名
./beeline -u jdbc:hive2://hadoop001:10000/default -n hadoop
复杂数据类型
- arrays:数组,一个数组中装的元素类型是一样的
# 示例数据,字段间分隔符为\t
zhangsan beijing,shanghai,tianjin,hangzhou
lisi changchun,chengdu,wuhan,beijing
# 创建表,注意如何指定数组类型及其分隔符
create table hive_array(
name string,
work_locations array<string>
) row format
delimited fields terminated by '\t'
collection items terminated by ',';
# 查询数据
hive_array.name hive_array.work_locations
zhangsan ["beijing","shanghai","tianjin","hangzhou"]
lisi ["changchun","chengdu","wuhan","beijing"]
# 可以在 SELECT 后查询的
# 查询某个 index 的数据,index 从 0 开始
work_locations[index]
# 查询数组大小
size(work_locations)
# 可以在 WHERE 后筛选的
# 数组是否包含某个元素
array_contains(work_locations, "hangzhou");
- maps:kv对,key的类型是一样的
# 示例数据
1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29
4,mayun,father:mayongzhen#mother:angelbaby,26
# 创建表,注意如何指定 map 类型及其分隔符
create table hive_map(
id int,
name string,
members map<string, string>,
age int
) row format
delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';
# 查询数据
hive_map.id hive_map.name hive_map.members hive_map.age
1 zhangsan {"father":"xiaoming","mother":"xiaohuang","brother":"xiaoxu"} 28
2 lisi {"father":"mayun","mother":"huangyi","brother":"guanyu"} 22
3 wangwu {"father":"wangjianlin","mother":"ruhua","sister":"jingtian"} 29
4 mayun {"father":"mayongzhen","mother":"angelbaby"} 26
# 可以在 SELECT 后查询的
# 查询某个 key 的数据
members["father"]
# 返回 key 的数组
map_keys(members)
# 返回 value 的数组
map_values(members)
# 返回 map 中的 kv 对的个数
size(members)
- structs:可以存放任意类型
# 示例数据
192.168.1.1#zhangsan:40
192.168.1.2#lisi:50
192.168.1.3#wangwu:60
192.168.1.4#zhaoliu:70
# 创建表
create table hive_struct(
ip string,
userinfo struct<name:string, age:int>)
row format
delimited fields terminated by '#'
collection items terminated by ':';
# 查询数据
hive_struct.ip hive_struct.userinfo
192.168.1.1 {"name":"zhangsan","age":40}
192.168.1.2 {"name":"lisi","age":50}
192.168.1.3 {"name":"wangwu","age":60}
192.168.1.4 {"name":"zhaoliu","age":70}
# 可以在 SELECT 后查询的
# 直接.即可
userinfo.name
分区表
# 示例数据
10703007267488 2014-05-01 06:01:12.334+01
10101043505096 2014-05-01 07:28:12.342+01
10103043509747 2014-05-01 07:50:12.33+01
10103043501575 2014-05-01 09:27:12.33+01
10104043514061 2014-05-01 09:03:12.324+01
# 指定分区字段
create table order_partition(
order_no string,
order_time string
)
partitioned by (event_month string)
row format delimited fields terminated by '\t';
# load 数据,指定分区
load data local inpath 'xxx' into table order_partition partition (event_month='2014-05');
# 查看数据
# 真正表的字段是不包括分区字段的,分区字段只是HDFS上文件夹的名称
order_partition.order_no order_partition.order_time order_partition.event_month
10703007267488 2014-05-01 06:01:12.334+01 2014-05
10101043505096 2014-05-01 07:28:12.342+01 2014-05
10103043509747 2014-05-01 07:50:12.33+01 2014-05
10103043501575 2014-05-01 09:27:12.33+01 2014-05
10104043514061 2014-05-01 09:03:12.324+01 2014-05
# 查看 HDFS 上的存储
# 存储在 /user/hive/warehouse/order_partition/event_month=2014-05 目录下
# WHERE 子查询可以跟上 event_month=‘xxx’ 查询指定分区的数据
# 生产上的流程一般是数据经过清洗后存放在 HDFS 的目录上,然后将目录中的数据加载到分区表中
# 一级分区
# 将文件放到 HDFS 的对应目录
hdfs dfs -mkdir /user/hive/warehouse/order_partition/event_month=2014-06
hdfs dfs -put xxx /user/hive/warehouse/order_partition/event_month=2014-06
# 加载所有分区
msck repair table order_partition
# 多级分区,注意分区字段多了一个 step
create table order_multi_partition(
order_no string,
order_time string
)
partitioned by (event_month string, step string)
row format delimited fields terminated by '\t';
# 指定 step
load data local inpath 'xxx' into table order_partition partition (event_month='2014-05', step=1);
# HDFS 上的存储位置为 /user/hive/warehouse/order_multi_partition/event_month=2014-05/step=1
# 查询数据
order_multi_partition.order_no order_multi_partition.order_time order_multi_partition.event_month order_multi_partition.step
10703007267488 2014-05-01 06:01:12.334+01 2014-05 1
10101043505096 2014-05-01 07:28:12.342+01 2014-05 1
10103043509747 2014-05-01 07:50:12.33+01 2014-05 1
10103043501575 2014-05-01 09:27:12.33+01 2014-05 1
10104043514061 2014-05-01 09:03:12.324+01 2014-05 1
# 将同一部门的人分到同一个分区
# 创建表
create table emp_partition(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double
) partitioned by (deptno int)
row format
delimited fields terminated by '\t';
# 静态分区:手动指定每个分区然后加载数据,如果有很多分区,这样一个个的加载非常麻烦
insert into table emp_partition partition(deptno=10)
select empno, ename, job, mgr, hiredate, sal, comm from emp
where deptno=10;
# 动态分区:自动按分区分到不同分区
# 需要设置 set hive.exec.dynamic.partition.mode=nonstrict,默认为strict
# 需要将分区字段写到 SELECT 后的最后一个字段
insert into table emp_dynamic_partition partition(deptno)
select empno, ename, job, mgr, hiredate, sal, comm, deptno from emp;