一、执行计划

  • explain,查看基本信息
  • explain dependency,查看依赖信息
  • explain authorization,查看权限信息

1.explain 用法

-- explain + 执行语句
例如:
explain
select 
	s2.s_id ,avg(s2.s_score) as avg_score
from edu.student s
join edu.score s2 on s.s_id  = s2.s_id 
where s.s_class  = 3
group by s2.s_id 
having avg(s2.s_score) >= 60
order by avg_score desc
limit 3;

-- MySQL: from -> on -> join -> where -> group by -> having -> window func -> select -> distinct -> union -> order by -> limit

-- Hive:from -> where -> on -> join -> group by -> 聚合函数 -> having -> select -> order by -> limit
STAGE DEPENDENCIES:
   Stage-6 is a root stage
   Stage-2 depends on stages: Stage-6
   Stage-3 depends on stages: Stage-2
   Stage-0 depends on stages: Stage-3STAGE PLANS:
   Stage: Stage-6
     Map Reduce Local Work
       Alias -> Map Local Tables:
         $hdt$_0:s -- student表
           Fetch Operator
             limit: -1 -- 不限制行数, 不是limit 3
       Alias -> Map Local Operator Tree:
         $hdt$_0:s 
           TableScan -- 表扫描
             alias: s -- student表
             Statistics: Num rows: 58 Data size: 470 Basic stats: COMPLETE Column stats: NONE
             Filter Operator -- where筛选条件
               predicate: ((s_class = 3) and s_id is not null) (type: boolean)
               Statistics: Num rows: 29 Data size: 235 Basic stats: COMPLETE Column stats: NONE
               Select Operator -- on所需字段: s.s_id
                 expressions: s_id (type: int)
                 outputColumnNames: _col0
                 Statistics: Num rows: 29 Data size: 235 Basic stats: COMPLETE Column stats: NONE
                 HashTable Sink Operator
                   keys:
                     0 _col0 (type: int)
                     1 _col0 (type: int)  Stage: Stage-2
     Map Reduce
       Map Operator Tree:
           TableScan -- 表扫描
             alias: sc -- score表
             Statistics: Num rows: 66 Data size: 531 Basic stats: COMPLETE Column stats: NONE
             Filter Operator
               -- on筛选条件
               predicate: s_id is not null (type: boolean)
               Statistics: Num rows: 66 Data size: 531 Basic stats: COMPLETE Column stats: NONE
               Select Operator
                   -- on所需字段: sc.s_id
                   -- group by + avg所需字段: sc.s_id, sc.score
                 expressions: s_id (type: int), s_score (type: int)
                 outputColumnNames: _col0, _col1
                 Statistics: Num rows: 66 Data size: 531 Basic stats: COMPLETE Column stats: NONE
                 Map Join Operator
                   condition map:
                           -- inner join内连接操作
                        Inner Join 0 to 1
                   keys:
                     0 _col0 (type: int)
                     1 _col0 (type: int)
                   outputColumnNames: _col2, _col3
                   Statistics: Num rows: 72 Data size: 584 Basic stats: COMPLETE Column stats: NONE
                   Group By Operator -- group by 分组操作
                     aggregations: avg(_col3) -- avg聚合操作
                     keys: _col2 (type: int)
                     mode: hash
                     outputColumnNames: _col0, _col1
                     Statistics: Num rows: 72 Data size: 584 Basic stats: COMPLETE Column stats: NONE
                     Reduce Output Operator
                       key expressions: _col0 (type: int)
                       sort order: +  -- group by 升序操作, 非order by 降序
                       Map-reduce partition columns: _col0 (type: int)
                       Statistics: Num rows: 72 Data size: 584 Basic stats: COMPLETE Column stats: NONE
                       value expressions: _col1 (type: struct<count:bigint,sum:double,input:int>)
       Local Work:
         Map Reduce Local Work
       Reduce Operator Tree:
         Group By Operator
           aggregations: avg(VALUE._col0)
           keys: KEY._col0 (type: int)
           mode: mergepartial
           outputColumnNames: _col0, _col1
           Statistics: Num rows: 36 Data size: 292 Basic stats: COMPLETE Column stats: NONE
           Filter Operator -- having筛选操作
             predicate: (_col1 > 60.0) (type: boolean)
             Statistics: Num rows: 12 Data size: 97 Basic stats: COMPLETE Column stats: NONE
             File Output Operator
               compressed: false  -- 不压缩
               table:
                   input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                   serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe  Stage: Stage-3
     Map Reduce
       Map Operator Tree:
           TableScan
             Reduce Output Operator
               key expressions: _col1 (type: double)
               sort order: -  -- order by 降序操作
               Statistics: Num rows: 12 Data size: 97 Basic stats: COMPLETE Column stats: NONE
               TopN Hash Memory Usage: 0.1
               value expressions: _col0 (type: int)
       Reduce Operator Tree:
         Select Operator
           expressions: VALUE._col0 (type: int), KEY.reducesinkkey0 (type: double)
           outputColumnNames: _col0, _col1
           Statistics: Num rows: 12 Data size: 97 Basic stats: COMPLETE Column stats: NONE
           Limit -- limit操作
             Number of rows: 3
             Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: NONE
             File Output Operator
               compressed: false
               Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: NONE
               table:
                   input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                   serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe  Stage: Stage-0
     Fetch Operator
       limit: 3
       Processor Tree:
         ListSink

2.explain 场景

案例一:join会过滤null吗?

-- join会过滤null

案例二:group by会排序吗?

-- group by 会排序且只能升序

案例三:哪条SQL执行效率高?

-- 以下两条SQL语句输出结果一致,但哪条执行效率高呢?
-- 以下两段SQL语句除了表别名和字段别名不一致, 其他执行计划完全一致
select s.s_id, s.s_name, sc.c_id, sc.s_score 
from student s
join score sc on sc.s_id=s.s_id 
where s.s_class=3;

select s.s_id, s.s_name, sc.c_id, sc.s_score 
from (select * from student where s_class=3) s
join score sc on sc.s_id=s.s_id;

两条语句的执行计划只有表别名与字段别名有区别,所以以上两段sql语句运行效率相同

3.explain dependency 用法

用于描述数据来源,输出json格式,包含以下两部分:

  • input_partitions:数据来源的表分区
  • input_tables:数据来源的表
explain dependency select * from edu.student
{"input_tables":[{"tablename":"edu@student","tabletype":"MANAGED_TABLE"}],"input_partitions":[]}
explain dependency select * from sm.sm_order_detail
{"input_tables":[{"tablename":"sm@sm_order_detail","tabletype":"MANAGED_TABLE"}],"input_partitions":[{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=01"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=02"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=03"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=04"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=05"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=06"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=07"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=08"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=09"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=10"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=11"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=12"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=01"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=02"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=03"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=04"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=05"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=06"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=07"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=08"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=09"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=10"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=11"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=12"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=01"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=02"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=03"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=04"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=05"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=06"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=07"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=08"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=09"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=10"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=11"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=12"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=01"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=02"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=03"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=04"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=05"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=06"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=07"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=08"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=09"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=10"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=11"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=12"},{"partitionName":"sm@sm_order_detail@p_year=2021/p_month=01"}]}

4.explain dependency 场景

案例一:识别代码是否等价?

代码1

explain dependency
select * 
from sm.sm_order_info soi 
join sm.sm_region_info sri on sri.region_id=soi.region_id and soi.p_year='2020'


{"input_tables":[{"tablename":"sm@sm_order_info","tabletype":"MANAGED_TABLE"},{"tablename":"sm@sm_region_info","tabletype":"MANAGED_TABLE"}],"input_partitions":[{"partitionName":"sm@sm_order_info@p_year=2020/p_month=01"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=02"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=03"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=04"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=05"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=06"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=07"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=08"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=09"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=10"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=11"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=12"}]}

代码2

explain dependency
select * 
from sm.sm_order_info soi 
join sm.sm_region_info sri on sri.region_id=soi.region_id
where soi.p_year='2020'

{"input_tables":[{"tablename":"sm@sm_order_info","tabletype":"MANAGED_TABLE"},{"tablename":"sm@sm_region_info","tabletype":"MANAGED_TABLE"}],"input_partitions":[{"partitionName":"sm@sm_order_info@p_year=2020/p_month=01"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=02"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=03"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=04"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=05"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=06"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=07"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=08"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=09"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=10"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=11"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=12"}]}

5.explain authorization 用法

explain authorization
select s_id, s_name from edu.student


INPUTS: 
  edu@student
OUTPUTS: 
  hdfs://localhost:8020/tmp/hive/root/c7dcee71-89d7-457e-ad6f-16d1419b032a/hive_2022-10-13_20-20-21_296_5224225313108621685-3/-mr-10001
CURRENT_USER: 
  root
OPERATION: 
  QUERY

二、Hive性能调优

1.SQL语句优化

1.union all

-- 创建数据库
create database test;
-- 创建分区表,最高分插入到max分区,最低分插入到min分区
create table test.course_max_min (
	c_id int comment '课程id',
    s_score int comment '分数'
) comment '每个课程的最高分和最低分'
partitioned by (tp string comment '分数类型');

-- 开启动态分区,分两次运行
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;

-- 优化前, 同一张表分组两次(运行前开启动态分区)
-- Total jobs = 5
insert into table test.course_max_min partition(tp)
select 
sc.c_id, max(sc.s_score) as s_score, 'max' as tp
from edu.score sc
group by sc.c_id
union all 
select 
sc.c_id, min(sc.s_score) as s_score, 'min' as tp
from edu.score sc
group by sc.c_id;
-- 优化后, 同一张表分组一次(运行前开启动态分区)
-- Total jobs = 1
from edu.score sc
insert into table test.course_max_min partition(tp)
select 
sc.c_id, max(sc.s_score) as s_score, 'max' as tp
group by sc.c_id
insert into table test.course_max_min partition(tp)
select 
sc.c_id, max(sc.s_score) as s_score, 'min' as tp
group by sc.c_id;

 2.distinct

-- 优化前
select count(1)
from(
  select s_id
  from edu.student
  group by s_id
) tb;

-- 优化后
select count(distinct s_id)
from edu.student;

3.left semi join

  • 通过退货表的订单编号筛选订单详情表的数据
-- 通过退货表的订单编号筛选订单详情表的数据
-- 优化前, in子查询
select * 
from sm.sm_order_detail_np a
where a.order_id in (
	select b.order_id 
	from sm.sm_return_info b);

-- 优化前, exists子查询
select * 
from sm.sm_order_detail_np a
where exists(
	select b.*
	from sm.sm_return_info b
	where a.order_id = b.order_id)
	
-- 优化后, left semi join左半连接(执行效率更佳)
-- select子句只能选择左表的字段
-- on子句的过滤条件只能是等于号
-- 右表的过滤条件只能在on子句设置
select a.* 
from sm.sm_order_detail_np a 
left semi join sm.sm_return_info b on b.order_id = a.order_id;

 2.并行执行优化

-- 启动并行执行
set hive.exec.parallel=true;
-- 设置并行执行的线程数量
set hive.exec.parallel.thread.number=16;