指路牌
- DDL操作
- 数据库操作
- 表操作
- 建表
- 修改表
- 其他命令
- DML操作
- 加载文件数据到表中
- 查询结果插入表中
- 结果写出到文件系统
DDL操作
数据库操作
1、查看数据库
0: jdbc:hive2://CentOS:10000> show databases;
+----------------+--+
| database_name |
+----------------+--+
| default |
| test |
+----------------+--+
2 rows selected (0.441 seconds)
2、切换数据库
0: jdbc:hive2://CentOS:10000> use test;
No rows affected (0.03 seconds)
3、新建数据库
-- 语法
CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name --DATABASE|SCHEMA 是等价的
[COMMENT database_comment] --数据库注释
[LOCATION hdfs_path] --存储在 HDFS 上的位置
[WITH DBPROPERTIES (property_name=property_value, ...)]; --指定额外属性
# 实例
0: jdbc:hive2://CentOS:10000> CREATE DATABASE IF NOT EXISTS test COMMENT 'database for test' LOCATION 'hdfs:///userdatabase/test.db' WITH DBPROPERTIES ('creator'='jiangzz');
No rows affected (0.186 seconds)
4、查看数据库详情
-- 语法
DESC DATABASE [EXTENDED] db_name; --EXTENDED 表示是否显示额外属性
5、删除数据库
-- 语法
DROP (DATABASE|SCHEMA) [IF EXISTS] database_name [RESTRICT|CASCADE];
0: jdbc:hive2://CentOS:10000> drop schema Dora cascade;
No rows affected (0.653 seconds)
默认行为是 RESTRICT
,如果数据库中存在表则删除失败。要想删除库及其中的表,可以使用 CASCADE
级联删除。
6、查看当前数据库
0: jdbc:hive2://CentOS:10000> select current_database();
+------------+--+
| _c0 |
+------------+--+
| hive_test |
+------------+--+
1 row selected (0.041 seconds)
表操作
建表
- 管理表: 内部表也称之为MANAGED_TABLE;默认存储在
/user/hive/warehouse
下,也可以通过location
指定;删除表时,会删除表数据以及元数据;
create table if not exists t_user(
id int,
name string,
sex boolean,
age int,
salary double,
hobbies array<string>,
card map<string,string>,
address struct<country:string,city:string>
)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile;
- 外部表: 外部表称之为EXTERNAL_TABLE;在创建表时可以自己指定目录位置(
LOCATION
);删除表时,只会删除元数据不会删除表数据;
create external table if not exists t_access(
ip string,
app varchar(32),
service string,
last_time timestamp
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex"="^(.*)\\s(.*)\\s(.*)\\s(.*\\s.*)"
)
LOCATION 'hdfs:///hive/t_access';
- 分区表: Hive中的表对应为HDFS上的指定目录,在查询数据时候,默认会对全表进行扫描,这样时间和性能的消耗都非常大。分区是在HDFS上为表创建子目录,数据按照分区存储在子目录中。如果查询的where子句的中包含分区条件,则直接从该分区去查找,而不是扫描整个表目录。合理的分区设计可以极大提高查询速度和性能。在Hive中可以使用
PARTITIONED BY
子句创建分区表。表可以包含一个或多个分区列,程序会为分区列中的每个不同值组合创建单独的数据目录。
CREATE EXTERNAL TABLE t_employee(
id INT,
name STRING,
job STRING,
manager INT,
hiredate TIMESTAMP,
salary DECIMAL(7,2)
)
PARTITIONED BY (deptno INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
LOCATION '/hive/t_employee';
7369 SMITH CLERK 7902 1980-12-17 00:00:00 800.00
7499 ALLEN SALESMAN 7698 1981-02-20 00:00:00 1600.00 300.00
7521 WARD SALESMAN 7698 1981-02-22 00:00:00 1250.00 500.00
7566 JONES MANAGER 7839 1981-04-02 00:00:00 2975.00
7654 MARTIN SALESMAN 7698 1981-09-28 00:00:00 1250.00 1400.00
7698 BLAKE MANAGER 7839 1981-05-01 00:00:00 2850.00
7782 CLARK MANAGER 7839 1981-06-09 00:00:00 2450.00
7788 SCOTT ANALYST 7566 1987-04-19 00:00:00 1500.00
7839 KING PRESIDENT 1981-11-17 00:00:00 5000.00
7844 TURNER SALESMAN 7698 1981-09-08 00:00:00 1500.00 0.00
7876 ADAMS CLERK 7788 1987-05-23 00:00:00 1100.00
7900 JAMES CLERK 7698 1981-12-03 00:00:00 950.00
7902 FORD ANALYST 7566 1981-12-03 00:00:00 3000.00
7934 MILLER CLERK 7782 1982-01-23 00:00:00 1300.00
0: jdbc:hive2://CentOS:10000> load data local inpath '/root/t_employee' overwrite into table t_employee partition(deptno='10');
INFO : Loading data to table test.t_employee partition (deptno=10) from file:/root/t_employee
INFO : Partition test.t_employee{deptno=10} stats: [numFiles=1, numRows=0, totalSize=747, rawDataSize=0]
No rows affected (0.539 seconds)
0: jdbc:hive2://CentOS:10000> load data local inpath '/root/t_employee' overwrite into table t_employee partition(deptno='20');
INFO : Loading data to table test.t_employee partition (deptno=20) from file:/root/t_employee
INFO : Partition test.t_employee{deptno=20} stats: [numFiles=1, numRows=0, totalSize=747, rawDataSize=0]
No rows affected (0.419 seconds)
0: jdbc:hive2://CentOS:10000>
- 分桶表: 分区表是为了将文件按照分区文件夹进行粗粒度文件隔离,但是分桶表是将数据按照某个字段进行hash计算出所属的桶,然后在对桶内的数据进行排序。
CREATE EXTERNAL TABLE t_employee_bucket(
id INT,
name STRING,
job STRING,
manager INT,
hiredate TIMESTAMP,
salary DECIMAL(7,2),
deptno INT)
DISTRIBUTE BY(job) SORTED BY(salary ASC) INTO 4 BUCKETS
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
LOCATION '/hive/employee_bucket';
0: jdbc:hive2://CentOS:10000> set hive.enforce.bucketing = true;
0: jdbc:hive2://CentOS:10000> INSERT INTO TABLE t_employee_bucket SELECT * FROM t_employee;
- 临时表: 临时表仅对当前session可见,临时表的数据将存储在用户的暂存目录中,并在会话结束后删除。如果临时表与永久表表名相同,则对该表名的任何引用都将解析为临时表,而不是永久表。临时表还具有以下两个限制:不支持分区列;不支持创建索引.
CREATE TEMPORARY TABLE if not exists emp_temp(
id INT,
name STRING,
job STRING,
manager INT,
hiredate TIMESTAMP,
salary DECIMAL(7,2),
deptno INT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
LOCATION '/hive/emp_temp';
- CTAS创建表: 创建的表结构和查询的信息类似,但是并不会拷贝表的结构
0: jdbc:hive2://CentOS:10000> create TEMPORARY TABLE t_employee_copy1 as select * from t_employee where deptno=10;
- 复制表结构: 仅仅是赋值表的结构,但是不拷贝数据。
0: jdbc:hive2://CentOS:10000> CREATE TEMPORARY EXTERNAL TABLE t_empoyee_copy2 LIKE t_employee_bucket location '/hive/t_empoyee_copy2';
No rows affected (0.128 seconds)
0: jdbc:hive2://CentOS:10000> desc t_empoyee_copy2;
+-----------+---------------+----------+--+
| col_name | data_type | comment |
+-----------+---------------+----------+--+
| id | int | |
| name | string | |
| job | string | |
| manager | int | |
| hiredate | timestamp | |
| salary | decimal(7,2) | |
| deptno | int | |
+-----------+---------------+----------+--+
7 rows selected (0.038 seconds)
修改表
- 重命名表
0: jdbc:hive2://CentOS:10000> ALTER TABLE t_user RENAME TO t_u;
- 修改列(修改类型、顺序、新增)
0: jdbc:hive2://CentOS:10000> ALTER TABLE t_employee CHANGE id eid INT;--修改列名&类型
0: jdbc:hive2://CentOS:10000> ALTER TABLE t_employee CHANGE id id decimal(7,2) AFTER name;--修改顺序
0: jdbc:hive2://CentOS:10000> ALTER TABLE t_employee ADD COLUMNS (address STRING);
- 清空表
0: jdbc:hive2://CentOS:10000> truncate table t_employee partition(deptno=10);
- 删除
0: jdbc:hive2://CentOS:10000> drop table t_employee PURGE;
PURGE表示数据会直接删除,不会放置在垃圾箱中,需要开启HDFS垃圾回收!
其他命令
Describe
- 查看数据库
-- 语法
DESCRIBE|DESC DATABASE [EXTENDED] db_name;
0: jdbc:hive2://CentOS:10000> desc database hive_test;
+------------+--------------------+------------------------------------------------------+-------------+-------------+-------------+--+
| db_name | comment | location | owner_name | owner_type | parameters |
+------------+--------------------+------------------------------------------------------+-------------+-------------+-------------+--+
| hive_test | database for test | hdfs://CentOS:9000/user/hive/warehouse/hive_test.db | root | USER | |
+------------+--------------------+------------------------------------------------------+-------------+-------------+-------------+--+
1 row selected (0.039 seconds)
- 查看表
-- 语法
DESCRIBE|DESC [EXTENDED|FORMATTED] table_name
0: jdbc:hive2://CentOS:10000> desc t_user;
+-----------+-------------------------------------+----------+--+
| col_name | data_type | comment |
+-----------+-------------------------------------+----------+--+
| id | int | |
| name | string | |
| sex | boolean | |
| age | int | |
| salary | double | |
| hobbies | array<string> | |
| card | map<string,string> | |
| address | struct<country:string,city:string> | |
+-----------+-------------------------------------+----------+--+
8 rows selected (0.06 seconds)
Show
- 查看数据库列表
-- 语法
SHOW (DATABASES|SCHEMAS) [LIKE 'identifier_with_wildcards'];
0: jdbc:hive2://CentOS:10000> show schemas like '*'
0: jdbc:hive2://CentOS:10000> ;
+----------------+--+
| database_name |
+----------------+--+
| default |
| hive_test |
| test |
+----------------+--+
3 rows selected (0.03 seconds)
- 查看表的列表
-- 语法
SHOW TABLES [IN database_name] ['identifier_with_wildcards'];
# 展示所有表
0: jdbc:hive2://CentOS:10000> show tables;
+--------------------+--+
| tab_name |
+--------------------+--+
| t_access |
| t_employ_copy |
| t_employee_bucket |
| t_employee_copy1 |
| t_user |
+--------------------+--+
5 rows selected (0.054 seconds)
# 展示test库下的所有表
0: jdbc:hive2://CentOS:10000> show tables in test;
+-------------+--+
| tab_name |
+-------------+--+
| t_access |
| t_employee |
| t_product |
| t_student |
| t_user |
+-------------+--+
5 rows selected (0.043 seconds)
# 展示test库下t_开头的所有表
0: jdbc:hive2://CentOS:10000> show tables in test like 't_*';
+-------------+--+
| tab_name |
+-------------+--+
| t_access |
| t_employee |
| t_product |
| t_student |
| t_user |
+-------------+--+
5 rows selected (0.04 seconds)
- 查看分区
0: jdbc:hive2://CentOS:10000> show partitions t_employee;
+------------+--+
| partition |
+------------+--+
| deptno=10 |
+------------+--+
1 row selected (0.065 seconds)
- 查看建表语句
0: jdbc:hive2://CentOS:10000> show create table t_employee;
+-----------------------------------------------------------------+--+
| createtab_stmt |
+-----------------------------------------------------------------+--+
| CREATE EXTERNAL TABLE `t_employee`( |
| `id` int, |
| `name` string, |
| `job` string, |
| `manager` int, |
| `hiredate` timestamp, |
| `salary` decimal(7,2)) |
| PARTITIONED BY ( |
| `deptno` int) |
| ROW FORMAT DELIMITED |
| FIELDS TERMINATED BY '\t' |
| STORED AS INPUTFORMAT |
| 'org.apache.hadoop.mapred.TextInputFormat' |
| OUTPUTFORMAT |
| 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' |
| LOCATION |
| 'hdfs://CentOS:9000/hive/t_employee' |
| TBLPROPERTIES ( |
| 'transient_lastDdlTime'='1576961129') |
+-----------------------------------------------------------------+--+
19 rows selected (0.117 seconds)
更多请参考:https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL
DML操作
加载文件数据到表中
-- 语法
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
0: jdbc:hive2://CentOS:10000> load data local inpath '/root/t_employee' overwrite into table t_employee partition(deptno='10');
查询结果插入表中
- 将查询结果插入表中
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]]
select_statement1 FROM from_statement; --覆盖
INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)]
select_statement1 FROM from_statement;--追加
0: jdbc:hive2://CentOS:10000> add jar /usr/apache-hive-1.2.2-bin/hcatalog/share/hcatalog/hive-hcatalog-core-1.2.2.jar
CREATE EXTERNAL TABLE t_emp_json(
empno INT,
ename STRING,
job STRING,
mgr INT,
hiredate TIMESTAMP,
sal DECIMAL(7,2),
comm DECIMAL(7,2),
deptno INT)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe';
0: jdbc:hive2://CentOS:10000> select empno,ename,job,mgr,hiredate,sal,comm,deptno from t_emp_json;
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| empno | ename | job | mgr | hiredate | sal | comm | deptno |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| 7369 | SMITH | CLERK | 7902 | 1980-12-17 00:00:00.0 | 800 | NULL | 20 |
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 00:00:00.0 | 1600 | 300 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 00:00:00.0 | 1250 | 500 | 30 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 00:00:00.0 | 2975 | NULL | 20 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 00:00:00.0 | 1250 | 1400 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 00:00:00.0 | 2850 | NULL | 30 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 00:00:00.0 | 2450 | NULL | 10 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-04-19 00:00:00.0 | 1500 | NULL | 20 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-17 00:00:00.0 | 5000 | NULL | 10 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 00:00:00.0 | 1500 | 0 | 30 |
| 7876 | ADAMS | CLERK | 7788 | 1987-05-23 00:00:00.0 | 1100 | NULL | 20 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 00:00:00.0 | 950 | NULL | 30 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 00:00:00.0 | 3000 | NULL | 20 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 00:00:00.0 | 1300 | NULL | 10 |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
CREATE TABLE t_employee(
empno INT,
ename STRING,
job STRING,
mgr INT,
hiredate TIMESTAMP,
sal DECIMAL(7,2),
comm DECIMAL(7,2)
)
PARTITIONED BY (deptno INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";
0: jdbc:hive2://CentOS:10000> insert overwrite table t_employee partition(deptno=10) select empno,ename,job,mgr,hiredate,sal,comm from t_emp_json where deptno=10;--覆盖
0: jdbc:hive2://CentOS:10000> insert into table t_employee partition(deptno=20) select empno,ename,job,mgr,hiredate,sal,comm from t_emp_json where deptno=20;--追加
- 将查询结果插入多个表
-- 语法
FROM from_statement
INSERT OVERWRITE TABLE tablename1
[PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1
[INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2]
[INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...;
from t_emp_json
INSERT OVERWRITE TABLE t_employee partition(deptno=10) select empno,ename,job,mgr,hiredate,sal,comm where deptno=10
INSERT OVERWRITE TABLE t_employee partition(deptno=20) select empno,ename,job,mgr,hiredate,sal,comm where deptno=20
INSERT OVERWRITE TABLE t_employee partition(deptno=30) select empno,ename,job,mgr,hiredate,sal,comm where deptno=30
- 插入动态分区
0: jdbc:hive2://CentOS:10000> set hive.exec.dynamic.partition.mode=nonstrict;
No rows affected (0.004 seconds)
0: jdbc:hive2://CentOS:10000> INSERT OVERWRITE TABLE t_employee PARTITION (deptno) SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_emp_json;
0: jdbc:hive2://CentOS:10000> show partitions t_employee;
+------------+--+
| partition |
+------------+--+
| deptno=10 |
| deptno=20 |
| deptno=30 |
+------------+--+
3 rows selected (0.064 seconds)
结果写出到文件系统
-- 语法
INSERT OVERWRITE [LOCAL] DIRECTORY directory1
[ROW FORMAT row_format] [STORED AS file_format]
SELECT ... FROM ...
0: jdbc:hive2://CentOS:10000> INSERT OVERWRITE DIRECTORY '/employee' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE select * from t_employee;
更多参考:https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML