两种表结构
- managed table:托管表。删除表时,数据也删除。
- external table:外部表。删除表时,数据不删除。
基本命令
//创建表 外部表 t2
$hive>CREATE external TABLE IF NOT EXISTS t2(id int,name string,age int) COMMENT 'xx' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE ;
//查看表数据
$hive>desc t2 ;
//查看表的详细信息
$hive>desc formatted t2 ;
//加载数据到hive表
$hive>load data local inpath '/home/centosmin0/wc.txt' into table t2 ; //local上传文件
$hive>load data inpath '/user/centosmin0/wc.txt' [overwrite] into table t2 ; //移动文件
//复制表
mysql>create table tt as select * from users ; //携带数据和表结构
mysql>create table tt like users ; //不带数据,只有表结构
hive>create table tt as select * from users ;
hive>create table tt like users ;
//count()查询要转成mr
$hive>select count(*) from t2 ;
$hive>select id,name from t2 ;
//插入,聚合操作需要转成mr,需要开启yarn模块
$hive>select * from t2 order by id desc ; //MR
//启用/禁用表
$hive>ALTER TABLE t2 ENABLE NO_DROP; //不允许删除
$hive>ALTER TABLE t2 DISABLE NO_DROP; //允许删除
//创建分区表,优化手段之一,从目录的层面控制搜索数据的范围。
//创建分区表.
$hive>CREATE TABLE t3(id int,name string,age int) PARTITIONED BY (Year INT, Month INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
//显式表的分区信息
$hive>SHOW PARTITIONS t3;
//添加分区,创建目录
$hive>alter table t3 add partition (year=2014, month=12);
//删除分区
hive>ALTER TABLE t3 DROP IF EXISTS PARTITION (year=2014, month=11);
//分区结构
hive>/user/hive/warehouse/mydb2.db/t3/year=2014/month=11
hive>/user/hive/warehouse/mydb2.db/t3/year=2014/month=12
//加载数据到分区表
hive>load data local inpath '/home/centosmin0/wc.txt' into table t3 partition(year=2014,month=11);
//设置分区模式:严格模式和非严格模式
//strict-严格模式,插入时至少指定一个静态分区,nonstrict-非严格模式-可以不指定静态分区。
$hive> set hive.exec.dynamic.partition.mode=nonstrict //默认非严格模式
$hive> set hive.exec.dynamic.partition.mode=strict //严格模式
//创建桶表
$hive>CREATE TABLE t4(id int,name string,age int) CLUSTERED BY (id) INTO 3 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
//加载数据不会进行分桶操作
$hive>load data local inpath '/home/centosmin0/wc.txt' into table t4 ;
//查询t3表数据插入到t4中。
$hive>insert into t4 select id,name,age from t3 ;
//桶表的数量如何设置?
//评估数据量,保证每个桶的数据量block的2倍大小。hadoop默认块大小 128M
//eg: 1G的数据量分为 4 个桶
//连接查询
$hive>CREATE TABLE customers(id int,name string,age int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
$hive>CREATE TABLE orders(id int,orderno string,price float,cid int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
//加载数据到表
//内连接查询
hive>select a.*,b.* from customers a , orders b where a.id = b.cid ;
//左外
hive>select a.*,b.* from customers a left outer join orders b on a.id = b.cid ;
hive>select a.*,b.* from customers a right outer join orders b on a.id = b.cid ;
hive>select a.*,b.* from customers a full outer join orders b on a.id = b.cid ;
//export: 导出 表结构 + 数据
$hive>EXPORT TABLE customers TO '/user/centos/tmp.txt';
//order全排序
$hive>select * from orders order by id asc ;
//sort,map端排序,本地有序
$hive>select * from orders sort by id asc ;
//使用hive实现WordCount
//split函数:使用正则进行分割
//explode函数:将一行的单词炸裂为几行
$hive> select t.word,count(*) c from ((select explode(split(line, ' ')) as word from doc) as t) group by t.word order by c desc limit 2 ;
//将WordCount的数据放入新表 stats
$hive> create table stats(word string, c int) select t.word,count(*) c from ((select explode(split(line, ' ')) as word from doc) as t) group by t.word order by c desc limit 2 ;
分区表结构
桶表结构:mr作业
hive的事物操作
hive 也如同MySQL一样支持事物操作,也具有ACID(Atomicity, Consistency, Isolation, and Durability)
如下是在使用hive的事物操作时需要配置的参数:
$hive>SET hive.support.concurrency = true;
$hive>SET hive.enforce.bucketing = true;
$hive>SET hive.exec.dynamic.partition.mode = nonstrict;
$hive>SET hive.txn.manager = org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
$hive>SET hive.compactor.initiator.on = true;
$hive>SET hive.compactor.worker.threads = 1;
创建可以进行事物操作的表:
CREATE TABLE tx(id int,name string,age int) CLUSTERED BY (id) INTO 3 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' stored as orc TBLPROPERTIES ('transactional'='true');
hive性能调忧
//explain命令:可以查看hive的查询计划
使用explain查看查询计划
hive>explain [extended] select count(*) from customers ;
hive>explain select t.name , count(*) from (select a.name ,b.id,b.orderno from customers a ,orders b where a.id = b.cid) t group by t.name ;
//设置limit优化测,避免全部查询.
hive>set hive.limit.optimize.enable=true
//开启本地模式
$hive>set mapred.job.tracker=local;
//自动本地模式,用于测试,速度比较快
$hive>set hive.exec.mode.local.auto=true
//并行执行:同时执行不存在依赖关系的阶段。
$hive>set hive.exec.parallel=true
//严格模式
//1.分区表必须指定分区进行查询
//2.order by时必须使用limit子句。
//3.不允许笛卡尔积.
$hive>set hive.mapred.mode=strict
//设置MR的数量
//设置reduce处理的字节数。
hive> set hive.exec.reducers.bytes.per.reducer=750000000;
//JVM重用
//-1没有限制,使用大量小文件。
$hive>set mapreduce.job.jvm.numtasks=1