两种表结构
  • managed table:托管表。删除表时,数据也删除。
  • external table:外部表。删除表时,数据不删除。
基本命令
//创建表 外部表 t2
$hive>CREATE external TABLE IF NOT EXISTS t2(id int,name string,age int) COMMENT 'xx' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE ; 

//查看表数据
$hive>desc t2 ;
//查看表的详细信息
$hive>desc formatted t2 ;

//加载数据到hive表
$hive>load data local inpath '/home/centosmin0/wc.txt' into table t2 ;	    //local上传文件
$hive>load data inpath '/user/centosmin0/wc.txt' [overwrite] into table t2 ;	//移动文件

//复制表
mysql>create table tt as select * from users ;		//携带数据和表结构
mysql>create table tt like users ;			        //不带数据,只有表结构

hive>create table tt as select * from users ;	
hive>create table tt like users ;	


//count()查询要转成mr
$hive>select count(*) from t2 ;
$hive>select id,name from t2 ;


//插入,聚合操作需要转成mr,需要开启yarn模块
$hive>select * from t2 order by id desc ;				//MR
	
//启用/禁用表
$hive>ALTER TABLE t2 ENABLE NO_DROP;	//不允许删除
$hive>ALTER TABLE t2 DISABLE NO_DROP;	//允许删除


//创建分区表,优化手段之一,从目录的层面控制搜索数据的范围。
//创建分区表.
$hive>CREATE TABLE t3(id int,name string,age int) PARTITIONED BY (Year INT, Month INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;

//显式表的分区信息
$hive>SHOW PARTITIONS t3;

//添加分区,创建目录
$hive>alter table t3 add partition (year=2014, month=12);

//删除分区
hive>ALTER TABLE t3 DROP IF EXISTS PARTITION (year=2014, month=11);

//分区结构
hive>/user/hive/warehouse/mydb2.db/t3/year=2014/month=11
hive>/user/hive/warehouse/mydb2.db/t3/year=2014/month=12


//加载数据到分区表
hive>load data local inpath '/home/centosmin0/wc.txt' into table t3 partition(year=2014,month=11);

//设置分区模式:严格模式和非严格模式
//strict-严格模式,插入时至少指定一个静态分区,nonstrict-非严格模式-可以不指定静态分区。
$hive> set hive.exec.dynamic.partition.mode=nonstrict //默认非严格模式
$hive> set hive.exec.dynamic.partition.mode=strict    //严格模式

//创建桶表
$hive>CREATE TABLE t4(id int,name string,age int) CLUSTERED BY (id) INTO 3 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;

//加载数据不会进行分桶操作
$hive>load data local inpath '/home/centosmin0/wc.txt' into table t4 ;

//查询t3表数据插入到t4中。
$hive>insert into t4 select id,name,age from t3 ;

//桶表的数量如何设置?
//评估数据量,保证每个桶的数据量block的2倍大小。hadoop默认块大小 128M
//eg: 1G的数据量分为 4 个桶


//连接查询
$hive>CREATE TABLE customers(id int,name string,age int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
$hive>CREATE TABLE orders(id int,orderno string,price float,cid int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;

//加载数据到表
//内连接查询
hive>select a.*,b.* from customers a , orders b where a.id = b.cid ;
//左外
hive>select a.*,b.* from customers a left outer join orders b on a.id = b.cid ;
hive>select a.*,b.* from customers a right outer join orders b on a.id = b.cid ;
hive>select a.*,b.* from customers a full outer join orders b on a.id = b.cid ;



//export: 导出 表结构 + 数据
$hive>EXPORT TABLE customers TO '/user/centos/tmp.txt';

//order全排序
$hive>select * from orders order by id asc ;


//sort,map端排序,本地有序
$hive>select * from orders sort by id asc ;


//使用hive实现WordCount
//split函数:使用正则进行分割
//explode函数:将一行的单词炸裂为几行
$hive> select t.word,count(*) c from ((select explode(split(line, ' ')) as word from doc) as t) group by t.word order by c desc limit 2 ;

//将WordCount的数据放入新表 stats
$hive> create table stats(word string, c int) select t.word,count(*) c from ((select explode(split(line, ' ')) as word from doc) as t) group by t.word order by c desc limit 2 ;

分区表结构

桶表结构:mr作业

hive的事物操作

hive 也如同MySQL一样支持事物操作,也具有ACID(Atomicity, Consistency, Isolation, and Durability)

如下是在使用hive的事物操作时需要配置的参数:

$hive>SET hive.support.concurrency = true;				
$hive>SET hive.enforce.bucketing = true;					
$hive>SET hive.exec.dynamic.partition.mode = nonstrict;	
$hive>SET hive.txn.manager = org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
$hive>SET hive.compactor.initiator.on = true;
$hive>SET hive.compactor.worker.threads = 1;

创建可以进行事物操作的表:

CREATE TABLE tx(id int,name string,age int) CLUSTERED BY (id) INTO 3 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' stored as orc TBLPROPERTIES ('transactional'='true');
hive性能调忧
//explain命令:可以查看hive的查询计划
使用explain查看查询计划
hive>explain [extended] select count(*) from customers ;
hive>explain select t.name , count(*) from (select a.name ,b.id,b.orderno from customers a ,orders b where a.id = b.cid) t group by t.name ;

//设置limit优化测,避免全部查询.
hive>set hive.limit.optimize.enable=true

//开启本地模式
$hive>set mapred.job.tracker=local;	
//自动本地模式,用于测试,速度比较快
$hive>set hive.exec.mode.local.auto=true	


//并行执行:同时执行不存在依赖关系的阶段。
$hive>set hive.exec.parallel=true			

//严格模式
//1.分区表必须指定分区进行查询
//2.order by时必须使用limit子句。
//3.不允许笛卡尔积.
$hive>set hive.mapred.mode=strict		


//设置MR的数量
//设置reduce处理的字节数。
hive> set hive.exec.reducers.bytes.per.reducer=750000000;	

//JVM重用
//-1没有限制,使用大量小文件。
$hive>set mapreduce.job.jvm.numtasks=1