hive数据库是hdfs上的文件夹,表也是文件夹,表里的数据是文件

hive建表
 create table t_student(id string,name string,age int,classNo string)
 row format delimited
 fields terminated by ‘,’;创建外部表
 create external table t_a(id string,name string)
 row format delimited
 fields terminated by ‘,’
 location ‘/…/…’


外部表和内部表的区别,drop时内部表的hdfs数据一同删除,外部表的hdfs上的数据不删除

create table t_b(id string,name string)
 row format delimited
 fields terminated by ‘,’create table t_a(id string,name string)
 row format delimited
 fields terminated by ‘,’–加载数据从hdfs上加载
 load data inpath ‘/dataB/b.txt’ into table t_a;
 –加载数据从本地上加载
 load data local inpath ‘/root/bb.txt’ into table t_b;t_a
 ±--------±----------±-+
 | t_a.id | t_a.name |
 ±--------±----------±-+
 | a | 1 |
 | b | 2 |
 | c | 4 |
 | d | 5 |
 ±--------±----------±-+t_b
 ±--------±----------±-+
 | t_b.id | t_b.name |
 ±--------±----------±-+
 | a | xx |
 | b | yy |
 | d | zz |
 | e | pp |
 ±--------±----------±-+–笛卡尔积
 select a.,b.
 from t_a inner join t_b±--------±----------±--------±----------±-+
 | t_a.id | t_a.name | t_b.id | t_b.name |
 ±--------±----------±--------±----------±-+
 | a | 1 | a | xx |
 | b | 2 | a | xx |
 | c | 4 | a | xx |
 | d | 5 | a | xx |
 | a | 1 | b | yy |
 | b | 2 | b | yy |
 | c | 4 | b | yy |
 | d | 5 | b | yy |
 | a | 1 | d | zz |
 | b | 2 | d | zz |
 | c | 4 | d | zz |
 | d | 5 | d | zz |
 | a | 1 | e | pp |
 | b | 2 | e | pp |
 | c | 4 | e | pp |
 | d | 5 | e | pp |
 ±--------±----------±--------±----------±-+–内连接
 select * from t_a join t_b where t_a.id = t_b.id;
 select * from t_a join t_b on t_a.id = t_b.id;–左外连接
 select
 a.,b.
 from t_a a left join t_b bselect
 a.,b.
 from t_a a left join t_b b on a.id = b.id±------±--------±------±--------±-+
 | a.id | a.name | b.id | b.name |
 ±------±--------±------±--------±-+
 | a | 1 | a | xx |
 | b | 2 | b | yy |
 | d | 5 | d | zz |
 | NULL | NULL | e | pp |
 ±------±--------±------±--------±-+–右外连接
 select
 a.,b.
 from t_a a right outer join t_b bselect
 a.,b.
 from t_a a right outer join t_b b on a.id = b.id
 ±------±--------±------±--------±-+
 | a.id | a.name | b.id | b.name |
 ±------±--------±------±--------±-+
 | a | 1 | a | xx |
 | b | 2 | b | yy |
 | d | 5 | d | zz |
 | NULL | NULL | e | pp |
 ±------±--------±------±--------±-+–全外连接
 select
 a.,b.
 from t_a a full outer join t_b b on a.id = b.id±------±--------±------±--------±-+
 | a.id | a.name | b.id | b.name |
 ±------±--------±------±--------±-+
 | a | 1 | a | xx |
 | b | 2 | b | yy |
 | c | 4 | NULL | NULL |
 | d | 5 | d | zz |
 | NULL | NULL | e | pp |
 ±------±--------±------±--------±-+–左半连接 相当于指定条件的内连接,但是只显示左边的表数据
 select
 a.*
 from t_a a left semi join t_b b on a.id = b.id±------±--------±-+
 | a.id | a.name |
 ±------±--------±-+
 | a | 1 |
 | b | 2 |
 | d | 5 |
 ±------±--------±-+

–例如有这些数据

vi access.log.0804
 192.168.33.3,http://www.sina.com/stu,2017-08-04 15:30:20
 192.168.33.3,http://www.sina.com/teach,2017-08-04 15:35:20
 192.168.33.4,http://www.sina.com/stu,2017-08-04 15:30:20
 192.168.33.4,http://www.sina.com/job,2017-08-04 16:30:20
 192.168.33.5,http://www.sina.com/job,2017-08-04 15:40:20vi access.log.0805
 192.168.33.3,http://www.sina.com/stu,2017-08-05 15:30:20
 192.168.44.3,http://www.sina.com/teach,2017-08-05 15:35:20
 192.168.33.44,http://www.sina.com/stu,2017-08-05 15:30:20
 192.168.33.46,http://www.sina.com/job,2017-08-05 16:30:20
 192.168.33.55,http://www.sina.com/job,2017-08-05 15:40:20vi access.log.0806
 192.168.133.3,http://www.sina.com/register,2017-08-06 15:30:20
 192.168.111.3,http://www.sina.com/register,2017-08-06 15:35:20
 192.168.34.44,http://www.sina.com/pay,2017-08-06 15:30:20
 192.168.33.46,http://www.sina.com/excersize,2017-08-06 16:30:20
 192.168.33.55,http://www.sina.com/job,2017-08-06 15:40:20
 192.168.33.46,http://www.sina.com/excersize,2017-08-06 16:30:20
 192.168.33.25,http://www.sina.com/job,2017-08-06 15:40:20
 192.168.33.36,http://www.sina.com/excersize,2017-08-06 16:30:20
 192.168.33.55,http://www.sina.com/job,2017-08-06 15:40:20
–建分区表
 create table t_access(ip string,url string,access_time string)
 partitioned by (day string)
 row format delimited
 fields terminated by ‘,’–导入分区表数据
 load data local inpath ‘/root/access.log.0806’ into table t_access partition(day=‘2017-08-06’);–查看当前分区情况
 show partitions t_access;–求每条url访问的次数
 select
 url,count(1)
 from t_access
 group by url;–求每条url访问的ip中,最大的ip是哪个
 select
 url,max(ip)
 from t_access
 group by url;–求每个ip访问同一个页面的记录中,最晚的一条
 select
 ip,url,max(access_time)
 from t_access
 group by ip,url–求PV
 select
 url,count(1)
 from t_access
 group by url±-------------------------------±-----±-+
 | url | _c1 |
 ±-------------------------------±-----±-+
 | http://www.sina.com/excersize | 3 |
 | http://www.sina.com/job | 7 |
 | http://www.sina.com/pay | 1 |
 | http://www.sina.com/register | 2 |
 | http://www.sina.com/stu | 4 |
 | http://www.sina.com/teach | 2 |
 ±-------------------------------±-----±-+–求UV
 select
 url,count(distinct(ip))
 from t_access
 group by url±-------------------------------±----±-+
 | url | c1 |
 ±-------------------------------±----±-+
 | http://www.sina.com/excersize | 2 |
 | http://www.sina.com/job | 5 |
 | http://www.sina.com/pay | 1 |
 | http://www.sina.com/register | 2 |
 | http://www.sina.com/stu | 3 |
 | http://www.sina.com/teach | 2 |
 ±-------------------------------±----±-+–求每条url访问的ip中,最大的ip是哪个
 –PV
 –UV
 –用mapreduce实现–求8月6号pv,uv
 select
 count(1),url
 from t_access
 where day = ‘2017-08-06’
 group by url–求8月4号以后,每天访问http://www.sina.com/job 的次数,以及访问者中ip最大的
 select
 count(1),max(ip),day
 from t_access
 where day > ‘2017-08-04’
 and url = ‘http://www.sina.com/job’
 group by dayselect
 count(1),max(ip),day
 from t_access
 where url = ‘http://www.sina.com/job’
 group by day having day > ‘2017-08-04’–求8月4号以后,每天每个页面访问总次数,以及页面最大的ip,并且上述查询结果中访问次数大于2的
 select tmp.* from
 (select
 url,day,count(1) cnts,max(ip) ip
 from t_access
 group by day,url having day > ‘2017-08-04’) tmp
 where tmp.cnts > 2–每天,pv排序
 select
 url,day,count(1) cnts
 from t_access
 group by day,url
 order by cnts desc–hive里的order by是全局排序
 –sort by是在map里进行排序select
 tmp.*
 from
 (select
 url,day,count(1) cnts
 from t_access
 group by day,url) tmp
 distribute by tmp.day sort by tmp.cnts desc–有如下数据
 ±------------±--------------±-+
 | t_sale.mid | t_sale.money |
 ±------------±--------------±-+
 | AA | 15.0 |
 | AA | 20.0 |
 | BB | 22.0 |
 | CC | 44.0 |
 ±------------±--------------±-+–distribute by先把数据分发到各个reduce中,然后sort by在各个reduce中进行局部排序
 select
 mid,money
 from t_sale
 distribute by mid sort by money desc±-----±-------±-+
 | mid | money |
 ±-----±-------±-+
 | AA | 15.0 |
 | AA | 20.0 |
 | BB | 22.0 |
 | CC | 44.0 |
 ±-----±-------±-+–cluster by mid 等于 distribute by mid sort by mid
 –cluster by后面不能跟desc,asc,默认的只能升序–order by 是全排序,所有的数据会发送给一个reduceTask进行处理,在数据量大的时候,reduce就会超负荷
 select
 mid,money
 from t_sale
 order by money desc–设置最大的reduce启动个数
 set hive.exec.reducers.max=10;
 –设置reduce的启动个数
 set mapreduce.job.reduce=3–hive数据类型
 –数字类型
 tinyint 1byte -128到127
 smallint 2byte -32768到32767 char
 int
 bigint
 float
 double–日期类型
 timestamp
 date–字符串类型
 string
 varchar
 char–混杂类型
 boolean
 binary 二进制–复合类型
 –数组流浪地球,吴京:吴孟达:张飞:赵云,2019-09-11
 我和我的祖国,葛优:黄渤:宋佳:吴京,2019-10-01create table t_movie(movie_name string,actors array,show_time date)
 row format delimited fields terminated by ‘,’
 collection items terminated by ‘:’;array_contains
select
 *
 from t_movie
 where array_contains(actors,‘张飞’)size
select
 movie_name,actors,show_time,size(actors) num
 from t_movie

========================================

map类型
 1,zhangsan,father:xiaoming#mother:chentianxing#brother:shaoshuai,28
 2,xiaoming,father:xiaoyun#mother:xiaohong#sister:xiaoyue#wife:chentianxing,30create table t_user(id int,name string,family map<string,string>,jage int)
 row format delimited fields terminated by ‘,’
 collection items terminated by ‘#’
 map keys terminated by ‘:’;map_keys
 select
 id,name,map_keys(family) as relation,jage
 from t_user;map_values
 select
 id,name,map_values(family) as relation,jage
 from t_user;–家庭成员数量
 select
 id,name,size(family),jage
 from t_user–有兄弟的人
 select
 *
 from t_user
 where array_contains(map_keys(family),‘brother’)

========================================

1,zhangsan,18?beijing
 2,lisi,22?shanghaicreate table t_people(id int,name string,info structage:int,sex:string,addr:string)
 row format delimited fields terminated by ‘,’
 collection items terminated by ‘:’;select
 id,name,info.age
 from t_people;

========================================

192,168,33,66,zhangsan,male
 192,168,33,77,lisi,male
 192,168,43,101,wangwu,femalecreate table t_people_ip(ip_seg1 string,ip_seg2 string,ip_seg3 string,ip_seg4 string,name string,sex string)
 row format delimited fields terminated by ‘,’;–字符串拼接函数
 concat
 select concat(‘abc’,‘def’)concat_ws
 select concat_ws(’.’,ip_seg1,ip_seg2,ip_seg3,ip_seg4),name,sex from t_people_ip

===================================================================

–求字符串长度
 length
 select lenth(‘jsdfijsdkfjkdsfjkdf’);

========================================

select to_date(‘2019-09-11 16:55:11’);
–把字符串转换成unix时间戳
 select unix_timestamp(‘2019-09-11 11:55:11’,‘yyyy-MM-dd HH:mm:ss’);–把unix时间戳转换成字符串
 select from_unixtime(unix_timestamp(),‘yyyy-MM-dd HH:mm:ss’);

========================================

–数学运算函数
 –四舍五入
 select round(5.4)
 –四舍五入保留2位小数
 select round(5.1345,2)
 –向上取整
 select ceil(5.3)
 –向下取整
 select floor(5.3)
 –取绝对值
 select abs(-5.2)
 –取最大值
 select greatest(3,4,5,6,7)
 –取最小值
 select least(3,4,5,6,7)

==================================================

1,19,a,male
 2,19,b,male
 3,22,c,female
 4,16,d,female
 5,30,e,male
 6,26,f,female±--------------±---------------±----------------±---------------±-+
 | t_student.id | t_student.age | t_student.name | t_student.sex |
 ±--------------±---------------±----------------±---------------±-+
 | 1 | 19 | a | male |
 | 2 | 19 | b | male |
 | 5 | 30 | e | male || 3 | 22 | c | female |
 | 4 | 16 | d | female |
 | 6 | 26 | f | female |±--------------±---------------±----------------±---------------±-+
row_number() over()
select
 id,age,name,sex,row_number() over(partition by sex order by age desc) rk
 from t_studentselect
 tmp.*
 from
 (select
 id,age,name,sex,row_number() over(partition by sex order by age desc) rk
 from t_student) tmp
 where tmp.rk = 1select
 id,age,name,sex,rank() over(partition by sex order by age desc) rk
 from t_studentselect
 id,age,name,sex,dense_rank() over(partition by sex order by age desc) rk
 from t_student

========================================

A,2015-01,5
 A,2015-01,15
 B,2015-01,5
 A,2015-01,8
 B,2015-01,25
 A,2015-01,5
 C,2015-01,10
 C,2015-01,20
 A,2015-02,4
 A,2015-02,6
 C,2015-02,30
 C,2015-02,10
 B,2015-02,10
 B,2015-02,5
 A,2015-03,14
 A,2015-03,6
 B,2015-03,20
 B,2015-03,25
 C,2015-03,10
 C,2015-03,20–求每个用户每个月的销售额和到当月位置的累计销售额
 create table t_saller(name string,month string,amount int)
 row format delimited fields terminated by ‘,’create table t_accumulate
 as
 select name,month,sum(amount) samount
 from t_saller
 group by name,month±-------------------±--------------------±----------------------±-+
 | t_accumulate.name | t_accumulate.month | t_accumulate.samount |
 ±-------------------±--------------------±----------------------±-+
 | A | 2015-01 | 33 |
 | A | 2015-02 | 10 |
 | A | 2015-03 | 20 |
 | B | 2015-01 | 30 |
 | B | 2015-02 | 15 |
 | B | 2015-03 | 45 |
 | C | 2015-01 | 30 |
 | C | 2015-02 | 40 |
 | C | 2015-03 | 30 |
 ±-------------------±--------------------±----------------------±-+
 –最前面一行
 select
 name,month,samount,sum(samount) over(partition by name order by month rows between unbounded preceding and current row) accumlateAmount
 from
 t_accumulate| A | 2015-01 | 33 |
 | A | 2015-02 | 10 |
 | A | 2015-03 | 20 |
 | A | 2015-04 | 30 |
 | A | 2015-05 | 15 |
 | A | 2015-06 | 45 |
 | A | 2015-07 | 30 |
 | A | 2015-08 | 40 |
 | A | 2015-09 | 30 |select
 name,month,samount,sum(samount) over(partition by name order by month rows between 2 preceding and 1 following ) accumlateAmount
 from
 t_accumulatepreceding |
 当前行 | 窗口长度
 following |min() over() ,max() over() , avg() over()

========================================

1,zhangsan,化学:物理:数学:语文
 2,lisi,化学:数学:生物:生理:卫生
 3,wangwu,化学:语文:英语:体育:生物create table t_stu_subject(id int,name string,subjects array)
 row format delimited fields terminated by ‘,’
 collection items terminated by ‘:’explode()
select explode(subjects) from t_stu_subject;
select distinct tmp.subs from (select explode(subjects) subs from t_stu_subject) tmp

========================================

lateral view连接函数
select
 id,name,sub
 from
 t_stu_subject lateral view explode(subjects) tmp as sub±----±----------±-----±-+
 | id | name | sub |
 ±----±----------±-----±-+
 | 1 | zhangsan | 化学 |
 | 1 | zhangsan | 物理 |
 | 1 | zhangsan | 数学 |
 | 1 | zhangsan | 语文 |
 | 2 | lisi | 化学 |
 | 2 | lisi | 数学 |
 | 2 | lisi | 生物 |
 | 2 | lisi | 生理 |
 | 2 | lisi | 卫生 |
 | 3 | wangwu | 化学 |
 | 3 | wangwu | 语文 |
 | 3 | wangwu | 英语 |
 | 3 | wangwu | 体育 |
 | 3 | wangwu | 生物 |
 ±----±----------±-----±-+

========================================

wordcount
words
create table words(line string)
line
 hello world hi tom and jack
 hello chentianxing qiaoyuan and shaoshuai
 hello hello hi tom and shaoshuai
 chentianxing love saoshuai
 hello love what is love how lovesplit()
select
 tmp.word,count(1) cnts
 from
 (select
 explode(split(line,’ ')) word
 from words) tmp
 group by tmp.word order by cnts desc

========================================

–炸map
 select
 id,name,key,value
 from
 t_user
 lateral view explode(family) tmp as key,value

========================================

–有web系统,每天产生下列的日志文件
2017-09-15号的数据:
 192.168.33.6,hunter,2017-09-15 10:30:20,/a
 192.168.33.7,hunter,2017-09-15 10:30:26,/b
 192.168.33.6,jack,2017-09-15 10:30:27,/a
 192.168.33.8,tom,2017-09-15 10:30:28,/b
 192.168.33.9,rose,2017-09-15 10:30:30,/b
 192.168.33.10,julia,2017-09-15 10:30:40,/c2017-09-16号的数据:
 192.168.33.16,hunter,2017-09-16 10:30:20,/a
 192.168.33.18,jerry,2017-09-16 10:30:30,/b
 192.168.33.26,jack,2017-09-16 10:30:40,/a
 192.168.33.18,polo,2017-09-16 10:30:50,/b
 192.168.33.39,nissan,2017-09-16 10:30:53,/b
 192.168.33.39,nissan,2017-09-16 10:30:55,/a
 192.168.33.39,nissan,2017-09-16 10:30:58,/c
 192.168.33.20,ford,2017-09-16 10:30:54,/c2017-09-17号的数据:
 192.168.33.46,hunter,2017-09-17 10:30:21,/a
 192.168.43.18,jerry,2017-09-17 10:30:22,/b
 192.168.43.26,tom,2017-09-17 10:30:23,/a
 192.168.53.18,bmw,2017-09-17 10:30:24,/b
 192.168.63.39,benz,2017-09-17 10:30:25,/b
 192.168.33.25,haval,2017-09-17 10:30:30,/c
 192.168.33.10,julia,2017-09-17 10:30:40,/c–统计日活跃用户
–统计每日新增用户
–统计历史用户
–创建日志表
 create table t_web_log(ip string,uname string,access_time string,url string)
 partitioned by (day string)
 row format delimited fields terminated by ‘,’;–创建日活跃用户表
 create table t_user_active_day
 like
 t_web_log;–统计日活跃用户
 insert into table t_user_active_day partition(day=‘2017-09-15’)
 select tmp.ip,tmp.uname,tmp.access_time,tmp.url
 from
 (select
 ip,uname,access_time,url,row_number() over(partition by uname order by access_time) rn
 from
 t_web_log where day=‘2017-09-15’) tmp
 where rn <2–创建历史用户表
 create table t_user_history(uname string)–创建日新增用户表
 create table t_user_new_day like t_user_active_day;–统计日新增用户
 insert into table t_user_new_day partition(day=‘2017-09-15’)
 select
 tua.ip,tua.uname,tua.access_time,tua.url
 from t_user_active_day tua
 left join t_user_history tuh on tua.uname = tuh.uname
 where tua.day = ‘2017-09-15’ and tuh.uname IS NULL–记录历史用户
 insert into table t_user_history
 select
 uname
 from t_user_new_day
 where day=‘2017-09-15’insert into table t_user_active_day partition(day=‘2017-09-16’)
 select tmp.ip,tmp.uname,tmp.access_time,tmp.url
 from
 (select
 ip,uname,access_time,url,row_number() over(partition by uname order by access_time) rn
 from
 t_web_log where day=‘2017-09-16’) tmp
 where rn <2insert into table t_user_new_day partition(day=‘2017-09-16’)
 select
 tua.ip,tua.uname,tua.access_time,tua.url
 from t_user_active_day tua
 left join t_user_history tuh on tua.uname = tuh.uname
 where tua.day = ‘2017-09-16’ and tuh.uname IS NULLinsert into table t_user_history
 select
 uname
 from t_user_new_day
 where day=‘2017-09-16’–桶表
 –创建桶表,按id分成三个桶
 create table t1(id int,name string,age int)
 clustered by(id) into 3 buckets
 row format delimited fields terminated by ‘,’;–桶表插入数据的方式
 create table t1_1(id int,name string,age int)
 row format delimited fields terminated by ‘,’;set hive.enforce.bucketing=true;
 set mapreduce.job.reduces=3;insert into t1 select * from t1_1;

========================================
hive自定义函数

package com.bwie.hive.udf;
import com.alibaba.fastjson.JSONObject;
 import org.apache.hadoop.hive.ql.exec.UDF;public class JsonUDF extends UDF {
public String evaluate(String json,String colum){
    JSONObject jsonObject = JSONObject.parseObject(json);
    String value = jsonObject.getString(colum);
    return value;
}
}

打包上传

add jar /root/hivetest-1.0-SNAPSHOT.jar;
 create temporary function getjson as ‘com.bwie.hive.udf.JsonUDF’;select
 getjson(json,‘movie’) as movie,getjson(json,‘rate’) as rate,from_unixtime(CAST(getjson(json,‘timeStamp’) as BIGINT),‘yyyy-MM-dd HH:mm:ss’) as showtime,getjson(json,‘uid’) as uid
 from t_ex_rat limit 10;

–创建永久函数
方法一、
在hive-site.xml里添加jar包

hive.aux.jars.path
 file:///usr/local/apache-hive-1.2.2-bin/lib/hivetest-1.0-SNAPSHOT.jar



疑似jar包要放到lib下才能生效

create function getjson AS ‘com.bwie.hive.udf.JsonUDF’

方法二、

create function getjson as ‘com.bwie.hive.udf.JsonUDF’ using jar ‘hdfs://hdp1703/lib/hivetest-1.0-SNAPSHOT.jar’
drop function getjson

========================================

create table t_employee(id int,name string,age int)
 partitioned by(state string,city string)
 row format delimited fields terminated by ‘,’create table t_employee_orgin(id int,name string,age int,state string,city string)
 row format delimited fields terminated by ‘,’;load data local inpath ‘/root/employ.txt’ into table t_employee_orgin;

在严格模式下,多个分区一定有一个分区是固定的

insert into t_employee
 partition(state=‘china’,city)
 select id,name,age,city
 from t_employee_orgin
 where state=‘china’

–非严格模式

set hive.exec.dynamic.partition.mode=nostrict;
 insert into t_employee
 partition(state,city)
 select id,name,age,state,city
 from t_employee_orgin

========================================

select to_date(‘2019-09-20 17:30:00’)
unix_timestamp(‘2019/09/20 17:30:00’,‘yyyy/MM/dd HH:mm:ss’)
select from_unixtime(unix_timestamp(‘2019/09/20 17:30:00’,‘yyyy/MM/dd HH:mm:ss’),‘yyyy-MM-dd HH:mm:ss’)