数据仓库
OLAP //online analyze process.
//数量量大,并发低,延迟高。
hive //hadoop mr,效率高。
sql //类似sql语句。
数据库
mysql,
OLTP //在线事务处理。acid
//事务并发现象: dirty read | unrepeatable read | phantom read
//隔离级别:read uncommitted | read committed | repeatable read | serializable
//延迟低.
hive
蜜蜂。
数据仓库。
使用sql方式读、写、管理驻留在分布式存储系统上的大型数据集的数据仓库软件。
命令行方式和driver方式。
最初由facebook开发,
处理结构化数据。
[不是]
关系型数据库
OLTP
实时查询和行级更新。
[特点]
存储schema文件在数据库中,处理的是hdfs数据。
OLAP
提供SQL语句,成为HiveQL / HQL
可伸缩、可扩展
hive概念
1.user interface
shell
web ui
2.metastore
元数据库
存储库、表、列、类型等信息。
存放在关系数据库中。
3.HiveQL prcoess engine
处理引擎,替代传统MR方式。
4.执行引擎
处理查询,生成MR的结果。
5.hdfs | hbase
存储存放地。
hive与hadoop的交互流程
client同hive driver(select id,name,age from customers ;)
-->通过编译器进行编译
-->查询metastore
-->执行引擎
-->hadoop mr
hive安装
1.下载
2.tar
$>tar -xzvf apache-hive-2.1.0-bin.tar.gz -C /soft/
$>ln -s apache-hive-2.1.0-bin hive
3.环境变量
$>sudo nano /etc/profile
...
HIVE_HOME=/soft/hive
PATH=$PATH:$HIVE_HOME/bin
4.source /etc/profile
hive配置元数据到mysql中,不使用默认的derby库。
1.复制mysql驱动到hive的lib下.
cp mysql-connector-java-5.1.17.jar /soft/hive/lib
2.创建hive配置文件
[/soft/hive/conf/hive-site.xml]
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://192.168.231.1:3306/hive6</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>hive.server2.authentication</name>
<value>NONE</value>
</property>
<property>
<name>hive.server2.enable.doAs</name>
<value>false</value>
</property>
初始化hive元数据到mysql
$>bin/schemaTool -dbType mysql -initSchema
hive的组件
database //hdfs目录,/user/hive/warehouse/${dbname}.db
table //hdfs目录,/user/hive/warehouse/${dbname}.db/${table}
进入shell
//进入hive命令行
$>hive
//hive操作
$hive>create database mydb ; //建库
$hive>use mydb ;
$hive>create table customers(id int,name string , age int) ; //建表
$hive>show database ;
$hive>show tables ;
$hive>desc customers ;
使用hive的hiveserver2服务实现hive cs访问
1.启动hiveserver2
$>hive/bin/hiveserver2 &
2.查看hiveserver2是否启动完成
netstat -anop | grep 10000
3.启动beeline命令行
$>hive/bin/beeline
$beeline>!help
$beeline>!connect jdbc:hive2://localhost:10000/mydb ;
$beeline>show databases ;
远程通过jdbc连接访问hiveserver2,进行hive数据仓库操作
1.创建模块添加依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.it18zhang</groupId>
<artifactId>my-hive-day01</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>2.1.0</version>
</dependency>
</dependencies>
</project>
2.编写测试代码
package com.hive.test;
import org.junit.Test;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
/**
* 测试hive
*/
public class TestHive {
@Test
public void getConn() throws Exception {
String driver= "org.apache.hive.jdbc.HiveDriver" ;
Class.forName(driver) ;
Connection conn = DriverManager.getConnection("jdbc:hive2://s100:10000/mydb") ;
Statement st = conn.createStatement();
ResultSet rs = st.executeQuery("select id,name,age from customers order by id desc");
while(rs.next()){
int id = rs.getInt(1) ;
String name = rs.getString(2) ;
int age = rs.getInt(3) ;
System.out.println(id + " ," + name + "," + age);
}
rs.close();
conn.close();
}
}
3.执行并查看web ui
mr.
hive数据类型
1.基本数据类型
TINYINT //byte 1
SMALLINT //short 2
INT //int 4
BIGINT //long 8
FLOAT //float 4
DOUBLE //double 8
DECIMAL //decimal 精度和刻度decimal(10,3)
BINARY //二进制
BOOLEAN //TRUE | FALSE
STRING //字符串
CHAR //定长 <= 255
VARCHAR //变长 <=65355.
DATE //日期 '2013-01-01'
TIMESTAMP //时间戳 ‘2013-01-01 12:00:01.345’
2.复杂类型
ARRAY //数组 ['apple','orange','mango']
MAP //map {1:"apple",2: "orange"}
STRUCT //结构体 {1, "apple"}
NAMED STRUCT //命名结构体{"apple":"gala","weightkg":1}
UNION //组合 {2:["apple","orange"]}
3.准备数据
Michael|Montreal,Toronto|Male,30|DB:80|Product:Developer Lead
Will|Montreal|Male,35|Perl:85|Product:Lead,Test:Lead
Shelley|New York|Female,27|Python:80|Test:Lead,COE:Architect
Lucy|Vancouver|Female,57|Sales:89,HR:94|Sales:Lead
4.创建复杂表
CREATE TABLE employee
(
name string,
arr ARRAY<string>,
struc STRUCT<sex:string,age:int>,
map1 MAP<string,int>,
map2 MAP<string,ARRAY<string>>
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
5.加载本地数据到hive
$hive>load data local inpath '/home/centos/employee.txt' into table employee ;
//
$hive>select * from employee ;
//重命名
$hive>alter table employee rename to emp;
$hive>select name,arr from emp ;
//查询数组指定元素
$hive>select name,arr[0] from emp ;
//查询结构体
$hive>select name,struc.sex from emp ;
//查询map指定的key
$hive>select name,map1["DB"] from emp ;
//查询map2指定的k
$hive>select map2["Product"][0] from emp ;
6.使用hive实现word count
6.1)创建表
CREATE TABLE docs
(
line string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',' ;
6.2)加载数据到docs
load data local inpath '/home/centos/1.txt' into table docs ;
6.3)单词统计
//单词统计
select t.word,count(*) from (select explode(split(line," ")) word from docs) t group by t.word ;
//降序topN查询
select t.word,count(*) cnt from (select explode(split(line," ")) word from docs) t group by t.word order by cnt desc limit 3 ;
创建表完整语法
CREATE TABLE employee
(
name string,
arr ARRAY<string>,
struc STRUCT<sex:string,age:int>,
map1 MAP<string,int>,
map2 MAP<string,ARRAY<string>>
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|' //默认\001
COLLECTION ITEMS TERMINATED BY ',' //默认\002
MAP KEYS TERMINATED BY ':' //默认\003
LINES TERMINATED BY '\n' //行结束符
STORED AS TEXTFILE; //
explode
UDTF,表生成函数。
可以应用于array或者map.
类型转换函数
cast('124' as int); //转换成整数
concat('12','12','23','35') //字符串连接函数
DDL
create database if not exists xx ;
//创建数据库定义位置和属性,以及注释。
CREATE DATABASE IF NOT EXISTS myhivebook
COMMENT 'hive database demo'
LOCATION '/hdfs/directory'
WITH DBPROPERTIES ('creator'='dayongd','date'='2015-01-01')
//查看库信息
desc database myhivebook ;
//删除库
drop database myhivebook ;
load data
//1.从本地加载,复制过程
load data local inpath '/x/x/x/x/1.xt' into table docs ;
//2.从hdfs加载,移动过程
load data inpath '/x/x/x/x/1.xt' into table docs ;
内部表和外部表
1.内部表
托管表,删除表时,数据和表结构都删除,默认内部表。
2.外部表
删除时,只删除表结构。数据还在。
CTAS:Create the table as select
//携带数据
create table emp2 as select name from emp ;
like语句建表,只有数据结构,没有数据
create table emp3 like emp ;
truncate
快速请空表。
truncate table emp2 ;
分区表:分区是目录。
//创建分区表
CREATE TABLE custs
(
id int,
name string ,
age int
)
PARTITIONED BY (prov string, city string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
//添加分区
alter table custs add PARTITION (prov='hebei', city='baoding') PARTITION (prov='hebei', city='shijiazhuang');
//查看分区
SHOW PARTITIONS custs;
//删除分区
alter table custs drop partition partition(prov='hebei',city='shijizhuang') ;
//加载数据到分区
load data local inpath '/home/centos/cust.txt' into table custs partition(prov='hebei',city='baoding') ;
//按照分区查询
select * from custs where city = 'baoding' ;
桶表:桶表是文件。
//创建桶表
CREATE TABLE buck
(
id int,
name string ,
age int
)
CLUSTERED BY (id) INTO 3 BUCKETS
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
//桶的数量确定标准
避免桶内的数据量过大或者过小,一般以数据块的2倍为宜。
//设置map个数和强行分桶
set map.reduce.tasks = 2;
set hive.enforce.bucketing = true;
连接
//创建orders表。
CREATE TABLE orders
(
id int,
orderno string ,
price float,
cid int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
//笛卡尔积
select a.*,b.* from custs a , orders b ;
//内连接
select a.*,b.* from custs a inner join orders b on a.id = b.cid ;
//左外连接
select a.*,b.* from custs a left outer join orders b on a.id = b.cid ;
//半连接,只查询左边的表
select a.* from custs a left outer join orders b on a.id = b.cid ;
//右外
select a.*,b.* from custs a right outer join orders b on a.id = b.cid ;
//全外
select a.*,b.* from custs a full outer join orders b on a.id = b.cid ;
Map端连接
//连接暗示/*+ MAPJOIN(employee) */
SELECT /*+ MAPJOIN(employee) */ c.* FROM custs c CROSS JOIN orders o WHERE c.id <> o.cid;
//通过设置自动map端连接转换,实现map连接
set hive.auto.convert.join=true
SELECT c.* FROM custs c CROSS JOIN orders o WHERE c.id <> o.cid;
union查询
//union all
select id,name from custs union select id,orderno from orders ;
//去重
select distinct cid from orders ;