一、背景说明
HBase是一个分布式的、面向列的开源NoSQL数据库,不同于传统关系型数据库,它在大数据量级下的性能表现堪称卓越。最近项目也在探索往Hbase方向迁移,故首先整理了一份Hbase入库效率方面的数据。
Hbase入库手段有三种,但针对项目实际情况,我采用了其中两种(JavaAPI和MapReduce)来进行入库操作,并进行比较。
二、测试环境
三台主机:一台master:192.168.13.74,两台slave(192.168.13.75/192.168.13.76)
Hadoop:Hadoop 2.6.0-cdh5.4.0
Hbase:HBase 1.0.0-cdh5.4.0
三、JavaAPI方式进行入库操作
1、新建java测试工程,新建测试类
2、导入相关jar包
3、新建测试类,通过HBase的API初始化连接
public static Configuration configuration;
private static Admin admin = null;
private static Random random = null;//生成主键使用
private static Connection connection = null;
static {
try {
configuration = HBaseConfiguration.create();
configuration.set("hbase.zookeeper.quorum", "192.168.13.74");
configuration.set("hbase.zookeeper.property.clientPort", "2181");
connection = ConnectionFactory.createConnection(configuration);
admin = connection.getAdmin();
random = new Random();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
4、增删改查操作
/**
* 创建表
*
* @param tableName
*/
public static void createTable(String tableName) {
System.out.println("start create table ......");
TableName tn = TableName.valueOf(tableName);
try {
if (admin.tableExists(tn)) {
admin.disableTable(tn);
admin.deleteTable(tn);
System.out.println(tableName + " is exist,detele....");
}
HTableDescriptor hTableDescriptor = new HTableDescriptor(tn);
hTableDescriptor.addFamily(new HColumnDescriptor("column1"));
hTableDescriptor.addFamily(new HColumnDescriptor("column2"));
hTableDescriptor.addFamily(new HColumnDescriptor("column3"));
admin.createTable(hTableDescriptor);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("end create table ......");
}
/**
* 插入数据
*
* @param tableName
*/
public static void insertData(String tableName) {
// System.out.println("start insert data ......");
Table table = null;
TableName tn = TableName.valueOf(tableName);
try {
table = connection.getTable(tn);
// System.out.println("init insert data ......");
Put put = new Put(String.valueOf(random.nextLong()).getBytes());// 一个PUT代表一行数据,再NEW一个PUT表示第二行数据,每行一个唯一的ROWKEY,此处rowkey为put构造方法中传入的值
put.addColumn("column1".getBytes(), null, "ddd".getBytes());// 本行数据的第一列
put.addColumn("column2".getBytes(), null, "bbb".getBytes());// 本行数据的第三列
put.addColumn("column3".getBytes(), null, "ccc".getBytes());// 本行数据的第三列
// System.out.println("insert data ......");
table.put(put);
// System.out.println("insert data over......");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
try {
if (table != null)
table.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// System.out.println("end insert data ......");
}
/**
* 删除一张表
*
* @param tableName
*/
public static void dropTable(String tableName) {
try {
TableName tn = TableName.valueOf(tableName);
admin.disableTable(tn);
admin.deleteTable(tn);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* 根据 rowkey删除一条记录
*
* @param tablename
* @param rowkey
*/
public static void deleteRow(String tablename, String rowkey) {
Table table = null;
TableName tn = TableName.valueOf(tablename);
try {
table = connection.getTable(tn);
List list = new ArrayList();
Delete d1 = new Delete(rowkey.getBytes());
list.add(d1);
table.delete(list);
System.out.println("删除行成功!");
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (table != null)
table.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/**
* 组合条件删除
*
* @param tablename
* @param rowkey
*/
public static void deleteByCondition(String tablename, String rowkey) {
// 目前还没有发现有效的API能够实现 根据非rowkey的条件删除 这个功能能,还有清空表全部数据的API操作
}
/**
* 查询所有数据
*
* @param tableName
*/
public static void QueryAll(String tableName) {
Table table = null;
TableName tn = TableName.valueOf(tableName);
try {
table = connection.getTable(tn);
ResultScanner rs = table.getScanner(new Scan());
for (Result r : rs) {
System.out.println("获得到rowkey:" + new String(r.getRow()));
for (Cell cell : r.rawCells()) {
System.out.println("列:" + new String(cell.getFamilyArray())
+ "====值:" + new String(cell.getValueArray()));
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (table != null)
table.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/**
* 单条件查询,根据rowkey查询唯一一条记录
*
* @param tableName
*/
public static void QueryByCondition1(String tableName) {
Table table = null;
TableName tn = TableName.valueOf(tableName);
try {
table = connection.getTable(tn);
Get scan = new Get("112233bbbcccc".getBytes());// 根据rowkey查询
Result r = table.get(scan);
System.out.println("获得到rowkey:" + new String(r.getRow()));
for (Cell cell : r.rawCells()) {
System.out.println("列:" + new String(cell.getFamilyArray())
+ "====值:" + new String(cell.getValueArray()));
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (table != null)
table.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/**
* 单条件按查询,查询多条记录
*
* @param tableName
*/
public static void QueryByCondition2(String tableName) {
Table table = null;
TableName tn = TableName.valueOf(tableName);
try {
table = connection.getTable(tn);
Filter filter = new SingleColumnValueFilter(
Bytes.toBytes("column1"), null, CompareOp.EQUAL,
Bytes.toBytes("ddd")); // 当列column1的值为ddd时进行查询
Scan s = new Scan();
s.setFilter(filter);
ResultScanner rs = table.getScanner(s);
for (Result r : rs) {
System.out.println("获得到rowkey:" + new String(r.getRow()));
for (Cell cell : r.rawCells()) {
System.out.println("列:" + new String(cell.getFamilyArray())
+ "====值:" + new String(cell.getValueArray()));
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (table != null)
table.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/**
* 组合条件查询
*
* @param tableName
*/
public static void QueryByCondition3(String tableName) {
Table table = null;
TableName tn = TableName.valueOf(tableName);
try {
table = connection.getTable(tn);
List<Filter> filters = new ArrayList<Filter>();
Filter filter1 = new SingleColumnValueFilter(
Bytes.toBytes("column1"), null, CompareOp.EQUAL,
Bytes.toBytes("aaa"));
filters.add(filter1);
Filter filter2 = new SingleColumnValueFilter(
Bytes.toBytes("column2"), null, CompareOp.EQUAL,
Bytes.toBytes("bbb"));
filters.add(filter2);
Filter filter3 = new SingleColumnValueFilter(
Bytes.toBytes("column3"), null, CompareOp.EQUAL,
Bytes.toBytes("ccc"));
filters.add(filter3);
FilterList filterList1 = new FilterList(filters);
Scan scan = new Scan();
scan.setFilter(filterList1);
ResultScanner rs = table.getScanner(scan);
for (Result r : rs) {
System.out.println("获得到rowkey:" + new String(r.getRow()));
for (Cell cell : r.rawCells()) {
System.out.println("列:" + new String(cell.getFamilyArray())
+ "====值:" + new String(cell.getValueArray()));
}
}
rs.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (table != null)
table.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
其中,组合条件的删除操作,暂时没有相关API支持
5、以上是基于单线程的操作,通过Thread可以实现多线程并发操作
public static class ImportThread extends Thread {
public void HandleThread() {
// this.TableName = "T_TEST_1";
}
//
public void run() {
try {
InsertProcess("test");
} catch (IOException e) {
e.printStackTrace();
} finally {
System.gc();
}
}
}
/*
* 多线程环境下线程插入函数
*/
public static void InsertProcess(String tableName) throws IOException {
// System.out.println("start insert data ......");
Table table = null;
TableName tn = TableName.valueOf(tableName);
int count = 15000;
long start = System.currentTimeMillis();
try {
table = connection.getTable(tn);
List<Put> list = new ArrayList<Put>();
Put put = null;
for(int i=0;i<count;i++) {
// System.out.println("init insert data ......");
put = new Put(String.valueOf(random.nextLong()).getBytes());// 一个PUT代表一行数据,再NEW一个PUT表示第二行数据,每行一个唯一的ROWKEY,此处rowkey为put构造方法中传入的值
put.addColumn("column1".getBytes(), null, "ddd".getBytes());// 本行数据的第一列
put.addColumn("column2".getBytes(), null, "bbb".getBytes());// 本行数据的第三列
put.addColumn("column3".getBytes(), null, "ccc".getBytes());// 本行数据的第三列
// System.out.println("insert data ......");
list.add(put);
}
table.put(list);
long stop = System.currentTimeMillis();
System.out.println("线程:"+Thread.currentThread().getId()+"插入数据:"+count+"共耗时:"+ (stop - start)*1.0/1000+"s");
// System.out.println("insert data over......");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
try {
if (table != null)
table.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/*
* Mutil thread insert test
*/
public static void MultThreadInsert() throws InterruptedException {
System.out.println("---------开始MultThreadInsert测试----------");
long start = System.currentTimeMillis();
int threadNumber = 5;
Thread[] threads = new Thread[threadNumber];
for (int i = 0; i < threads.length; i++) {
threads[i] = new ImportThread();
threads[i].start();
}
for (int j = 0; j < threads.length; j++) {
(threads[j]).join();
}
long stop = System.currentTimeMillis();
System.out.println("MultThreadInsert:" + threadNumber * 10000 + "共耗时:"
+ (stop - start) * 1.0 / 1000 + "s");
System.out.println("---------结束MultThreadInsert测试----------");
}
6、基于以上程序,我们可以针对不同数量级和并发任务数的组合,来进行相关测试工作,测试结果如下:
从测试结果可以看出,JavaAPI方式调用的情况下,单线程入库速度为2000条/s~7000条/s之间,而在多线程并发状态下,最高速度能达到10900条/s,稍优于Mysql单节点的入库速度。但小数量级的入库速度,要慢于Mysql。波动幅度比较大。
注:在windows上用eclipse远程访问HDFS时,需要配置hosts文件,把HDFS所有主机的主机名与IP对应关系配置好,否则集群在通信时找不到主机:
192.168.13.74 traceMaster
192.168.13.75 traceSlave1
192.168.13.76 traceSlave2
下一节,我们再来尝试用MapReduce的方式来入库,看看效率是否能进一步提升