1、设置连接,参考之前文章:Java API操作HA方式下的Hadoop
static String ClusterName = "nsstargate";
private static final String HADOOP_URL = "hdfs://"+ClusterName;
public static Configuration conf;
static {
conf = new Configuration();
conf.set("fs.defaultFS", HADOOP_URL);
conf.set("dfs.nameservices", ClusterName);
conf.set("dfs.ha.namenodes."+ClusterName, "nn1,nn2");
conf.set("dfs.namenode.rpc-address."+ClusterName+".nn1", "172.16.50.24:8020");
conf.set("dfs.namenode.rpc-address."+ClusterName+".nn2", "172.16.50.21:8020");
//conf.setBoolean(name, value);
conf.set("dfs.client.failover.proxy.provider."+ClusterName,
"org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
conf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
}
注:如果只是Configuration conf = new Configuration(); 不设置hdfs连接信息的话,则会将文件写到本地磁盘上(需要配置hadoop环境信息)。
2、设置orc文件的schema
TypeDescription schema = TypeDescription.createStruct()
.addField("field1", TypeDescription.createLong())
.addField("field2", TypeDescription.createDouble())
.addField("field3", TypeDescription.createBoolean())
.addField("field4", TypeDescription.createTimestamp())
.addField("field5", TypeDescription.createString());
3、输出ORC文件到HDFS
String fileName = "/user/test/test_orc_file_datatype.orc";
Path path = new Path(fileName);
FileSystem fs;
try {
fs = path.getFileSystem(conf);
if (fs.exists(path)) {
fs.delete(path, true);
}
} catch (Exception e) {
e.printStackTrace();
throw new KettleFileException(e.getCause());
}
Writer writer = OrcFile.createWriter(path,
OrcFile.writerOptions(conf)
.setSchema(schema)
.stripeSize(67108864)
.bufferSize(131072)
.blockSize(134217728)
.compress(CompressionKind.ZLIB)
.version(OrcFile.Version.V_0_12));
//要写入的内容
Object[][] contents = new Object[][]{
{1l,1.1,false,"2016-10-21 14:56:25","abcd"},
{2l,1.2,true,"2016-10-22 14:56:25","中文"}
};
VectorizedRowBatch batch = schema.createRowBatch();
for(Object[] content : contents) {
int rowCount = batch.size++;
((LongColumnVector) batch.cols[0]).vector[rowCount] = (long) content[0];
((DoubleColumnVector) batch.cols[1]).vector[rowCount] =(double) content[1];
((LongColumnVector) batch.cols[2]).vector[rowCount] =content[2].equals(true)?1:0;
((TimestampColumnVector) batch.cols[3]).time[rowCount]
= (Timestamp.valueOf((String) content[3])).getTime();
((BytesColumnVector) batch.cols[4]).setVal(rowCount, content[4].toString().getBytes("UTF8"));
//batch full
if (batch.size == batch.getMaxSize()) {
writer.addRowBatch(batch);
batch.reset();
}
}
if(batch.size>0){
writer.addRowBatch(batch);
}
writer.close();
4、Hive建表及load orc文件
create table testtype(field1 bigint, f2 double, f3 boolean,field4 timestamp,f5 string) stored AS orc;
load data inpath '/user/test/test_orc_file_datatype.orc' overwrite into table testtype;
在创建文件以及将orc文件导入到Hive表中时,需要注意的是:
当字段为boolean类型时,则schema为boolean,写入为long(true为1,false为0),创建hive表为boolean,建表时字段无需与schema中字段同名,但是必须保证顺序一致。
可以参考:Using Core Java