读取orc文件
@Test
public void readOrc() throws IOException {
Configuration conf = new Configuration();
Reader reader = OrcFile.createReader(new Path("/tmp/Orc.orc"),
OrcFile.readerOptions(conf));
RecordReader rows = reader.rows();
VectorizedRowBatch batch = reader.getSchema().createRowBatch();
while (rows.nextBatch(batch)) {
System.out.println(batch.toString());
}
rows.close();
}
写orc文件---一行
@Test
public void writeLine3() throws IOException {
Configuration conf = new Configuration();
TypeDescription schema = TypeDescription.fromString("struct<x:int,y:int>");
Writer writer = OrcFile.createWriter(new Path("/tmp/Orc.orc"),
OrcFile.writerOptions(conf)
.setSchema(schema));
VectorizedRowBatch batch = schema.createRowBatch();
LongColumnVector x = (LongColumnVector) batch.cols[0];
LongColumnVector y = (LongColumnVector) batch.cols[1];
int row = batch.size++;
x.vector[row] = 2;
y.vector[row] = 2 * 3;
if (batch.size != 0) {
writer.addRowBatch(batch);
batch.reset();
}
writer.close();
}
写orc文件--多行
@Test
public void writeLine2() throws IOException {
String[] lines = new String[]{"1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd", "1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd", "1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd", "1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd"};
// String[] lines = new String[]{"1,2,4", "1,2,3", "1,2,3", "1,2,3", "1,2,3", "1,2,3", "1,2,3", "1,2,3"};
Configuration conf = new Configuration();
TypeDescription schema = TypeDescription.fromString("struct<field1:String,field2:String,field3:String>");
// TypeDescription schema = TypeDescription.fromString("struct<field1:int,field2:int,field3:int>");
Writer writer = OrcFile.createWriter(new Path("/tmp/Orc.orc"),
OrcFile.writerOptions(conf)
.setSchema(schema).overwrite(true));
VectorizedRowBatch batch = schema.createRowBatch();
List<? super ColumnVector> columnVectors = new ArrayList<>();
for (int i = 0; i < batch.numCols; i++) {
columnVectors.add(batch.cols[i]);
}
for (String line : lines) {
String[] columns = line.split(",");
System.out.println(batch.size);
int row = batch.size++;
for (int i = 0; i < columns.length; i++) {
switch (columnVectors.get(i).getClass().getName()) {
case "org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector":
BytesColumnVector bytesColumnVector = BytesColumnVector.class.cast(columnVectors.get(i));
bytesColumnVector.setVal(row, columns[i].getBytes(), 0, columns[i].getBytes().length);
break;
case "org.apache.hadoop.hive.ql.exec.vector.LongColumnVector":
LongColumnVector longColumnVector = LongColumnVector.class.cast(columnVectors.get(i));
longColumnVector.vector[row] = Long.parseLong(columns[i]);
break;
case "org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector":
Decimal64ColumnVector decimal64ColumnVector = Decimal64ColumnVector.class.cast(columnVectors.get(i));
decimal64ColumnVector.set(row, HiveDecimal.create(columns[i]));
break;
case "org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector":
DecimalColumnVector decimalColumnVector = DecimalColumnVector.class.cast(columnVectors.get(i));
decimalColumnVector.set(row, HiveDecimal.create(columns[i]));
break;
case "org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector":
DoubleColumnVector doubleColumnVector = DoubleColumnVector.class.cast(columnVectors.get(i));
doubleColumnVector.vector[row] = Double.parseDouble(columns[i]);
break;
case "org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector":
TimestampColumnVector timestampColumnVector = TimestampColumnVector.class.cast(columnVectors.get(i));
timestampColumnVector.set(row, java.sql.Timestamp.valueOf(columns[i]));
break;
}
if (batch.size == batch.getMaxSize()) {
writer.addRowBatch(batch);
batch.reset();
}
}
}
if (batch.size != 0) {
writer.addRowBatch(batch);
batch.reset();
}
writer.close();
}
引用jar
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.*;
import org.junit.Test;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;