本文将csv格式的数据转化为parquet格式,涉及的数据类型包括String,timestamp,double,boolean

其中timestamp由int64存放。

parquet元数据

java csv数据转parquet格式_列式存储

由上图可知parquet支持的类型如下:


  • BOOLEAN: 1 bit boolean
  • INT32: 32 bit signed ints
  • INT64: 64 bit signed ints
  • INT96: 96 bit signed ints
  • FLOAT: IEEE 32-bit floating point values
  • DOUBLE: IEEE 64-bit floating point values
  • BYTE_ARRAY: arbitrarily long byte arrays.
    具体见官网:https://parquet.apache.org/documentation/latest/

涉及的maven依赖包如下:

pom.xml

<!-- https://mvnrepository.com/artifact/org.apache.parquet/parquet-hadoop -->
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>1.9.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.parquet/parquet-hadoop -->
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-format</artifactId>
<version>2.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-encoding</artifactId>
<version>1.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-common</artifactId>
<version>1.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-column</artifactId>
<version>1.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
<version>1.9.0</version>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>RELEASE</version>
<scope>compile</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.2.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/net.sourceforge.javacsv/javacsv -->
<dependency>
<groupId>net.sourceforge.javacsv</groupId>
<artifactId>javacsv</artifactId>
<version>2.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>1.14.0</version>
</dependency>
<dependency>
<groupId>org.jodd</groupId>
<artifactId>jodd-core</artifactId>
<version>5.1.5</version>
</dependency>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>2.10.5</version>
</dependency>

代码如下

TestParqueWriter_2.java

import com.csvreader.CsvReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.example.ExampleParquetWriter;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.charset.Charset;
import java.sql.Time;
import java.sql.Timestamp;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import static com.csv2Parquet.readCSV.gethead;

public class TestParqueWriter_2 {

public static Path file = new Path(System.currentTimeMillis() + ".parquet");
private static Logger logger = LoggerFactory
.getLogger(TestParqueWriter_2.class);
//根据实际数据,设计字段类型相对应的parseMessageType,程序通过parseMessageType来生成parquet数据
private static String schemaStr = "message schema " +
"{repeated binary item_id (UTF8);" +
"optional int64 bill_billing_period_start_date(TIMESTAMP_MILLIS);" +
"repeated double cost ;" +
"repeated binary year (UTF8);" +
"repeated binary month (UTF8);}";
static MessageType schema =MessageTypeParser.parseMessageType(schemaStr);
//描述:输出MessageType
public static void testParseSchema(){
System.out.println(schema.toString());
}
// 描述:获取parquet的Schema
public static void testGetSchema() throws Exception {
Configuration configuration = new Configuration();
ParquetMetadata readFooter = null;
Path parquetFilePath = new Path("input.parquet");
readFooter = ParquetFileReader.readFooter(configuration,
parquetFilePath, ParquetMetadataConverter.NO_FILTER);
MessageType schema =readFooter.getFileMetaData().getSchema();
System.out.println(schema.toString());
}

//自动读取csv表头和数据写入到parquet文件中
private static void testParquetWriter() throws IOException {
//以时间戳为输出的文件名
DateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String csvFile = "output.csv";
CsvReader csvReader = null;
String pattern_3="cost";
//读取csv文件
csvReader = new CsvReader(csvFile, ',', Charset.forName("UTF-8"));
ExampleParquetWriter.Builder builder = ExampleParquetWriter
.builder(file).withWriteMode(ParquetFileWriter.Mode.CREATE)
.withWriterVersion(ParquetProperties.WriterVersion.PARQUET_2_0)
.withCompressionCodec(CompressionCodecName.SNAPPY)
//.withConf(configuration)
.withType(schema);
ParquetWriter<Group> writer = builder.build();
String[] csvhead = gethead("input.csv");
SimpleGroupFactory groupFactory = new SimpleGroupFactory(schema);
csvReader.readHeaders();
while (true) {
if (!csvReader.readRecord()) break;
String[] str = csvReader.getValues();
Group group =groupFactory.newGroup();
String newName="";
for (int j = 0; j < 字段个数; j++) {
System.out.println(csvhead[j]+":"+str[j]);
String dirDiveded[] = csvhead[j].split("_");
newName = dirDiveded[dirDiveded.length-1];
//根据字段名末尾单词判断是否为double类型
if(newName.equals(pattern_3)) {
if(!str[j].isEmpty())
group.add(csvhead[j], Double.parseDouble(str[j]));
else group.add(csvhead[j], Double.NaN);
}
//根据字段名末尾单词判断是否为boolean类型
else if(csvhead[j].equals("workingsupport"))
if(!str[j].isEmpty())
group.add(csvhead[j], Boolean.parseBoolean(str[j]));
// else group.append(csvhead[j], (NanoTime) null);
else group.add(csvhead[j], Boolean.parseBoolean(null));
//根据字段名末尾单词判断是否为timestamp类型
else if(newName.equals("date"))
{
Date date = new Date();
Date date_1 = new Date();
//注意format的格式要与日期String的格式相匹配
try {
//字符串转Date
date = sdf.parse(str[j]);
//时间加八个小时
date_1 = new Date(date.getTime()+8 * 60 * 60 * 1000);
} catch (ParseException e) {
e.printStackTrace();
}
Timestamp ts = new Timestamp(date_1.getTime());
group.add(csvhead[j], date_1.getTime() );
else
if(!str[j].isEmpty())
group.add(csvhead[j],str[j]);
else group.add(csvhead[j], Binary.EMPTY);

}
writer.write(group);
}
writer.close();
}

//描述:测试读parquet文件

private static void testParquetReader() throws IOException{
Path file = new Path("output.parquet");
ParquetReader.Builder<Group> builder = ParquetReader.builder(new GroupReadSupport(), file);
ParquetReader<Group> reader = builder.build();
// SimpleGroup group =(SimpleGroup) reader.read();
// System.out.println("schema:"+group.getType().toString());
// System.out.println(group.get(""));
//System.out.println("identity_line_item_id:"+group.getString(1, 0));
Group line = null;
while((line = reader.read()) != null) {
System.out.println(line.getString("date", 0));
System.out.println(line.getLong("cost", 0));
}
}

public static void main(String[] args) throws Exception {
testGetSchema();
// testParseSchema();
testParquetWriter();
//testParquetReader();
}
}

数据涉及隐私,不做展示,有问题需要探讨可留言或者私信