场景

实时产生的数据写入到Kafka,由Spark实时读取Kafka的数据,写入hudi

实现

package com.zhen.hudi.streaming

import com.zhen.hudi.didi.SparkUtils
import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD, PRECOMBINE_FIELD, RECORDKEY_FIELD}
import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.{DataFrame, Dataset, Row, SaveMode, SparkSession}

/**
* @Author FengZhen
* @Date 3/3/22 10:16 PM
* @Description 基于StructuredStreaming结构化流实时从kafka消费数据,经过ETL转换后,存储至Hudi表
*/
object HudiStructureDemo {

/**
* 指定kafka topic名称,实时消费数据
* @param spark
* @param topicName
* @return
*/
def readFromKafka(spark: SparkSession, topicName: String): DataFrame = {

spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", topicName)
//消费位置
.option("startingOffsets", "latest")
//每次最多处理十万条
.option("maxOffsetsPerTrigger", 100000)
//数据丢失是否失败
.option("failOnDataLoss", "false")
.load()

}

/**
* 对kafka获取的数据进行转换操作,获取所有字段的值,转换为String,以便保存到hudi表
* @param streamDF
* @return
*/
def process(streamDF: DataFrame): DataFrame = {

streamDF
//选择字段
.selectExpr(
"CAST(key AS STRING) AS order_id",
"CAST(value AS STRING) AS message",
"topic", "partition", "offset", "timestamp"
)
//解析Message数据,提取字段值
.withColumn("user_id", get_json_object(col("message"), "$.userId"))
.withColumn("order_time", get_json_object(col("message"), "$.orderTime"))
.withColumn("ip", get_json_object(col("message"), "$.ip"))
.withColumn("order_money", get_json_object(col("message"), "$.orderMoney"))
.withColumn("order_status", get_json_object(col("message"), "$.orderStatus"))
//删除message字段
.drop(col("message"))
//转换订单日期时间格式为Long类型,作为hudi表中合并数据字段
.withColumn("ts", to_timestamp(col("order_time"), "yyyy-MM-dd HH:mm:ss.SSS"))
//订单日期时间提取分区日期:yyyy-MM-dd
.withColumn("day", substring(col("order_time"), 0, 10))
}

/**
* 将流式数据DataFrame保存到Hudi表中
* @param streamDF
*/
def saveToHudi(streamDF: DataFrame): Unit = {
streamDF.writeStream
.outputMode(OutputMode.Append())
.queryName("query-hudi-streaming")
.foreachBatch((batchDF: Dataset[Row], batchId: Long) =>{
println(s"=============== BatchId: ${batchId} start =============== ")
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._
import org.apache.hudi.keygen.constant.KeyGeneratorOptions._

batchDF.write
.mode(SaveMode.Append)
.format("hudi")
.option("hoodie.insert.shuffle.parallelism", "2")
.option("hoodie.upsert.shuffle.parallelism", "2")
//hudi表的属性值的设置
//主键
.option(RECORDKEY_FIELD.key(), "order_id")
//预合并
.option(PRECOMBINE_FIELD.key(), "ts")
//分区
.option(PARTITIONPATH_FIELD.key(), "day")
//分区值对应目录格式,与hive分区策略一致
.option(HIVE_STYLE_PARTITIONING_ENABLE.key(), "true")
//表名
.option(TBL_NAME.key(), "tbl_hudi_order")
//读取时合并表类型
.option(TABLE_TYPE.key(), "MERGE_ON_READ")
.save("/hudi-warehouse/tbl_hudi_order")
})
.option("checkpointLocation", "/datas/hudi-spark/struct-ckpt-1001")
.start()
}

def main(args: Array[String]): Unit = {
//1.构建SparkSession实例对象
val spark: SparkSession = SparkUtils.createSparkSession(this.getClass)

//2.从kafka实时消费数据
val kafkaStreamDF: DataFrame = readFromKafka(spark, "order-topic")

//3.提取数据,转换数据类型
val streamDF: DataFrame = process(kafkaStreamDF)

//4.保存数据至hudi表中:MOR类型,读取表数据合并文件
saveToHudi(streamDF)

//5.流式应用启动以后,等待终止
spark.streams.active.foreach(query => println(s"Query: ${query.name} is Running"))
spark.streams.awaitAnyTermination()

}

}