1、Spark-shell读写Hudi
1.1、Spark-shell启动
// spark-shell for spark 3
spark-shell \
--packages org.apache.hudi:hudi-spark3-bundle_2.12:0.10.0,org.apache.spark:spark-avro_2.12:3.1.2 \
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
// spark-shell for spark 2 with scala 2.12
spark-shell \
--packages org.apache.hudi:hudi-spark-bundle_2.12:0.10.0,org.apache.spark:spark-avro_2.12:2.4.4 \
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
// spark-shell for spark 2 with scala 2.11
spark-shell \
--packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.0,org.apache.spark:spark-avro_2.11:2.4.4 \
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
1.2、设置表名
设置表名,基本路径和数据生成器
// spark-shell
import org.apache.hudi.QuickstartUtils._
import scala.collection.JavaConversions._
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._
val tableName = "hudi_trips_cow"
val basePath = "file:///tmp/hudi_trips_cow"
val dataGen = new DataGenerator
1.3、数据写入
// spark-shell
val inserts = convertToStringList(dataGen.generateInserts(10))
val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2))
df.write.format("hudi").
options(getQuickstartWriteConfigs).
option(PRECOMBINE_FIELD_OPT_KEY, "ts").
option(RECORDKEY_FIELD_OPT_KEY, "uuid").
option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
option(TABLE_NAME, tableName).
mode(Overwrite).
save(basePath)
1.4、查询数据
// spark-shell
val tripsSnapshotDF = spark.
read.
format("hudi").
load(basePath)
//load(basePath) use "/partitionKey=partitionValue" folder structure for Spark auto partition discovery
tripsSnapshotDF.createOrReplaceTempView("hudi_trips_snapshot")
spark.sql("select fare, begin_lon, begin_lat, ts from hudi_trips_snapshot where fare > 20.0").show()
spark.sql("select _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare from hudi_trips_snapshot").show()
1.5、修改数据
// spark-shell
val updates = convertToStringList(dataGen.generateUpdates(10))
val df = spark.read.json(spark.sparkContext.parallelize(updates, 2))
df.write.format("hudi").
options(getQuickstartWriteConfigs).
option(PRECOMBINE_FIELD_OPT_KEY, "ts").
option(RECORDKEY_FIELD_OPT_KEY, "uuid").
option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
option(TABLE_NAME, tableName).
mode(Append).
save(basePath)
1.6、增量查询
// spark-shell
// reload data
spark.
read.
format("hudi").
load(basePath).
createOrReplaceTempView("hudi_trips_snapshot")
val commits = spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_trips_snapshot order by commitTime").map(k => k.getString(0)).take(50)
val beginTime = commits(commits.length - 2) // commit time we are interested in
// incrementally query data
val tripsIncrementalDF = spark.read.format("hudi").
option(QUERY_TYPE_OPT_KEY, QUERY_TYPE_INCREMENTAL_OPT_VAL).
option(BEGIN_INSTANTTIME_OPT_KEY, beginTime).
load(basePath)
tripsIncrementalDF.createOrReplaceTempView("hudi_trips_incremental")
spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_trips_incremental where fare > 20.0").show()
1.7、时间点查询
// spark-shell
val beginTime = "000" // Represents all commits > this time.
val endTime = commits(commits.length - 2) // commit time we are interested in
//incrementally query data
val tripsPointInTimeDF = spark.read.format("hudi").
option(QUERY_TYPE_OPT_KEY, QUERY_TYPE_INCREMENTAL_OPT_VAL).
option(BEGIN_INSTANTTIME_OPT_KEY, beginTime).
option(END_INSTANTTIME_OPT_KEY, endTime).
load(basePath)
tripsPointInTimeDF.createOrReplaceTempView("hudi_trips_point_in_time")
spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_trips_point_in_time where fare > 20.0").show()
1.8、删除数据
// spark-shell
// fetch total records count
spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
// fetch two records to be deleted
val ds = spark.sql("select uuid, partitionpath from hudi_trips_snapshot").limit(2)
// issue deletes
val deletes = dataGen.generateDeletes(ds.collectAsList())
val df = spark.read.json(spark.sparkContext.parallelize(deletes, 2))
df.write.format("hudi").
options(getQuickstartWriteConfigs).
option(OPERATION_OPT_KEY,"delete").
option(PRECOMBINE_FIELD_OPT_KEY, "ts").
option(RECORDKEY_FIELD_OPT_KEY, "uuid").
option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
option(TABLE_NAME, tableName).
mode(Append).
save(basePath)
// run the same read query as above.
val roAfterDeleteViewDF = spark.
read.
format("hudi").
load(basePath)
roAfterDeleteViewDF.registerTempTable("hudi_trips_snapshot")
// fetch should return (total - 2) records
spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
1.9、覆盖数据
// spark-shell
spark.
read.format("hudi").
load(basePath).
select("uuid","partitionpath").
sort("partitionpath","uuid").
show(100, false)
val inserts = convertToStringList(dataGen.generateInserts(10))
val df = spark.
read.json(spark.sparkContext.parallelize(inserts, 2)).
filter("partitionpath = 'americas/united_states/san_francisco'")
df.write.format("hudi").
options(getQuickstartWriteConfigs).
option(OPERATION.key(),"insert_overwrite").
option(PRECOMBINE_FIELD.key(), "ts").
option(RECORDKEY_FIELD.key(), "uuid").
option(PARTITIONPATH_FIELD.key(), "partitionpath").
option(TBL_NAME.key(), tableName).
mode(Append).
save(basePath)
// Should have different keys now for San Francisco alone, from query before.
spark.
read.format("hudi").
load(basePath).
select("uuid","partitionpath").
sort("partitionpath","uuid").
show(100, false)
2、Spark写数据
主要介绍,使用Delta Streamer工具从外部源甚至其他Hudi表中读取和更改的方法,以及使用Hudi数据源通过upserts加速Spark作业的方法。然后使用各种查询引擎来进行查询表数据。
2.1、Write Operations
Delta Streamer工具提供3种不同的写入的操作方式
- Upsert:这是默认操作,输入数据时会通过索引判定为插入或更新。
- Insert:此操作与upsert类似,但是完全跳过索引,因此效率可能比upsert高的多,但也只适用于允许数据重复的场景。
- Bulk_Insert:upsert和insert操作都将数据保存到内存中,以更快的存储计算,因此在引导加载初始化表的时候会比较麻烦。那么大批插入的方式实现了基于排序的数据写入算法,可以很好地负载百TB的初始化负载。
2.2、Delta Streamer
Hudi Delta Streamer提供了从不同来源(DFS或Kafka)读取数据的方法,具有以下功能
- 精准一次的从kafka消费数据,支持从Sqoop或HiveIncrementalPuller的输出或DFS文件的增量导入
- 支持手动创建arvo文件或者使用Confluent注册
- 支持传入数据类型有json、arvo或自定义类型
- 具有checkpoint检查点,回滚和恢复的功能
2.3、Spark DataFrame写数据
1、引入hudi相关依赖
<dependencies>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-client</artifactId>
<scope>provided</scope>
<version>0.8.0</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark-bundle_2.12</artifactId>
<scope>provided</scope>
<version>0.8.0</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hadoop-mr-bundle</artifactId>
<exclusions>
<exclusion>
<groupId>jackson-databind</groupId>
<artifactId>com.fasterxml.jackson.core</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
</exclusion>
</exclusions>
<version>0.8.0</version>
<scope>provided</scope>
</dependency>
</dependencies>
需要重启hive服务。
3、实例代码(Spark读取数据写入Hive和Hudi)
package com.hudi
import org.apache.hudi.config.HoodieIndexConfig
import org.apache.hudi.hive.MultiPartKeysValueExtractor
import org.apache.hudi.index.HoodieIndex
import org.apache.spark.SparkConf
import org.apache.spark.sql.{SaveMode, SparkSession}
/**
* @Author: Yuan Liu
* @Description:
* @Date: Created in 22:44 2021/12/21
*
* Good Good Study Day Day Up
*/
object TableOperator {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("test_operator").setMaster("local[*]")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val sparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()
insertData(sparkSession);
}
def insertData(sparkSession: SparkSession) = {
import org.apache.spark.sql.functions._
val commitTime = System.currentTimeMillis().toString //生成提交时间
val resultDF = sparkSession.read.json("/user/atguigu/ods/member.log")
.withColumn("ts", lit(commitTime)) //添加ts时间戳
.withColumn("hudipartition", concat_ws("/", col("dt"), col("dn"))) //添加分区 两个字段组合分区
Class.forName("org.apache.hive.jdbc.HiveDriver")
resultDF.write.format("hudi")
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) //选择表的类型 到底是MERGE_ON_READ 还是 COPY_ON_WRITE
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "uid") //设置主键
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "ts") //数据更新时间戳的
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "hudipartition") //hudi分区列
.option("hoodie.table.name", "member") //hudi表名
.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2://hadoop101:10000") //hiveserver2地址
.option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY, "default") //设置hudi与hive同步的数据库
.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY, "member") //设置hudi与hive同步的表名
.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "dt,dn") //hive表同步的分区列
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY, classOf[MultiPartKeysValueExtractor].getName) // 分区提取器 按/ 提取分区
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true") //设置数据集注册并同步到hive
.option(HoodieIndexConfig.BLOOM_INDEX_UPDATE_PARTITION_PATH, "true") //设置当分区变更时,当前数据的分区目录是否变更
.option(HoodieIndexConfig.INDEX_TYPE_PROP, HoodieIndex.IndexType.GLOBAL_BLOOM.name()) //设置索引类型目前有HBASE,INMEMORY,BLOOM,GLOBAL_BLOOM 四种索引 为了保证分区变更后能找到必须设置全局GLOBAL_BLOOM
.option("hoodie.insert.shuffle.parallelism", "12")
.option("hoodie.upsert.shuffle.parallelism", "12")
.mode(SaveMode.Append)
.save("/user/my/hudi/hivetest")
}
/**
* 修改数据,hudi支持行级更新
*/
def updateData(sparkSession: SparkSession) = {
//只查询20条数据,并进行修改,修改这20条数据的fullname值
import org.apache.spark.sql.functions._
val commitTime = System.currentTimeMillis().toString //生成提交时间
val resultDF = sparkSession.read.json("/user/my/ods/member.log")
.withColumn("ts", lit(commitTime)) //添加ts时间戳
.withColumn("hudipartition", concat_ws("/", col("dt"), col("dn")))
.where("uid >=0 and uid <20")
.withColumn("fullname", lit("进行修改"))
//改完数据之后,进行插入,表会根据DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY 主键进行判断修改
//插入类型必须是append模 才能起到修改作用
resultDF.write.format("hudi")
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) //选择表的类型 到底是MERGE_ON_READ 还是 COPY_ON_WRITE
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "uid") //设置主键
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "ts") //数据更新时间戳的
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "hudipartition") //hudi分区列
.option("hoodie.table.name", "member") //hudi表名
.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2://hadoop101:10000") //hiveserver2地址
.option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY, "default") //设置hudi与hive同步的数据库
.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY, "member") //设置hudi与hive同步的表名
.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "dt,dn") //hive表同步的分区列
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY, classOf[MultiPartKeysValueExtractor].getName) // 分区提取器 按/ 提取分区
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true") //设置数据集注册并同步到hive
.option(HoodieIndexConfig.BLOOM_INDEX_UPDATE_PARTITION_PATH, "true") //设置当分区变更时,当前数据的分区目录是否变更
.option(HoodieIndexConfig.INDEX_TYPE_PROP, HoodieIndex.IndexType.GLOBAL_BLOOM.name()) //设置索引类型目前有HBASE,INMEMORY,BLOOM,GLOBAL_BLOOM 四种索引 为了保证分区变更后能找到必须设置全局GLOBAL_BLOOM
.option("hoodie.insert.shuffle.parallelism", "12")
.option("hoodie.upsert.shuffle.parallelism", "12")
.mode(SaveMode.Append)
.save("/user/my/hudi/hivetest")
}
}
注意:DataSourceWriteOptions.TABLE_TYPE_OPT_KEY:
DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL。读时合并,会在hive产生2张表。
DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL。写时复制,在hive只产生1张表。
- 如果表类型为copy on write,那么对应的hive里的建表会采用HoodieParquetInputFormat 格式行快照查询和增量查询
- 如果表类型为Merge On Read类型,那么会得到两张表,分别以ro和rt结尾。rt结尾的表支持快照查询和增量查询,查询rt表将会查询表基本列数据和增量日志数据的合并视图,立马可以查询到修改后的数据。而ro表则只查询表中基本列数据并不会去查询增量日志里的数据。rt表采用HoodieParquetRealtimeInputFormat格式进行存储,ro表采用HoodieParquetInputFormat格式进行存储
2.4、两种类型表的比较
2.4.1、两张表支持的查询引擎及类型
- 快照查询,是指查询当前时间点的数据表,所以如果表是copy on write类型的表,那么直接进行查询就可以查看到修改后的数据。
- 如果是merge on read 表查询rt结尾的表为最新快照表,如果是使用spark sql查询,需要将spark.sql.hive.convertMetastoreParquet参数设置为false。查询ro结尾的表,则对应是读取优化查询,只查询最基本列数据,并不会看到被修改的数据。
2.4.2、快照查询与增量查询
1、快照查询(spark默认快照查询,也可以不指定类型)
spark
.read
.format("hudi")
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY,DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
.load("/user/my/hudi/hivetest/*/*")
.where("uid>=0 and uid<20")
.show
2、增量查询
hudi提供使用时间戳增量查询。指定查询类型为增量查询,并且传入时间戳,那么spark会查询时间戳以后的数据。
spark.read.format("hudi")
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY, "20210805165813")
.load("/user/my/hudi/hivetest/").show(40)
DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY是开始时间戳,也对应有DataSourceReadOptions.END_INSTANTTIME_OPT_KEY结束时间戳,增量查询也可以给时间范围进行查询。(是大于开始时间戳)
2.5、Structured Streaming实时写Hudi
实例代码
object StructuredStreamingTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("test-app")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.set("spark.sql.shuffle.partitions", "12")
// .setMaster("local[*]")
val sparkSession = SparkSession.builder().config(sparkConf).getOrCreate()
val df = sparkSession.readStream.format("kafka")
.option("kafka.bootstrap.servers", "hadoop101:9092,hadoop102:9092,hadoop103:9092")
.option("subscribe", "test2")
.option("startingOffsets", "earliest")
.option("maxOffsetsPerTrigger", "20000")
.load()
import sparkSession.implicits._
val query = df.selectExpr("cast (value as string)").as[String]
.map(item => {
val jsonObj: JSONObject = JSON.parseObject(item)
val userid = jsonObj.getString("userid").toLong;
val username = jsonObj.getString("username")
val age = jsonObj.getString("age").toInt
val partition = jsonObj.getString("partition").toInt
val ts = System.currentTimeMillis()
new Model(userid, username, age, partition, ts)
}).writeStream.foreachBatch { (batchDF: Dataset[Model], batchid: Long) =>
batchDF.write.format("hudi")
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) //选择表的类型 到底是MERGE_ON_READ 还是 COPY_ON_WRITE
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "userid") //设置主键
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "ts") //数据更新时间戳的
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "partition") //hudi分区列
.option("hoodie.table.name", "kafak_test_tabel") //hudi表名
.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2://hadoop101:10000") //hiveserver2地址
.option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY, "default") //设置hudi与hive同步的数据库
.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY, "kafak_test_tabel") //设置hudi与hive同步的表名
.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "partition") //hive表同步的分区列
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY, classOf[MultiPartKeysValueExtractor].getName) // 分区提取器 按/ 提取分区
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true") //设置数据集注册并同步到hive
.option(HoodieIndexConfig.BLOOM_INDEX_UPDATE_PARTITION_PATH, "true") //设置当分区变更时,当前数据的分区目录是否变更
.option(HoodieIndexConfig.INDEX_TYPE_PROP, HoodieIndex.IndexType.GLOBAL_BLOOM.name()) //设置索引类型目前有HBASE,INMEMORY,BLOOM,GLOBAL_BLOOM 四种索引 为了保证分区变更后能找到必须设置全局GLOBAL_BLOOM
.option("hoodie.insert.shuffle.parallelism", "12")
.option("hoodie.upsert.shuffle.parallelism", "12")
.mode(SaveMode.Append)
.save("/ss/hudi/test-table")
}.option("checkpointLocation", "/ss/checkpoint")
.trigger(Trigger.ProcessingTime(5, TimeUnit.MINUTES))
.start()
query.awaitTermination()
}
}
case class Model(userid: Long, username: String, age: Int, partition: Int, ts: Long)
3、并发控制
3.1、Hudi支持的并发控制
- MVCC:Hudi的表操作,如压缩、清理、提交,hudi会利用多版本并发控制来提供多个表操作写入和查询之间的快照隔离。使用MVCC这种模型,Hudi支持并发任意数量的操作作业,并保证不会发生任何冲突。Hudi默认这种模型。
- OPTIMISTIC CONCURRENCY:针对写入操作(upsert、insert等)利用乐观并发控制来启用多个写入器将数据写到同一个表中,Hudi支持文件级的乐观一致性,即对于发生在同一个表中的任何2个提交(写入),如果它们没有写入正在更改的重叠文件,则允许两个写入都成功。此功能处于实验阶段,需要用到Zookeeper或HiveMetastore来获取锁。
3.2、使用并发写
(1)如果需要开启乐观并发写入,需要设置以下属性
hoodie.write.concurrency.mode=optimistic_concurrency_control
hoodie.cleaner.policy.failed.writes=LAZY
hoodie.write.lock.provider=<lock-provider-classname>
(2)Hudi获取锁的服务提供两种模式使用zookeeper或者HiveMetaStore(选一种即可),相关zookeeper参数
hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider
hoodie.write.lock.zookeeper.url
hoodie.write.lock.zookeeper.port
hoodie.write.lock.zookeeper.lock_key
hoodie.write.lock.zookeeper.base_path
(3)相关HiveMetastore参数
hoodie.write.lock.provider=org.apache.hudi.hive.HiveMetastoreBasedLockProvider
hoodie.write.lock.hivemetastore.database
hoodie.write.lock.hivemetastore.table
实例:
.option("hoodie.cleaner.policy.failed.writes", "LAZY")
.option("hoodie.write.concurrency.mode", "optimistic_concurrency_control")
.option("hoodie.write.lock.provider","org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider")
.option("hoodie.write.lock.zookeeper.url", "hadoop101,hadoop102,hadoop103")
.option("hoodie.write.lock.zookeeper.port", "2181")
.option("hoodie.write.lock.zookeeper.lock_key", "test_table")
.option("hoodie.write.lock.zookeeper.base_path", "/multiwriter_test")