文章目录
- 0 数据结构演进含义
- 1 测试代码
- 2 结果
- 3 解决老数据的插入问题
0 数据结构演进含义
hudi表接收新增字段后的hudi表内容。
1 测试代码
dept = [("Finance",10), \
("Marketing",20), \
("Sales",30), \
("IT",40) \
]
deptColumns = ["dept_name","dept_id"]
write_df = spark.createDataFrame(data=dept, schema = deptColumns)
write_df.printSchema()
write_df.show(truncate=False)
save_path='/user/testSchema/'
hudi_options = dict()
hudi_options['hoodie.table.name'] = 'schema'
hudi_options['hoodie.datasource.write.recordkey.field'] ='dept_id'
hudi_options['hoodie.datasource.write.table.name']='schema'
hudi_options['hoodie.datasource.write.operation']='upsert'
hudi_options['hoodie.datasource.write.precombine.field']='dept_name'
hudi_options['hoodie.datasource.write.partitionpath.field']=''
write_df.write.format("hudi").options(**hudi_options).mode("append").save(save_path)
print('原始数据:')
df = spark.read.format("hudi").load('/user/testSchema/*/*')
df.show()
dept2 = [("Finance", 50, 'test1'), ("Marketing",60, 'test2')]
deptColumns2 = ["dept_name","dept_id", "new_field"]
write_df2 = spark.createDataFrame(data=dept2, schema = deptColumns2)
write_df2.printSchema()
write_df2.write.format("hudi").options(**hudi_options).mode("append").save(save_path)
print('新增字段后的数据:')
df2 = spark.read.format("hudi").load('/user/testSchema/*/*')
df2.show()
# 下面的代码写入【重新插入老数据】会报错,因为缺少字段
dept3 = [("Teacher",80)]
deptColumns3 = ["dept_name","dept_id"]
write_df3 = spark.createDataFrame(data=dept3, schema = deptColumns3)
write_df3.printSchema()
write_df3.show(truncate=False)
write_df3.write.format("hudi").options(**hudi_options).mode("append").save(save_path)
# 报错信息:ERROR queue.BoundedInMemoryExecutor: error producing records0]
# org.apache.parquet.io.InvalidRecordException: Parquet/Avro schema mismatch: Avro field 'new_field' not found
2 结果
>>> df = spark.read.format("hudi").load('/user/testSchema/*/*')
21/11/16 17:35:50 WARN hudi.DefaultSource: Loading Base File Only View.
>>> df.show()
+-------------------+--------------------+------------------+----------------------+--------------------+---------+-------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path| _hoodie_file_name|dept_name|dept_id|
+-------------------+--------------------+------------------+----------------------+--------------------+---------+-------+
| 20211116172847| 20211116172847_0_1| 40| default|031435a7-fca3-437...| IT| 40|
| 20211116172847| 20211116172847_0_2| 10| default|031435a7-fca3-437...| Finance| 10|
| 20211116172847| 20211116172847_0_3| 30| default|031435a7-fca3-437...| Sales| 30|
| 20211116172847| 20211116172847_0_4| 20| default|031435a7-fca3-437...|Marketing| 20|
+-------------------+--------------------+------------------+----------------------+--------------------+---------+-------+
>>> dept2 = [("Finance", 50, 'test1'), ("Marketing",60, 'test2')]
>>> deptColumns = ["dept_name","dept_id", "new_field"]
>>> write_df2 = spark.createDataFrame(data=dept, schema = deptColumns)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/software/spark-3.1.2-bin-hadoop2.7/python/pyspark/sql/session.py", line 675, in createDataFrame
return self._create_dataframe(data, schema, samplingRatio, verifySchema)
File "/software/spark-3.1.2-bin-hadoop2.7/python/pyspark/sql/session.py", line 700, in _create_dataframe
rdd, schema = self._createFromLocal(map(prepare, data), schema)
File "/software/spark-3.1.2-bin-hadoop2.7/python/pyspark/sql/session.py", line 517, in _createFromLocal
struct.fields[i].name = name
IndexError: list index out of range
>>> deptColumns2 = ["dept_name","dept_id", "new_field"]
>>> write_df2 = spark.createDataFrame(data=dept2, schema = deptColumns2)
>>> write_df2.printSchema()
root
|-- dept_name: string (nullable = true)
|-- dept_id: long (nullable = true)
|-- new_field: string (nullable = true)
>>> write_df2.write.format("hudi").options(**hudi_options).mode("append").save(save_path)
>>> df2 = spark.read.format("hudi").load('/user/testSchema/*/*')
21/11/16 17:37:42 WARN hudi.DefaultSource: Loading Base File Only View.
>>> df2.show()
+-------------------+--------------------+------------------+----------------------+--------------------+---------+-------+---------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path| _hoodie_file_name|dept_name|dept_id|new_field|
+-------------------+--------------------+------------------+----------------------+--------------------+---------+-------+---------+
| 20211116172847| 20211116172847_0_1| 40| default|031435a7-fca3-437...| IT| 40| null|
| 20211116172847| 20211116172847_0_2| 10| default|031435a7-fca3-437...| Finance| 10| null|
| 20211116172847| 20211116172847_0_3| 30| default|031435a7-fca3-437...| Sales| 30| null|
| 20211116172847| 20211116172847_0_4| 20| default|031435a7-fca3-437...|Marketing| 20| null|
| 20211116173638| 20211116173638_0_5| 60| default|031435a7-fca3-437...|Marketing| 60| test2|
| 20211116173638| 20211116173638_0_6| 50| default|031435a7-fca3-437...| Finance| 50| test1|
+-------------------+--------------------+------------------+----------------------+--------------------+---------+-------+---------+
3 解决老数据的插入问题
下面的示例代码,简单的解决了原始老数据,插入新表的问题。【思路为对比两个表的schema,然后对老数据的字段进行补充,再插入】
save_path='/user/testSchema/'
hudi_options = dict()
hudi_options['hoodie.table.name'] = 'schema'
hudi_options['hoodie.datasource.write.recordkey.field'] ='dept_id'
hudi_options['hoodie.datasource.write.table.name']='schema'
hudi_options['hoodie.datasource.write.operation']='upsert'
hudi_options['hoodie.datasource.write.precombine.field']='dept_name'
hudi_options['hoodie.datasource.write.partitionpath.field']=''
import pyspark.sql.functions as F
import json
dept = [("Teacher",80)]
deptColumns = ["dept_name","dept_id"]
write_df = spark.createDataFrame(data=dept, schema = deptColumns)
write_df.printSchema()
write_df.show(truncate=False)
new_df_json = json.loads(write_df.schema.json())['fields']
new_df_schema_field = []
for d in new_df_json:
new_df_schema_field.append(d['name'])
df = spark.read.format("hudi").load('/user/testSchema/*/*')
columns_to_drop =['_hoodie_commit_time', '_hoodie_commit_seqno', '_hoodie_record_key', '_hoodie_partition_path', '_hoodie_file_name']
original_df_json = json.loads(df.drop(*columns_to_drop).schema.json())['fields']
for d in original_df_json:
if d['name'] in new_df_schema_field:
pass
else:
write_df = write_df.withColumn(d['name'], F.lit(None).astype(d['type']))
write_df.write.format("hudi").options(**hudi_options).mode("append").save(save_path)
下面的代码使用于每次写入hudi表前,用于记录schema和补全将要写入的表字段:
## 补全schema
columns_to_drop =['_hoodie_commit_time', '_hoodie_commit_seqno', '_hoodie_record_key', '_hoodie_partition_path', '_hoodie_file_name']
hdfs_schema_file_path = self._write_hudi_file_base_path + "/table_schema.txt"
import json
import pyspark.sql.functions as F
new_df_json = json.loads(write_df.drop(*columns_to_drop).schema.json())['fields']
new_df_schema = dict()
for d in new_df_json:
new_df_schema[d['name']] = d['type']
write_new_df_schema = new_df_schema
# 补全字段
if self.__judgeHdfsFileExist(hdfs_schema_file_path):
original_df_schema = eval(self._spark.read.text(hdfs_schema_file_path))
for k, v in original_df_schema.items():
if k in new_df_schema.keys():
pass
else:
write_df = write_df.withColumn(d['name'], F.lit(None).astype(d['type']))
write_new_df_schema[d['name']] = d['type']
else:
pass
# 写入schema
write_new_df_schema_str = str(write_new_df_schema)
df = self._spark.createDataFrame([{'table_schema': str(write_new_df_schema_str)}])
df.write.format("text").option("header", "false").mode("overwrite") \
.save(hdfs_schema_file_path)
功能函数:
def __judgeHdfsFileExist(self, hdfs_file_path):
sc = self._spark.sparkContext
fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())
return fs.exists(sc._jvm.org.apache.hadoop.fs.Path(hdfs_file_path))