1. 背景
对于实时更新的维表,以什么组件来处理作为FlinkSQL的source维表?HBase?Kafka?或mysql?哪一种方案能得到正确结果?
且需要考虑到事实表和维表关联的时候,是否需要和维表的历史版本关联?还是只关联维表的最新版本?
下文以只关联维表的最新版本为目标进行测试。
2. 实践过程
2.1 采用upsert-kafka作为维表
(1) kafka生产者代码
// 创建消息
DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.nnnnnnnnn");
for (int i = 2; i < 8; i++) {
JSONObject json1 = new JSONObject();
json1.put("key", i+"");
//json.put("update_time", dtf.format(LocalDateTime.now()));
JSONObject json = new JSONObject();
json.put("id", i+"");
json.put("name", "name444"+i);
ProducerRecord<String, String> record = new ProducerRecord<String, String>(
"flinksqldim",
json1.toJSONString(),
json.toJSONString()
);
}
(2) FlinkSQL主体代码
// 创建执行环境
//EnvironmentSettings settings = EnvironmentSettings.inStreamingMode();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings settings = EnvironmentSettings.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build();
TableEnvironment tableEnv = StreamTableEnvironment.create(env, settings);
// 把kafka中的topic映射成一个输入临时表
tableEnv.executeSql(
"CREATE TABLE sensor_source(" +
" id STRING, " +
" name STRING, " +
" o_time TIMESTAMP(3), " +
" WATERMARK FOR o_time AS o_time " +
" ) WITH (" +
" 'connector' = 'kafka'," +
" 'topic' = 'flinksqldemo'," +
" 'properties.bootstrap.servers' = 'ip:port'," +
" 'properties.group.id' = 'flinksqlCount'," +
" 'scan.startup.mode' = 'earliest-offset'," +
" 'format' = 'json')"
);
// 把kafka中数据 映射成输入维表 - 实时变更的维表
tableEnv.executeSql(
"CREATE TABLE dim_source (" +
" id STRING," +
" name STRING," +
" update_time TIMESTAMP(3) METADATA FROM 'timestamp' VIRTUAL, " +
" WATERMARK FOR update_time AS update_time, " +
" PRIMARY KEY (id) NOT ENFORCED" +
") WITH (" +
" 'connector' = 'upsert-kafka'," +
" 'topic' = 'flinksqldim'," +
" 'properties.bootstrap.servers' = 'ip:port'," +
" 'properties.group.id' = 'flinksqlDim'," +
" 'key.format' = 'json'," +
" 'value.format' = 'json')"
);
// 把Mysql中的表映射为一个输出临时表
String mysql_sql = "CREATE TABLE mysql_sink (" +
" name STRING," +
" cnt BIGINT," +
" PRIMARY KEY (name) NOT ENFORCED" +
") WITH (" +
" 'connector' = 'jdbc'," +
" 'url' = 'jdbc:mysql://ip:port/kafka?serverTimezone=UTC'," +
" 'table-name' = 'count_info'," +
" 'username' = 'xxx'," +
" 'password' = 'xxx'" +
")";
tableEnv.executeSql(mysql_sql);
// 插入数据
TableResult tableResult = tableEnv.executeSql(
"INSERT INTO mysql_sink " +
"SELECT b.name, count(*) as cnt " +
"FROM sensor_source as a " +
"INNER JOIN dim_source as b " +
"on a.id = b.id " +
"where a.id > 3 " +
"group by b.name "
// "order by name "
);
System.out.println(tableResult.getJobClient().get().getJobStatus());
2.2 采用mysql作为维表
mysql维表的更新(新增 或 修改)只对新到来的流生效。
主体代码
// 创建执行环境,可web ui查看
Configuration conf = new Configuration();
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(conf);
EnvironmentSettings settings = EnvironmentSettings.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build();
TableEnvironment tableEnv = StreamTableEnvironment.create(env, settings);
// 把kafka中的topic映射成一个输入临时表,需要proctime
tableEnv.executeSql(
"CREATE TABLE sensor_source( " +
" id STRING, " +
" name STRING," +
" proctime AS PROCTIME() " +
" ) WITH ( " +
" 'connector' = 'kafka'," +
" 'topic' = 'flinksqldemo'," +
" 'properties.bootstrap.servers' = 'ip:port'," +
" 'properties.group.id' = 'flinksqlCount'," +
" 'scan.startup.mode' = 'earliest-offset'," +
" 'format' = 'json')"
);
// 把mysql中表映射成输入维表
tableEnv.executeSql(
"CREATE TABLE mysql_source (" +
" id STRING," +
" name STRING," +
" PRIMARY KEY (id) NOT ENFORCED" +
") WITH (" +
" 'connector' = 'jdbc'," +
" 'url' = 'jdbc:mysql://ip:port/kafka?serverTimezone=UTC'," +
" 'table-name' = 'test_info'," +
" 'username' = 'xxx'," +
" 'password' = 'xxx'," +
" 'lookup.max-retries' = '3'," +
" 'lookup.cache.max-rows' = '1000'," +
" 'lookup.cache.ttl' = '60s' " +
")"
);
// 把Mysql中的表映射为一个输出临时表
String mysql_sql = "CREATE TABLE mysql_sink (" +
" name STRING," +
" cnt BIGINT," +
" PRIMARY KEY (name) NOT ENFORCED" +
") WITH (" +
" 'connector' = 'jdbc'," +
" 'url' = 'jdbc:mysql://ip:port/kafka?serverTimezone=UTC'," +
" 'table-name' = 'count_info'," +
" 'username' = 'xxx'," +
" 'password' = 'xxx'" +
")";
tableEnv.executeSql(mysql_sql);
// 插入数据 FOR SYSTEM_TIME AS OF a.proctime
TableResult tableResult = tableEnv.executeSql(
"INSERT INTO mysql_sink " +
"SELECT b.name, count(*) as cnt " +
"FROM sensor_source as a " +
"INNER JOIN mysql_source FOR SYSTEM_TIME AS OF a.proctime as b " +
"on a.id = b.id " +
"where a.id > 3 " +
"group by b.name "
);
System.out.println(tableResult.getJobClient().get().getJobStatus());
3. 试错
3.1 使用Regular Joins 常规join
kafka生产者代码
for (int i = 1; i < 10; i++) {
//json.put("update_time", dtf.format(LocalDateTime.now()));
JSONObject json = new JSONObject();
json.put("id", i+"");
json.put("name", "name555"+i);
ProducerRecord<Integer, String> record = new ProducerRecord<Integer, String>(
"flinksqldim2",
i,
json.toJSONString()
);
// 发送消息
Future<RecordMetadata> future = producer.send(record);
FlinkSQL处理代码
// 创建执行环境
//EnvironmentSettings settings = EnvironmentSettings.inStreamingMode();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings settings = EnvironmentSettings.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build();
TableEnvironment tableEnv = StreamTableEnvironment.create(env, settings);
// 把kafka中的topic映射成一个输入临时表
tableEnv.executeSql(
"CREATE TABLE sensor_source(" +
"id STRING, " +
"name STRING, " +
"o_time TIMESTAMP(3), " +
" WATERMARK FOR o_time AS o_time " +
") WITH (" +
" 'connector' = 'kafka'," +
" 'topic' = 'flinksqldemo'," +
" 'properties.bootstrap.servers' = 'ip:port'," +
" 'properties.group.id' = 'flinksqlCount'," +
" 'scan.startup.mode' = 'earliest-offset'," +
" 'format' = 'json')"
);
// 把kafka中数据 映射成输入维表 - 实时变更的维表, 非compacted topic
tableEnv.executeSql(
"CREATE TABLE dim_source ( " +
" id STRING, " +
" name STRING, " +
" update_time TIMESTAMP(3) METADATA FROM 'timestamp' VIRTUAL, " +
" WATERMARK FOR update_time AS update_time " +
") WITH (" +
" 'connector' = 'kafka'," +
" 'topic' = 'flinksqldim2'," +
" 'properties.bootstrap.servers' = 'ip:port'," +
" 'properties.group.id' = 'flinksqlDim'," +
" 'scan.startup.mode' = 'earliest-offset'," +
" 'format' = 'json')"
);
// 把Mysql中的表映射为一个输出临时表
String mysql_sql = "CREATE TABLE mysql_sink (" +
" name STRING," +
" cnt BIGINT," +
" PRIMARY KEY (name) NOT ENFORCED" +
") WITH (" +
" 'connector' = 'jdbc'," +
" 'url' = 'jdbc:mysql://ip:port/kafka?serverTimezone=UTC'," +
" 'table-name' = 'count_info'," +
" 'username' = 'xxx'," +
" 'password' = 'xxx'" +
")";
tableEnv.executeSql(mysql_sql);
// 插入数据
TableResult tableResult = tableEnv.executeSql(
"INSERT INTO mysql_sink " +
"SELECT b.name, count(*) as cnt " +
"FROM sensor_source a " +
"JOIN dim_source b " +
"on a.id = b.id " +
"where a.id > 3 " +
"group by b.name "
);
System.out.println(tableResult.getJobClient().get().getJobStatus());
维表流更新了几次数据后,结果表count_info中数据错乱
3.2 mysql表作为维表
tableEnv.executeSql(
"CREATE TEMPORARY TABLE mysql_source (" +
" id STRING," +
" name STRING," +
" update_time TIMESTAMP(3)," +
" PRIMARY KEY (id) NOT ENFORCED" +
") WITH (" +
" 'connector' = 'jdbc'," +
" 'url' = 'jdbc:mysql://ip:port/kafka?serverTimezone=UTC'," +
" 'table-name' = 'user_info'," +
" 'username' = 'xxx'," +
" 'password' = 'xxx'" +
")"
);
报错如下:
Event-Time Temporal Table Join requires both primary key and row time attribute in versioned table, but no row time attribute can be found.
可见,mysql直接作为维表不能做temporal join , 那如果加上CDC呢?是不是就可以了?更新中。。。
3.3 使用HBase表作为维表
FlinkSQL主体代码
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings settings = EnvironmentSettings.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build();
TableEnvironment tableEnv = StreamTableEnvironment.create(env, settings);
// 把kafka中的topic映射成一个输入临时表
tableEnv.executeSql(
"CREATE TABLE sensor_source( " +
" id STRING, " +
" name STRING, " +
" o_time TIMESTAMP(3), " +
" WATERMARK FOR o_time AS o_time " +
" ) WITH (" +
" 'connector' = 'kafka'," +
" 'topic' = 'flinksqldemo'," +
" 'properties.bootstrap.servers' = 'ip:port'," +
" 'properties.group.id' = 'flinksqlCount'," +
" 'scan.startup.mode' = 'earliest-offset'," +
" 'format' = 'json')"
);
// 把hbase中表映射成输入维表
tableEnv.executeSql(
"CREATE TABLE hbase_source (" +
" rowkey STRING," +
" INFO ROW<name STRING>," +
" PRIMARY KEY (rowkey) NOT ENFORCED" +
") WITH (" +
" 'connector' = 'hbase-1.4'," +
" 'zookeeper.quorum' = 'node1:2181'," +
" 'table-name' = 'TEST_INFO'" +
")"
);
// 把Mysql中的表映射为一个输出临时表
String mysql_sql = "CREATE TABLE mysql_sink (" +
" name STRING," +
" cnt BIGINT," +
" PRIMARY KEY (name) NOT ENFORCED" +
") WITH (" +
" 'connector' = 'jdbc'," +
" 'url' = 'jdbc:mysql://ip:port/kafka?serverTimezone=UTC'," +
" 'table-name' = 'count_info'," +
" 'username' = 'xxx'," +
" 'password' = 'xxx'" +
")";
tableEnv.executeSql(mysql_sql);
// 插入数据
TableResult tableResult = tableEnv.executeSql(
"INSERT INTO mysql_sink " +
"SELECT INFO.name, count(*) as cnt " +
"FROM sensor_source o " +
"JOIN hbase_source c " +
"on o.id = c.rowkey " +
"where o.id > 3 " +
"group by INFO.name "
);
System.out.println(tableResult.getJobClient().get().getJobStatus());
遇到问题
# (1)未找到类
ClassNotFoundException: org.apache.hadoop.util.PlatformName
# 添加hadoop-auth
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-auth</artifactId>
<version>2.5.1</version>
</dependency>
# (2)未找到表
HBase Table expects at least one region in scan, please check the HBase table status in HBase cluster
# 因为表是用phoenix创建的,表名为大写,而代码用了小写,所以找不到
# (3)classloader问题
在./conf/flink-conf.yaml文件添加以下配置
classloader.check-leaked-classloader: false