flink异步查询hive flink1.12 hive

转载

hackernew 2023-11-07 15:37:38

文章标签 flink异步查询hive hive kafka flink apache 文章分类 Hive 大数据

上次发的是1.10的flink，当时版本的flink不支持hive数据更新后依然以最新数据和kafka关联。

本次以1.12.2版本，java代码形式实现“动态”关联。下方是这个tiny demo的依赖和代码。

依赖：

<dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_2.12</artifactId>
            <version>1.12.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.12</artifactId>
            <version>1.12.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_2.12</artifactId>
            <version>1.12.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-scala_2.12</artifactId>
            <version>1.12.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-java-bridge_2.12</artifactId>
            <version>1.12.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner_2.12</artifactId>
            <version>1.12.2</version>
        </dependency>

    
                <dependency>
                    <groupId>org.apache.flink</groupId>
                    <artifactId>flink-table-planner-blink_2.12</artifactId>
                    <version>1.12.2</version>
                </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-scala-bridge_2.12</artifactId>
            <version>1.12.2</version>

        </dependency>


        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-hive_2.12</artifactId>
            <version>1.12.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>3.1.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-csv</artifactId>
            <version>1.12.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-cep-scala_2.12</artifactId>
            <version>1.12.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-common</artifactId>
            <version>1.12.2</version>
        </dependency>

<!--    java.lang.NoClassDefFoundError: org/apache/hadoop/mapred/JobConf     -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>2.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-common</artifactId>
            <version>2.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
            <version>2.7.2</version>
        </dependency>
    </dependencies>

代码：

package StreamBatch_fh;

import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
//import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.SqlDialect;
import org.apache.flink.table.api.TableResult;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.catalog.hive.HiveCatalog;
import org.apache.flink.types.Row;

/**
 * FLINK 1.12.2
 * HIVE 3.1.2
 * KAFKA 2.4
 * target： kafka流和hive关联后sink 到另一个topic中
 */

public class FlinkHiveMain {

    public static void main(String[] args) {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        ParameterTool para = ParameterTool.fromArgs(args);
        String host = para.get("host");
        Integer port = para.getInt("port");
        String topic = para.get("topic");
        String hivedir = para.get("hivedir");
/**
 创建EnvironmentSettings 和 tableEnv
 */
        EnvironmentSettings envSet = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build();
        StreamTableEnvironment streamTableEnvironment = StreamTableEnvironment.create(env, envSet);

//         tableEnv里添加hive相关的参数

        streamTableEnvironment.getConfig().getConfiguration().setString("table.dynamic-table-options.enabled", "true");

/**
 创建kafkasource
 注意1.11后开始不再用tableEnv..connect 创建表

 注意：kafka表和hive表 创建后都会在hive侧生成一个新表
 */

        String name = "myhive";
        String defaultDatabase = "flink01";
//        String hiveConfDir = "D:\\ocspx_20210616\\flink0810\\flink12hive\\src\\main\\resources";
        String hiveConfDir = hivedir;
        HiveCatalog hive = new HiveCatalog(name, defaultDatabase, hiveConfDir);
//注册catalog
        streamTableEnvironment.registerCatalog("myhive", hive);
//使用catalog
        streamTableEnvironment.useCatalog("myhive");

        String dropoldkafkatable = String.format("DROP table if exists UserScores");
        streamTableEnvironment.executeSql(dropoldkafkatable);
        String createKafkaTable = String.format(

                "CREATE TABLE UserScores (name1 STRING,scoure DOUBLE,zoneCode STRING,proctime as PROCTIME())\n" +
                        "WITH (\n" +
                        "  'connector' = 'kafka',\n" +
                        "  'topic' = 'test_in1',\n" +
                        "  'properties.bootstrap.servers' = '10.1.236.92:6667',\n" +
                        "  'properties.group.id' = 'testGroup',\n" +
                        "  'format' = 'csv',\n" +
                        //"  'scan.startup.timestamp-millis' = '1605147648000',\n" +
                        // "  'csv.field-delimiter' = '\t',\n" +
                        "  'scan.startup.mode' = 'latest-offset'\n" +
                        ")");
//创建表 也就是上方那一段sql
        TableResult tableResult = streamTableEnvironment.executeSql(createKafkaTable);
//
        Table table1 = streamTableEnvironment.sqlQuery("select * from UserScores");
//把表转为流 再打印
        DataStream<Row> gg = streamTableEnvironment.toAppendStream(table1, Row.class);
        gg.print("kafka源数据");
//        String createHiveTable = String.format("");
/**
 * 引入hive
 *  hive-conf-dir: xxx  # contains hive-site.xml
 */
        //        方言改为hive
        streamTableEnvironment.getConfig().setSqlDialect(SqlDialect.HIVE);
//   ??table.exec.hive.fallback-mapred-reader=true

        /**
         * hive表的第一种方式 “从现有的hive表整 诞生一个用来和kafka关联的表”
         * 20211122
         *
         */
//        Table table2 = streamTableEnvironment.sqlQuery(
//                "select * from dis_users_1118a /*+ OPTIONS('streaming-source.enable'='false','streaming-source.partition.include' = 'all','streaming-source.monitor-interval'='2 min','lookup.join.cache.ttl'='2 min') */" ); /*('scan.startup.mode'='earliest-offset',)*/
//      String hive1 =   String.format("select * from dis_users_1118a /*+ OPTIONS('streaming-source.enable'='false','streaming-source.partition.include' = 'all','streaming-source.monitor-interval'='2 min','lookup.join.cache.ttl'='2 min') */");


//下方where条件里 仿佛左右两个表不能有重复的字段
//        Table tableJoin = table1.join(table2).where("name1 = name");
//        streamTableEnvironment.toAppendStream(table2, Row.class).print("当前hive数据");
//        streamTableEnvironment.toAppendStream(tableJoin, Row.class).print("关联后");
//        DataStream<Row> gg2 = streamTableEnvironment.toAppendStream(table2, Row.class);

//        gg2.print("hive中创建的users表");

        /**
         * hive表的第二种方式 “create一个hive没有的表再和kafka进行关联”
         */

        String dropoldHivetable = String.format("DROP table if exists dimension_table");
        streamTableEnvironment.executeSql(dropoldHivetable);

        /**   去掉的时间字段 可以根据具体场景判断要不要 有没有都不影响关联hive维表      "  update_time TIMESTAMP(9),\n" +
         'lookup.join.cache.ttl' = '2 min'这个参数非常重要 到点就重新把hive表数据全部缓存到slot里 所以数据量大的时候需要考虑这个值 或者考虑“最新分区”更新形式
         下方hive里的维表dimension_table 没有完全和官网样例一样 不算商品页面pv那些 只是用来测试关联 所以值长什么样子是乱写的 -_-
         */
//
        String hive_create = String.format("CREATE TABLE dimension_table (\n" +
                "  product_id STRING,\n" +
                "  user_name STRING,\n" +
                "  unit_price DECIMAL(10, 4),\n" +
                "  pv_count BIGINT,\n" +
                "  like_count BIGINT,\n" +
                "  comment_count BIGINT,\n" +
                "  update_user STRING\n" +
                ") TBLPROPERTIES (\n" +
                "  'streaming-source.enable' = 'false',           \n" +
                "  'streaming-source.partition.include' = 'all',  \n" +
                "  'lookup.join.cache.ttl' = '1 min'\n" +
                ")");
        streamTableEnvironment.executeSql(hive_create);

        /**插入数据样例 可以省略 即假设hive表每次都是随着作业重新运行而删除并新建 再插入一行初始数据
         * insert
         */
        String Insert = String.format(" INSERT INTO dimension_table values('Bill','Bill',9.22,20211122,1122,2021,'hh')");
        streamTableEnvironment.executeSql(Insert);
/**常规 JOIN 无法实现hive里数据更新后也能被flink获取并随kafka数据关联出
 */
//        String join = String.format("select * from UserScores  join dimension_table  ON UserScores.name1 = dimension_table.product_name");
//        streamTableEnvironment.sqlQuery(join).printSchema();
/**Temporal JOIN
 */
        String join2 = String.format("select * from UserScores  join dimension_table  FOR SYSTEM_TIME AS OF UserScores.proctime ON UserScores.name1 = dimension_table.user_name");
        Table t = streamTableEnvironment.sqlQuery(join2);
        streamTableEnvironment.toAppendStream(t, Row.class).print("输出关联结果");

        try {
            env.execute("hive test01");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}
/**
 * 注意kafka流表和hive维表，都会在对应catalog下创建出各自hive表
 */

运行效果：

打包好后提交到yarn上：

/usr/hdp/3.1.0.0-78/flink/bin/flink run -c StreamBatch_fh.FlinkHiveMain -m yarn-cluster -yt /data/hh/app_jar/hive_join/resources /data/hh/app_jar/hive_join/flink12hive-1.0-SNAPSHOT.jar --port 6667 --host 10.***.92 --topic test_in1 --hivedir /data/hh/app_jar/hive_join/resources

提交作业后，给hive维表插入1条数据，加上代码里默认插入的1条，共2条数据:

flink异步查询hive flink1.12 hive_hive