目录

  • 🍑一、背景
  • 🍑二、具体配置及代码
  • 🍊2.1、源码
  • 🍊2.2、hive 配置
  • 🍓2.2.1、hive-env.sh
  • 🍓2.2.2、hive-site.xml
  • 🍑三、测试环境、验证
  • 🍊3.1、通过beeline创建库及表
  • 🍊3.2、查看数据库
  • 参考


🍑一、背景


公司开发了一套元数据管理系统,市面上是有开源元数据管理项目的,比如前端时间研究到的apache atlas,感觉还是比较复杂的,如果想复用其某一块代码,感觉会比较耗时,不如自己写呢,本文就是通过hive hook 监控元数据,将消息发送到Kafka,Kafka将消息发送到mysql数据库(待商议,是主动调用Kafka API还是Kafka主动推)


🍑二、具体配置及代码


🍊2.1、源码


已上传Git,点击下载编译后的jar包为hive-collec-demo-1.0.0.dev.jar

  • HiveHook
package com.renxiaozhao.collect.hive;

import java.util.HashSet;
import java.util.Set;

import org.apache.hadoop.hive.metastore.api.Database;
import org.apache.hadoop.hive.ql.QueryPlan;
import org.apache.hadoop.hive.ql.hooks.Entity;
import org.apache.hadoop.hive.ql.hooks.ExecuteWithHookContext;
import org.apache.hadoop.hive.ql.hooks.HookContext;
import org.apache.hadoop.hive.ql.hooks.HookContext.HookType;
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.hooks.WriteEntity;
import org.apache.hadoop.hive.ql.plan.HiveOperation;
import org.codehaus.jackson.map.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.renxiaozhao.collect.util.HiveKafkaUtil;


public class HiveHook implements ExecuteWithHookContext {

    private static final Logger LOGGER = LoggerFactory.getLogger(HiveHook.class);

    // 存储Hive的SQL操作类型

    private static final HashSet<String> OPERATION_NAMES = new HashSet<>();

    // HiveOperation是一个枚举类,封装了Hive的SQL操作类型

    // 监控SQL操作类型

    static {

        // 建表

        OPERATION_NAMES.add(HiveOperation.CREATETABLE.getOperationName());

        // 修改数据库属性

        OPERATION_NAMES.add(HiveOperation.ALTERDATABASE.getOperationName());

        // 修改数据库属主

        OPERATION_NAMES.add(HiveOperation.ALTERDATABASE_OWNER.getOperationName());

        // 修改表属性,添加列

        OPERATION_NAMES.add(HiveOperation.ALTERTABLE_ADDCOLS.getOperationName());

        // 修改表属性,表存储路径

        OPERATION_NAMES.add(HiveOperation.ALTERTABLE_LOCATION.getOperationName());

        // 修改表属性

        OPERATION_NAMES.add(HiveOperation.ALTERTABLE_PROPERTIES.getOperationName());

        // 表重命名

        OPERATION_NAMES.add(HiveOperation.ALTERTABLE_RENAME.getOperationName());

        // 列重命名

        OPERATION_NAMES.add(HiveOperation.ALTERTABLE_RENAMECOL.getOperationName());

        // 更新列,先删除当前的列,然后加入新的列

        OPERATION_NAMES.add(HiveOperation.ALTERTABLE_REPLACECOLS.getOperationName());

        // 创建数据库

        OPERATION_NAMES.add(HiveOperation.CREATEDATABASE.getOperationName());

        // 删除数据库

        OPERATION_NAMES.add(HiveOperation.DROPDATABASE.getOperationName());

        // 删除表

        OPERATION_NAMES.add(HiveOperation.DROPTABLE.getOperationName());

    }

    @Override

    public void run(HookContext hookContext) throws Exception {

        assert (hookContext.getHookType() == HookType.POST_EXEC_HOOK);

        // 执行计划

        QueryPlan plan = hookContext.getQueryPlan();

        // 操作名称

        String operationName = plan.getOperationName();

        LOGGER.info("执行的SQL语句: " + plan.getQueryString());

        LOGGER.info("操作名称: " + operationName);

        if (OPERATION_NAMES.contains(operationName) && !plan.isExplain()) {

            LOGGER.info("监控SQL操作");

            Set<ReadEntity> inputs = hookContext.getInputs();

            Set<WriteEntity> outputs = hookContext.getOutputs();

            for (Entity entity : inputs) {

                LOGGER.info("Hook metadata输入值: " + toJson(entity));

            }
            for (Entity entity : outputs) {
                LOGGER.info("发送Hook metadata到Kafka,输出值: " + toJson(entity));
                HiveKafkaUtil.kafkaProducer("HIVE_HOOK", toJson(entity));

            }

        } else {
            LOGGER.info("不在监控范围,忽略该hook!");

        }

    }

    private static String toJson(Entity entity) throws Exception {

        ObjectMapper mapper = new ObjectMapper();

        //  entity的类型

        // 主要包括:

        // DATABASE, TABLE, PARTITION, DUMMYPARTITION, DFS_DIR, LOCAL_DIR, FUNCTION

        switch (entity.getType()) {

            case DATABASE:

                Database db = entity.getDatabase();

                return mapper.writeValueAsString(db);

            case TABLE:

                return mapper.writeValueAsString(entity.getTable().getTTable());

        }

        return null;

    }

}
  • HiveKafkaUtil
package com.renxiaozhao.collect.util;

import java.sql.SQLException;
import java.time.Duration;
import java.util.Arrays;
import java.util.Properties;

import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.clients.producer.Callback;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.clients.producer.RecordMetadata;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HiveKafkaUtil {
    private static final Logger LOGGER = LoggerFactory.getLogger(HiveKafkaUtil.class);
    
	public static void main(String[] args) {
	    kafkaProducer("HIVE_HOOK", "hello2");
	}
	
	public static void kafkaProducer(String topicName,String value) {
	    Properties properties = new Properties();

        properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.38.10:9092");
        properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
        properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
        properties.put(ProducerConfig.ACKS_CONFIG, "all");
        properties.put(ProducerConfig.BUFFER_MEMORY_CONFIG, 33554432);
        properties.put(ProducerConfig.RETRIES_CONFIG, 0);
        properties.put(ProducerConfig.RETRY_BACKOFF_MS_CONFIG, 300);
        properties.put(ProducerConfig.BATCH_SIZE_CONFIG, 16384);

        // KafkaProducer 是线程安全的,可以多个线程使用用一个 KafkaProducer
        KafkaProducer<String, String> kafkaProducer = new KafkaProducer<>(properties);
        ProducerRecord<String, String> record = new ProducerRecord<>(topicName, value);
        kafkaProducer.send(record, new Callback() {
            @Override
            public void onCompletion(RecordMetadata metadata, Exception e) {
                if (e != null) {
                    LOGGER.error(String.format("发送数据到kafka发生异常: %s",e));
                    return;
                }
                LOGGER.info("发送数据到kafka成功, topic: " + metadata.topic() + " offset: " + metadata.offset() + " partition: "
                        + metadata.partition());
                LOGGER.info("从kafka获取数据topic数据: " + metadata.topic());
                kafkaConsumer(metadata.topic());
            }
        });
        kafkaProducer.close();
	}
	
	public static void kafkaConsumer(String topicName) {
        Properties props = new Properties();
        // 必须指定
        props.put("bootstrap.servers", "192.168.38.10:9092");
        // 必须指定
        props.put("group.id", "atlas");
        // 必须指定
        props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        // 必须指定
        props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        // 从最早的消息开始读取
        props.put("auto.offset.reset", "earliest");
        props.put("enable.auto.commit", "true");
        KafkaConsumer<String, String> consumer = new KafkaConsumer<>(props);
        consumer.subscribe(Arrays.asList(topicName));
        try {
            ConsumerRecords<String, String> records = consumer.poll(Duration.ofSeconds(2));
            for (ConsumerRecord<String, String> record : records) {
                LOGGER.info("KafkaConsumer获取"+topicName+"信息value=" +record.value());
                if(!DBMySqlUtil.testColletHiveMeta(record.value())) {
                    LOGGER.info("KafkaConsumer获取"+topicName+"信息,插入mysql库失败");
                }
            }
        } catch (ClassNotFoundException e) {
            LOGGER.error(String.format("kafkaConsumer调用DbUtil报错: %s", e));
        } catch (SQLException e) {
            LOGGER.error(String.format("kafkaConsumer调用DbUtil报错: %s", e));
        } finally {
            consumer.close();
        }
    }

}
  • DBMySqlUtil
package com.renxiaozhao.collect.util;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**.
 * JDBC连接工具类
 * @author admin
 *
 */
public class DBMySqlUtil {
    private static final Logger LOGGER = LoggerFactory.getLogger(DBMySqlUtil.class);
    
//    public static Connection getConn(String url, String user, String password) throws ClassNotFoundException {
    public static Connection getConn() throws ClassNotFoundException {
        Connection conn = null;
        try {
            Class.forName("com.mysql.cj.jdbc.Driver");
            conn = DriverManager.getConnection("jdbc:mysql://192.168.38.10:3306/dolphinscheduler?characterEncoding=utf8&useSSL=false&serverTimezone=Asia/Shanghai&rewriteBatchedStatements=true",
                    "ds_user","dolphinscheduler");// 连接数据库
        } catch (SQLException e) {
            LOGGER.error("DBMySqlConfig.getConn()异常---->", e);
        }
        return conn;
    }

    public static boolean testColletHiveMeta(String value) throws SQLException, ClassNotFoundException {
        Connection con = getConn();
        String sql = "insert into test_hive(meta_value) values('"+value+"')";
        PreparedStatement pst = con.prepareStatement(sql);
        return pst.execute(sql);
    }
    public void closeConn(Connection conn) {
        try {
            if (conn != null) {
                conn.close();
                conn = null;
            }
        } catch (SQLException e) {
            LOGGER.error("DBMySqlConfig.closeConn()异常---->", e);
        }
    }

    public PreparedStatement getPStmt(Connection conn, String sql) {
        PreparedStatement pstmt = null;
        try {
            pstmt = conn.prepareStatement(sql);
        } catch (SQLException e) {
            LOGGER.error("DBMySqlConfig.getPStmt()异常---->", e);
        }
        return pstmt;
    }

    public void closePStmt(PreparedStatement stmt) {
        try {
            if (stmt != null) {
                stmt.close();
                stmt = null;
            }
        } catch (SQLException e) {
            LOGGER.error("DBMySqlConfig.closePStmt()异常---->", e);
        }

    }

    public void closeRs(ResultSet rs) {
        try {
            if (rs != null) {
                rs.close();
                rs = null;
            }
        } catch (SQLException e) {
            LOGGER.error("DBMySqlConfig.closeRs()异常---->", e);
        }
    }

    public ResultSet executeQuery(Connection conn, String sql) {
        ResultSet rs = null;
        try {
            rs = conn.createStatement().executeQuery(sql);
        } catch (SQLException e) {
            LOGGER.error("DBMySqlConfig.executeQuery()异常---->", e);
        }
        return rs;
    }
}

🍊2.2、hive 配置


🍓2.2.1、hive-env.sh


增加HIVE_AUX_JARS_PATH变量,引入2.1编译后的jar包:

export HIVE_AUX_JARS_PATH=/root/collect
  • 目前只需要下面两个jar包

🍓2.2.2、hive-site.xml


增加hook配置:

<property>
    <name>hive.exec.post.hooks</name>
    <value>com.renxiaozhao.collect.hive.HiveHook</value>
</property>

kafka实时表写到hive离线表 kafka导入hive_大数据

🍑三、测试环境、验证


确保hadoop、Kafka、hive已成功部署

🍊3.1、通过beeline创建库及表


[root@host1 bin]# ./beeline -u jdbc:hive2://192.168.38.10:10000 -n root
Connecting to jdbc:hive2://192.168.38.10:10000
Connected to: Apache Hive (version 3.1.3)
Driver: Hive JDBC (version 3.1.3)
Transaction isolation: TRANSACTION_REPEATABLE_READ
Beeline version 3.1.3 by Apache Hive
0: jdbc:hive2://192.168.38.10:10000> CREATE database hive_test_db;
No rows affected (0.668 seconds)
0: jdbc:hive2://192.168.38.10:10000> CREATE  TABLE  hive_meta_test(id int,name string);
No rows affected (3.497 seconds)
0: jdbc:hive2://192.168.38.10:10000> CREATE  TABLE  hive_meta_testB as select * from hive_meta_test;
No rows affected (25.366 seconds)
0: jdbc:hive2://192.168.38.10:10000> 
0: jdbc:hive2://192.168.38.10:10000>

kafka实时表写到hive离线表 kafka导入hive_大数据_02

🍊3.2、查看数据库


kafka实时表写到hive离线表 kafka导入hive_大数据_03


三条语句四条记录?看最后两条的创建时间是一样的

详细数据:

{
	"name": "hive_test_db",
	"description": null,
	"locationUri": "hdfs://192.168.38.10:8020/user/hive/warehouse/hive_test_db.db",
	"parameters": {},
	"privileges": null,
	"ownerName": "root",
	"ownerType": "USER",
	"catalogName": "hive",
	"setCatalogName": true,
	"setParameters": true,
	"setPrivileges": false,
	"setOwnerType": true,
	"parametersSize": 0,
	"setName": true,
	"setDescription": false,
	"setLocationUri": true,
	"setOwnerName": true
}
{
	"name": "hive_test_db",
	"description": null,
	"locationUri": null,
	"parameters": null,
	"privileges": null,
	"ownerName": null,
	"ownerType": null,
	"catalogName": null,
	"setCatalogName": false,
	"setParameters": false,
	"setPrivileges": false,
	"setOwnerType": false,
	"parametersSize": 0,
	"setName": true,
	"setDescription": false,
	"setLocationUri": false,
	"setOwnerName": false
}
{
	"name": "default",
	"description": "Default Hive database",
	"locationUri": "hdfs://192.168.38.10:8020/user/hive/warehouse",
	"parameters": {},
	"privileges": null,
	"ownerName": "public",
	"ownerType": "ROLE",
	"catalogName": "hive",
	"setCatalogName": true,
	"setParameters": true,
	"setPrivileges": false,
	"setOwnerType": true,
	"parametersSize": 0,
	"setName": true,
	"setDescription": true,
	"setLocationUri": true,
	"setOwnerName": true
}
{
	"tableName": "hive_meta_test",
	"dbName": "default",
	"owner": "root",
	"createTime": 1656849289,
	"lastAccessTime": 0,
	"retention": 0,
	"sd": {
		"cols": [],
		"location": null,
		"inputFormat": "org.apache.hadoop.mapred.SequenceFileInputFormat",
		"outputFormat": "org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat",
		"compressed": false,
		"numBuckets": -1,
		"serdeInfo": {
			"name": null,
			"serializationLib": "org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe",
			"parameters": {
				"serialization.format": "1"
			},
			"description": null,
			"serializerClass": null,
			"deserializerClass": null,
			"serdeType": null,
			"setParameters": true,
			"parametersSize": 1,
			"setName": false,
			"setDescription": false,
			"setSerializationLib": true,
			"setSerializerClass": false,
			"setDeserializerClass": false,
			"setSerdeType": false
		},
		"bucketCols": [],
		"sortCols": [],
		"parameters": {},
		"skewedInfo": {
			"skewedColNames": [],
			"skewedColValues": [],
			"skewedColValueLocationMaps": {},
			"setSkewedColNames": true,
			"setSkewedColValues": true,
			"setSkewedColValueLocationMaps": true,
			"skewedColNamesSize": 0,
			"skewedColNamesIterator": [],
			"skewedColValuesSize": 0,
			"skewedColValuesIterator": [],
			"skewedColValueLocationMapsSize": 0
		},
		"storedAsSubDirectories": false,
		"colsSize": 0,
		"setParameters": true,
		"parametersSize": 0,
		"setLocation": false,
		"setInputFormat": true,
		"setCols": true,
		"colsIterator": [],
		"setSkewedInfo": true,
		"bucketColsSize": 0,
		"bucketColsIterator": [],
		"sortColsSize": 0,
		"sortColsIterator": [],
		"setOutputFormat": true,
		"setSerdeInfo": true,
		"setBucketCols": true,
		"setSortCols": true,
		"setCompressed": false,
		"setNumBuckets": true,
		"setStoredAsSubDirectories": false
	},
	"partitionKeys": [],
	"parameters": {},
	"viewOriginalText": null,
	"viewExpandedText": null,
	"tableType": "MANAGED_TABLE",
	"privileges": null,
	"temporary": false,
	"rewriteEnabled": false,
	"creationMetadata": null,
	"catName": null,
	"ownerType": "USER",
	"partitionKeysSize": 0,
	"setCatName": false,
	"setParameters": true,
	"setTableName": true,
	"setDbName": true,
	"setOwner": true,
	"setViewOriginalText": false,
	"setViewExpandedText": false,
	"setTableType": true,
	"setPrivileges": false,
	"setCreationMetadata": false,
	"setOwnerType": true,
	"setCreateTime": true,
	"setLastAccessTime": false,
	"setRetention": false,
	"partitionKeysIterator": [],
	"parametersSize": 0,
	"setTemporary": false,
	"setRewriteEnabled": false,
	"setPartitionKeys": true,
	"setSd": true
}
  • 整明白了,创建数据库,一条记录,创建表两条记录(数据库一条+表一条)
  • 指定数据库创建表(同样两条数据)

参考

hive hook应用元数据管理集成部署及测试