一、环境依赖

  1. 消息队列:Kafka(单机或者集群)(需要安装Zookeeper)
  2. Redis
  3. Hadoop3.2-3、flink1.12( 集群或者单机环境)
  4. Python3.8、pyflink1.12

二、项目整体概述

pyflink 提交作业_深度学习


一句话描述:需要预测的数据发送到Kafka消息队列,从Redis里读取离线训练的神经网络模型,经过特征工程及预测后重新写回Kafka供上游应用使用

三、 项目搭建及部署流程

(1) pyflink运行环境搭建

  1. 从conda官方下载Miniconda3 Linux 64-bit的.sh安装包(为方便后续说明,下载后我重命名为miniconda.sh),请根据您需要的python版本下载对应的miniconda安装包
  2. sh安装包执行可执行命令
chmod +x miniconda.sh
  1. 创建Python运行环境目录
#venv指将创建的目录名称
./miniconda.sh -b -p venv
  1. 激活运行环境
source venv/bin/activate ""
  1. 通过pip install安装需要包
pip install apache-flink==1.12.0 --target venv/lib/python3.7/site-packages
  1. python包安装完后即可在此环境下开发。开发完毕后执行下面命令退出环境
conda deactivate
  1. 退出环境后,删除掉conda安装python包时,用于缓存的安装包路径
rm -rf venv/pkgs
  1. 用zip打包运行环境
zip -r venv.zip venv

该venv.zip可提交至flink集群为后续的pyflink项目提供可运行环境

(2)项目部署

  1. 近线推理pyflink代码,消费Kafka数据
################################kafka配置参数################################
def model_predict():
    kafka_servers = "your_kafka_ip:9092"
    kafka_zookeeper_servers = "your_zookeeper_ip:2181"
    source_topic = "your_source_topic"
    sink_topic = "your_sink_topic"
    kafka_consumer_group_id = "test_1"

    env=StreamExecutionEnvironment.get_execution_environment()
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    env_settings = EnvironmentSettings.Builder().use_blink_planner().build()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env,environment_settings=env_settings)
    t_env.get_config().get_configuration().set_boolean("python.fn-execution.memory.managed",True)

    #Loading kafka  connector  jars
    jars = []
    lib_jar_path ="/home/pyflink/venv/javalib/"
    for file in os.listdir(lib_jar_path):
        if file.endswith(".jar"):
            jars.append(lib_jar_path+file)
    str_jars = ';'.join(['file://'+jar for jar in  jars])
    print(str_jars)
    t_env.get_config().get_configuration().set_string("pipeline.jars",str_jars)
	
	#注册UDF,IDSModel()为机器学习近线推理逻辑
    model = udf(IDSModel(), input_types=DataTypes.STRING(), result_type=DataTypes.INT())

    source_ddl=f"""
     CREATE TABLE some_ip_data(
               dataflow VARCHAR --Some/IP json dataflow 
              ) WITH (
              'connector.type' = 'kafka',
              'connector.version' = 'universal',
              'connector.topic' = '{source_topic}',
              'connector.properties.bootstrap.servers' = '{kafka_servers}',
              'connector.properties.zookeeper.connect' = '{kafka_zookeeper_servers}',
              'connector.properties.group.id' = '{kafka_consumer_group_id}',
              'connector.startup-mode' = 'latest-offset',
              'format.type' = 'json'
            )
    """

    sink_ddl=f"""
    CREATE TABLE es_sink(
               label INT, --0/1 类别
              ) WITH (
              'connector.type' = 'kafka',
              'connector.version' = 'universal',
              'connector.topic' = '{sink_topic}',
              'connector.properties.bootstrap.servers' = '{kafka_servers}',
              'connector.properties.zookeeper.connect' = '{kafka_zookeeper_servers}',
              'connector.properties.group.id' = '{kafka_consumer_group_id}',
              'connector.startup-mode' = 'latest-offset',
              'format.type' = 'json'
            )
    """
    t_env.execute_sql(source_ddl)
    t_env.execute_sql(sink_ddl)
    t_env.create_temporary_system_function('IDS_predict',model)

    query="""
    select IDS_predict(dataflow)
    from some_ip_data
    """
    t_env.sql_query(query).execute_insert("es_sink")
    t_env.execute("realtime_ids")
if __name__ == '__main__':
    model_predict()

依赖jar包,请根据您的flink版本下载kafka连接器jar:https://nightlies.apache.org/flink/flink-docs-release-1.12/dev/table/connectors/kafka.html

  1. 模拟生产者向kafka写入数据
import time, calendar
from kafka import KafkaProducer
def write_data():
    source_topic = "your_kafka_source_topic"
    kafka_servers = "your_kafka_ip:9092"
    producer = KafkaProducer(bootstrap_servers=kafka_servers,
                             value_serializer=lambda x: dumps(x).encode('utf-8'))
    cur_data={"dataflow": "这是一个字符串"+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}
    producer.send(source_topic,cur_data)
    time.sleep(5)
if __name__ == '__main__':
    write_data()
  1. 离线模型写入redis
import redis
from tensorflow.keras.models import load_model
from tensorflow.keras.models import model_from_json
from redis import Redis
import pickle as cPickle

model=load_model("xxx.h5")
model_json = model.to_json()

weights_list = model.get_weights()
## 对模型进行序列化
weights = cPickle.dumps(weights_list)
## 储存到redis中
redis_conn = Redis(host='your_redis_ip',port=6379,password='your_password',db=0)
redis_conn.set("model",model_json)
redis_conn.set("weights",weights)
################################ 读取redis里的model #################################
model_from_redis = model_from_json(redis_conn.get("model"))
weight_from_redis = cPickle.loads(redis_conn.get("weights"))
model.set_weights(weight_from_redis)
model.summary()
  1. 近线推理UDF
class IDSModel(ScalarFunction):
    def __init__(self):
        logging.info("Model init方法")
        self.model = None
        self.redis_params = dict(host='your_redis_ip', password='your_redis_password', port=6379, db=0)
        self.metric_predict_acc = 0  # 模型预测的准确率(用过去 10 条样本来评估)
        self.metric_distribution_y = None  # 标签 y 的分布
        self.metric_total_10_sec = None  # 过去 10 秒内训练过的样本数量
        self.metric_right_10_sec = None  # 过去 10 秒内的预测正确的样本数
    def open(self, function_context):
        """
               访问指标系统,并注册指标,以便于在 webui (localhost:8081) 实时查看算法的运行情况。
               :param function_context:
               :return:
       """
        logging.info("Model open方法")
        if self.model:
            logging.info("模型已加载.")
            self.model.summary()
        else:
            logging.info("模型为空,正在重新加载...")
            self.model = self.load_model()
        # 访问指标系统,并定义 Metric Group 名称为 online_ids,以便于在 webui 查找
        # Metric Group + Metric Name 是 Metric 的唯一标识
        metric_group = function_context.get_metric_group().add_group("online_ids")
        self.metric_counter = metric_group.counter('sample_count')  # 训练过的样本数量
        metric_group.gauge("prediction_acc", lambda: int(self.metric_predict_acc * 100))
        self.metric_distribution_y = metric_group.distribution("metric_distribution_y")

        # 4、仪表 Meters,用于汇报平均吞吐量,可以通过 mark_event(n: int) 函数来更新事件数。
        # 统计过去 10 秒内的样本量、预测正确的样本量
        self.metric_total_10_sec = metric_group.meter("total_10_sec", time_span_in_seconds=10)
        self.metric_right_10_sec = metric_group.meter("right_10_sec", time_span_in_seconds=10)
    def eval(self, data):
        """
             模型预测
            :param args: 参数集合
            :return:
        """
        import logging
        import time
        logging.info("Model eval方法")
        #####测试模型是否加载成功#######
        try:
            logging.info(type(self.model))
            if self.model:
                logging.info("模型加载成功")
        except Exception as e:
            logging.info("模型加载出错")
            logging.info(e)
        #####模型预测#######
        y_pred=self.model.predict(processed_data)
        y_label=np.argmax(y_pred)
        return int(y_label)
    def load_model(self):
        """
        加载模型,如果 redis 里存在模型,则优先从 redis 加载,否则初始化一个新模型
        :return:
        """
        import redis
        import pickle
        import logging
        from tensorflow.keras.models import model_from_json
        logging.info('Model load_model方法!')
        redis = redis.StrictRedis(**self.redis_params)
        logging.info(redis.get("test"))
        #model = None
        try:
            # 从redis中获取模型、及参数
            # 应用pickle.loads加载模型参数
            model = model_from_json(redis.get("model"))
            logging.info('加载的权重...')
            self.model.set_weights(pickle.loads(redis.get("weights")))
        except TypeError:
            logging.error('Redis 内没有指定名称的模型,请先训练模型保存至Redis')
        return model
  1. 提交任务到集群运行
bin/flink run  --target yarn-per-job \
-pyarch venv.zip \
-pyexec venv.zip/venv/bin/python3 \
-pyclientexec venv.zip/venv/bin/python3 \
-py model_predict.py