一、环境依赖
- 消息队列:Kafka(单机或者集群)(需要安装Zookeeper)
- Redis
- Hadoop3.2-3、flink1.12( 集群或者单机环境)
- Python3.8、pyflink1.12
二、项目整体概述
一句话描述:需要预测的数据发送到Kafka消息队列,从Redis里读取离线训练的神经网络模型,经过特征工程及预测后重新写回Kafka供上游应用使用
三、 项目搭建及部署流程
(1) pyflink运行环境搭建
- 从conda官方下载Miniconda3 Linux 64-bit的.sh安装包(为方便后续说明,下载后我重命名为miniconda.sh),请根据您需要的python版本下载对应的miniconda安装包
- sh安装包执行可执行命令
chmod +x miniconda.sh
- 创建Python运行环境目录
#venv指将创建的目录名称
./miniconda.sh -b -p venv
- 激活运行环境
source venv/bin/activate ""
- 通过pip install安装需要包
pip install apache-flink==1.12.0 --target venv/lib/python3.7/site-packages
- python包安装完后即可在此环境下开发。开发完毕后执行下面命令退出环境
conda deactivate
- 退出环境后,删除掉conda安装python包时,用于缓存的安装包路径
rm -rf venv/pkgs
- 用zip打包运行环境
zip -r venv.zip venv
该venv.zip可提交至flink集群为后续的pyflink项目提供可运行环境
(2)项目部署
- 近线推理pyflink代码,消费Kafka数据
################################kafka配置参数################################
def model_predict():
kafka_servers = "your_kafka_ip:9092"
kafka_zookeeper_servers = "your_zookeeper_ip:2181"
source_topic = "your_source_topic"
sink_topic = "your_sink_topic"
kafka_consumer_group_id = "test_1"
env=StreamExecutionEnvironment.get_execution_environment()
env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
env_settings = EnvironmentSettings.Builder().use_blink_planner().build()
t_env = StreamTableEnvironment.create(stream_execution_environment=env,environment_settings=env_settings)
t_env.get_config().get_configuration().set_boolean("python.fn-execution.memory.managed",True)
#Loading kafka connector jars
jars = []
lib_jar_path ="/home/pyflink/venv/javalib/"
for file in os.listdir(lib_jar_path):
if file.endswith(".jar"):
jars.append(lib_jar_path+file)
str_jars = ';'.join(['file://'+jar for jar in jars])
print(str_jars)
t_env.get_config().get_configuration().set_string("pipeline.jars",str_jars)
#注册UDF,IDSModel()为机器学习近线推理逻辑
model = udf(IDSModel(), input_types=DataTypes.STRING(), result_type=DataTypes.INT())
source_ddl=f"""
CREATE TABLE some_ip_data(
dataflow VARCHAR --Some/IP json dataflow
) WITH (
'connector.type' = 'kafka',
'connector.version' = 'universal',
'connector.topic' = '{source_topic}',
'connector.properties.bootstrap.servers' = '{kafka_servers}',
'connector.properties.zookeeper.connect' = '{kafka_zookeeper_servers}',
'connector.properties.group.id' = '{kafka_consumer_group_id}',
'connector.startup-mode' = 'latest-offset',
'format.type' = 'json'
)
"""
sink_ddl=f"""
CREATE TABLE es_sink(
label INT, --0/1 类别
) WITH (
'connector.type' = 'kafka',
'connector.version' = 'universal',
'connector.topic' = '{sink_topic}',
'connector.properties.bootstrap.servers' = '{kafka_servers}',
'connector.properties.zookeeper.connect' = '{kafka_zookeeper_servers}',
'connector.properties.group.id' = '{kafka_consumer_group_id}',
'connector.startup-mode' = 'latest-offset',
'format.type' = 'json'
)
"""
t_env.execute_sql(source_ddl)
t_env.execute_sql(sink_ddl)
t_env.create_temporary_system_function('IDS_predict',model)
query="""
select IDS_predict(dataflow)
from some_ip_data
"""
t_env.sql_query(query).execute_insert("es_sink")
t_env.execute("realtime_ids")
if __name__ == '__main__':
model_predict()
依赖jar包,请根据您的flink版本下载kafka连接器jar:https://nightlies.apache.org/flink/flink-docs-release-1.12/dev/table/connectors/kafka.html
- 模拟生产者向kafka写入数据
import time, calendar
from kafka import KafkaProducer
def write_data():
source_topic = "your_kafka_source_topic"
kafka_servers = "your_kafka_ip:9092"
producer = KafkaProducer(bootstrap_servers=kafka_servers,
value_serializer=lambda x: dumps(x).encode('utf-8'))
cur_data={"dataflow": "这是一个字符串"+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}
producer.send(source_topic,cur_data)
time.sleep(5)
if __name__ == '__main__':
write_data()
- 离线模型写入redis
import redis
from tensorflow.keras.models import load_model
from tensorflow.keras.models import model_from_json
from redis import Redis
import pickle as cPickle
model=load_model("xxx.h5")
model_json = model.to_json()
weights_list = model.get_weights()
## 对模型进行序列化
weights = cPickle.dumps(weights_list)
## 储存到redis中
redis_conn = Redis(host='your_redis_ip',port=6379,password='your_password',db=0)
redis_conn.set("model",model_json)
redis_conn.set("weights",weights)
################################ 读取redis里的model #################################
model_from_redis = model_from_json(redis_conn.get("model"))
weight_from_redis = cPickle.loads(redis_conn.get("weights"))
model.set_weights(weight_from_redis)
model.summary()
- 近线推理UDF
class IDSModel(ScalarFunction):
def __init__(self):
logging.info("Model init方法")
self.model = None
self.redis_params = dict(host='your_redis_ip', password='your_redis_password', port=6379, db=0)
self.metric_predict_acc = 0 # 模型预测的准确率(用过去 10 条样本来评估)
self.metric_distribution_y = None # 标签 y 的分布
self.metric_total_10_sec = None # 过去 10 秒内训练过的样本数量
self.metric_right_10_sec = None # 过去 10 秒内的预测正确的样本数
def open(self, function_context):
"""
访问指标系统,并注册指标,以便于在 webui (localhost:8081) 实时查看算法的运行情况。
:param function_context:
:return:
"""
logging.info("Model open方法")
if self.model:
logging.info("模型已加载.")
self.model.summary()
else:
logging.info("模型为空,正在重新加载...")
self.model = self.load_model()
# 访问指标系统,并定义 Metric Group 名称为 online_ids,以便于在 webui 查找
# Metric Group + Metric Name 是 Metric 的唯一标识
metric_group = function_context.get_metric_group().add_group("online_ids")
self.metric_counter = metric_group.counter('sample_count') # 训练过的样本数量
metric_group.gauge("prediction_acc", lambda: int(self.metric_predict_acc * 100))
self.metric_distribution_y = metric_group.distribution("metric_distribution_y")
# 4、仪表 Meters,用于汇报平均吞吐量,可以通过 mark_event(n: int) 函数来更新事件数。
# 统计过去 10 秒内的样本量、预测正确的样本量
self.metric_total_10_sec = metric_group.meter("total_10_sec", time_span_in_seconds=10)
self.metric_right_10_sec = metric_group.meter("right_10_sec", time_span_in_seconds=10)
def eval(self, data):
"""
模型预测
:param args: 参数集合
:return:
"""
import logging
import time
logging.info("Model eval方法")
#####测试模型是否加载成功#######
try:
logging.info(type(self.model))
if self.model:
logging.info("模型加载成功")
except Exception as e:
logging.info("模型加载出错")
logging.info(e)
#####模型预测#######
y_pred=self.model.predict(processed_data)
y_label=np.argmax(y_pred)
return int(y_label)
def load_model(self):
"""
加载模型,如果 redis 里存在模型,则优先从 redis 加载,否则初始化一个新模型
:return:
"""
import redis
import pickle
import logging
from tensorflow.keras.models import model_from_json
logging.info('Model load_model方法!')
redis = redis.StrictRedis(**self.redis_params)
logging.info(redis.get("test"))
#model = None
try:
# 从redis中获取模型、及参数
# 应用pickle.loads加载模型参数
model = model_from_json(redis.get("model"))
logging.info('加载的权重...')
self.model.set_weights(pickle.loads(redis.get("weights")))
except TypeError:
logging.error('Redis 内没有指定名称的模型,请先训练模型保存至Redis')
return model
- 提交任务到集群运行
bin/flink run --target yarn-per-job \
-pyarch venv.zip \
-pyexec venv.zip/venv/bin/python3 \
-pyclientexec venv.zip/venv/bin/python3 \
-py model_predict.py