from pyflink.datastream import StreamExecutionEnvironment, CheckpointingMode
from pyflink.table import StreamTableEnvironment, EnvironmentSettings
'''Window TVF(表值函数table-valued function)
本案例包含三种窗口聚合方式:tumbling window aggregation;hopping window aggregation;cumulative window aggregation
注意,所有cdc模式source表都不支持窗口聚合
'''
settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(2)
env.enable_checkpointing(3600000)
env.get_checkpoint_config().set_checkpointing_mode(CheckpointingMode.EXACTLY_ONCE)
env.get_checkpoint_config().set_checkpoint_timeout(600000)
t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=settings)
t_env.get_config().get_configuration().set_string("taskmanager.memory.task.off-heap.size", '100m')
t_env.get_config().set_python_executable("python3")
'''
create table order_main_source(
order_id STRING NOT NULL,
order_name STRING,
order_type STRING,
order_time STRING,
rowtime as TO_TIMESTAMP(order_time),
WATERMARK FOR rowtime AS rowtime - INTERVAL '3' MINUTES, --时间窗口watermark,订单创建时间-3分钟
PRIMARY KEY (order_id) NOT ENFORCED
);
desc order_main_source;
SELECT * FROM order_main_source;
orderdetail.csv数据内容:
d001,o001,jacket001,1,200,2022-07-20 10:25:19
d002,o001,shoe001,1,305,2022-07-20 10:25:21
d003,o002,tshirt001,2,158,2022-07-21 10:26:01
d004,o003,skirt001,1,88,2022-07-21 18:26:32
d005,o003,skirt002,1,62,2022-07-21 18:27:02
d006,o004,shoe001,1,305,2022-07-22 12:27:33
ordermain.csv数据内容:
o001,ordername1,online,2022-07-20 10:26:19,0
o002,ordername2,offline,2022-07-21 10:26:51,0
o003,ordername3,online,2022-07-21 18:27:32,0
o004,ordername4,offline,2022-07-22 12:28:33,0
'''
# 订单主表
source_ddl_main = """
create table order_main_source(
order_id STRING NOT NULL, --订单号
order_name STRING, --订单名
order_type STRING, --订单类型
order_time TIMESTAMP(3), --订单时间
is_del STRING, --删除标记
WATERMARK FOR order_time AS order_time - INTERVAL '3' MINUTES, --时间窗口watermark,订单创建时间-3分钟
PRIMARY KEY (order_id) NOT ENFORCED
) WITH (
'connector' = 'filesystem',
'path' = 'file:///srv/flinkcdc/source_tables/order_main',
'format' = 'csv'
)
"""
# 订单明细表
source_ddl_detail = """
create table order_detail_source(
detail_id STRING NOT NULL, ---订单明细id
order_id STRING, ---订单ID
product_id STRING, ---商品ID
num INT, ---商品数量
amt DECIMAL(10, 2), ---商品总价
order_time TIMESTAMP(3), ---订单时间
WATERMARK FOR order_time AS order_time - INTERVAL '3' MINUTE, --时间窗口watermark,订单创建时间-3分钟
PRIMARY KEY (detail_id) NOT ENFORCED
) WITH (
'connector' = 'filesystem',
'path' = 'file:///srv/flinkcdc/source_tables/order_detail',
'format' = 'csv'
)
"""
'''TUMBLING window:滚动窗口
TUMBLE函数根据时间属性列为关系的每一行分配一个窗口
TUMBLE函数有三个必需的参数: TUMBLE(TABLE data, DESCRIPTOR(timecol), size)
@data:表名,该表须有时间属性列
@timecol:指示哪个时间属性列映射到该滚动窗口
@size:滚动窗口的大小
效果:按序,每个窗口固定大小,不重叠
window_start和window_end是常规时间戳,无法在后续程序中作为时间属性列使用,故可用window_time(TVF生成的时间属性列,为window_end-1毫秒)
作为时间属性进行后续的操作(如级联窗⼝聚合和窗⼝TopN)
'''
# 每个窗口的累计金额
tumble_opt = """
SELECT window_start, window_end, SUM(amt) as total_amt
FROM TABLE(
TUMBLE(TABLE order_detail_source, DESCRIPTOR(order_time), INTERVAL '30' MINUTES))
GROUP BY window_start,window_end
"""
'''结果:
window_start window_end total_amt
2022-07-20 10:00:00.000 2022-07-20 10:30:00.000 505.00
2022-07-21 10:00:00.000 2022-07-21 10:30:00.000 158.00
2022-07-21 18:00:00.000 2022-07-21 18:30:00.000 150.00
2022-07-22 12:00:00.000 2022-07-22 12:30:00.000 305.00
'''
# 根据业务字段聚合的每个窗口累计金额
tumble_opt1 = """
SELECT window_start, window_end, product_id,SUM(amt) as total_amt
FROM TABLE(
TUMBLE(TABLE order_detail_source, DESCRIPTOR(order_time), INTERVAL '30' MINUTES))
GROUP BY product_id,window_start, window_end
"""
'''结果:
window_start window_end product_id total_amt
2022-07-20 10:00:00.000 2022-07-20 10:30:00.000 jacket001 200.00
2022-07-20 10:00:00.000 2022-07-20 10:30:00.000 shoe001 305.00
2022-07-21 10:00:00.000 2022-07-21 10:30:00.000 tshirt001 158.00
2022-07-21 18:00:00.000 2022-07-21 18:30:00.000 skirt001 88.00
2022-07-21 18:00:00.000 2022-07-21 18:30:00.000 skirt002 62.00
2022-07-22 12:00:00.000 2022-07-22 12:30:00.000 shoe001 305.00
'''
# 含window_time的效果
tumble_opt2 = """
SELECT window_start,window_end,window_time, SUM(amt) as total_amt
FROM TABLE(
TUMBLE(TABLE order_detail_source, DESCRIPTOR(order_time), INTERVAL '30' MINUTES))
GROUP BY window_start,window_end,window_time
"""
'''结果:
window_start window_end window_time total_amt
2022-07-20 10:00:00.000 2022-07-20 10:30:00.000 2022-07-20 10:29:59.999 505.00
2022-07-21 10:00:00.000 2022-07-21 10:30:00.000 2022-07-21 10:29:59.999 158.00
2022-07-21 18:00:00.000 2022-07-21 18:30:00.000 2022-07-21 18:29:59.999 150.00
2022-07-22 12:00:00.000 2022-07-22 12:30:00.000 2022-07-22 12:29:59.999 305.00
'''
# 多维聚合效果--所有维度组合均覆盖的情况
tumble_opt3 = """
SELECT window_start,window_end,window_time,order_id,product_id,SUM(amt) as total_amt
FROM TABLE(
TUMBLE(TABLE order_detail_source, DESCRIPTOR(order_time), INTERVAL '30' MINUTES))
GROUP BY window_start,window_end,window_time,GROUPING SETS ((order_id,product_id),(order_id),(product_id),());
"""
# 多维聚合效果--普通聚合情况
tumble_opt31 = """
SELECT window_start,window_end,window_time,order_id,product_id,SUM(amt) as total_amt
FROM TABLE(
TUMBLE(TABLE order_detail_source, DESCRIPTOR(order_time), INTERVAL '30' MINUTES))
GROUP BY window_start,window_end,window_time,order_id,product_id
"""
'''
HOPPING window:滑动窗口
HOP函数分配的窗口覆盖大小间隔内的行,并根据时间属性列移动每个窗口,窗口大小必须为间隔的整数倍
HOP(TABLE data, DESCRIPTOR(timecol), slide, size [, offset ])
@data:表名,该表须有时间属性列
@timecol:指示哪个时间属性列映射到该滚动窗口
@slide:指定顺序hopping 窗口开始之间的持续时间
@size:hopping 窗口宽度的持续时间,size为slide的整数倍
'''
# 窗口大小30min,滑动大小10min的情况
hop_opt = """
SELECT window_start, window_end, SUM(amt) as total_amt
FROM TABLE(
HOP(TABLE order_detail_source, DESCRIPTOR(order_time), INTERVAL '10' MINUTES, INTERVAL '30' MINUTES))
GROUP BY window_start, window_end;
"""
'''结果:
window_start window_end total_amt
2022-07-20 10:00:00.000 2022-07-20 10:30:00.000 505.00
2022-07-20 10:10:00.000 2022-07-20 10:40:00.000 505.00
2022-07-20 10:20:00.000 2022-07-20 10:50:00.000 505.00
2022-07-21 10:00:00.000 2022-07-21 10:30:00.000 158.00
2022-07-21 10:10:00.000 2022-07-21 10:40:00.000 158.00
2022-07-21 10:20:00.000 2022-07-21 10:50:00.000 158.00
2022-07-21 18:00:00.000 2022-07-21 18:30:00.000 150.00
2022-07-21 18:10:00.000 2022-07-21 18:40:00.000 150.00
2022-07-21 18:20:00.000 2022-07-21 18:50:00.000 150.00
2022-07-22 12:00:00.000 2022-07-22 12:30:00.000 305.00
2022-07-22 12:10:00.000 2022-07-22 12:40:00.000 305.00
2022-07-22 12:20:00.000 2022-07-22 12:50:00.000 305.00
'''
# 含window_time情况、及按不同维度聚合的写法类似tumble例子中的写法
'''cumulative window:累计窗口[应用较多]
场景:有一个1小时步长和1天最大大小的累积窗口,将得到每天的窗口:
[00:00,01:00),[00:00,02:00),[00:00,03:00),…,[00:00,24:00)
针对每个窗口进行累计数据的计算
CUMULATE(TABLE data, DESCRIPTOR(timecol), step, size)
@data:表名,该表须有时间属性列
@timecol:指示哪个时间属性列映射到该滚动窗口
@step:每个步长持续的时间
@size:累计窗口最大宽度持续时间,须为步长时间的整数倍
'''
cum_opt = """
SELECT window_start, window_end, SUM(amt) as total_amt
FROM TABLE(
CUMULATE(TABLE order_detail_source, DESCRIPTOR(order_time), INTERVAL '1' HOURS, INTERVAL '24' HOURS))
GROUP BY window_start, window_end;
"""
'''结果:
window_start window_end total_amt
2022-07-20 00:00:00.000 2022-07-20 11:00:00.000 505.00
2022-07-20 00:00:00.000 2022-07-20 12:00:00.000 505.00
2022-07-20 00:00:00.000 2022-07-20 13:00:00.000 505.00
2022-07-20 00:00:00.000 2022-07-20 14:00:00.000 505.00
2022-07-20 00:00:00.000 2022-07-20 15:00:00.000 505.00
2022-07-20 00:00:00.000 2022-07-20 16:00:00.000 505.00
2022-07-20 00:00:00.000 2022-07-20 17:00:00.000 505.00
2022-07-20 00:00:00.000 2022-07-20 18:00:00.000 505.00
2022-07-20 00:00:00.000 2022-07-20 19:00:00.000 505.00
2022-07-20 00:00:00.000 2022-07-20 20:00:00.000 505.00
2022-07-20 00:00:00.000 2022-07-20 21:00:00.000 505.00
2022-07-20 00:00:00.000 2022-07-20 22:00:00.000 505.00
2022-07-20 00:00:00.000 2022-07-20 23:00:00.000 505.00
2022-07-20 00:00:00.000 2022-07-21 00:00:00.000 505.00
2022-07-21 00:00:00.000 2022-07-21 11:00:00.000 158.00
2022-07-21 00:00:00.000 2022-07-21 12:00:00.000 158.00
2022-07-21 00:00:00.000 2022-07-21 13:00:00.000 158.00
2022-07-21 00:00:00.000 2022-07-21 14:00:00.000 158.00
2022-07-21 00:00:00.000 2022-07-21 15:00:00.000 158.00
2022-07-21 00:00:00.000 2022-07-21 16:00:00.000 158.00
2022-07-21 00:00:00.000 2022-07-21 17:00:00.000 158.00
2022-07-21 00:00:00.000 2022-07-21 18:00:00.000 158.00
2022-07-21 00:00:00.000 2022-07-21 19:00:00.000 308.00
2022-07-21 00:00:00.000 2022-07-21 20:00:00.000 308.00
2022-07-21 00:00:00.000 2022-07-21 21:00:00.000 308.00
2022-07-21 00:00:00.000 2022-07-21 22:00:00.000 308.00
2022-07-21 00:00:00.000 2022-07-21 23:00:00.000 308.00
2022-07-21 00:00:00.000 2022-07-22 00:00:00.000 308.00
2022-07-22 00:00:00.000 2022-07-22 13:00:00.000 305.00
2022-07-22 00:00:00.000 2022-07-22 14:00:00.000 305.00
2022-07-22 00:00:00.000 2022-07-22 15:00:00.000 305.00
'''
# 含window_time情况、及按不同维度聚合的写法类似tumble例子中的写法
flink LISTAGG顺序 flink last value
转载本文章为转载内容,我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题,欢迎原作者联系我们进行内容更正或删除文章。
提问和评论都可以,用心的回复会被更多人看到
评论
发布评论
相关文章
-
python 更改另一个进程变量
大家好我叫乌图米,我会在这里陆续跟大家分享一些有的没的数码体验、软件技巧、系统知识,欢迎大家留言与我交流~如果你喜欢文章的内容,可以在文末点个赞 ,你的支持就是我最大的动力 !这篇文章记录一下 macOS 环境变量的相关内容。啥是环境变量,它从哪儿加载,如何查看和设置环境变量,我们需要注意哪些环境变量?一起来看看。0 啥是环境变量操作系统中的环境变量「environment vari
python 更改另一个进程变量 node环境变量配置 node配置环境变量 shell 脚本调用环境变量失败