Flink核心编程
1、Environment
Flink Job在提交执行计算时,需要首先建立和Flink框架之间的联系,也就指的是当前的flink运行环境,只有获取了环境信息,才能将task调度到不同的taskManager执行。而这个环境对象的获取方式相对比较简单。
批处理环境
ExecutionEnvironment benv = ExecutionEnvironment.getExecutionEnvironment();
流式数据处理环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
2、Source
Flink框架可以从不同的来源获取数据,将数据提交给框架进行处理, 我们将获取数据的来源称之为数据源。
代码演示:
// 定义样例类:水位传感器:用于接收空高数据
// id:传感器编号
// ts:时间戳
// vc:空高
public class WaterSensor {
private String id;
private Long ts;
private Integer vc;
public Watersensor() {
}
public Watersensor(String id, Long ts, Integer vc) {
this.id = id;
this.ts = ts;
this.vc = vc;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public Long getTs() {
return ts;
}
public void setTs(Long ts) {
this.ts = ts;
}
public Integer getVc() {
return vc;
}
public void setVc(Integer vc) {
this.vc = vc;
}
@Override
public String toString() {
return "WaterSensor{" +
"id='" + id + '\'' +
", ts=" + ts +
", vc=" + vc +
'}';
}
}
1)从集合读取数据
一般情况下,可以将数据临时存储到内存中,形成特殊的数据结构后,作为数据源使用。这里的数据结构采用集合类型是比较普遍的。
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.util.Arrays;
public class Flink02_Source_Collection {
public static void main(String[] args) throws Exception {
//创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//指定并行度
env.setParallelism(1);
//从集合中读取数据
DataStreamSource<WaterSensor> collectionDs = env.fromCollection(Arrays.asList(
new WaterSensor("ws_001", 1577844001L, 45),
new WaterSensor("ws_002", 1577844015L, 45),
new WaterSensor("ws_003", 1577844020L, 45)
));
//打印
collectionDs.print();
//执行
env.execute();
}
}
2)从文件读取数据
通常情况下,我们会从存储介质中获取数据,比较常见的就是将日志文件作为数据源。
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class Flink03_Source_File {
public static void main(String[] args) throws Exception {
//创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment().setParallelism(1);
//从文件中读取数据
DataStreamSource<String> fileDs = env.readTextFile("input/sensor-data.log");
//打印
fileDs.print();
//执行
env.execute();
}
}
3)从Kafka读取数据
Kafka作为消息传输队列,是一个分布式的,高吞吐量,易于扩展地基于主题发布/订阅的消息系统。在现今企业级开发中,Kafka 和 Flink 成为构建一个实时的数据处理系统的首选。
①、引入Kafka连接器的依赖
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka-0.11_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
②、代码实现参考
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;
import java.util.Properties;
public class Flink04_Source_Kafka {
public static void main(String[] args) throws Exception {
//创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment().setParallelism(1);
//从Kafka读取数据
Properties properties = new Properties();
//kafka集群地址
properties.setProperty("bootstrap.servers", "hadoop102:9092");
//消费者组
properties.setProperty("group.id","consumer-group");
//反序列化器
properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
//消费策略
properties.setProperty("auto.offset.reset", "latest");
DataStreamSource<String> kafkaDs = env.addSource(new FlinkKafkaConsumer011<String>(
"sensor",
new SimpleStringSchema(),
properties));
kafkaDs.print("kafka source");
env.execute();
}
}
4)自定义数据源
大多数情况下,前面的数据源已经能够满足需要,但是难免会存在特殊情况的场合,所以flink也提供了能自定义数据源的方式。
import com.atguigu.bean.WaterSensor;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import java.util.Random;
public class Flink05_Source_MySource {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// TODO 1.Source:从自定义数据源读取
DataStreamSource<WaterSensor> inputDS = env.addSource(new MySourceFunction());
inputDS.print();
env.execute();
}
/**
* 自定义数据源
* 1. 实现 SourceFunction,指定输出的类型
* 2. 重写 两个方法
* run():
* cancel():
*/
public static class MySourceFunction implements SourceFunction<WaterSensor> {
// 定义一个标志位,控制数据的产生
private boolean flag = true;
@Override
public void run(SourceContext<WaterSensor> ctx) throws Exception {
Random random = new Random();
while (flag) {
ctx.collect(
new WaterSensor(
"sensor_" + random.nextInt(3),
System.currentTimeMillis(),
random.nextInt(10) + 40
)
);
Thread.sleep(2000L);
}
}
@Override
public void cancel() {
this.flag = false;
}
}
}
3、Transform
1)map
Ø 映射:将数据流中的数据进行转换, 形成新的数据流,消费一个元素并产出一个元素
Ø 参数:lambda表达式或MapFunction实现类
Ø 返回:DataStream
//lambda
SingleOutputStreamOperator<WaterSensor> sensorDS = inputDS
.map((MapFunction<String, WaterSensor>) value -> {
String[] datas = value.split(",");
return new WaterSensor(datas[0], Long.valueOf(datas[1]), Integer.valueOf(datas[2]));
});
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import java.util.Random;
public class Flink06_Transform_Map {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据
DataStreamSource<String> inputDS = env.readTextFile("input/sensor-data.log");
// 2.Transform: Map转换成实体对象
SingleOutputStreamOperator<WaterSensor> sensorDS = inputDS.map(new MyMapFunction());
// 3.打印
sensorDS.print();
env.execute();
}
/**
* 实现MapFunction接口,并定义泛型(输入,输出)
* 重写 map方法
*/
public static class MyMapFunction implements MapFunction<String, WaterSensor> {
@Override
public WaterSensor map(String value) throws Exception {
String[] datas = value.split(",");
return new WaterSensor(datas[0], Long.valueOf(datas[1]), Integer.valueOf(datas[2]));
}
}
}
所有Flink函数类都有其Rich版本。它与常规函数的不同在于,可以获取运行环境的上下文,并拥有一些生命周期方法,所以可以实现更复杂的功能。也有意味着提供了更多的,更丰富的功能。例如:RichMapFunction
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class Flink07_Transform_RichMapFunction {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据
DataStreamSource<String> inputDS = env.readTextFile("input/sensor-data.log");
// DataStreamSource<String> inputDS = env.socketTextStream("localhost",9999);
// 2.Transform: Map转换成实体对象
SingleOutputStreamOperator<WaterSensor> sensorDS = inputDS.map(new MyRichMapFunction());
// 3.打印
sensorDS.print();
env.execute();
}
/**
* 继承 RichMapFunction,指定输入的类型,返回的类型
* 提供了 open()和 close() 生命周期管理方法
* 能够获取 运行时上下文对象 =》 可以获取 状态、任务信息 等环境信息
*/
public static class MyRichMapFunction extends RichMapFunction<String, WaterSensor> {
@Override
public WaterSensor map(String value) throws Exception {
String[] datas = value.split(",");
return new WaterSensor(getRuntimeContext().getTaskName() + datas[0], Long.valueOf(datas[1]), Integer.valueOf(datas[2]));
}
@Override
public void open(Configuration parameters) throws Exception {
System.out.println("open...");
}
@Override
public void close() throws Exception {
System.out.println("close...");
}
}
}
*Rich Function********有一个生命周期的概念。典型的生命周期方法有:*
open()方法是rich function的初始化方法,当一个算子例如map或者filter被调用之前open()会被调用
close()方法是生命周期中的最后一个调用的方法,做一些清理工作
getRuntimeContext()方法提供了函数的RuntimeContext的一些信息,例如函数执行的并行度,任务的名字,以及state状态
2)flatMap
Ø 扁平映射:将数据流中的整体拆分成一个一个的个体使用,消费一个元素并产生零到多个元素
Ø 参数:lambda表达式或FlatMapFunction实现类
Ø 返回:DataStream
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
import java.util.Arrays;
import java.util.List;
public class Flink08_Transform_FlatMap {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据
DataStreamSource<List<Integer>> inputDS = env.fromCollection(
Arrays.asList(
Arrays.asList(1, 2, 3, 4),
Arrays.asList(5, 6, 7, 8)
)
);
// 2.Transform: FlatMap转换成实体对象
inputDS
.flatMap(new FlatMapFunction<List<Integer>, Integer>() {
@Override
//进来的数据是List{List(1, 2, 3, 4),List(5, 6, 7, 8)}
public void flatMap(List<Integer> value, Collector<Integer> out) throws Exception {
//对进来的每个List进行遍历
for (Integer number : value) {
out.collect(number + 10);
}
}
})
.print();//10 20 30 40 50 ..........
env.execute();
}
}
3)filter
Ø 过滤:根据指定的规则将满足条件(true)的数据保留,不满足条件(false)的数据丢弃
Ø 参数:Scala匿名函数或FilterFunction
Ø 返回:DataStream
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
import java.util.Arrays;
import java.util.List;
/**
* TODO
*
* @author cjp
* @version 1.0
* @date 2020/9/16 15:29
*/
public class Flink09_Transform_Filter {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据
DataStreamSource<Integer> inputDS = env.fromCollection(
Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8)
);
// TODO Transform: filter => 为 true保留,为 false丢弃
inputDS
.filter(new MyFilterFunction())
.print(); // 2,4,6,8
env.execute();
}
public static class MyFilterFunction implements FilterFunction<Integer> {
@Override
public boolean filter(Integer value) throws Exception {
return value % 2 == 0;
}
}
}
4)keyby
在Spark中有一个GroupBy的算子,用于根据指定的规则将数据进行分组,在flink中也有类似的功能,那就是keyBy,根据指定的key对数据进行分流
Ø 分流:根据指定的Key的hashcode将元素发送到不同的分区,相同的Key会被分到一个分区(这里分区指的就是下游算子多个并行节点的其中一个)。keyBy()是通过哈希来分区的
Ø 参数:Scala匿名函数或POJO属性或元组索引,不能使用数组
Ø 返回:KeyedStream
package com.atguigu.chapter05;
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.util.Arrays;
public class Flink10_Transform_KeyBy {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据
DataStreamSource<String> inputDS = env.readTextFile("input/sensor-data.log");
// 2.Transform: Map转换成实体对象
SingleOutputStreamOperator<WaterSensor> sensorDS = inputDS.map(new Flink06_Transform_Map.MyMapFunction());
// TODO Keyby:分组
// 通过 位置索引 或 字段名称 ,返回 Key的类型,无法确定,所以会返回 Tuple,后续使用key的时候,很麻烦
// 通过 明确的指定 key 的方式, 获取到的 key就是具体的类型 => 实现 KeySelector 或 lambda
// 分组是逻辑上的分组,即 给每个数据打上标签(属于哪个分组),并不是对并行度进行改变
// sensorDS.keyBy(0).print();
// KeyedStream<WaterSensor, Tuple> sensorKSByFieldName = sensorDS.keyBy("id");
KeyedStream<WaterSensor, String> sensorKSByKeySelector = sensorDS.keyBy(new MyKeySelector());
// KeyedStream<WaterSensor, String> waterSensorStringKeyedStream = sensorDS.keyBy(r -> r.getId());
env.execute();
}
public static class MyKeySelector implements KeySelector<WaterSensor, String> {
@Override
public String getKey(WaterSensor value) throws Exception {
return value.getId();
}
}
}
5)shuffle
Ø 打乱重组(洗牌):将数据随机分布打散到下游
Ø 参数:无
Ø 返回:DataStream
package com.atguigu.chapter05;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class Flink11_Transform_Shuffle {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(2);
// 1.从文件读取数据
DataStreamSource<String> inputDS = env.readTextFile("input/sensor-data.log");
inputDS.print("input");
DataStream<String> resultDS = inputDS.shuffle();
resultDS.print("shuffle");
env.execute();
}
}
6)split
在某些情况下,我们需要将数据流根据某些特征拆分成两个或者多个数据流,给不同数据流增加标记以便于从流中取出。
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.collector.selector.OutputSelector;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.SplitStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.util.Arrays;
public class Flink12_Transform_Split {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据
SingleOutputStreamOperator<WaterSensor> sensorDS = env
.readTextFile("input/sensor-data.log")
.map(new MapFunction<String, WaterSensor>() {
@Override
public WaterSensor map(String value) throws Exception {
String[] datas = value.split(",");
return new WaterSensor(datas[0], Long.valueOf(datas[1]), Integer.valueOf(datas[2]));
}
});
// TODO Split: 水位低于 50 正常,水位 [50,80) 警告, 水位高于 80 告警
// split并不是真正的把流分开,而是给数据打上标签
SplitStream<WaterSensor> splitSS = sensorDS.split(new OutputSelector<WaterSensor>() {
@Override
public Iterable<String> select(WaterSensor value) {
if (value.getVc() < 50) {
return Arrays.asList("normal");
} else if (value.getVc() < 80) {
return Arrays.asList("warn");
} else {
return Arrays.asList("alarm");
}
}
}
);
env.execute();
}
}
7)select
将数据流进行split后,如何从流中将不同的标记取出呢,这时就需要使用select算子了。
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.collector.selector.OutputSelector;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.SplitStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.util.Arrays;
public class Flink13_Transform_Select {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据
SingleOutputStreamOperator<WaterSensor> sensorDS = env
.readTextFile("input/sensor-data.log")
.map(new MapFunction<String, WaterSensor>() {
@Override
public WaterSensor map(String value) throws Exception {
String[] datas = value.split(",");
return new WaterSensor(datas[0], Long.valueOf(datas[1]), Integer.valueOf(datas[2]));
}
});
// TODO Split: 水位低于 50 正常,水位 [50,80) 警告, 水位高于 80 告警
// split并不是真正的把流分开
SplitStream<WaterSensor> splitSS = sensorDS.split(new OutputSelector<WaterSensor>() {
@Override
public Iterable<String> select(WaterSensor value) {
if (value.getVc() < 50) {
return Arrays.asList("normal","happy","hi");
} else if (value.getVc() < 80) {
return Arrays.asList("warn","happy");
} else {
return Arrays.asList("alarm");
}
}
}
);
//TODO select
// 通过之前的标签名,获取对应的流
// 一个流可以起多个名字,取出的时候,给定一个名字就行
splitSS.select("normal").print("normal");
// splitSS.select("hi").print("normal");
splitSS.select("happy").print("warn");
// splitSS.select("warn").print("warn");
// splitSS.select("alarm").print("alarm");
env.execute();
}
}
8)connect
在某些情况下,我们需要将两个不同来源的数据流进行连接,实现数据匹配,比如订单支付和第三方交易信息,这两个信息的数据就来自于不同数据源,连接后,将订单支付和第三方交易信息进行对账,此时,才能算真正的支付完成。
Flink中的connect算子可以连接两个保持他们类型的数据流,两个数据流被Connect之后,只是被放在了一个同一个流中,内部依然保持各自的数据和形式不发生任何变化,两个流相互独立。
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.collector.selector.OutputSelector;
import org.apache.flink.streaming.api.datastream.ConnectedStreams;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.SplitStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoMapFunction;
import java.util.Arrays;
public class Flink14_Transform_Connect {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据
SingleOutputStreamOperator<WaterSensor> sensorDS = env
.readTextFile("input/sensor-data.log")
.map(new MapFunction<String, WaterSensor>() {
@Override
public WaterSensor map(String value) throws Exception {
String[] datas = value.split(",");
return new WaterSensor(datas[0], Long.valueOf(datas[1]), Integer.valueOf(datas[2]));
}
});
// 再获取一条流
DataStreamSource<Integer> numDS = env.fromCollection(Arrays.asList(1, 2, 3, 4));
// TODO 使用connect连接两条流
// 两条流 数据类型 可以不一样
// 只能两条流进行连接
// 处理数据的时候,也是分开处理
ConnectedStreams<WaterSensor, Integer> sensorNumCS = sensorDS.connect(numDS);
// 调用其他算子
SingleOutputStreamOperator<Object> resultDS = sensorNumCS.map(
new CoMapFunction<WaterSensor, Integer, Object>() {
@Override
public String map1(WaterSensor value) throws Exception {
return value.toString();
}
@Override
public Integer map2(Integer value) throws Exception {
return value + 10;
}
});
resultDS.print();
env.execute();
}
}
9)union
对两个或者两个以上的DataStream进行union操作,产生一个包含所有DataStream元素的新DataStream.
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.ConnectedStreams;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoMapFunction;
import java.util.Arrays;
public class Flink15_Transform_Union {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 获取流
DataStreamSource<Integer> numDS = env.fromCollection(Arrays.asList(1, 2, 3, 4));
DataStreamSource<Integer> numDS1 = env.fromCollection(Arrays.asList(11, 12, 13, 14));
DataStreamSource<Integer> numDS2 = env.fromCollection(Arrays.asList(21, 22, 23, 24));
//TODO Union连接流
// 要求流的 数据类型 要相同
// 可以连接多条流
DataStream<Integer> unionDS = numDS
.union(numDS1)
.union(numDS2);
unionDS
.map(new MapFunction<Integer, Integer>() {
@Override
public Integer map(Integer value) throws Exception {
return value * 10;
}
})
.print("union");
env.execute();
}
}
4、Opertor
1)滚动聚合算子
这些算子可以针对****KeyedStream*的每一个支流做聚合。执行完成后,会将聚合的结果合成一个流返回,所以结果都是*DataStream****
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class Flink16_Transform_RollingAgg {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据
DataStreamSource<String> inputDS = env
// .readTextFile("input/sensor-data.log");
.socketTextStream("localhost",9999 );
// 2.Transform: Map转换成实体对象
SingleOutputStreamOperator<WaterSensor> sensorDS = inputDS.map((MapFunction<String, WaterSensor>) value -> {
String[] datas = value.split(",");
return new WaterSensor(datas[0], Long.valueOf(datas[1]), Integer.valueOf(datas[2]));
});
// 3.按照 id 分组
KeyedStream<Tuple3<String, Long, Integer>, String> sensorKS = sensorDS
.map(new MapFunction<WaterSensor, Tuple3<String, Long, Integer>>() {
@Override
public Tuple3<String, Long, Integer> map(WaterSensor value) throws Exception {
return new Tuple3<>(value.getId(), value.getTs(), value.getVc());
}
})
.keyBy( r -> r.f0);
// TODO 滚动聚合算子:来一条,聚合一条,输出一次
// sensorKS.sum(2).print("sum");
sensorKS.max(2).print("max");
// sensorKS.min(2).print("min");
env.execute();
}
public static class MyKeySelector implements KeySelector<WaterSensor, String> {
@Override
public String getKey(WaterSensor value) throws Exception {
return value.getId();
}
}
}
2)reduce
一个分组数据流的聚合操作,合并当前的元素和上次聚合的结果,产生一个新的值,返回的流中包含每一次聚合的结果,而不是只返回最后一次聚合的最终结果。
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class Flink17_Transform_Reduce {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据
DataStreamSource<String> inputDS = env
// .readTextFile("input/sensor-data.log");
.socketTextStream("localhost", 9999);
// 2.Transform: Map转换成实体对象
SingleOutputStreamOperator<WaterSensor> sensorDS = inputDS.map(new Flink06_Transform_Map.MyMapFunction());
// 3.按照 id 分组
KeyedStream<Tuple3<String, Long, Integer>, String> sensorKS = sensorDS
.map(new MapFunction<WaterSensor, Tuple3<String, Long, Integer>>() {
@Override
public Tuple3<String, Long, Integer> map(WaterSensor value) throws Exception {
return new Tuple3<>(value.getId(), value.getTs(), value.getVc());
}
})
.keyBy(r -> r.f0);
// TODO Reduce
// 1.输入的类型要一致,输出的类型也要一致
// 2.第一条来的数据,不会进入reduce
// 3.帮我们保存了中间状态
sensorKS
.reduce(
new ReduceFunction<Tuple3<String, Long, Integer>>() {
@Override
public Tuple3<String, Long, Integer> reduce(Tuple3<String, Long, Integer> value1, Tuple3<String, Long, Integer> value2) throws Exception {
System.out.println(value1.toString() + " <-> " + value2.toString());
return Tuple3.of("aaa", 123L, value1.f2 + value2.f2);
}
}
)
.print("reduce");
env.execute();
}
public static class MyKeySelector implements KeySelector<WaterSensor, String> {
@Override
public String getKey(WaterSensor value) throws Exception {
return value.getId();
}
}
}
3)process
Flink在数据流通过keyBy进行分流处理后,如果想要处理过程中获取环境相关信息,可以采用process算子自定义实现.
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
public class Flink18_Transform_Process {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据
DataStreamSource<String> inputDS = env
.readTextFile("input/sensor-data.log");
// .socketTextStream("localhost", 9999);
// 2.Transform: Map转换成实体对象
SingleOutputStreamOperator<WaterSensor> sensorDS = inputDS.map((MapFunction<String, WaterSensor>) value -> {
String[] datas = value.split(",");
return new WaterSensor(datas[0], Long.valueOf(datas[1]), Integer.valueOf(datas[2]));
});
// 3.按照 id 分组
KeyedStream<Tuple3<String, Long, Integer>, String> sensorKS = sensorDS
.map(new MapFunction<WaterSensor, Tuple3<String, Long, Integer>>() {
@Override
public Tuple3<String, Long, Integer> map(WaterSensor value) throws Exception {
return new Tuple3<>(value.getId(), value.getTs(), value.getVc());
}
})
.keyBy(r -> r.f0);
// TODO Process
// 可以获取到一些 环境信息
sensorKS.process(
new KeyedProcessFunction<String, Tuple3<String, Long, Integer>, String>() {
/**
* 处理数据的方法:来一条处理一条
* @param value 一条数据
* @param ctx 上下文
* @param out 采集器
* @throws Exception
*/
@Override
public void processElement(Tuple3<String, Long, Integer> value, Context ctx, Collector<String> out) throws Exception {
out.collect("当前key=" + ctx.getCurrentKey() + "当前时间=" + ctx.timestamp() + ",数据=" + value);
}
}
)
.print("process");
env.execute();
}
public static class MyKeySelector implements KeySelector<WaterSensor, String> {
@Override
public String getKey(WaterSensor value) throws Exception {
return value.getId();
}
}
}
5、Sink
Sink有下沉的意思,在Flink中所谓的Sink其实可以表示为将数据存储起来的意思,也可以将范围扩大,表示将处理完的数据发送到指定的存储系统的输出操作之前我们一直在使用的print方法其实就是一种Sink。
1)Kafka Sink
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer011;
import org.apache.flink.util.Collector;
public class Sink_Kafka {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据
DataStreamSource<String> inputDS = env
.readTextFile("input/sensor-data.log");
// .socketTextStream("localhost", 9999);
//TODO 数据 Sink到Kafka
// DataStream调用 addSink =》 注意,不是env来调用
inputDS.addSink(
new FlinkKafkaProducer011<String>(
"hadoop102:9092",
"sensor0421",
new SimpleStringSchema())
);
env.execute();
}
}
2)Redis Sink
咱们可以将处理完的数据发送到Redis缓存数据库中
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer011;
import org.apache.flink.streaming.connectors.redis.RedisSink;
import org.apache.flink.streaming.connectors.redis.common.config.FlinkJedisPoolConfig;
import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommand;
import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommandDescription;
import org.apache.flink.streaming.connectors.redis.common.mapper.RedisMapper;
public class Flink20_Sink_Redis {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据
DataStreamSource<String> inputDS = env
.readTextFile("input/sensor-data.log");
// .socketTextStream("localhost", 9999);
//TODO 数据 Sink到 Redis
FlinkJedisPoolConfig jedisConfig = new FlinkJedisPoolConfig.Builder()
.setHost("hadoop102")
.setPort(6379)
.build();
inputDS.addSink(
new RedisSink<String>(
jedisConfig,
new RedisMapper<String>() {
// redis 的命令: key是最外层的 key
@Override
public RedisCommandDescription getCommandDescription() {
return new RedisCommandDescription(RedisCommand.HSET,"sensor0421");
}
// Hash类型:这个指定的是 hash 的key
@Override
public String getKeyFromData(String data) {
String[] datas = data.split(",");
return datas[1];
}
// Hash类型:这个指定的是 hash 的 value
@Override
public String getValueFromData(String data) {
String[] datas = data.split(",");
return datas[2];
}
}
)
);
env.execute();
}
}
3)ElasticSearch Sink
咱们可以将处理完的数据发送到ElasticSearch搜索服务器中.
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction;
import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer;
import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink;
import org.apache.flink.streaming.connectors.redis.RedisSink;
import org.apache.flink.streaming.connectors.redis.common.config.FlinkJedisPoolConfig;
import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommand;
import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommandDescription;
import org.apache.flink.streaming.connectors.redis.common.mapper.RedisMapper;
import org.apache.http.HttpHost;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.Requests;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class Flink21_Sink_ES {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据
DataStreamSource<String> inputDS = env
.readTextFile("input/sensor-data.log");
// .socketTextStream("localhost", 9999);
//TODO 数据 Sink到 ES
List<HttpHost> httpHosts = new ArrayList<>();
httpHosts.add(new HttpHost("hadoop102",9200));
httpHosts.add(new HttpHost("hadoop103",9200));
httpHosts.add(new HttpHost("hadoop104",9200));
ElasticsearchSink<String> esSink = new ElasticsearchSink.Builder<String>(
httpHosts,
new ElasticsearchSinkFunction<String>() {
@Override
public void process(String element, RuntimeContext ctx, RequestIndexer indexer) {
// 将数据放在Map中
Map<String, String> dataMap = new HashMap<>();
dataMap.put("data", element);
// 创建 IndexRequest =》 指定index,指定type,指定source
IndexRequest indexRequest = Requests.indexRequest("sensor0421").type("reading").source(dataMap);
// 添加到 RequestIndexer
indexer.add(indexRequest);
}
}
)
.build();
inputDS.addSink(esSink);
env.execute();
}
}
4)自定义 Sink
如果Flink没有提供给我们可以直接使用的连接器,那我们如果想将数据存储到我们自己的存储设备中,怎么办?没事,Flink提供了自定义Sink,你自己决定如何进行存储。
import com.atguigu.bean.WaterSensor;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction;
import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer;
import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink;
import org.apache.http.HttpHost;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.Requests;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class Flink22_Sink_MySQL {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据
DataStreamSource<String> inputDS = env
.readTextFile("input/sensor-data.log");
// .socketTextStream("localhost", 9999);
//TODO 数据 Sink到自定义的 MySQL
inputDS.addSink(
new RichSinkFunction<String>() {
private Connection conn = null;
private PreparedStatement pstmt = null;
@Override
public void open(Configuration parameters) throws Exception {
conn = DriverManager.getConnection("jdbc:mysql://hadoop102:3306/test", "root", "000000");
pstmt = conn.prepareStatement("INSERT INTO sensor VALUES (?,?,?)");
}
@Override
public void close() throws Exception {
pstmt.close();
conn.close();
}
@Override
public void invoke(String value, Context context) throws Exception {
String[] datas = value.split(",");
pstmt.setString(1, datas[0]);
pstmt.setLong(2, Long.valueOf(datas[1]));
pstmt.setInt(3, Integer.valueOf(datas[2]));
pstmt.execute();
}
}
);
env.execute();
}
}
6、案例实操
1)基于埋点日志数据的网络流量统计
//读取日志数据转换为JavaBean对象方便操作
public class UserBehavior {
private Long userId; //用户ID
private Long itemId; //商品
private Integer categoryId; //商品类目ID
private String behavior; //行为类型
private Long timestamp; //时间戳
public Long getUserId() {
return userId;
}
public void setUserId(Long userId) {
this.userId = userId;
}
public Long getItemId() {
return itemId;
}
public void setItemId(Long itemId) {
this.itemId = itemId;
}
public Integer getCategoryId() {
return categoryId;
}
public void setCategoryId(Integer categoryId) {
this.categoryId = categoryId;
}
public String getBehavior() {
return behavior;
}
public void setBehavior(String behavior) {
this.behavior = behavior;
}
public Long getTimestamp() {
return timestamp;
}
public void setTimestamp(Long timestamp) {
this.timestamp = timestamp;
}
public UserBehavior(Long userId, Long itemId, Integer categoryId, String behavior, Long timestamp) {
this.userId = userId;
this.itemId = itemId;
this.categoryId = categoryId;
this.behavior = behavior;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "UserBehavior{" +
"userId=" + userId +
", itemId=" + itemId +
", categoryId=" + categoryId +
", behavior='" + behavior + '\'' +
", timestamp=" + timestamp +
'}';
}
}
Ø *网站总浏览量(PV)的统计*
衡量网站流量一个最简单的指标,就是网站的页面浏览量(Page View,PV)。用户每次打开一个页面便记录1次PV,多次打开同一页面则浏览量累计。一般来说,PV与来访者的数量成正比,但是PV并不直接决定页面的真实来访者数量,如同一个来访者通过不断的刷新页面,也可以制造出非常高的PV。接下来我们就用咱们之前学习的Flink算子来实现在PV的统计。
参考WordCount思路,实现 PV的统计
import bean.UserBehavior;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class Case_PV {
public static void main(String[] args) throws Exception {
//创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment().setParallelism(1);
//读取数据
DataStreamSource<String> inputDs = env.readTextFile("input/UserBehavior.csv");
//将读到的数据转换成bean对象
SingleOutputStreamOperator<UserBehavior> userBehaviorDs = inputDs.map(new MapFunction<String, UserBehavior>() {
@Override
public UserBehavior map(String value) throws Exception {
String[] datas = value.split(",");
return new UserBehavior(
Long.valueOf(datas[0]),
Long.valueOf(datas[1]),
Integer.valueOf(datas[2]),
datas[3],
Long.valueOf(datas[4])
);
}
});
//参考wordcount思路实现PV的统计
//处理数据
//1、过滤出pv行为
SingleOutputStreamOperator<UserBehavior> userBehaviorDsfilter = userBehaviorDs.filter(new FilterFunction<UserBehavior>() {
@Override
public boolean filter(UserBehavior value) throws Exception {
return "pv".equals(value.getBehavior());
}
});
//2、转换数据结构
SingleOutputStreamOperator<Tuple2<String, Integer>> pvAndTuple2 = userBehaviorDsfilter.map(new MapFunction<UserBehavior, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(UserBehavior value) throws Exception {
return Tuple2.of("pv", 1);
}
});
//3、按照第一个位置的元素进行分组
KeyedStream<Tuple2<String, Integer>, Tuple> pvAndOneKs = pvAndTuple2.keyBy(0);
//4、求和
SingleOutputStreamOperator<Tuple2<String, Integer>> pvDs = pvAndOneKs.sum(1);
//打印
pvDs.print("pv");
env.execute();
}
}
Process
import bean.UserBehavior;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
public class Case_PV2 {
public static void main(String[] args) throws Exception {
//创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment().setParallelism(1);
//获取数据
DataStreamSource<String> inputDs = env.readTextFile("input/UserBehavior.csv");
//转换数据结构
//
SingleOutputStreamOperator<UserBehavior> userBeavirDs = inputDs.map((MapFunction<String, UserBehavior>) value -> {
String[] datas = value.split(",");
return new UserBehavior(
Long.valueOf(datas[0]),
Long.valueOf(datas[1]),
Integer.valueOf(datas[2]),
datas[3],
Long.valueOf(datas[4])
);
});
//过滤数据
SingleOutputStreamOperator<UserBehavior> userBeavirfilter = userBeavirDs.filter(data -> "pv".equals(data.getBehavior()));
//按照维度进行分组
KeyedStream<UserBehavior, String> userBehaviorKs = userBeavirfilter.keyBy(data -> data.getBehavior());
//求和
SingleOutputStreamOperator<Long> resultDS = userBehaviorKs.process(new KeyedProcessFunction<String, UserBehavior, Long>() {
//定义一个变量,求统计条数
private Long pvcount = 0L;
/**
*
* @param valuer
* @param ctx
* @param out
* @throws Exception
*/
@Override
public void processElement(UserBehavior valuer, Context ctx, Collector<Long> out) throws Exception {
pvcount++;
out.collect(pvcount);
}
});
inputDs.print("pv");
env.execute();
}
}
FlatMap
import com.atguigu.bean.UserBehavior;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
/**
* TODO
*
* @author cjp
* @version 1.0
* @date 2020/9/16 15:29
*/
public class Flink25_Case_PVByFlatmap {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据、转换成 bean对象
env
.readTextFile("input/UserBehavior.csv")
.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
@Override
public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
String[] datas = value.split(",");
if ("pv".equals(datas[3])) {
out.collect(Tuple2.of("pv", 1));
}
}
})
.keyBy(0)
.sum(1)
.print("pv by flatmap");
env.execute();
}
}
Ø *网站独立访客数(UV)的统计*
上一个案例中,我们统计的是所有用户对页面的所有浏览行为,也就是说,同一用户的浏览行为会被重复统计。而在实际应用中,我们往往还会关注,到底有多少不同的用户访问了网站,所以另外一个统计流量的重要指标是网站的独立访客数(Unique Visitor,UV)。
import com.atguigu.bean.UserBehavior;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.util.HashSet;
import java.util.Set;
public class Flink26_Case_UV {
public static void main(String[] args) throws Exception {
// 0.创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.从文件读取数据、转换成 bean对象
SingleOutputStreamOperator<UserBehavior> userBehaviorDS = env
.readTextFile("input/UserBehavior.csv")
.map(new MapFunction<String, UserBehavior>() {
@Override
public UserBehavior map(String value) throws Exception {
String[] datas = value.split(",");
return new UserBehavior(
Long.valueOf(datas[0]),
Long.valueOf(datas[1]),
Integer.valueOf(datas[2]),
datas[3],
Long.valueOf(datas[4])
);
}
});
// TODO 实现 UV的统计 :对 userId进行去重,统计
// => userId存在一个Set中
// => 算出Set里的元素个数,就是UV值
// 2.处理数据
// 2.1 过滤出 pv 行为 => UV 就是 PV的去重,所以行为还是 pv
SingleOutputStreamOperator<UserBehavior> userBehaviorFilter = userBehaviorDS.filter(data -> "pv".equals(data.getBehavior()));
// 2.2 转换成二元组 ("uv",userId)
// => 第一个给 uv,是为了分组,分组是为了调用 sum或 process等方法
// => 第二个给userId,是为了将 userId存到 Set里
// => 这里我们,只需要userId,其他什么商品、品类这些都不需要
SingleOutputStreamOperator<Tuple2<String, Long>> uvTuple2 = userBehaviorFilter.map(new MapFunction<UserBehavior, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> map(UserBehavior value) throws Exception {
return Tuple2.of("uv", value.getUserId());
}
});
// 2.3 按照 uv 分组
KeyedStream<Tuple2<String, Long>, String> uvKS = uvTuple2.keyBy(data -> data.f0);
// 2.4 使用 process 处理
SingleOutputStreamOperator<Integer> uvDS = uvKS.process(
new KeyedProcessFunction<String, Tuple2<String, Long>, Integer>() {
// 定义一个Set,用来去重并存放 userId
private Set<Long> uvSet = new HashSet<>();
/**
* 来一条数据处理一条
* @param value
* @param ctx
* @param out
* @throws Exception
*/
@Override
public void processElement(Tuple2<String, Long> value, Context ctx, Collector<Integer> out) throws Exception {
// 来一条数据,就把 userId存到 Set中
uvSet.add(value.f1);
// 通过采集器,往下游发送 uv值 => Set中元素的个数,就是 UV值
out.collect(uvSet.size());
}
}
);
uvDS.print("uv");
env.execute();
}
}
2)市场营销商业指标统计分析
随着智能手机的普及,在如今的电商网站中已经有越来越多的用户来自移动端,相比起传统浏览器的登录方式,手机APP成为了更多用户访问电商网站的首选。对于电商企业来说,一般会通过各种不同的渠道对自己的APP进行市场推广,而这些渠道的统计数据(比如,不同网站上广告链接的点击量、APP下载量)就成了市场营销的重要商业指标。
public class MarketingUserBehavior {
private Long userId;//用户
private String behavior;//行为:下载、安装、更新、卸载
private String channel;//渠道:小米、华为、OPPO、VIVO
private Long timestamp;//时间戳
public Long getUserId() {
return userId;
}
public void setUserId(Long userId) {
this.userId = userId;
}
public String getBehavior() {
return behavior;
}
public void setBehavior(String behavior) {
this.behavior = behavior;
}
public String getChannel() {
return channel;
}
public void setChannel(String channel) {
this.channel = channel;
}
public Long getTimestamp() {
return timestamp;
}
public void setTimestamp(Long timestamp) {
this.timestamp = timestamp;
}
public MarketingUserBehavior() {
}
public MarketingUserBehavior(Long userId, String behavior, String channel, Long timestamp) {
this.userId = userId;
this.behavior = behavior;
this.channel = channel;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "MarketingUserBehavior{" +
"userId=" + userId +
", behavior='" + behavior + '\'' +
", channel='" + channel + '\'' +
", timestamp=" + timestamp +
'}';
}
}
import bean.MarketingUserBehavior;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
public class APPMarketingAnalysis {
public static void main(String[] args) throws Exception {
//创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment().setParallelism(1);
//获取数据
DataStreamSource<MarketingUserBehavior> inputDs = env.addSource(new MysourceFunction());
//处理数据
//按照 统计的维度 分组:渠道、行为
SingleOutputStreamOperator<Tuple2<String, Integer>> chanalAndBehaviorTuple2 = inputDs.map(new MapFunction<MarketingUserBehavior, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(MarketingUserBehavior value) throws Exception {
return Tuple2.of(value.getChannel() + "_" + value.getBehavior(), 1);
}
});
KeyedStream<Tuple2<String, Integer>, String> chanalAndBehaviorKs = chanalAndBehaviorTuple2.keyBy(data -> data.f0);
SingleOutputStreamOperator<Tuple2<String, Integer>> resultDs = chanalAndBehaviorKs.sum(1);
resultDs.print();
env.execute();
}
//自定义数据源
public static class MysourceFunction implements SourceFunction<MarketingUserBehavior> {
private boolean flag = true;
private List<String> behaviorList = Arrays.asList("DOWNLOAD","INSTALL","UPDATE","UNINSTALL");
private List<String> channelList = Arrays.asList("Huawei","XioaMi","OPPO","VIVO");
@Override
public void run(SourceContext<MarketingUserBehavior> ctx) throws Exception {
Random random = new Random();
while(flag){
ctx.collect(
new MarketingUserBehavior(
Long.valueOf(random.nextInt(10)),
behaviorList.get(random.nextInt(behaviorList.size())),
channelList.get(random.nextInt(channelList.size())),
System.currentTimeMillis()
)
);
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
flag = false;
}
}
}
import com.atguigu.bean.MarketingUserBehavior;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
public class Flink28_Case_APPMarketingAnalysisWithoutChannel {
public static void main(String[] args) throws Exception {
// 0 执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1.读取数据、转换成 bean对象
DataStreamSource<MarketingUserBehavior> appDS = env.addSource(new AppSource());
// 2.处理数据:不同行为 的统计(不分渠道)
// 2.1 按照 统计的维度 分组 : 行为
// 2.1.1 转换成 (行为,1)二元组
SingleOutputStreamOperator<Tuple2<String, Integer>> channelAndBehaviorTuple2 = appDS.map(new MapFunction<MarketingUserBehavior, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(MarketingUserBehavior value) throws Exception {
return Tuple2.of(value.getBehavior(), 1);
}
});
// 2.1.2 按照 行为 分组
KeyedStream<Tuple2<String, Integer>, String> channelAndBehaviorKS = channelAndBehaviorTuple2.keyBy(data -> data.f0);
// 2.2 求和
SingleOutputStreamOperator<Tuple2<String, Integer>> resultDS = channelAndBehaviorKS.sum(1);
// 3. 输出:打印
resultDS.print("app marketing analysis by behavior");
env.execute();
}
public static class AppSource implements SourceFunction<MarketingUserBehavior> {
private boolean flag = true;
private List<String> behaviorList = Arrays.asList("DOWNLOAD", "INSTALL", "UPDATE", "UNINSTALL");
private List<String> channelList = Arrays.asList("XIAOMI", "HUAWEI", "OPPO", "VIVO");
@Override
public void run(SourceContext<MarketingUserBehavior> ctx) throws Exception {
while (flag) {
Random random = new Random();
ctx.collect(
new MarketingUserBehavior(
Long.valueOf(random.nextInt(10)),
behaviorList.get(random.nextInt(behaviorList.size())),
channelList.get(random.nextInt(channelList.size())),
System.currentTimeMillis()
)
);
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
flag = false;
}
}
}
3)页面广告分析
电商网站的市场营销商业指标中,除了自身的APP推广,还会考虑到页面上的广告投放(包括自己经营的产品和其它网站的广告)。所以广告相关的统计分析,也是市场营销的重要指标。
对于广告的统计,最简单也最重要的就是页面广告的点击量,网站往往需要根据广告点击量来制定定价策略和调整推广方式,而且也可以借此收集用户的偏好信息。更加具体的应用是,我们可以根据用户的地理位置进行划分,从而总结出不同省份用户对不同广告的偏好,这样更有助于广告的精准投放。
public class AdClickLog {
private Long userId;//用户ID
private Long adId;//广告ID
private String province;//省份
private String city;//城市
private Long timestamp;//时间戳
public AdClickLog() {
}
public AdClickLog(Long userId, Long adId, String province, String city, Long timestamp) {
this.userId = userId;
this.adId = adId;
this.province = province;
this.city = city;
this.timestamp = timestamp;
}
public Long getUserId() {
return userId;
}
public void setUserId(Long userId) {
this.userId = userId;
}
public Long getAdId() {
return adId;
}
public void setAdId(Long adId) {
this.adId = adId;
}
public String getProvince() {
return province;
}
public void setProvince(String province) {
this.province = province;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public Long getTimestamp() {
return timestamp;
}
public void setTimestamp(Long timestamp) {
this.timestamp = timestamp;
}
@Override
public String toString() {
return "AdClickLog{" +
"userId=" + userId +
", adId=" + adId +
", province='" + province + '\'' +
", city='" + city + '\'' +
", timestamp=" + timestamp +
'}';
}
}
import bean.AdClickLog;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class Case_AdClicAnalysis {
public static void main(String[] args) throws Exception {
//执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment().setParallelism(1);
//过获取数据转换为Bean对象
SingleOutputStreamOperator<AdClickLog> adClickDs =
env
.readTextFile("backup_AdClickLog.csv")
.map(new MapFunction<String, AdClickLog>() {
@Override
public AdClickLog map(String value) throws Exception {
String[] datas = value.split(",");
return new AdClickLog(
Long.valueOf(datas[0]),
Long.valueOf(datas[1]),
datas[2],
datas[3],
Long.valueOf(datas[4])
);
}
});
//处理数据:不同省份、不同广告的点击量实时统计
//按照 统计维度 分组:省份、广告
SingleOutputStreamOperator<Tuple2<String, Integer>> resultDs = adClickDs
.map(new MapFunction<AdClickLog, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(AdClickLog value) throws Exception {
return Tuple2.of(value.getProvince() + "_" + value.getAdId(), 1);
}
})
.keyBy(data -> data.f0)
.sum(1);
resultDs.print("ad");
env.execute();
}
}
4)订单支付实时监控
在电商网站中,订单的支付作为直接与营销收入挂钩的一环,在业务流程中非常重要。对于订单而言,为了正确控制业务流程,也为了增加用户的支付意愿,网站一般会设置一个支付失效时间,超过一段时间不支付的订单就会被取消。另外,对于订单的支付,我们还应保证用户支付的正确性,这可以通过第三方支付平台的交易数据来做一个实时对账。
Ø 来自两条流的订单交易匹配
对于订单支付事件,用户支付完成其实并不算完,我们还得确认平台账户上是否到账了。而往往这会来自不同的日志信息,所以我们要同时读入两条流的数据来做合并处理。
import com.atguigu.bean.AdClickLog;
import com.atguigu.bean.OrderEvent;
import com.atguigu.bean.TxEvent;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.ConnectedStreams;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoProcessFunction;
import org.apache.flink.util.Collector;
import java.util.HashMap;
import java.util.Map;
public class Flink30_Case_OrderTxDetect {
public static void main(String[] args) throws Exception {
// 0 执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(2);
// 1.读取数据,转成bean对象
SingleOutputStreamOperator<OrderEvent> orderDS = env
.readTextFile("input/OrderLog.csv")
.map(new MapFunction<String, OrderEvent>() {
@Override
public OrderEvent map(String value) throws Exception {
String[] datas = value.split(",");
return new OrderEvent(
Long.valueOf(datas[0]),
datas[1],
datas[2],
Long.valueOf(datas[3])
);
}
});
SingleOutputStreamOperator<TxEvent> txDS = env
.readTextFile("input/ReceiptLog.csv")
.map(new MapFunction<String, TxEvent>() {
@Override
public TxEvent map(String value) throws Exception {
String[] datas = value.split(",");
return new TxEvent(
datas[0],
datas[1],
Long.valueOf(datas[2])
);
}
});
// 2.处理数据:实时对账 监控
// 两条流 connect 起来,通过 txId 做一个匹配,匹配上就是对账成功
// 对于同一笔订单的交易来说,业务系统 和 交易系统 的数据,哪个先来,是不一定的
// TODO 一般两条流connect的时候,会做 keyby,为了要匹配的数据到一起
// 可以先 keyby再 connect,也可以 先 connect,再 keyby
ConnectedStreams<OrderEvent, TxEvent> orderTxCS = (orderDS.keyBy(order -> order.getTxId()))
.connect(txDS.keyBy(tx -> tx.getTxId()));
// 按照 txId进行分组,让相同txId的数据,到一起去
// ConnectedStreams<OrderEvent, TxEvent> orderTxKS = orderTxCS.keyBy(
// order -> order.getTxId(),
// tx -> tx.getTxId());
// 使用process进行处理
SingleOutputStreamOperator<String> resultDS = orderTxCS.process(
new CoProcessFunction<OrderEvent, TxEvent, String>() {
// 用来存放 交易系统 的数据
private Map<String, TxEvent> txMap = new HashMap<>();
// 用来存放 业务系统 的数据
private Map<String, OrderEvent> orderMap = new HashMap<>();
/**
* 处理业务系统的数据,来一条处理一条
* @param value
* @param ctx
* @param out
* @throws Exception
*/
@Override
public void processElement1(OrderEvent value, Context ctx, Collector<String> out) throws Exception {
// 进入这个方法,说明来的数据是 业务系统的数据
// 判断 交易数据 来了没有?
// 通过 交易码 查询保存的 交易数据 => 如果不为空,说明 交易数据 已经来了,匹配上
TxEvent txEvent = txMap.get(value.getTxId());
if (txEvent == null) {
// 1.说明 交易数据 没来 => 等 , 把自己临时保存起来
orderMap.put(value.getTxId(), value);
} else {
// 2.说明 交易数据 来了 => 对账成功
out.collect("订单" + value.getOrderId() + "对账成功");
// 对账成功,将保存的 交易数据 删掉
txMap.remove(value.getTxId());
}
}
/**
* 处理交易系统的数据,来一条处理一条
* @param value
* @param ctx
* @param out
* @throws Exception
*/
@Override
public void processElement2(TxEvent value, Context ctx, Collector<String> out) throws Exception {
// 进入这个方法,说明来的数据是 交易系统 的数据
// 判断 业务数据 来了没有?
OrderEvent orderEvent = orderMap.get(value.getTxId());
if (orderEvent == null) {
// 1.说明 业务数据 没来 => 把自己 临时保存起来
txMap.put(value.getTxId(), value);
} else {
// 2.说明 业务数据 来了 => 对账成功
out.collect("订单" + orderEvent.getOrderId() + "对账成功");
// 对账成功,将保存的 业务数据 删掉
orderMap.remove(value.getTxId());
}
}
}
);
resultDS.print();
env.execute();
}
}