1、基本数据源
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.util.ArrayList;
import java.util.List;
public class BasicDataStream {
public static void main(String[] args) throws Exception {
//获取flink的流式计算的执行环境
StreamExecutionEnvironment env=StreamExecutionEnvironment.getExecutionEnvironment();
//准备测试数据
List<Integer> data=new ArrayList<Integer>();
data.add(100);
data.add(101);
data.add(102);
data.add(103);
data.add(104);
data.add(105);
data.add(106);
data.add(107);
data.add(108);
data.add(109);
data.add(110);
DataStream<Integer> source=env.fromCollection(data).setParallelism(1);
source.map(new MapFunction<Integer, String>() {
public String map(Integer integer) throws Exception {
return "处理后的数据是:" +(integer+1);
}
}).setParallelism(1).print().setParallelism(1);
env.execute("BasicDataStream");
}
}
这个小测试中我们把map计算的并行度和print的并行都设置成1,是为了让处理的时候我们支持单核处理,这样打印出来的数据才能好看,不然话因为多核的执行顺序不一致导致打印出来的数据非常的乱
2、无并行度数据源
数据源
import org.apache.flink.streaming.api.functions.source.SourceFunction;
public class MyNoParalleSource implements SourceFunction<Integer> {
private Boolean isRunning=true;
private Integer counter=0;
//数据源产生的方法
public void run(SourceContext<Integer> sourceContext) throws Exception {
// 如何产生数据 每隔一秒
while(isRunning) {
//输出数据
sourceContext.collect(counter);
counter ++;
Thread.sleep(1000);
}
}
//数据源停止采集的方法,如果
public void cancel() {
isRunning=false;
}
}
执行任务
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class MyNoParalleSourceTestDemo {
public static void main(String[] args) throws Exception {
//获取实时计算的执行环境
StreamExecutionEnvironment senv=StreamExecutionEnvironment.getExecutionEnvironment();
//获取数据源
DataStream<Integer> source=senv.addSource(new MyNoParalleSource());
source.map(new MapFunction<Integer, String>() {
public String map(Integer integer) throws Exception {
return "接受到数据源中的数据为:"+integer;
}
}).print();
//执行环境
senv.execute("MyNoParalleSourceTestDemo");
}
}
但是这个数据源我们不能设置它的并行度,如果我们在获取数据的时候设置一个并行度的话,他就会报错。
DataStream<Integer> source=senv.addSource(new MyNoParalleSource()).setParallelism(2);
这样设置后就会产生以下报错
Exception in thread “main” java.lang.IllegalArgumentException: Source: 1 is not a parallel source
at org.apache.flink.streaming.api.datastream.DataStreamSource.setParallelism(DataStreamSource.java:55)
at com.kanxyz.stream.MyNoParalleSourceTestDemo.main(MyNoParalleSourceTestDemo.java:12)
这里抛出了一个错误,说数据源不是一个并行的数据源,它不支持多核为它产生数据,接下来我们再写一个可以并行的产生数据的数据源
3、并行的数据源
并行数据源
import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
public class MyParalleSource implements ParallelSourceFunction<Integer>{
private Boolean isRunning=true;
private Integer counter=0;
public void run(SourceContext<Integer> sourceContext) throws Exception {
while (isRunning){
sourceContext.collect(counter);
counter++;
Thread.sleep(1000);
}
}
public void cancel() {
isRunning=false;
}
}
并行数据源任务
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class MyParalleSourceTestDemo {
public static void main(String[] args) throws Exception {
//获取流式计算的执行环境
StreamExecutionEnvironment senv=StreamExecutionEnvironment.getExecutionEnvironment();
//获取数据源
DataStream<Integer> source=senv.addSource(new MyParalleSource()).setParallelism(2);
source.map(new MapFunction<Integer, String>() {
public String map(Integer integer) throws Exception {
return "接收并行的数据源的数据是:"+integer;
}
}).print().setParallelism(1);
//开始执行计算
senv.execute("MyParalleSourceTestDemo");
}
}
然后我们看打印出来的数据:
是不是由俩个cpu核同时产生了相同的数据
4、把kafka作为数据源,flink消费kafka中的数据
首先需要多添加一个依赖
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.11</artifactId>
<version>1.7.2</version>
</dependency>
flink任务
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import java.util.ArrayList;
import java.util.List;
public class FlinkDemo2 {
public static void main(String[] args) throws Exception {
//获取flink的执行环境
ExecutionEnvironment env=ExecutionEnvironment.getExecutionEnvironment();
//准备数据1
List<Tuple2<Integer,String>> data1=new ArrayList<Tuple2<Integer,String>>();
data1.add(new Tuple2(1,"Tom"));
data1.add(new Tuple2(2,"Lily"));
data1.add(new Tuple2(3,"HanMeimei"));
data1.add(new Tuple2(4,"Json"));
//准备数据2
List<Tuple2<Integer,String>> data2=new ArrayList<Tuple2<Integer,String>>();
data2.add(new Tuple2(1,"Beijing"));
data2.add(new Tuple2(2,"Shanghai"));
data2.add(new Tuple2(3,"Hangzhou"));
data2.add(new Tuple2(4,"Chongqin"));
//通过俩个list的数据源创建flink的数据模型
DataSet<Tuple2<Integer,String>> table1=env.fromCollection(data1);
DataSet<Tuple2<Integer,String>> table2=env.fromCollection(data2);
//做链接操作 第一张表的第一列 第二张表的第一个字段
table1.join(table2).where(0).equalTo(0)
.with(new JoinFunction<Tuple2<Integer,String>, Tuple2<Integer,String>, Tuple3<Integer,String,String>>() {
public Tuple3 join(Tuple2 table1, Tuple2 table2) throws Exception {
return new Tuple3(table1.f0,table1.f1,table2.f1);
}
}).print();
System.out.println("****************笛卡尔操作***********************");
table1.cross(table2).print();
env.execute("FlinkDemo2");
System.out.println("*****************左外连接**********************");
table1.leftOuterJoin(table2).where(0).equalTo(0)
.with(new JoinFunction<Tuple2<Integer,String>, Tuple2<Integer,String>, Tuple3<Integer,String,String>>() {
public Tuple3 join(Tuple2 table1, Tuple2 table2) throws Exception {
if (table2==null){
}
return new Tuple3(table1.f0,table1.f1,table2.f1);
}
}).print();
}
}
然后我们启动kafka,在启动kafka之前启动zookeeper
start //启动zookeeper
./ ../config/server0.properties //启动broker0
./ ../config/server1.properties //启动broker1
//我们在开启个kafka的生产者客户端
./ --broker-list 192.168.112.111:9092,192.168.112.111:9093 --topic mytopic1
//然后直接运行任务