flink简单上手_flink flinkmaven项目依赖,WordCount案例

创建一个maven项目

maven依赖为

   <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.10.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.12</artifactId>
            <version>1.10.1</version>
        </dependency>
    </dependencies>

批处理WordCount

Java代码实现:

package com.test;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;

public class wc {
    public static void main(String[] args) throws Exception {
        // 创建执行环境
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

        // 从文件中读取数据
        String inputPath = "D:\\IDE\\flinkTest\\src\\main\\resources\\wc.txt";
        DataSet<String> inputDataSet = env.readTextFile(inputPath);

        // 对数据集进行处理
        AggregateOperator<Tuple2<String, Integer>> sum = inputDataSet.flatMap(new myFlatMapper())
                .groupBy(0)  // 将元祖的第一个位置的字段分组
                .sum(1);// 将元祖的第二个位置求和
        sum.print();
    }
    // 自定义类实现接口
    public static class myFlatMapper implements FlatMapFunction<String, Tuple2<String ,Integer>>{
        public void flatMap(String value, Collector<Tuple2<String, Integer>> collector) throws Exception {
            // 按空格分词
            String[] words = value.split(" ");
            // 遍历所有的word,包成二元组输出
            for (String word : words) {
                collector.collect(new Tuple2<String, Integer>(word, 1));
            }
        }
    }

}

注:事先准备好要计算的文档哦~ 嘻嘻大家肯定都了解的

流处理WordCount

Java代码实现

package com.test.wc;

import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

public class StreamWC {
    public static void main(String[] args) throws Exception {
        // 创建流处理执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 从文件中读取数据
        String inputPath = "D:\\IDE\\flinkTest\\src\\main\\resources\\wc.txt";
        DataStreamSource<String> inputDataStream = env.readTextFile(inputPath);

        // 基于数据流进行转换计算
        DataStream<Tuple2<String, Integer>> resultStream = inputDataStream.flatMap(new wc.myFlatMapper())
                .keyBy(0)
                .sum(1);
        resultStream.print();

        // 事件触发,启动任务
        env.execute();
    }
}

结果打印(因为flink是有状态的计算,所以他会多次输出,输出结果前的数据是1-8,表示当前线程有八个,这个数量默认和自己的电脑内核相同)

3> (hello,1)
5> (fine,1)
6> (how,1)
5> (you,1)
5> (you,2)
3> (thank,1)
4> (me,1)
3> (hello,2)
4> (are,1)
3> (hello,3)
5> (you,3)
3> (hello,4)
3> (hello,5)
3> (python,1)
1> (scala,1)
8> (and,1)
2> (java,1)
7> (flink,1)