一、WordCount
package com.shujia.flink.core
import org.apache.flink.streaming.api.scala._
object Demo1WordCount {
def main(args: Array[String]): Unit = {
/**
* 创建 flink 环境
*/
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
/**
* flink 执行模式
* RuntimeExecutionMode.BATCH: 批处理模式,只能用于有界流,计算输出最终的结果
* RuntimeExecutionMode.STREAMING : 流处理模式,可以用于有界流也可以用于无界流,输出连续的结果
*/
// env.setRuntimeMode(RuntimeExecutionMode.BATCH)
/**
* 设置flink任务的并行度
* 默认和电脑的核数有关
*/
// env.setParallelism(2)
/**
* 数据从上游发送到下游的超时时间
* 默认是200毫秒
*/
// env.setBufferTimeout(200)
/**
* 读取socket数据
*/
val socketDS: DataStream[String] = env.socketTextStream("master", 8888)
/**
* 处理数据
*/
val resultDS: DataStream[(String, Int)] = socketDS
.flatMap(_.split(","))
.map((_, 1))
.keyBy(_._1)
.sum(1)
resultDS.print()
env.execute()
}
}
二、Source
1、 从List集合中读取数据
/**
* 集合本地集合source - 有界流
*
* 当读取的数据源是一个有界流时,flink处理完数据就结束了
*/
val linesDS: DataStream[String] = env.fromCollection(List("java,spark", "java,hadoop", "java"))
2、从本地文件中读取数据
/**
* 基于集合构建source --- 有界流
*/
val studentDS: DataStream[String] = env.readTextFile("data/students.txt")
3、从socket中读取数据
/**
* 基于socket构建source -- 无界流
*
* 无界流只能使用流处理模式,不能使用批处理模式
*/
val linesDS: DataStream[String] = env.socketTextStream("master", 8888)
4、自定义Source
package com.shujia.flink.source
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala._
import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet}
object Demo4MysqlSource {
def main(args: Array[String]): Unit = {
//创建flink环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//使用自定义的source读取mysql中的数据
val mysqlDS: DataStream[String] = env.addSource(new MysqlSource())
mysqlDS
.map(stu => (stu.split("\t")(4), 1))
.keyBy(_._1)
.sum(1)
.print()
env.execute()
}
/**
* 自定义source ,实现SourceFunction接口
*
*/
class MysqlSource extends SourceFunction[String] {
/**
* run: 用于读取外部数据的方法,只执行一次
*
* @param ctx : 上下文对象,用于将读取到的数据发送到下游
*/
override def run(ctx: SourceFunction.SourceContext[String]): Unit = {
/**
* 使用jdbc读取mysql的数据,将读取到的数据发送到下游
*/
Class.forName("com.mysql.jdbc.Driver")
//创建连接
val con: Connection = DriverManager.getConnection("jdbc:mysql://master:3306/bigdata", "root", "123456")
//编写查询数据的sql
val stat: PreparedStatement = con.prepareStatement("select * from students")
//执行查询
val resultSet: ResultSet = stat.executeQuery()
//解析数据
while (resultSet.next()) {
//通过字段名获取数据
val id: Long = resultSet.getLong("id")
val name: String = resultSet.getString("name")
val age: Long = resultSet.getLong("age")
val gender: String = resultSet.getString("gender")
val clazz: String = resultSet.getString("clazz")
//将每一条数据发送到下游
ctx.collect(s"$id\t$name\t$age\t$gender\t$clazz")
}
//关闭链接
stat.close()
con.close()
}
//任务被取消的时候执行,一般用于回收资源
override def cancel(): Unit = {}
}
}
三、Sink
1、将数据保存在文件中
package com.shujia.flink.sink
import org.apache.flink.api.common.RuntimeExecutionMode
import org.apache.flink.api.common.serialization.SimpleStringEncoder
import org.apache.flink.configuration.MemorySize
import org.apache.flink.connector.file.sink.FileSink
import org.apache.flink.core.fs.Path
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy
import org.apache.flink.streaming.api.scala._
import java.time.Duration
object Demo1FileSInk {
def main(args: Array[String]): Unit = {
//创建flink环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
env.setRuntimeMode(RuntimeExecutionMode.BATCH)
//读取数据
val studentDS: DataStream[String] = env.readTextFile("data/students.txt")
//统计班级的人数
val kvDS: DataStream[(String, Int)] = studentDS.map(stu => (stu.split(",")(4), 1))
val countDS: DataStream[(String, Int)] = kvDS.keyBy(_._1).sum(1)
//将统计好的结果保存到文件中
//老版本
//countDS.writeAsText("data/flink/clazz_num")
//新版本的api
val sink: FileSink[(String, Int)] = FileSink
.forRowFormat(new Path("data/flink/clazz_num"), new SimpleStringEncoder[(String, Int)]("UTF-8"))
.withRollingPolicy(
DefaultRollingPolicy.builder()
//至少包含多少时间的数据
.withRolloverInterval(Duration.ofSeconds(10))
//多少时间没有新的数据
.withInactivityInterval(Duration.ofSeconds(10))
//数据达到多大
.withMaxPartSize(MemorySize.ofMebiBytes(1))
.build())
.build()
//使用file sink
countDS.sinkTo(sink)
env.execute()
}
}
2、自定义Sink
package com.shujia.flink.sink
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.sink.{RichSinkFunction, SinkFunction}
import org.apache.flink.streaming.api.scala._
import java.sql.{Connection, DriverManager, PreparedStatement}
object Demo4MysqlSink {
def main(args: Array[String]): Unit = {
/**
* 创建 flink 环境
*/
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//读取数据
val linesDS: DataStream[String] = env.readTextFile("data/students.txt")
//处理数据
val resultDS: DataStream[(String, Int)] = linesDS.map(line => (line.split(",")(3), 1)).keyBy(_._1).sum(1)
//将数据保存到MySQL中
resultDS.addSink(new RichSinkFunction[(String, Int)] {
var con: Connection = _
var stat: PreparedStatement = _
/**
* 匿名内部类
* 方法重写 open close invoke
* open: 在invoke之前执行,每一个task中只执行一次
* 一般用于初始化数据库连接
*/
override def open(parameters: Configuration): Unit = {
//注册驱动
Class.forName("com.mysql.jdbc.Driver")
//获取连接
con = DriverManager.getConnection("jdbc:mysql://master:3306/bigdata?useUnicode=true&characterEncoding=utf8&useSSL=false", "root", "123456")
//编写插入数据sql
//replace:如果不存在就插入,存在就替换
stat = con.prepareStatement("replace into gender values(?,?)")
}
/**
* 任务关闭的时候执行,一般用于回收资源
*/
override def close(): Unit = {
stat.close()
con.close()
}
/**
* 重写invoke方法
* 每一条数据会执行一次
* 使用jdbc将数据保存到mysql中
*
* @param value : 一行数据
* @param context : 上下文对象
*/
override def invoke(value: (String, Int), context: SinkFunction.Context): Unit = {
stat.setString(1, value._1)
stat.setInt(2, value._2)
//执行插入
stat.execute()
}
})
env.execute()
}
}
四、从MySQL到MySQL中
package com.shujia.flink.homework
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.sink.{RichSinkFunction, SinkFunction}
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala._
import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet}
object Mysql2Mysql {
def main(args: Array[String]): Unit = {
/**
* 创建flink环境
*/
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
/**
* 从mysql数据库中读取数据
*/
val mysqlDS: DataStream[String] = env.addSource(new SourceFunction[String] {
override def run(sourceContext: SourceFunction.SourceContext[String]): Unit = {
//注册驱动
Class.forName("com.mysql.jdbc.Driver")
//创建连接
val con: Connection = DriverManager.getConnection("jdbc:mysql://master:3306/student?useUnicode=true&characterEncoding=utf8&useSSL=false", "root", "123456")
//编写查询数据的sql
val stat: PreparedStatement = con.prepareStatement("select * from student")
//执行查询
val resultSet: ResultSet = stat.executeQuery()
//解析数据
while (resultSet.next()) {
val id: Int = resultSet.getInt("id")
val name: String = resultSet.getString("name")
val age: Int = resultSet.getInt("age")
val gender: String = resultSet.getString("gender")
val clazz: String = resultSet.getString("clazz")
//将每一条数据发送到下游
sourceContext.collect(s"$id\t$name\t$age\t$gender\t$clazz")
}
//关闭链接
stat.close()
con.close()
}
override def cancel(): Unit = {}
})
/**
* 处理数据,求出每个班级人数
*/
val clazzDS: DataStream[(String, Int)] = mysqlDS
.map(_.split("\t")(4))
.map((_, 1))
.keyBy(_._1)
.sum(1)
// clazzDS.print()
/**
* 将统计好的结果保存到MySQL中
*/
clazzDS.addSink(new RichSinkFunction[(String, Int)] {
var con: Connection = _
var stat: PreparedStatement = _
override def open(parameters: Configuration): Unit = {
//1、加载驱动
Class.forName("com.mysql.jdbc.Driver")
//注册连接
con = DriverManager.getConnection("jdbc:mysql://master:3306/student?useUnicode=true&characterEncoding=UTF-8", "root", "123456")
//编写插入数据的sql
stat = con.prepareStatement("replace into clazz_count(clazz,num) values(?,?)")
}
override def close(): Unit = {
stat.close()
con.close()
}
override def invoke(value: (String, Int), context: SinkFunction.Context): Unit = {
stat.setString(1, value._1)
stat.setInt(2, value._2)
//执行插入数据
stat.execute()
}
})
env.execute()
}
}