使用Spark Streaming统计HDFS文件的词频
Demo02_HDFSWordCount
package cn.kgc.spark.Streaming
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Created by wangchunhui on 2021/1/19 11:40
* 第一步: 执行程序
* 第二步: 向HDFS目录拷贝一些文本文件
*/
object Demo02_HDFSWordCount {
def main(args: Array[String]): Unit = {
// 模板代码: ①创建SparkConf, ②创建StreamingContext, 第二个参数是批处理间隔大小
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getName).setMaster("local[4]")
val ssc = new StreamingContext(conf,Seconds(5))
// 1.加载数据源, 一个参数:①存储目录
val lines: DStream[String] = ssc.textFileStream("hdfs://singleNode:9000/data/streaming-input")
// 2.对数据进行处理[WordCount]
val result: DStream[(String, Int)] = lines.flatMap(_.split("\\s+")).map((_,1)).reduceByKey(_+_)
// 3.数据写出[print]
result.print()
// 4.启动程序
ssc.start()
ssc.awaitTermination()
}
}
需求:计算到目前为止累计词频的个数
Demo03_StatefulWordCount
package cn.kgc.spark.Streaming
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Created by wangchunhui on 2021/1/19 11:40
*/
object Demo03_StatefulWordCount {
def main(args: Array[String]): Unit = {
// 模板代码: ①创建SparkConf, ②创建StreamingContext, 第二个参数是批处理间隔大小
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getName).setMaster("local[4]")
val ssc = new StreamingContext(conf, Seconds(5))
// 设置checkpoint目录, 用来保存状态
ssc.checkpoint("file:///E:\\ideaProjects\\SparkLearn\\data\\spark\\checkpoint")
// 1.加载socket数据源, 三个参数:①主机名或ip地址, ②端口号, ③存储级别
val lines: ReceiverInputDStream[String] = ssc.socketTextStream("singleNode", 9999, StorageLevel.MEMORY_AND_DISK_SER_2)
// 2.对数据进行处理[WordCount]
// 使用了 updateStateByKey 状态类的算子, 可以统计截止到当前位置的累加值, 需要传入一个更新状态的函数
val result: DStream[(String, Int)] = lines.flatMap(_.split("\\s+"))
.map((_, 1))
//.updateStateByKey((x,y)=>Some(x.sum+y.getOrElse(0)))
.updateStateByKey(updateFunction)
// 3.数据写出[print]
result.print()
// 4.启动程序
ssc.start()
ssc.awaitTermination()
}
// 定义一个更新状态的函数
// 接收两个参数: ①当前批次的value值的序列 ②前一批次的统计状态值
// 返回值: 更新状态值
def updateFunction(currentValues: Seq[Int], preValues: Option[Int]) = {
val curr: Int = currentValues.sum
val pre: Int = preValues.getOrElse(0)
Some(curr + pre)
}
}
Spark Streaming整合Spark SQL
需求:使用Spark Streaming+Spark SQL完成WordCount
Demo04_NetWorkSQLWordCount
package cn.kgc.spark.Streaming
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Created by wangchunhui on 2021/1/19 11:40
*/
object Demo04_NetWorkSQLWordCount {
def main(args: Array[String]): Unit = {
// 模板代码: ①创建SparkConf, ②创建StreamingContext, 第二个参数是批处理间隔大小
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getName).setMaster("local[4]")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
import spark.implicits._
val ssc = new StreamingContext(spark.sparkContext,Seconds(5))
// 1.加载socket数据源, 三个参数:①主机名或ip地址, ②端口号, ③存储级别
val lines: ReceiverInputDStream[String] = ssc.socketTextStream("singleNode", 9999, StorageLevel.MEMORY_AND_DISK_SER_2)
// 2.对数据进行处理[WordCount]
val words: DStream[String] = lines.flatMap(_.split("\\s+"))
words.foreachRDD(rdd=>{
if(rdd.count()!=0){
val df: DataFrame = rdd.map(x=>Word(x)).toDF()
df.createOrReplaceTempView("tb_word")
spark.sql("select word,count(1) from tb_word group by word").show()
}
})
// 3.启动程序
ssc.start()
ssc.awaitTermination()
}
// 定义一个样例类, 用于转换为DF
case class Word(word:String)
}
Spark Streaming整合Flume
(1)图例:
push
先启动Spark程序,在启动flume服务
pull
先启动flume程序,在启动Spark服务
(2)代码实现
Demo05_FlumePushWordCount
package cn.kgc.spark.Streaming
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.flume.{FlumeUtils, SparkFlumeEvent}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Created by wangchunhui on 2021/1/19 11:40
* 第一步: 编写Flume程序, 配置Avro sink
* 第二步: pom中导入依赖包
* 第三步: 编写Spark程序, 使用 FlumeUtils.createStream 配置数据源
* 第四步: 打成jar包, 上传到集群, 使用spark-submit提交
* 第五步: 启动Flume程序
*/
object Demo05_FlumePushWordCount {
def main(args: Array[String]): Unit = {
// 模板代码: ①创建SparkConf, ②创建StreamingContext, 第二个参数是批处理间隔大小
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getName).setMaster("local[4]")
val ssc = new StreamingContext(conf,Seconds(5))
// 1.加载数据源
val flumeStream: ReceiverInputDStream[SparkFlumeEvent] = FlumeUtils.createStream(ssc,"singleNode",9999)
val lines: DStream[String] = flumeStream.map(x=>new String(x.event.getBody.array()).trim)
// 2.对数据进行处理[WordCount]
val result: DStream[(String, Int)] = lines.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_)
// 3.数据写出[print]
result.print()
// 4.启动程序
ssc.start()
ssc.awaitTermination()
}
}