实现UDAF函数如果要自定义类要继承UserDefinedAggregateFunction 类
UDAF 原理图:
SparkSQL 自定义聚合函数UDAF实战应用_hadoop

package sparkSql

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, DataTypes, StructField, StructType}

/**
* @Author yqq
* @Date 2021/12/14 14:34
* @Version 1.0
* 与聚合函数同时出现在Select后的字段,需要跟在 group by 后面
*/
object SparkSQLUDAF {
def main(args: Array[String]): Unit = {
val session = SparkSession.builder().master("local").appName("test02").getOrCreate()
session.sparkContext.setLogLevel("Error")
val list = List[String]("kobe", "james", "kobe", "durant", "kobe","kobe","james","james","durant")
import session.implicits._
val frame = list.toDF("name")
frame.createTempView("t")

/**
* 自定义聚合函数UDAF
*/
session.udf.register("nameCount",new UserDefinedAggregateFunction {
//调用UDF函数时,传参的类型
override def inputSchema: StructType = StructType(List[StructField](
StructField("name",DataTypes.StringType)
))
//设置在计算过程中,更新的数据类型
override def bufferSchema: StructType = StructType(List[StructField](
StructField("name",DataTypes.IntegerType)
))
//指定调用函数最后返回类型
override def dataType: DataType = DataTypes.IntegerType
//多次运行,结果顺序保持一致
override def deterministic: Boolean = true
//作用在map,reduce两侧给每个分区内的每个分组的数据做初始值
override def initialize(buffer: MutableAggregationBuffer): Unit = buffer.update(0,0)
//作用在map端每个分区的每个分组上
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = buffer.update(0,buffer.getInt(0)+1)
//作用在reduce端,每个分区的每个分组上,对map的结果做聚合
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = buffer1.update(0,buffer1.getInt(0)+buffer2.getInt(0))
//调用函数最后返回的数据结果
override def evaluate(buffer: Row): Any = buffer.getInt(0)
})
session.sql(
"""
|select name,nameCount(name) as totalCount from t group by name
|""".stripMargin).show()
}
}

SparkSQL 自定义聚合函数UDAF实战应用_sql_02