Spark--->转换算子_大数据

flatMap:数据的拆分、转换(一对多)
  def flatMapTest(): Unit = {
    //1、创建SparkConf
    val conf = new SparkConf().setMaster("local").setAppName("spark_context")
    //2、创建SparkContext
    val sc = new SparkContext(conf)
    //创建RDD
    val rdd1 = sc.parallelize(Seq("1,2,3,4"))
    //执行flatMap操作
    val rdd2 = rdd1.flatMap(item => (item.split(",")))
    rdd2.collect().foreach(println)
  }
  • 结果

1
2
3
4

map:转换(一对一)
  def mapTest(): Unit = {
    //1、创建SparkConf
    val conf = new SparkConf().setMaster("local").setAppName("spark_context")
    //2、创建SparkContext
    val sc = new SparkContext(conf)
    //1、创建RDD
    val rdd1 = sc.parallelize(Seq(1, 2, 3))
    //2、执行map操作
    val rdd2: RDD[Int] = rdd1.map(item => (item * 10))
    //3、得到结果
    rdd2.collect().foreach(println)
  }
  • 结果

10
20
30

filter:过滤算子
def filterTest(): Unit = {
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  sc.parallelize(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
    .filter(item=>item%2==0)
    .collect()
    .foreach(item=>println(item))
}
  • 结果

2
4
6
8
10

sort:排序算子
/**
  * sortBy可以按照任何部分进行排序
  * sortByKey只可以按照key来排序
  */
def sortByTest(): Unit ={
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local[6]").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  val rdd2 = sc.parallelize(Seq(("a", 89), ("c", 90), ("b", 87)))
  val rdd1 = sc.parallelize(Seq(2,4,1,5,7,3))
  //rdd1升序排序
  rdd1.sortBy(item=>item,ascending = true).collect().foreach(println)
  //rdd2按照第二个来降序排序
  rdd2.sortBy(item=>item._2,ascending = false).collect().foreach(println)
  //rdd2按照key来排序
  rdd2.sortByKey().collect().foreach(println)
}
mapPartitions
  • 和map相比他是一个分区的数据来转换,map是一条一条的进行转换
def mapPartitionsTest01(): Unit = {
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  //1、数据生成
  sc.parallelize(Seq(1, 2, 3, 4, 5, 6))
    .mapPartitions(iter => {
      iter.foreach(item => println(item))
      iter
    })
    .collect()
}
def mapPartitionsTest02(): Unit = {
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  //1、数据生成
  sc.parallelize(Seq(1, 2, 3, 4, 5, 6), 2)
    .mapPartitions(iter => {
      iter.map(item => item * 10)
    })
    .collect()
    .foreach(item => println(item))
}
mapPartitionsWithIndex
  • 比mapPartitions多了个分区号
def mapPartitionsWithIndexTest(): Unit = {
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  sc.parallelize(Seq(1, 2, 3, 4, 5, 6), 2)
    .mapPartitionsWithIndex((index, iter) => {
      println("index:"+index)
      iter.foreach(item => println(item))
      iter
    })
    .collect()
    .foreach(item=>println(item))
}
sample:取样,转换
  • 尽量不影响数据集
def sampleTest(): Unit ={
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  val rdd1 = sc.parallelize(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
  //false是不放回,0.6是大小比
  val rdd2 = rdd1.sample(false,0.6)
  val result = rdd2.collect()
  result.foreach(item=>println(item))
}
mapValue

-只转换value的值

def mapValueTest(): Unit ={
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  sc.parallelize(Seq(("a",1),("b",2),("c",3),("d",4),("e",5),("f",6)))
    .mapValues(item=>item*10)
    .collect()
    .foreach(item=>println(item))
}
union
  • 取俩个RDD的并集
def unionTest(): Unit = {
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  val rdd1 = sc.parallelize(Seq(1, 2, 3))
  val rdd2 = sc.parallelize(Seq(3, 4, 5))
  rdd1.union(rdd2)
    .collect()
    .foreach(println)
}
intersection
  • 取俩个RDD的交集,即共同的
def intersectionTest(): Unit ={
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  val rdd1 = sc.parallelize(Seq(1, 2, 3,4))
  val rdd2 = sc.parallelize(Seq(3, 4, 5,6))
  rdd1.intersection(rdd2)
    .collect()
    .foreach(println)
}
subtrat
  • 差集(rdd1.subtrat(rdd2)是2有1没有的)
def subtractTest(): Unit ={
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  val rdd1 = sc.parallelize(Seq(1, 2, 3,4))
  val rdd2 = sc.parallelize(Seq(3, 4, 5,6))
  rdd1.subtract(rdd2)
    .collect()
    .foreach(println)
}
reduceByKey
  • 按照key相同的执行reduce聚合(先按照key分组,然后执行自定义的关于value的操作)
def reduceByKeyTest(): Unit = {
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  //创建RDD
  val rdd1 = sc.parallelize(Seq("hello world", "spark world", "spark hello"))
  //处理数据
  val rdd2 = rdd1.flatMap(item => (item.split(" ")))
    .map(item => (item, 1))
    .reduceByKey((curr, agg) => curr + agg)
  //得到结果
  val result = rdd2.collect()
  result.foreach(item => println(item))
  //关闭
  sc.stop()
}
  • 结果

(spark,2)
(hello,2)
(world,2)

groupByKey
  • 以key为区分分组
def GroupByKey(): Unit ={
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  sc.parallelize(Seq(("a", 1), ("b", 2), ("c", 3), ("c", 4), ("a", 5), ("f", 6)))
    .groupByKey()
    .collect()
    .foreach(item=>println(item))
}
combinerByKey
  • groupByKey和ReduceByKey的底层
def combinerByKeyTest(): Unit = {
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  val rdd1: RDD[(String, Double)] = sc.parallelize(Seq(("a", 89.0), ("a", 90), ("b", 89), ("a", 86), ("b", 99)))
  val result = rdd1.combineByKey(
    //转换数据的函数,初始函数,只作用于分组后的第一条数据
    createCombiner = (curr: Double) => (curr, 1),
    //在分区上聚合
    mergeValue = (curr: (Double, Int), next: Double) => (curr._1 + next, curr._2 + 1),
    //把所有分区的结果聚合为最终结果
    mergeCombiners = (curr: (Double, Int), agg: (Double, Int)) => (curr._1 + agg._1, curr._2 + agg._2)
  )
  //(a,(90+90+90,3))
  val r = result.map(item=>(item._1,item._2._1/item._2._2))
  r.collect().foreach(println)
}
foldByKey
  • 比reduceByKey多了一个初始值
  def foldByKeyTest(): Unit = {
    //1、创建SparkConf
    val conf = new SparkConf().setMaster("local").setAppName("spark_context")
    //2、创建SparkContext
    val sc = new SparkContext(conf)
    val rdd1 = sc.parallelize(Seq(("a", 89), ("a", 90), ("b", 87)))
    rdd1.foldByKey(10)((curr, agg) => curr + agg)
      .collect()
      .foreach(println(_))
  }
aggregateByKey
  • 是foldByKey的底层
/**
  * aggregateByKey(zeroValue)(seqOp,combOp)
  * zeroValue:指定初始值
  * seqOp:作用于每一个元素,根据初始值,进行计算
  * combOp:将seqOp处理过的结果进行聚合
  * 适用于先处理,在聚合的操作
  */
@Test
def aggregateByKey(): Unit = {
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local[6]").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  val rdd1 = sc.parallelize(Seq(("手机", 10.0), ("手机", 15.0), ("电脑", 20.0)))
  rdd1.aggregateByKey(0.8)((zeroValue, item) => item * zeroValue, (curr, agg) => curr + agg)
    .collect()
    .foreach(println(_))
}
join
  • 以key为区分
/**
 * join是按照key来配对的
 * a.join(b)
 * a来匹配b中的key,然后笛卡尔积
  */
@Test
def joinTest(): Unit = {
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local[6]").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  val rdd1 = sc.parallelize(Seq(("a", 1), ("a", 2), ("b", 1)))
  val rdd2 = sc.parallelize(Seq(("a", 10), ("a", 11), ("a", 12)))
  rdd1.join(rdd2)
    .collect()
    .foreach(println(_))
}
repartition、coalesce
  • 设置分区数
/**
  * repartition:重新设置分区
  *
  * coalesce:减少分区,只能设置比先前少的的分区数
  */
@Test
def PartitionsTest(): Unit = {
  //1、创建SparkConf
  val conf = new SparkConf().setMaster("local[6]").setAppName("spark_context")
  //2、创建SparkContext
  val sc = new SparkContext(conf)
  val rdd1 = sc.parallelize(Seq(2, 4, 1, 5, 7, 3),2)
  //设置分区数,分区数可大可小
  rdd1.repartition(4).partitions.size
  val rdd2 = sc.parallelize(Seq(("a", 89), ("c", 90), ("b", 87)),3)
  //减少分区,不能设置比先前的多
  rdd2.coalesce(1).partitions.size
}