之前一直疑惑Shuffle过程中的读和写究竟是在哪里实现的,一直误解读和写都是在RDD的转换过程中实现的,但是追踪代码reduceByKey,却只找到了生成ShuffledRDD的过程,然后在ShuffledRDD中的compute函数中有读取过程,那么写入过程究竟在哪里呢??


Spark Thrift Server的用户名和密码在哪配置_ide



[java]  view plain  copy


 

1. PairRDDFunctions  
2.   
3. def combineByKey[C](createCombiner: V => C,  
4.       mergeValue: (C, V) => C,  
5.       mergeCombiners: (C, C) => C,  
6.       partitioner: Partitioner,  
7. true,  
8. null): RDD[(K, C)] = {  
9.   
10. new Aggregator[K, V, C](createCombiner, mergeValue, mergeCombiners)  
11. if (self.partitioner == Some(partitioner)) {  
12. // 一般的RDD的partitioner是None,这个条件不成立,即使成立只需要对这个数据做一次按key合并value的操作即可  
13.       self.mapPartitionsWithContext((context, iter) => {  
14. new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))  
15. true)  
16. else if (mapSideCombine) {  
17. // 默认是走的这个方法,需要map端的combinber.  
18.       val combined = self.mapPartitionsWithContext((context, iter) => {  
19.         aggregator.combineValuesByKey(iter, context)  
20. true)  
21. new ShuffledRDD[K, C, (K, C)](combined, partitioner)  
22.         .setSerializer(serializer)  
23.       partitioned.mapPartitionsWithContext((context, iter) => {  
24. new InterruptibleIterator(context, aggregator.combineCombinersByKey(iter, context))  
25. true)  
26. else {  
27. // 不需要map端的combine,直接就来shuffle  
28. new ShuffledRDD[K, V, (K, V)](self, partitioner).setSerializer(serializer)  
29.       values.mapPartitionsWithContext((context, iter) => {  
30. new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))  
31. true)  
32.     }  
33.   }



观察compute方法,会看到是如何去取上一个stage生成的数据的。



[java]  view plain  copy


 

1. //ShuffledRDD.scala  
2.   
3. package org.apache.spark.rdd  
4.   
5. import org.apache.spark._  
6. import org.apache.spark.annotation.DeveloperApi  
7. import org.apache.spark.serializer.Serializer  
8.   
9. private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition {  
10.   override val index: Int = idx  
11.   override def hashCode(): Int = idx  
12. }  
13.   
14. /**
15.  * :: DeveloperApi ::
16.  * The resulting RDD from a shuffle (e.g. repartitioning of data).
17.  * @param prev the parent RDD.
18.  * @param part the partitioner used to partition the RDD
19.  * @tparam K the key class.
20.  * @tparam V the value class.
21.  * @tparam C the combiner class.
22.  */  
23. // TODO: Make this return RDD[Product2[K, C]] or have some way to configure mutable pairs  
24. @DeveloperApi  
25. class ShuffledRDD[K, V, C](  
26. @transient var prev: RDD[_ <: Product2[K, V]],  
27.     part: Partitioner)  
28. extends RDD[(K, C)](prev.context, Nil) {  
29.   
30. private var serializer: Option[Serializer] = None  
31.   
32. private var keyOrdering: Option[Ordering[K]] = None  
33.   
34. private var aggregator: Option[Aggregator[K, V, C]] = None  
35.   
36. private var mapSideCombine: Boolean = false  
37.   
38. /** Set a serializer for this RDD's shuffle, or null to use the default (spark.serializer) */  
39.   def setSerializer(serializer: Serializer): ShuffledRDD[K, V, C] = {  
40. this.serializer = Option(serializer)  
41. this  
42.   }  
43.   
44. /** Set key ordering for RDD's shuffle. */  
45.   def setKeyOrdering(keyOrdering: Ordering[K]): ShuffledRDD[K, V, C] = {  
46. this.keyOrdering = Option(keyOrdering)  
47. this  
48.   }  
49.   
50. /** Set aggregator for RDD's shuffle. */  
51.   def setAggregator(aggregator: Aggregator[K, V, C]): ShuffledRDD[K, V, C] = {  
52. this.aggregator = Option(aggregator)  
53. this  
54.   }  
55.   
56. /** Set mapSideCombine flag for RDD's shuffle. */  
57.   def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = {  
58. this.mapSideCombine = mapSideCombine  
59. this  
60.   }  
61.   
62.   override def getDependencies: Seq[Dependency[_]] = {  
63. new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))  
64.   }  
65.   
66.   override val partitioner = Some(part)  
67.   
68.   override def getPartitions: Array[Partition] = {  
69. new ShuffledRDDPartition(i))  
70.   }  
71.   
72.   override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {  
73.     val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]  
74. 1, context)  
75.       .read()  
76.       .asInstanceOf[Iterator[(K, C)]]  
77.   }  
78.   
79.   override def clearDependencies() {  
80. super.clearDependencies()  
81. null  
82.   }  
83. }



后来想到ShuffleMapTask,这个名字就很可以,打开代码看看。发现代码很简单,直接粗暴的把结果通过ShuffleManger写入到磁盘。



[java]  view plain  copy


 

1. //ShuffleMapTask.scala  
2.   
3. package org.apache.spark.scheduler  
4.   
5. import java.nio.ByteBuffer  
6.   
7. import scala.language.existentials  
8.   
9. import org.apache.spark._  
10. import org.apache.spark.broadcast.Broadcast  
11. import org.apache.spark.rdd.RDD  
12. import org.apache.spark.shuffle.ShuffleWriter  
13.   
14. /**
15. * A ShuffleMapTask divides the elements of an RDD into multiple buckets (based on a partitioner
16. * specified in the ShuffleDependency).
17. *
18. * See [[org.apache.spark.scheduler.Task]] for more information.
19. *
20.  * @param stageId id of the stage this task belongs to
21.  * @param taskBinary broadcast version of the RDD and the ShuffleDependency. Once deserialized,
22.  *                   the type should be (RDD[_], ShuffleDependency[_, _, _]).
23.  * @param partition partition of the RDD this task is associated with
24.  * @param locs preferred task execution locations for locality scheduling
25.  */  
26. private[spark] class ShuffleMapTask(  
27.     stageId: Int,  
28.     taskBinary: Broadcast[Array[Byte]],  
29.     partition: Partition,  
30. @transient private var locs: Seq[TaskLocation])  
31. extends Task[MapStatus](stageId, partition.index) with Logging {  
32.   
33. /** A constructor used only in test suites. This does not require passing in an RDD. */  
34. this(partitionId: Int) {  
35. this(0, null, new Partition { override def index: Int = 0 }, null)  
36.   }  
37.   
38. @transient private val preferredLocs: Seq[TaskLocation] = {  
39. if (locs == null) Nil else locs.toSet.toSeq  
40.   }  
41.   
42.   override def runTask(context: TaskContext): MapStatus = {  
43. // Deserialize the RDD using the broadcast variable.  
44.     val deserializeStartTime = System.currentTimeMillis()  
45.     val ser = SparkEnv.get.closureSerializer.newInstance()  
46.     val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](  
47.       ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)  
48.     _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime  
49.   
50.     metrics = Some(context.taskMetrics)  
51. null  
52. try {  
53.       val manager = SparkEnv.get.shuffleManager  
54.       writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)  
55.       writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])  
56. return writer.stop(success = true).get  
57. catch {  
58. case e: Exception =>  
59. try {  
60. if (writer != null) {  
61. false)  
62.           }  
63. catch {  
64. case e: Exception =>  
65. "Could not stop writer", e)  
66.         }  
67. throw e  
68.     }  
69.   }  
70.   
71.   override def preferredLocations: Seq[TaskLocation] = preferredLocs  
72.   
73. "ShuffleMapTask(%d, %d)".format(stageId, partitionId)  
74. }



根据Stage的划分机制,只要出现ShuffleDependency,那么前面的任务就会被包装成为ShuffleMapTask,然后在ShuffleMapTask中把前面的Stage的output进行分区然后输出到硬盘,这样就可以不用考虑这次stage的final RDD的类型了,做到了shuflle write和RDD逻辑的解耦。

ShuffleManager后续发生的事情,参考https://github.com/JerryLead/SparkInternals/blob/master/markdown/4-shuffleDetails.md



0 踩


之前一直疑惑Shuffle过程中的读和写究竟是在哪里实现的,一直误解读和写都是在RDD的转换过程中实现的,但是追踪代码reduceByKey,却只找到了生成ShuffledRDD的过程,然后在ShuffledRDD中的compute函数中有读取过程,那么写入过程究竟在哪里呢??


Spark Thrift Server的用户名和密码在哪配置_ide



[java]  view plain  copy


 

1. PairRDDFunctions  
2.   
3. def combineByKey[C](createCombiner: V => C,  
4.       mergeValue: (C, V) => C,  
5.       mergeCombiners: (C, C) => C,  
6.       partitioner: Partitioner,  
7. true,  
8. null): RDD[(K, C)] = {  
9.   
10. new Aggregator[K, V, C](createCombiner, mergeValue, mergeCombiners)  
11. if (self.partitioner == Some(partitioner)) {  
12. // 一般的RDD的partitioner是None,这个条件不成立,即使成立只需要对这个数据做一次按key合并value的操作即可  
13.       self.mapPartitionsWithContext((context, iter) => {  
14. new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))  
15. true)  
16. else if (mapSideCombine) {  
17. // 默认是走的这个方法,需要map端的combinber.  
18.       val combined = self.mapPartitionsWithContext((context, iter) => {  
19.         aggregator.combineValuesByKey(iter, context)  
20. true)  
21. new ShuffledRDD[K, C, (K, C)](combined, partitioner)  
22.         .setSerializer(serializer)  
23.       partitioned.mapPartitionsWithContext((context, iter) => {  
24. new InterruptibleIterator(context, aggregator.combineCombinersByKey(iter, context))  
25. true)  
26. else {  
27. // 不需要map端的combine,直接就来shuffle  
28. new ShuffledRDD[K, V, (K, V)](self, partitioner).setSerializer(serializer)  
29.       values.mapPartitionsWithContext((context, iter) => {  
30. new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))  
31. true)  
32.     }  
33.   }



观察compute方法,会看到是如何去取上一个stage生成的数据的。



[java]  view plain  copy


 


    1. //ShuffledRDD.scala  
    2.   
    3. package org.apache.spark.rdd  
    4.   
    5. import org.apache.spark._  
    6. import org.apache.spark.annotation.DeveloperApi  
    7. import org.apache.spark.serializer.Serializer  
    8.   
    9. private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition {  
    10.   override val index: Int = idx  
    11.   override def hashCode(): Int = idx  
    12. }  
    13.   
    14. /**
    15.  * :: DeveloperApi ::
    16.  * The resulting RDD from a shuffle (e.g. repartitioning of data).
    17.  * @param prev the parent RDD.
    18.  * @param part the partitioner used to partition the RDD
    19.  * @tparam K the key class.
    20.  * @tparam V the value class.
    21.  * @tparam C the combiner class.
    22.  */  
    23. // TODO: Make this return RDD[Product2[K, C]] or have some way to configure mutable pairs  
    24. @DeveloperApi  
    25. class ShuffledRDD[K, V, C](  
    26. @transient var prev: RDD[_ <: Product2[K, V]],  
    27.     part: Partitioner)  
    28. extends RDD[(K, C)](prev.context, Nil) {  
    29.   
    30. private var serializer: Option[Serializer] = None  
    31.   
    32. private var keyOrdering: Option[Ordering[K]] = None  
    33.   
    34. private var aggregator: Option[Aggregator[K, V, C]] = None  
    35.   
    36. private var mapSideCombine: Boolean = false  
    37.   
    38. /** Set a serializer for this RDD's shuffle, or null to use the default (spark.serializer) */  
    39.   def setSerializer(serializer: Serializer): ShuffledRDD[K, V, C] = {  
    40. this.serializer = Option(serializer)  
    41. this  
    42.   }  
    43.   
    44. /** Set key ordering for RDD's shuffle. */  
    45.   def setKeyOrdering(keyOrdering: Ordering[K]): ShuffledRDD[K, V, C] = {  
    46. this.keyOrdering = Option(keyOrdering)  
    47. this  
    48.   }  
    49.   
    50. /** Set aggregator for RDD's shuffle. */  
    51.   def setAggregator(aggregator: Aggregator[K, V, C]): ShuffledRDD[K, V, C] = {  
    52. this.aggregator = Option(aggregator)  
    53. this  
    54.   }  
    55.   
    56. /** Set mapSideCombine flag for RDD's shuffle. */  
    57.   def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = {  
    58. this.mapSideCombine = mapSideCombine  
    59. this  
    60.   }  
    61.   
    62.   override def getDependencies: Seq[Dependency[_]] = {  
    63. new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))  
    64.   }  
    65.   
    66.   override val partitioner = Some(part)  
    67.   
    68.   override def getPartitions: Array[Partition] = {  
    69. new ShuffledRDDPartition(i))  
    70.   }  
    71.   
    72.   override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {  
    73.     val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]  
    74. 1, context)  
    75.       .read()  
    76.       .asInstanceOf[Iterator[(K, C)]]  
    77.   }  
    78.   
    79.   override def clearDependencies() {  
    80. super.clearDependencies()  
    81. null  
    82.   }  
    83. }



    后来想到ShuffleMapTask,这个名字就很可以,打开代码看看。发现代码很简单,直接粗暴的把结果通过ShuffleManger写入到磁盘。



    [java]  view plain  copy


     

    1. //ShuffleMapTask.scala  
    2.   
    3. package org.apache.spark.scheduler  
    4.   
    5. import java.nio.ByteBuffer  
    6.   
    7. import scala.language.existentials  
    8.   
    9. import org.apache.spark._  
    10. import org.apache.spark.broadcast.Broadcast  
    11. import org.apache.spark.rdd.RDD  
    12. import org.apache.spark.shuffle.ShuffleWriter  
    13.   
    14. /**
    15. * A ShuffleMapTask divides the elements of an RDD into multiple buckets (based on a partitioner
    16. * specified in the ShuffleDependency).
    17. *
    18. * See [[org.apache.spark.scheduler.Task]] for more information.
    19. *
    20.  * @param stageId id of the stage this task belongs to
    21.  * @param taskBinary broadcast version of the RDD and the ShuffleDependency. Once deserialized,
    22.  *                   the type should be (RDD[_], ShuffleDependency[_, _, _]).
    23.  * @param partition partition of the RDD this task is associated with
    24.  * @param locs preferred task execution locations for locality scheduling
    25.  */  
    26. private[spark] class ShuffleMapTask(  
    27.     stageId: Int,  
    28.     taskBinary: Broadcast[Array[Byte]],  
    29.     partition: Partition,  
    30. @transient private var locs: Seq[TaskLocation])  
    31. extends Task[MapStatus](stageId, partition.index) with Logging {  
    32.   
    33. /** A constructor used only in test suites. This does not require passing in an RDD. */  
    34. this(partitionId: Int) {  
    35. this(0, null, new Partition { override def index: Int = 0 }, null)  
    36.   }  
    37.   
    38. @transient private val preferredLocs: Seq[TaskLocation] = {  
    39. if (locs == null) Nil else locs.toSet.toSeq  
    40.   }  
    41.   
    42.   override def runTask(context: TaskContext): MapStatus = {  
    43. // Deserialize the RDD using the broadcast variable.  
    44.     val deserializeStartTime = System.currentTimeMillis()  
    45.     val ser = SparkEnv.get.closureSerializer.newInstance()  
    46.     val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](  
    47.       ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)  
    48.     _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime  
    49.   
    50.     metrics = Some(context.taskMetrics)  
    51. null  
    52. try {  
    53.       val manager = SparkEnv.get.shuffleManager  
    54.       writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)  
    55.       writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])  
    56. return writer.stop(success = true).get  
    57. catch {  
    58. case e: Exception =>  
    59. try {  
    60. if (writer != null) {  
    61. false)  
    62.           }  
    63. catch {  
    64. case e: Exception =>  
    65. "Could not stop writer", e)  
    66.         }  
    67. throw e  
    68.     }  
    69.   }  
    70.   
    71.   override def preferredLocations: Seq[TaskLocation] = preferredLocs  
    72.   
    73. "ShuffleMapTask(%d, %d)".format(stageId, partitionId)  
    74. }


    根据Stage的划分机制,只要出现ShuffleDependency,那么前面的任务就会被包装成为ShuffleMapTask,然后在ShuffleMapTask中把前面的Stage的output进行分区然后输出到硬盘,这样就可以不用考虑这次stage的final RDD的类型了,做到了shuflle write和RDD逻辑的解耦。

    ShuffleManager后续发生的事情,参考https://github.com/JerryLead/SparkInternals/blob/master/markdown/4-shuffleDetails.md