之前一直疑惑Shuffle过程中的读和写究竟是在哪里实现的,一直误解读和写都是在RDD的转换过程中实现的,但是追踪代码reduceByKey,却只找到了生成ShuffledRDD的过程,然后在ShuffledRDD中的compute函数中有读取过程,那么写入过程究竟在哪里呢??
[java] view plain copy
1. PairRDDFunctions
2.
3. def combineByKey[C](createCombiner: V => C,
4. mergeValue: (C, V) => C,
5. mergeCombiners: (C, C) => C,
6. partitioner: Partitioner,
7. true,
8. null): RDD[(K, C)] = {
9.
10. new Aggregator[K, V, C](createCombiner, mergeValue, mergeCombiners)
11. if (self.partitioner == Some(partitioner)) {
12. // 一般的RDD的partitioner是None,这个条件不成立,即使成立只需要对这个数据做一次按key合并value的操作即可
13. self.mapPartitionsWithContext((context, iter) => {
14. new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
15. true)
16. else if (mapSideCombine) {
17. // 默认是走的这个方法,需要map端的combinber.
18. val combined = self.mapPartitionsWithContext((context, iter) => {
19. aggregator.combineValuesByKey(iter, context)
20. true)
21. new ShuffledRDD[K, C, (K, C)](combined, partitioner)
22. .setSerializer(serializer)
23. partitioned.mapPartitionsWithContext((context, iter) => {
24. new InterruptibleIterator(context, aggregator.combineCombinersByKey(iter, context))
25. true)
26. else {
27. // 不需要map端的combine,直接就来shuffle
28. new ShuffledRDD[K, V, (K, V)](self, partitioner).setSerializer(serializer)
29. values.mapPartitionsWithContext((context, iter) => {
30. new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
31. true)
32. }
33. }
观察compute方法,会看到是如何去取上一个stage生成的数据的。
[java] view plain copy
1. //ShuffledRDD.scala
2.
3. package org.apache.spark.rdd
4.
5. import org.apache.spark._
6. import org.apache.spark.annotation.DeveloperApi
7. import org.apache.spark.serializer.Serializer
8.
9. private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition {
10. override val index: Int = idx
11. override def hashCode(): Int = idx
12. }
13.
14. /**
15. * :: DeveloperApi ::
16. * The resulting RDD from a shuffle (e.g. repartitioning of data).
17. * @param prev the parent RDD.
18. * @param part the partitioner used to partition the RDD
19. * @tparam K the key class.
20. * @tparam V the value class.
21. * @tparam C the combiner class.
22. */
23. // TODO: Make this return RDD[Product2[K, C]] or have some way to configure mutable pairs
24. @DeveloperApi
25. class ShuffledRDD[K, V, C](
26. @transient var prev: RDD[_ <: Product2[K, V]],
27. part: Partitioner)
28. extends RDD[(K, C)](prev.context, Nil) {
29.
30. private var serializer: Option[Serializer] = None
31.
32. private var keyOrdering: Option[Ordering[K]] = None
33.
34. private var aggregator: Option[Aggregator[K, V, C]] = None
35.
36. private var mapSideCombine: Boolean = false
37.
38. /** Set a serializer for this RDD's shuffle, or null to use the default (spark.serializer) */
39. def setSerializer(serializer: Serializer): ShuffledRDD[K, V, C] = {
40. this.serializer = Option(serializer)
41. this
42. }
43.
44. /** Set key ordering for RDD's shuffle. */
45. def setKeyOrdering(keyOrdering: Ordering[K]): ShuffledRDD[K, V, C] = {
46. this.keyOrdering = Option(keyOrdering)
47. this
48. }
49.
50. /** Set aggregator for RDD's shuffle. */
51. def setAggregator(aggregator: Aggregator[K, V, C]): ShuffledRDD[K, V, C] = {
52. this.aggregator = Option(aggregator)
53. this
54. }
55.
56. /** Set mapSideCombine flag for RDD's shuffle. */
57. def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = {
58. this.mapSideCombine = mapSideCombine
59. this
60. }
61.
62. override def getDependencies: Seq[Dependency[_]] = {
63. new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))
64. }
65.
66. override val partitioner = Some(part)
67.
68. override def getPartitions: Array[Partition] = {
69. new ShuffledRDDPartition(i))
70. }
71.
72. override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
73. val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
74. 1, context)
75. .read()
76. .asInstanceOf[Iterator[(K, C)]]
77. }
78.
79. override def clearDependencies() {
80. super.clearDependencies()
81. null
82. }
83. }
后来想到ShuffleMapTask,这个名字就很可以,打开代码看看。发现代码很简单,直接粗暴的把结果通过ShuffleManger写入到磁盘。
[java] view plain copy
1. //ShuffleMapTask.scala
2.
3. package org.apache.spark.scheduler
4.
5. import java.nio.ByteBuffer
6.
7. import scala.language.existentials
8.
9. import org.apache.spark._
10. import org.apache.spark.broadcast.Broadcast
11. import org.apache.spark.rdd.RDD
12. import org.apache.spark.shuffle.ShuffleWriter
13.
14. /**
15. * A ShuffleMapTask divides the elements of an RDD into multiple buckets (based on a partitioner
16. * specified in the ShuffleDependency).
17. *
18. * See [[org.apache.spark.scheduler.Task]] for more information.
19. *
20. * @param stageId id of the stage this task belongs to
21. * @param taskBinary broadcast version of the RDD and the ShuffleDependency. Once deserialized,
22. * the type should be (RDD[_], ShuffleDependency[_, _, _]).
23. * @param partition partition of the RDD this task is associated with
24. * @param locs preferred task execution locations for locality scheduling
25. */
26. private[spark] class ShuffleMapTask(
27. stageId: Int,
28. taskBinary: Broadcast[Array[Byte]],
29. partition: Partition,
30. @transient private var locs: Seq[TaskLocation])
31. extends Task[MapStatus](stageId, partition.index) with Logging {
32.
33. /** A constructor used only in test suites. This does not require passing in an RDD. */
34. this(partitionId: Int) {
35. this(0, null, new Partition { override def index: Int = 0 }, null)
36. }
37.
38. @transient private val preferredLocs: Seq[TaskLocation] = {
39. if (locs == null) Nil else locs.toSet.toSeq
40. }
41.
42. override def runTask(context: TaskContext): MapStatus = {
43. // Deserialize the RDD using the broadcast variable.
44. val deserializeStartTime = System.currentTimeMillis()
45. val ser = SparkEnv.get.closureSerializer.newInstance()
46. val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
47. ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
48. _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
49.
50. metrics = Some(context.taskMetrics)
51. null
52. try {
53. val manager = SparkEnv.get.shuffleManager
54. writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
55. writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
56. return writer.stop(success = true).get
57. catch {
58. case e: Exception =>
59. try {
60. if (writer != null) {
61. false)
62. }
63. catch {
64. case e: Exception =>
65. "Could not stop writer", e)
66. }
67. throw e
68. }
69. }
70.
71. override def preferredLocations: Seq[TaskLocation] = preferredLocs
72.
73. "ShuffleMapTask(%d, %d)".format(stageId, partitionId)
74. }
根据Stage的划分机制,只要出现ShuffleDependency,那么前面的任务就会被包装成为ShuffleMapTask,然后在ShuffleMapTask中把前面的Stage的output进行分区然后输出到硬盘,这样就可以不用考虑这次stage的final RDD的类型了,做到了shuflle write和RDD逻辑的解耦。
ShuffleManager后续发生的事情,参考https://github.com/JerryLead/SparkInternals/blob/master/markdown/4-shuffleDetails.md
0 踩
之前一直疑惑Shuffle过程中的读和写究竟是在哪里实现的,一直误解读和写都是在RDD的转换过程中实现的,但是追踪代码reduceByKey,却只找到了生成ShuffledRDD的过程,然后在ShuffledRDD中的compute函数中有读取过程,那么写入过程究竟在哪里呢??
[java] view plain copy
1. PairRDDFunctions
2.
3. def combineByKey[C](createCombiner: V => C,
4. mergeValue: (C, V) => C,
5. mergeCombiners: (C, C) => C,
6. partitioner: Partitioner,
7. true,
8. null): RDD[(K, C)] = {
9.
10. new Aggregator[K, V, C](createCombiner, mergeValue, mergeCombiners)
11. if (self.partitioner == Some(partitioner)) {
12. // 一般的RDD的partitioner是None,这个条件不成立,即使成立只需要对这个数据做一次按key合并value的操作即可
13. self.mapPartitionsWithContext((context, iter) => {
14. new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
15. true)
16. else if (mapSideCombine) {
17. // 默认是走的这个方法,需要map端的combinber.
18. val combined = self.mapPartitionsWithContext((context, iter) => {
19. aggregator.combineValuesByKey(iter, context)
20. true)
21. new ShuffledRDD[K, C, (K, C)](combined, partitioner)
22. .setSerializer(serializer)
23. partitioned.mapPartitionsWithContext((context, iter) => {
24. new InterruptibleIterator(context, aggregator.combineCombinersByKey(iter, context))
25. true)
26. else {
27. // 不需要map端的combine,直接就来shuffle
28. new ShuffledRDD[K, V, (K, V)](self, partitioner).setSerializer(serializer)
29. values.mapPartitionsWithContext((context, iter) => {
30. new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
31. true)
32. }
33. }
观察compute方法,会看到是如何去取上一个stage生成的数据的。
[java] view plain copy
1. //ShuffledRDD.scala
2.
3. package org.apache.spark.rdd
4.
5. import org.apache.spark._
6. import org.apache.spark.annotation.DeveloperApi
7. import org.apache.spark.serializer.Serializer
8.
9. private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition {
10. override val index: Int = idx
11. override def hashCode(): Int = idx
12. }
13.
14. /**
15. * :: DeveloperApi ::
16. * The resulting RDD from a shuffle (e.g. repartitioning of data).
17. * @param prev the parent RDD.
18. * @param part the partitioner used to partition the RDD
19. * @tparam K the key class.
20. * @tparam V the value class.
21. * @tparam C the combiner class.
22. */
23. // TODO: Make this return RDD[Product2[K, C]] or have some way to configure mutable pairs
24. @DeveloperApi
25. class ShuffledRDD[K, V, C](
26. @transient var prev: RDD[_ <: Product2[K, V]],
27. part: Partitioner)
28. extends RDD[(K, C)](prev.context, Nil) {
29.
30. private var serializer: Option[Serializer] = None
31.
32. private var keyOrdering: Option[Ordering[K]] = None
33.
34. private var aggregator: Option[Aggregator[K, V, C]] = None
35.
36. private var mapSideCombine: Boolean = false
37.
38. /** Set a serializer for this RDD's shuffle, or null to use the default (spark.serializer) */
39. def setSerializer(serializer: Serializer): ShuffledRDD[K, V, C] = {
40. this.serializer = Option(serializer)
41. this
42. }
43.
44. /** Set key ordering for RDD's shuffle. */
45. def setKeyOrdering(keyOrdering: Ordering[K]): ShuffledRDD[K, V, C] = {
46. this.keyOrdering = Option(keyOrdering)
47. this
48. }
49.
50. /** Set aggregator for RDD's shuffle. */
51. def setAggregator(aggregator: Aggregator[K, V, C]): ShuffledRDD[K, V, C] = {
52. this.aggregator = Option(aggregator)
53. this
54. }
55.
56. /** Set mapSideCombine flag for RDD's shuffle. */
57. def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = {
58. this.mapSideCombine = mapSideCombine
59. this
60. }
61.
62. override def getDependencies: Seq[Dependency[_]] = {
63. new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))
64. }
65.
66. override val partitioner = Some(part)
67.
68. override def getPartitions: Array[Partition] = {
69. new ShuffledRDDPartition(i))
70. }
71.
72. override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
73. val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
74. 1, context)
75. .read()
76. .asInstanceOf[Iterator[(K, C)]]
77. }
78.
79. override def clearDependencies() {
80. super.clearDependencies()
81. null
82. }
83. }
后来想到ShuffleMapTask,这个名字就很可以,打开代码看看。发现代码很简单,直接粗暴的把结果通过ShuffleManger写入到磁盘。
[java] view plain copy
1. //ShuffleMapTask.scala
2.
3. package org.apache.spark.scheduler
4.
5. import java.nio.ByteBuffer
6.
7. import scala.language.existentials
8.
9. import org.apache.spark._
10. import org.apache.spark.broadcast.Broadcast
11. import org.apache.spark.rdd.RDD
12. import org.apache.spark.shuffle.ShuffleWriter
13.
14. /**
15. * A ShuffleMapTask divides the elements of an RDD into multiple buckets (based on a partitioner
16. * specified in the ShuffleDependency).
17. *
18. * See [[org.apache.spark.scheduler.Task]] for more information.
19. *
20. * @param stageId id of the stage this task belongs to
21. * @param taskBinary broadcast version of the RDD and the ShuffleDependency. Once deserialized,
22. * the type should be (RDD[_], ShuffleDependency[_, _, _]).
23. * @param partition partition of the RDD this task is associated with
24. * @param locs preferred task execution locations for locality scheduling
25. */
26. private[spark] class ShuffleMapTask(
27. stageId: Int,
28. taskBinary: Broadcast[Array[Byte]],
29. partition: Partition,
30. @transient private var locs: Seq[TaskLocation])
31. extends Task[MapStatus](stageId, partition.index) with Logging {
32.
33. /** A constructor used only in test suites. This does not require passing in an RDD. */
34. this(partitionId: Int) {
35. this(0, null, new Partition { override def index: Int = 0 }, null)
36. }
37.
38. @transient private val preferredLocs: Seq[TaskLocation] = {
39. if (locs == null) Nil else locs.toSet.toSeq
40. }
41.
42. override def runTask(context: TaskContext): MapStatus = {
43. // Deserialize the RDD using the broadcast variable.
44. val deserializeStartTime = System.currentTimeMillis()
45. val ser = SparkEnv.get.closureSerializer.newInstance()
46. val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
47. ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
48. _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
49.
50. metrics = Some(context.taskMetrics)
51. null
52. try {
53. val manager = SparkEnv.get.shuffleManager
54. writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
55. writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
56. return writer.stop(success = true).get
57. catch {
58. case e: Exception =>
59. try {
60. if (writer != null) {
61. false)
62. }
63. catch {
64. case e: Exception =>
65. "Could not stop writer", e)
66. }
67. throw e
68. }
69. }
70.
71. override def preferredLocations: Seq[TaskLocation] = preferredLocs
72.
73. "ShuffleMapTask(%d, %d)".format(stageId, partitionId)
74. }
根据Stage的划分机制,只要出现ShuffleDependency,那么前面的任务就会被包装成为ShuffleMapTask,然后在ShuffleMapTask中把前面的Stage的output进行分区然后输出到硬盘,这样就可以不用考虑这次stage的final RDD的类型了,做到了shuflle write和RDD逻辑的解耦。
ShuffleManager后续发生的事情,参考https://github.com/JerryLead/SparkInternals/blob/master/markdown/4-shuffleDetails.md