spark 读写操作详解,以wordcount为例:
1.代码:
System.setProperty("hadoop.home.dir","C:\\hadoop")
val sparkConf = new SparkConf().setMaster("local").setAppName("wordCount")
sparkConf.set("spark.network.timeout","600")
sparkConf.set("spark.executor.heartbeatInterval","500")
val sc =new SparkContext(sparkConf)
val rdd:RDD[String] = sc.textFile("data/wc.txt",2) //MapPartitionsRDD[1] <-HadoopRDD[0]
val rdd2 = rdd.flatMap{line => line.split(" ")}//MapPartitionsRDD[2]
val rdd3 = rdd2.map{word => word ->1} //MapPartitionsRDD[3]
val rdd4 = rdd3.reduceByKey(_ + _) //ShuffledRDD
println(rdd4.toDebugString)
//触发job执行
rdd4.foreach(println)
2.读取的文件内容:
apache spark kafka spark
scala spark kafka
3.总体流程图
4.shuffle写 代码详解
首先看下创建MapPartitionsRDD对象时参数参数定义:
new MapPartitionsRDD[U, T](this, (context, pid, iter) => iter.map(cleanF))
(context, pid, iter) => iter.map(cleanF)就是在compute中执行的f()函数
源代码解析:
1)任务启动,执行shuffleMapTask类的runTask方法
override def runTask(context: TaskContext): MapStatus = {
// Deserialize the RDD using the broadcast variable.
val threadMXBean = ManagementFactory.getThreadMXBean
val deserializeStartTime = System.currentTimeMillis()
val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
threadMXBean.getCurrentThreadCpuTime
} else 0L
val ser = SparkEnv.get.closureSerializer.newInstance()
//反序列化出MapPartitionsRDD[3]和ShuffleDependency
val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
_executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
_executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime
} else 0L
var writer: ShuffleWriter[Any, Any] = null
try {
//获取shuffleManger对象由spark.shuffle.manager 参数决定,默认是SortShuffleManager对象。
val manager = SparkEnv. get.shuffleManager
//获取writer对象;根据dep.shuffleHandle不同选择不同的对象,整理选择SortShuffleWriter
writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
//执行write方法之前,先执行rdd.iterator方法
writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
writer.stop(success = true).get
} catch {
case e: Exception =>
try {
if (writer != null) {
writer.stop(success = false)
}
} catch {
case e: Exception =>
log.debug("Could not stop writer", e)
}
throw e
}
}
2)调用RDD的方法,把数据进行转换,形成一个执行管道
//RDD类中的iterator方法
final def iterator(split: Partition, context: TaskContext): Iterator[T] = {
if (storageLevel != StorageLevel.NONE) {
getOrCompute(split, context)
} else {
computeOrReadCheckpoint(split, context) //没有指定缓存,会调用这个方法。
}
}
3)调用MapPartitionsRDD[3].computeOrReadCheckpoint方法
//RDD该方法:调用RDD的computer方法计算一个RDD分区,或者如果RDD是检查点,则从检查点读取它。
private[spark] def computeOrReadCheckpoint(split: Partition, context: TaskContext): Iterator[T] =
{
if (isCheckpointedAndMaterialized) {
firstParent[T].iterator(split, context)
} else {
compute(split, context) //MapPartitionsRDD[3]的compute方法
}
}
4)调用MapPartitionsRDD[3]的compute方法
override def compute(split: Partition, context: TaskContext): Iterator[U] =
//首先会执行firstParent[T].iterator(split, context) ;
// firstParent就是MapPartitionsRDD[3]的父RDD,即:MapPartitionsRDD[2]
//调用MapPartitionsRDD[2].iterator方法
f(context, split.index, firstParent[T].iterator(split, context))
5)调用MapPartitionsRDD[2].iterator方法
6)调用MapPartitionsRDD[2].computeOrReadCheckpoint方法
7)调用MapPartitionsRDD[2]的compute方法
override def compute(split: Partition, context: TaskContext): Iterator[U] =
//首先会执行firstParent[T].iterator(split, context) ;
// firstParent就是MapPartitionsRDD[2]的父RDD,即:MapPartitionsRDD[1]
//调用MapPartitionsRDD[1].iterator方法
f(context, split.index, firstParent[T].iterator(split, context))
8)调用MapPartitionsRDD[1].iterator方法
9)调用MapPartitionsRDD[1].computeOrReadCheckpoint方法
10)调用MapPartitionsRDD[1]的compute方法
override def compute(split: Partition, context: TaskContext): Iterator[U] =
//首先会执行firstParent[T].iterator(split, context) ;
// firstParent就是MapPartitionsRDD[1]的父RDD,即:HadoopRDD[0]
//先调用HadoopRDD[0].iterator方法
f(context, split.index, firstParent[T].iterator(split, context))
11)调用HadoopRDD[0].iterator方法
12)调用HadoopRDD[0].computeOrReadCheckpoint方法
K, V)](context, iter)对象。
override def compute(theSplit: Partition, context: TaskContext): InterruptibleIterator[(K, V)] = {
val iter = new NextIterator[(K, V)] {
private val split = theSplit.asInstanceOf[HadoopPartition]
//打印分区文件的目录和读取的字节数 Input split: file:/D:/github/easyspark-source-2.4/data/wc.txt:0+21
logInfo("Input split: " + split.inputSplit)
private val jobConf = getJobConf()
private val inputMetrics = context.taskMetrics().inputMetrics
private val existingBytesRead = inputMetrics.bytesRead
// Sets InputFileBlockHolder for the file block's information
split.inputSplit.value match {
case fs: FileSplit =>
InputFileBlockHolder.set(fs.getPath.toString, fs.getStart, fs.getLength)
case _ =>
InputFileBlockHolder.unset()
}
// Find a function that will return the FileSystem bytes read by this thread. Do this before
// creating RecordReader, because RecordReader's constructor might read some bytes
private val getBytesReadCallback: Option[() => Long] = split.inputSplit.value match {
case _: FileSplit | _: CombineFileSplit =>
Some(SparkHadoopUtil.get.getFSBytesReadOnThreadCallback())
case _ => None
}
// We get our input bytes from thread-local Hadoop FileSystem statistics.
// If we do a coalesce, however, we are likely to compute multiple partitions in the same
// task and in the same thread, in which case we need to avoid override values written by
// previous partitions (SPARK-13071).
private def updateBytesRead(): Unit = {
getBytesReadCallback.foreach { getBytesRead =>
inputMetrics.setBytesRead(existingBytesRead + getBytesRead())
}
}
private var reader: RecordReader[K, V] = null
//inputFormat为TextInputFormat
private val inputFormat = getInputFormat(jobConf)
HadoopRDD.addLocalConfiguration(
new SimpleDateFormat("yyyyMMddHHmmss", Locale.US).format(createTime),
context.stageId, theSplit.index, context.attemptNumber, jobConf)
//获取LineRecordReader对象,该对象包含分区文件的内容。
reader =
try {
inputFormat.getRecordReader(split.inputSplit.value, jobConf, Reporter.NULL)
} catch {
case e: FileNotFoundException if ignoreMissingFiles =>
logWarning(s"Skipped missing file: ${split.inputSplit}", e)
finished = true
null
// Throw FileNotFoundException even if `ignoreCorruptFiles` is true
case e: FileNotFoundException if !ignoreMissingFiles => throw e
case e: IOException if ignoreCorruptFiles =>
logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e)
finished = true
null
}
// Register an on-task-completion callback to close the input stream.
context.addTaskCompletionListener[Unit] { context =>
// Update the bytes read before closing is to make sure lingering bytesRead statistics in
// this thread get correctly added.
updateBytesRead()
closeIfNeeded()
}
private val key: K = if (reader == null) null.asInstanceOf[K] else reader.createKey()
private val value: V = if (reader == null) null.asInstanceOf[V] else reader.createValue()
override def getNext(): (K, V) = {
try {
finished = !reader.next(key, value)
} catch {
case e: FileNotFoundException if ignoreMissingFiles =>
logWarning(s"Skipped missing file: ${split.inputSplit}", e)
finished = true
// Throw FileNotFoundException even if `ignoreCorruptFiles` is true
case e: FileNotFoundException if !ignoreMissingFiles => throw e
case e: IOException if ignoreCorruptFiles =>
logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e)
finished = true
}
if (!finished) {
inputMetrics.incRecordsRead(1)
}
if (inputMetrics.recordsRead % SparkHadoopUtil.UPDATE_INPUT_METRICS_INTERVAL_RECORDS == 0) {
updateBytesRead()
}
(key, value)
}
override def close(): Unit = {
if (reader != null) {
InputFileBlockHolder.unset()
try {
reader.close()
} catch {
case e: Exception =>
if (!ShutdownHookManager.inShutdown()) {
logWarning("Exception in RecordReader.close()", e)
}
} finally {
reader = null
}
if (getBytesReadCallback.isDefined) {
updateBytesRead()
} else if (split.inputSplit.value.isInstanceOf[FileSplit] ||
split.inputSplit.value.isInstanceOf[CombineFileSplit]) {
// If we can't get the bytes read from the FS stats, fall back to the split size,
// which may be inaccurate.
try {
inputMetrics.incBytesRead(split.inputSplit.value.getLength)
} catch {
case e: java.io.IOException =>
logWarning("Unable to get input size to set InputMetrics for task", e)
}
}
}
}
}
new InterruptibleIterator[(K, V)](context, iter)
}
首先会执行MapPartitionsRDD[1]的compute中的f函数: 获取文本内容
执行MapPartitionsRDD[2]的compute中的f函数 :按空格切分
执行MapPartitionsRDD[3]的compute中的f函数 :转成word ->1的元组
最终返回到sortShuffleWriter.write这个方法。
15)执行SortShuffleWriter.write()方法:
该方法的参数内容就是包含[(apache,1),(spark,1),(kafka,1),(spark,1)] 迭代器
override def write(records: Iterator[Product2[K, V]]): Unit = {
//创建ExternalSorter对象,我们使用的 reduceBykey会再map侧合并
sorter = if (dep.mapSideCombine) {
new ExternalSorter[K, V, C](
context, dep.aggregator, Some(dep.partitioner), dep.keyOrdering, dep.serializer)
} else {
// In this case we pass neither an aggregator nor an ordering to the sorter, because we don't
// care whether the keys get sorted in each partition; that will be done on the reduce side
// if the operation being run is sortByKey.
new ExternalSorter[K, V, V](
context, aggregator = None, Some(dep.partitioner), ordering = None, dep.serializer)
}
//执行insertAll方法.
sorter.insertAll(records)
// Don't bother including the time to open the merged output file in the shuffle write time,
// because it just opens a single file, so is typically too fast to measure accurately
// (see SPARK-3570).
val output = shuffleBlockResolver.getDataFile(dep.shuffleId, mapId)
val tmp = Utils.tempFileWith(output)
try {
val blockId = ShuffleBlockId(dep.shuffleId, mapId, IndexShuffleBlockResolver.NOOP_REDUCE_ID)
val partitionLengths = sorter.writePartitionedFile(blockId, tmp)
shuffleBlockResolver.writeIndexFileAndCommit(dep.shuffleId, mapId, partitionLengths, tmp)
mapStatus = MapStatus(blockManager.shuffleServerId, partitionLengths)
} finally {
if (tmp.exists() && !tmp.delete()) {
logError(s"Error while deleting temp file ${tmp.getAbsolutePath}")
}
}
}
ExternalSorter类:
作用:对迭代器中数据进行 分区,聚合 ,排序,
包含的对象:
AppendOnlyMap对象子类:PartitionedAppendOnlyMap,。这个map用于保存溢写前的数据,该map的key是(partition ID, K).
默认大小是64,使用Array来保存数据,默认长度64*2;保存方式:key,value紧挨着保存到数组中
16) 执行ExternalSorter.insertAll方法
insertAll就是把数据保存到 PartitionedAppendOnlyMap对象中。遍历iterator,把所有数据放到map中
def insertAll(records: Iterator[Product2[K, V]]): Unit = {
// TODO: stop combining if we find that the reduction factor isn't high
val shouldCombine = aggregator.isDefined
if (shouldCombine) {//使用combine操作,执行这段代码
// Combine values in-memory first using our AppendOnlyMap
val mergeValue = aggregator.get.mergeValue
val createCombiner = aggregator.get.createCombiner
var kv: Product2[K, V] = null
//update 函数用来更新value, 如果k存在则使用mergeValue函数进行合并,
val update = (hadValue: Boolean, oldValue: C) => {
if (hadValue) mergeValue(oldValue, kv._2) else createCombiner(kv._2)
}
while (records.hasNext) { //遍历迭代器
addElementsRead() //记录处理的元素个数,溢写之后会重新计数
kv = records.next() // kv=(apache,1)
//先调用分区器HashPartitioner对key进行分区 18)。再调用changeValue方法
map.changeValue((getPartition(kv._1), kv._1), update) //apache分区=0;参数:((0,apache),update)
maybeSpillCollection(usingMap =
true)
}
}
else {
// Stick values into our buffer
while (records.hasNext) {
addElementsRead()
val kv = records.next()
buffer.insert(getPartition(kv._1), kv._1, kv._2.asInstanceOf[C])
maybeSpillCollection(usingMap = false)
}
}
}
17) 根据key调用分区方法返回分区号
private def getPartition(key: K): Int = {
if (shouldPartition) partitioner.get.getPartition(key) else 0
}
18)调用PartitionedAppendOnlyMap.changeValue()方法
//保存数据的函数,key-value 保存在数组的相邻位置。
def changeValue(key: K, updateFunc: (Boolean, V) => V): V = { //key=(0,apache)
assert(!destroyed, destructionMessage)
val k = key.asInstanceOf[AnyRef]
if (k.eq(null)) {
if (!haveNullValue) {
incrementSize()
}
nullValue = updateFunc(haveNullValue, nullValue)
haveNullValue = true
return nullValue
}
var pos = rehash(k.hashCode) & mask //获取数据保存的位置
var i = 1
while (true) {
val curKey = data(2 * pos) //判断数据是否存在
if (curKey.eq(null)) {//不存在
val newValue = updateFunc(false, null.asInstanceOf[V])
data(2 * pos) = k //保存key值,相邻的保存value值
data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
incrementSize()
return newValue
} else if (k.eq(curKey) || k.equals(curKey)) { //key存在
val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V])
data(2 * pos + 1) = newValue.asInstanceOf[AnyRef] //更新value
return newValue
} else {
val delta = i
pos = (pos + delta) & mask
i += 1
}
}
null.asInstanceOf[V] // Never reached but needed to keep compiler happy
}
先判断数据是否需要溢写到磁盘
private def maybeSpillCollection(usingMap: Boolean): Unit = { //usingMap = true
var estimatedSize = 0L
if (usingMap) {
estimatedSize = map.estimateSize()
if (maybeSpill(map, estimatedSize)) { //调用maybeSpill,咱们数据比较少不会溢写
map = new PartitionedAppendOnlyMap[K, C]
}
} else {
estimatedSize = buffer.estimateSize()
if (maybeSpill(buffer, estimatedSize)) {
buffer = new PartitionedPairBuffer[K, C]
}
}
if (estimatedSize > _peakMemoryUsedBytes) {
_peakMemoryUsedBytes = estimatedSize
}
}
20)溢写磁盘的方法。我们得数据量少,不会溢写。
如果需要,将当前的内存收集溢出到磁盘。试图在溢出之前获取更多内存。
protected def maybeSpill(collection: C, currentMemory: Long): Boolean = {
var shouldSpill = false
if (elementsRead % 32 == 0 && currentMemory >= myMemoryThreshold) {
// Claim up to double our current memory from the shuffle memory pool
val amountToRequest = 2 * currentMemory - myMemoryThreshold
val granted = acquireMemory(amountToRequest)
myMemoryThreshold += granted
// If we were granted too little memory to grow further (either tryToAcquire returned 0,
// or we already had more memory than myMemoryThreshold), spill the current collection
shouldSpill = currentMemory >= myMemoryThreshold
}
shouldSpill = shouldSpill || _elementsRead > numElementsForceSpillThreshold
// Actually spill
if (shouldSpill) {
_spillCount += 1
logSpillage(currentMemory)
spill(collection)
_elementsRead = 0
_memoryBytesSpilled += currentMemory
releaseMemory()
}
shouldSpill
}
21).遍历执行17,18,19,20 四步,所有数据存入externalSort的map中;退出ExternalSorter.insertAll方法,继续执行SortShuffleWriter.write方法: 如下:
/** Write a bunch of records to this task's output */
override def write(records: Iterator[Product2[K, V]]): Unit = {
sorter = if (dep.mapSideCombine) {
new ExternalSorter[K, V, C](
context, dep.aggregator, Some(dep.partitioner), dep.keyOrdering, dep.serializer)
} else {
// In this case we pass neither an aggregator nor an ordering to the sorter, because we don't
// care whether the keys get sorted in each partition; that will be done on the reduce side
// if the operation being run is sortByKey.
new ExternalSorter[K, V, V](
context, aggregator = None, Some(dep.partitioner), ordering = None, dep.serializer)
}
sorter.insertAll(records) //insertAll执行完成,继续向下执行
// Don't bother including the time to open the merged output file in the shuffle write time,
// because it just opens a single file, so is typically too fast to measure accurately
// (see SPARK-3570).
//开始执行的方法,返回文件对象 目录:C:\Users\tend\AppData\Local\Temp\blockmgr-3d9bb635-2b93-488d-bc3f-febc75f9468a\0c\shuffle_0_0_0.data
val output = shuffleBlockResolver.getDataFile(dep.shuffleId, mapId) //(0,0)
val tmp = Utils.tempFileWith(output) //output+uuid生成一个临时文件,C:\Users\tend\AppData\Local\Temp\blockmgr-3d9bb635-2b93-488d-bc3f-febc75f9468a\0c\shuffle_0_0_0.data.4ec0ce74-d4d3-4eae-afdc-343e0b778998
try {
val blockId = ShuffleBlockId(dep.shuffleId, mapId, IndexShuffleBlockResolver.NOOP_REDUCE_ID)
val partitionLengths = sorter.writePartitionedFile(blockId, tmp)
shuffleBlockResolver.writeIndexFileAndCommit(dep.shuffleId, mapId, partitionLengths, tmp)
mapStatus = MapStatus(blockManager.shuffleServerId, partitionLengths)
} finally {
if (tmp.exists() && !tmp.delete()) {
logError(s"Error while deleting temp file ${tmp.getAbsolutePath}")
}
}
}
22) 调用IndexShuffleBlockResolver.getFile方法
def getDataFile(shuffleId: Int, mapId: Int): File = {
//创建ShuffleDataBlockId对象,得到blockId的名称 = shuffle_0_0_0.data
//再调用blockManager.diskBlockManager.getFile(方法)
blockManager.diskBlockManager.getFile(ShuffleDataBlockId(shuffleId, mapId, NOOP_REDUCE_ID))
}
IndexShuffleBlockResolver:
创建和维护shuffle块在逻辑块和物理文件位置之间的映射。
来自同一map任务的shuffle块的数据存储在单个合并数据文件中。
数据文件中数据块的偏移量存储在单独的索引文件中。
我们使用shuffle数据的shuffleBlockId的名称,并将ID设置为0并添加”.data"作为数据文件的文件名后缀,并且".index"作为索引文件的文件名后缀。
def getFile(blockId: BlockId): File = getFile(blockId.name)
def getFile(filename: String): File = {
// Figure out which local directory it hashes to, and which subdirectory in that
val hash = Utils.nonNegativeHash(filename)
val dirId = hash % localDirs.length
val subDirId = (hash / localDirs.length) % subDirsPerLocalDir
// Create the subdirectory if it doesn't already exist
val subDir = subDirs(dirId).synchronized {
val old = subDirs(dirId)(subDirId)
if (old != null) {
old
} else {
val newDir = new File(localDirs(dirId), "%02x".format(subDirId))
if (!newDir.exists() && !newDir.mkdir()) {
throw new IOException(s"Failed to create local dir in $newDir.")
}
subDirs(dirId)(subDirId) = newDir
newDir
}
}
new File(subDir, filename)
}
24)方法返回,继续执行SortShuffleWriter.write
override def write(records: Iterator[Product2[K, V]]): Unit = {
sorter = if (dep.mapSideCombine) {
new ExternalSorter[K, V, C](
context, dep.aggregator, Some(dep.partitioner), dep.keyOrdering, dep.serializer)
} else {
// In this case we pass neither an aggregator nor an ordering to the sorter, because we don't
// care whether the keys get sorted in each partition; that will be done on the reduce side
// if the operation being run is sortByKey.
new ExternalSorter[K, V, V](
context, aggregator = None, Some(dep.partitioner), ordering = None, dep.serializer)
}
sorter.insertAll(records)
// Don't bother including the time to open the merged output file in the shuffle write time,
// because it just opens a single file, so is typically too fast to measure accurately
// (see SPARK-3570).
val output = shuffleBlockResolver.getDataFile(dep.shuffleId, mapId) //执行完成
val tmp = Utils.tempFileWith(output) //output+uuid生成一个临时文件,C:\Users\tend\AppData\Local\Temp\blockmgr-3d9bb635-2b93-488d-bc3f-febc75f9468a\0c\shuffle_0_0_0.data.4ec0ce74-d4d3-4eae-afdc-343e0b778998
try {
//定义blockId=shuffle_0_0_0
val blockId = ShuffleBlockId(dep.shuffleId, mapId, IndexShuffleBlockResolver.NOOP_REDUCE_ID)
//25)调用sorter.writePartitionedFile 把sorter中所有数据排序并写到磁盘
val partitionLengths = sorter.writePartitionedFile(blockId, tmp)
shuffleBlockResolver.writeIndexFileAndCommit(dep.shuffleId, mapId, partitionLengths, tmp)
mapStatus = MapStatus(blockManager.shuffleServerId, partitionLengths)
} finally {
if (tmp.exists() && !tmp.delete()) {
logError(s"Error while deleting temp file ${tmp.getAbsolutePath}")
}
}
}
25)sorter.writePartitionedFile方法:
def writePartitionedFile(
blockId: BlockId,
outputFile: File): Array[Long] = {
// Track location of each range in the output file
val lengths = new Array[Long](numPartitions)
//返回writer对象,diskBlockObjectWriter对象
val writer = blockManager.getDiskWriter(blockId, outputFile, serInstance, fileBufferSize,
context.taskMetrics().shuffleWriteMetrics)
if (spills.isEmpty) {
// Case where we only have in-memory data
val collection = if (aggregator.isDefined) map else buffer
//对sorter中的数据进行排序,先按分区id排序,再按key排序 ,26)步
val it = collection.destructiveSortedWritablePartitionedIterator(comparator)
while (it.hasNext) {
val partitionId = it.nextPartition()
while (it.hasNext && it.nextPartition() == partitionId) {
it.writeNext(writer)
}
val segment = writer.commitAndGet()
lengths(partitionId) = segment.length
}
} else {
// We must perform merge-sort; get an iterator by partition and write everything directly.
for ((id, elements) <- this.partitionedIterator) {
if (elements.hasNext) {
for (elem <- elements) {
writer.write(elem._1, elem._2)
}
val segment = writer.commitAndGet()
lengths(id) = segment.length
}
}
}
writer.close()
context.taskMetrics().incMemoryBytesSpilled(memoryBytesSpilled)
context.taskMetrics().incDiskBytesSpilled(diskBytesSpilled)
context.taskMetrics().incPeakExecutionMemory(peakMemoryUsedBytes)
lengths
}
26) 调用collection.destructiveSortedWritablePartitionedIterator(comparator) 这里的collection就是PartitionedAppendOnlyMap对象。
该方法作用:迭代数据并写出元素,而不是返回它们。按照分区ID和给定的比较器的顺序返回记录。
下面的3个方法都是PartitionedAppendOnlyMap的完成了排序操作:使用timSort 排序方法
PartitionedAppendOnlyMap中data的数据存储结构:使用数组存储, key-value相邻的两个元素。
def destructiveSortedWritablePartitionedIterator(keyComparator: Option[Comparator[K]])
: WritablePartitionedIterator = {
val it = partitionedDestructiveSortedIterator(keyComparator)
new WritablePartitionedIterator {
private[this] var cur = if (it.hasNext) it.next() else null
def writeNext(writer: DiskBlockObjectWriter): Unit = {
writer.write(cur._1._2, cur._2)
cur = if (it.hasNext) it.next() else null
}
def hasNext(): Boolean = cur != null
def nextPartition(): Int = cur._1._1
}
}
def partitionedDestructiveSortedIterator(keyComparator: Option[Comparator[K]])
: Iterator[((Int, K), V)] = {
val comparator = keyComparator.map(partitionKeyComparator).getOrElse(partitionComparator)
destructiveSortedIterator(comparator)
}
def destructiveSortedIterator(keyComparator: Comparator[K]): Iterator[(K, V)] = {
destroyed = true
// Pack KV pairs into the front of the underlying array
var keyIndex, newIndex = 0 //生成新的key-value index位置
while (keyIndex < capacity) {
if (data(2 * keyIndex) != null) {
data(2 * newIndex) = data(2 * keyIndex)
data(2 * newIndex + 1) = data(2 * keyIndex + 1)
newIndex += 1
}
keyIndex += 1
}
assert(curSize == newIndex + (if (haveNullValue) 1 else 0))
//调用sort方法进行排序
new Sorter(new KVArraySortDataFormat[K, AnyRef]).sort(data, 0, newIndex, keyComparator)
new Iterator[(K, V)] { //返回新的index的数据iterator
var i = 0
var nullValueReady = haveNullValue
def hasNext: Boolean = (i < newIndex || nullValueReady)
def next(): (K, V) = {
if (nullValueReady) {
nullValueReady = false
(null.asInstanceOf[K], nullValue)
} else {
val item = (data(2 * i).asInstanceOf[K], data(2 * i + 1).asInstanceOf[V])
i += 1
item
}
}
}
}
//使用timSort排序方法
def sort(a: Buffer, lo: Int, hi: Int, c: Comparator[_ >: K]): Unit = {
timSort.sort(a, lo, hi, c)
}
27)返回拍完序的iterator继续执行 sorter.writePartitionedFile方法: 将分区数据写入文件
该方法返回结果length=[61,51] 分别代表分区长度
def writePartitionedFile(
blockId: BlockId,
outputFile: File): Array[Long] = {
// Track location of each range in the output file
val lengths = new Array[Long](numPartitions)
//返回writer对象,diskBlockObjectWriter对象
val writer = blockManager.getDiskWriter(blockId, outputFile, serInstance, fileBufferSize,
context.taskMetrics().shuffleWriteMetrics)
if (spills.isEmpty) {
// Case where we only have in-memory data
val collection = if (aggregator.isDefined) map else buffer
val it = collection.destructiveSortedWritablePartitionedIterator(comparator)
//开始执行写入,按分区写入
while (it.hasNext) {
val partitionId = it.nextPartition() //获取分区id
while (it.hasNext && it.nextPartition() == partitionId) {
it.writeNext(writer) //逐行写入文件
}
val segment = writer.commitAndGet() //一个分区写完,刷新到磁盘,多个分区写入同一个文件。
lengths(partitionId) = segment.length //记录们个分区数据的长度
}
} else {
// We must perform merge-sort; get an iterator by partition and write everything directly.
for ((id, elements) <- this.partitionedIterator) {
if (elements.hasNext) {
for (elem <- elements) {
writer.write(elem._1, elem._2)
}
val segment = writer.commitAndGet()
lengths(id) = segment.length
}
}
}
writer.close()
context.taskMetrics().incMemoryBytesSpilled(memoryBytesSpilled)
context.taskMetrics().incDiskBytesSpilled(diskBytesSpilled)
context.taskMetrics().incPeakExecutionMemory(peakMemoryUsedBytes)
lengths
}
一个分区写完,刷新到磁盘,多个分区写入同一个文件。
(name=shuffle_0_0_0.data.c251762b-bcb7-4dae-ba3c-35caa5983119, offset=0, length=61)
(name=shuffle_0_0_0.data.c251762b-bcb7-4dae-ba3c-35caa5983119, offset=61, length=51)
def commitAndGet(): FileSegment = {
if (streamOpen) {
// NOTE: Because Kryo doesn't flush the underlying stream we explicitly flush both the
// serializer stream and the lower level stream.
objOut.flush()
bs.flush()
objOut.close()
streamOpen = false
if (syncWrites) {
// Force outstanding writes to disk and track how long it takes
val start = System.nanoTime()
fos.getFD.sync()
writeMetrics.incWriteTime(System.nanoTime() - start)
}
val pos = channel.position()
val fileSegment = new FileSegment(file, committedPosition, pos - committedPosition)
committedPosition = pos
// In certain compression codecs, more bytes are written after streams are closed
writeMetrics.incBytesWritten(committedPosition - reportedPosition)
reportedPosition = committedPosition
numRecordsWritten = 0
fileSegment
} else {
new FileSegment(file, committedPosition, 0)
}
}
sorter.writePartitionedFile(blockId, tmp) 方法执行完成之后,继续执行SortShuffleWriter.write里面的
shuffleBlockResolver.writeIndexFileAndCommit(dep.shuffleId, mapId, partitionLengths, tmp) 步骤 ,生成索引文件
override def write(records: Iterator[Product2[K, V]]): Unit = {
...
//定义blockId=shuffle_0_0_0
val blockId = ShuffleBlockId(dep.shuffleId, mapId, IndexShuffleBlockResolver.NOOP_REDUCE_ID)
//25)调用sorter.writePartitionedFile 把sorter中所有数据排序并写到磁盘,返回partitionLengths=[61,51] 分区长度
val partitionLengths = sorter.writePartitionedFile(blockId, tmp)
//生成索引文件:shuffle_0_0_0.index
shuffleBlockResolver.writeIndexFileAndCommit(dep.shuffleId, mapId, partitionLengths, tmp)
//创建MapStatus对象,包含块的位置(ip,port),最后把MapStatus返回给driver
//BlockManagerId(driver, 172.25.126.56, 2931, None)
mapStatus = MapStatus(blockManager.shuffleServerId, partitionLengths) //清理内存,和对象
} finally {
if (tmp.exists() && !tmp.delete()) {
logError(s"Error while deleting temp file ${tmp.getAbsolutePath}")
}
}
}
//写入索引文件
def writeIndexFileAndCommit(
shuffleId: Int,
mapId: Int,
lengths: Array[Long],
dataTmp: File): Unit = {
val indexFile = getIndexFile(shuffleId, mapId)
val indexTmp = Utils.tempFileWith(indexFile)
try {
val dataFile = getDataFile(shuffleId, mapId)
// There is only one IndexShuffleBlockResolver per executor, this synchronization make sure
// the following check and rename are atomic.
synchronized {
val existingLengths = checkIndexAndDataFile(indexFile, dataFile, lengths.length)
if (existingLengths != null) {
// Another attempt for the same task has already written our map outputs successfully,
// so just use the existing partition lengths and delete our temporary map outputs.
System.arraycopy(existingLengths, 0, lengths, 0, lengths.length)
if (dataTmp != null && dataTmp.exists()) {
dataTmp.delete()
}
} else {
// This is the first successful attempt in writing the map outputs for this task,
// so override any existing index and data files with the ones we wrote.
val out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(indexTmp)))
Utils.tryWithSafeFinally {
// We take in lengths of each block, need to convert it to offsets.
var offset = 0L
out.writeLong(offset)
for (length <- lengths) {
offset += length
out.writeLong(offset)
}
} {
out.close()
}
if (indexFile.exists()) {
indexFile.delete()
}
if (dataFile.exists()) {
dataFile.delete()
}
if (!indexTmp.renameTo(indexFile)) {
throw new IOException("fail to rename file " + indexTmp + " to " + indexFile)
}
if (dataTmp != null && dataTmp.exists() && !dataTmp.renameTo(dataFile)) {
throw new IOException("fail to rename file " + dataTmp + " to " + dataFile)
}
}
}
} finally {
if (indexTmp.exists() && !indexTmp.delete()) {
logError(s"Failed to delete temporary index file at ${indexTmp.getAbsolutePath}")
}
}
}
29)到这里SortShuffleWriter.write方法执行完成,返回到1)步 runTask方法:
true).get 并返回MapStatus
override def runTask(context: TaskContext): MapStatus = {
// Deserialize the RDD using the broadcast variable.
val threadMXBean = ManagementFactory.getThreadMXBean
val deserializeStartTime = System.currentTimeMillis()
val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
threadMXBean.getCurrentThreadCpuTime
} else 0L
val ser = SparkEnv.get.closureSerializer.newInstance()
//反序列化出MapPartitionsRDD[3]和ShuffleDependency
val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
_executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
_executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime
} else 0L
var writer: ShuffleWriter[Any, Any] = null
try {
//获取shuffleManger对象由spark.shuffle.manager 参数决定,默认是SortShuffleManager对象。
val manager = SparkEnv. get.shuffleManager
//获取writer对象;根据dep.shuffleHandle不同选择不同的对象,整理选择SortShuffleWriter
writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
//执行write方法之前,先执行rdd.iterator方法
writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
writer.stop(success = true).get
} catch {
case e: Exception =>
try {
if (writer != null) {
writer.stop(success = false)
}
} catch {
case e: Exception =>
log.debug("Could not stop writer", e)
}
throw e
}
}
总结:
数据保存格式((partitionId,key),value)排序保存到shuffle_.data文件中,并且生成一个分区索引文件shuffle_.index保存每个分区的读取位置.
最后将文件的位置BlockManagerId(driver, 172.25.126.56, 2931, None) 发送给driver.
5.shuffle 读
总结:
读取各分区文件指定offset的数据,聚合个分区数据,如果聚合数据过大会溢写到磁盘。
6.总结:
shuffle写:
数据保存格式((partitionId,key),value)排序保存到shuffle_.data文件中,并且生成一个分区索引文件shuffle_.index保存每个分区的读取位置.
最后将文件的位置BlockManagerId(driver, 172.25.126.56, 2931, None) 发送给driver.
shuffle读:读取各分区文件指定offset的数据,聚合个分区数据,如果聚合数据过大会溢写到磁盘。