hdfs配置副本数 hdfs副本放置策略

转载

数据挖掘者 2024-05-03 22:56:36

文章标签 hdfs配置副本数 hdfs hadoop big data 客户端 文章分类 架构后端开发

源码解析-HDFS副本放置策略

重点类

DistributedFileSystem create()
DataStreamer
BlockManager chooseTarget4NewBlock()
BlockPlacementPolicyDefault chooseTarget()
BlockPlacementPolicyDefault chooseTargetInOrder()

hadoop-version：3.4.0-SNAPSHOT

剖析点

通过FileSystem.newInstance()创建的FileSystem实例是无法设置favored node list的，需要通过DistributedFileSystem 的create方法来设置
hdfs的block放置策略分两种情况
2.1 客户端设置了优先放置的datanode的ip ，先从优待列表中选择datanode节点，剩下的走2.2的逻辑。
2.2 客户端没有设置优先放置的datanode的ip

//DataStreamer 的 run() 方法
if (stage == BlockConstructionStage.PIPELINE_SETUP_CREATE) {
          LOG.debug("Allocating new block: {}", this);
          //这里需要创建pipeline的datanode列表
          setPipeline(nextBlockOutputStream());
          initDataStreaming();
  }
 //nextBlockOutputStream() 方法

protected LocatedBlock nextBlockOutputStream() throws IOException {
    LocatedBlock lb;
    DatanodeInfo[] nodes;
    StorageType[] nextStorageTypes;
    String[] nextStorageIDs;
    int count = dfsClient.getConf().getNumBlockWriteRetry();
    boolean success;
    final ExtendedBlock oldBlock = block.getCurrentBlock();
    do {
      errorState.resetInternalError();
      lastException.clear();

      DatanodeInfo[] excluded = getExcludedNodes();
      /
      lb = locateFollowingBlock(
          excluded.length > 0 ? excluded : null, oldBlock);
      block.setCurrentBlock(lb.getBlock());
      block.setNumBytes(0);
      bytesSent = 0;
      accessToken = lb.getBlockToken();
      nodes = lb.getLocations();
      nextStorageTypes = lb.getStorageTypes();
      nextStorageIDs = lb.getStorageIDs();

      // Connect to first DataNode in the list.
      //创建pipeline 
      success = createBlockOutputStream(nodes, nextStorageTypes, nextStorageIDs,
          0L, false);

      if (!success) {
        LOG.warn("Abandoning " + block);
        dfsClient.namenode.abandonBlock(block.getCurrentBlock(),
            stat.getFileId(), src, dfsClient.clientName);
        block.setCurrentBlock(null);
        final DatanodeInfo badNode = nodes[errorState.getBadNodeIndex()];
        LOG.warn("Excluding datanode " + badNode);
        excludedNodes.put(badNode, badNode);
      }
    } while (!success && --count >= 0);

    if (!success) {
      throw new IOException("Unable to create new block.");
    }
    return lb;
  }
    //DataStream
     private LocatedBlock locateFollowingBlock(DatanodeInfo[] excluded,
      ExtendedBlock oldBlock) throws IOException {
   
    return DFSOutputStream.addBlock(excluded, dfsClient, src, oldBlock,
        stat.getFileId(), favoredNodes, addBlockFlags);
  }

//BlockPlacementPolicyDefault 先在优待列表中查找合适的datanode，剩下的会走通用逻辑chooseTargetInOrder()
protected DatanodeStorageInfo chooseLocalOrFavoredStorage(
      Node localOrFavoredNode, boolean isFavoredNode, Set<Node> excludedNodes,
      long blocksize, int maxNodesPerRack, List<DatanodeStorageInfo> results,
      boolean avoidStaleNodes, EnumMap<StorageType, Integer> storageTypes)
      throws NotEnoughReplicasException {
    // if no local machine, randomly choose one node
    if (localOrFavoredNode == null) {
      return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
          maxNodesPerRack, results, avoidStaleNodes, storageTypes);
    }
    if ((preferLocalNode || isFavoredNode)
        && localOrFavoredNode instanceof DatanodeDescriptor
        && clusterMap.contains(localOrFavoredNode)) {
      DatanodeDescriptor localDatanode =
          (DatanodeDescriptor) localOrFavoredNode;
      // otherwise try local machine first
      if (excludedNodes.add(localOrFavoredNode) // was not in the excluded list
          && isGoodDatanode(localDatanode, maxNodesPerRack, false,
              results, avoidStaleNodes)) {
        for (Iterator<Map.Entry<StorageType, Integer>> iter = storageTypes
            .entrySet().iterator(); iter.hasNext(); ) {
          Map.Entry<StorageType, Integer> entry = iter.next();
          DatanodeStorageInfo localStorage = chooseStorage4Block(
              localDatanode, blocksize, results, entry.getKey());
          if (localStorage != null) {
            // add node and related nodes to excludedNode
            addToExcludedNodes(localDatanode, excludedNodes);
            int num = entry.getValue();
            if (num == 1) {
              iter.remove();
            } else {
              entry.setValue(num - 1);
            }
            return localStorage;
          }
        }
      }
    }
    return null;
  }

//层层调用，最后会调用BlockPlacementPolicyDefault的chooseTargetInOrder方法
protected Node chooseTargetInOrder(int numOfReplicas, 
                                 Node writer,
                                 final Set<Node> excludedNodes,
                                 final long blocksize,
                                 final int maxNodesPerRack,
                                 final List<DatanodeStorageInfo> results,
                                 final boolean avoidStaleNodes,
                                 final boolean newBlock,
                                 EnumMap<StorageType, Integer> storageTypes)
                                 throws NotEnoughReplicasException {
    final int numOfResults = results.size();
    if (numOfResults == 0) {
      //1.第一个节点 选择本地节点，如果client不是DataNode节点，那么将随机选择一个节点，否则选择client节点作为第一个placement节点
      DatanodeStorageInfo storageInfo = chooseLocalStorage(writer,
          excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes,
          storageTypes, true);

      writer = (storageInfo != null) ? storageInfo.getDatanodeDescriptor()
                                     : null;

      if (--numOfReplicas == 0) {
        return writer;
      }
    }
    final DatanodeDescriptor dn0 = results.get(0).getDatanodeDescriptor();
    //2. 这里开始选择第二个节点，选择一个和第一个placement节点不在同一机架的节点,改节点也是随机选择的
    if (numOfResults <= 1) {
      chooseRemoteRack(1, dn0, excludedNodes, blocksize, maxNodesPerRack,
          results, avoidStaleNodes, storageTypes);
      if (--numOfReplicas == 0) {
        return writer;
      }
    }
    //3. 这里开始选择第三个节点，选择一个和第二个placement节点在同一机架的节点
    if (numOfResults <= 2) {
      final DatanodeDescriptor dn1 = results.get(1).getDatanodeDescriptor();
      if (clusterMap.isOnSameRack(dn0, dn1)) {
        chooseRemoteRack(1, dn0, excludedNodes, blocksize, maxNodesPerRack,
            results, avoidStaleNodes, storageTypes);
      } else if (newBlock){
        chooseLocalRack(dn1, excludedNodes, blocksize, maxNodesPerRack,
            results, avoidStaleNodes, storageTypes);
      } else {
        chooseLocalRack(writer, excludedNodes, blocksize, maxNodesPerRack,
            results, avoidStaleNodes, storageTypes);
      }
      if (--numOfReplicas == 0) {
        return writer;
      }
    }
    //4. 这里选择剩余的节点，这里作者只设计了前三个放置点的防治策略，后续的放置节点均采用随机的策略
    chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes, blocksize,
        maxNodesPerRack, results, avoidStaleNodes, storageTypes);
    return writer;
  }