源码解析-HDFS副本放置策略
重点类
- DistributedFileSystem create()
- DataStreamer
- BlockManager chooseTarget4NewBlock()
- BlockPlacementPolicyDefault chooseTarget()
- BlockPlacementPolicyDefault chooseTargetInOrder()
hadoop-version:3.4.0-SNAPSHOT
剖析点
- 通过FileSystem.newInstance()创建的FileSystem实例是无法设置favored node list的,需要通过DistributedFileSystem 的create方法来设置
- hdfs的block放置策略分两种情况
2.1 客户端设置了优先放置的datanode的ip ,先从优待列表中选择datanode节点,剩下的走2.2的逻辑。
2.2 客户端没有设置优先放置的datanode的ip
//DataStreamer 的 run() 方法
if (stage == BlockConstructionStage.PIPELINE_SETUP_CREATE) {
LOG.debug("Allocating new block: {}", this);
//这里需要创建pipeline的datanode列表
setPipeline(nextBlockOutputStream());
initDataStreaming();
}
//nextBlockOutputStream() 方法
protected LocatedBlock nextBlockOutputStream() throws IOException {
LocatedBlock lb;
DatanodeInfo[] nodes;
StorageType[] nextStorageTypes;
String[] nextStorageIDs;
int count = dfsClient.getConf().getNumBlockWriteRetry();
boolean success;
final ExtendedBlock oldBlock = block.getCurrentBlock();
do {
errorState.resetInternalError();
lastException.clear();
DatanodeInfo[] excluded = getExcludedNodes();
/
lb = locateFollowingBlock(
excluded.length > 0 ? excluded : null, oldBlock);
block.setCurrentBlock(lb.getBlock());
block.setNumBytes(0);
bytesSent = 0;
accessToken = lb.getBlockToken();
nodes = lb.getLocations();
nextStorageTypes = lb.getStorageTypes();
nextStorageIDs = lb.getStorageIDs();
// Connect to first DataNode in the list.
//创建pipeline
success = createBlockOutputStream(nodes, nextStorageTypes, nextStorageIDs,
0L, false);
if (!success) {
LOG.warn("Abandoning " + block);
dfsClient.namenode.abandonBlock(block.getCurrentBlock(),
stat.getFileId(), src, dfsClient.clientName);
block.setCurrentBlock(null);
final DatanodeInfo badNode = nodes[errorState.getBadNodeIndex()];
LOG.warn("Excluding datanode " + badNode);
excludedNodes.put(badNode, badNode);
}
} while (!success && --count >= 0);
if (!success) {
throw new IOException("Unable to create new block.");
}
return lb;
}
//DataStream
private LocatedBlock locateFollowingBlock(DatanodeInfo[] excluded,
ExtendedBlock oldBlock) throws IOException {
return DFSOutputStream.addBlock(excluded, dfsClient, src, oldBlock,
stat.getFileId(), favoredNodes, addBlockFlags);
}
//BlockPlacementPolicyDefault 先在优待列表中查找合适的datanode,剩下的会走通用逻辑chooseTargetInOrder()
protected DatanodeStorageInfo chooseLocalOrFavoredStorage(
Node localOrFavoredNode, boolean isFavoredNode, Set<Node> excludedNodes,
long blocksize, int maxNodesPerRack, List<DatanodeStorageInfo> results,
boolean avoidStaleNodes, EnumMap<StorageType, Integer> storageTypes)
throws NotEnoughReplicasException {
// if no local machine, randomly choose one node
if (localOrFavoredNode == null) {
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
maxNodesPerRack, results, avoidStaleNodes, storageTypes);
}
if ((preferLocalNode || isFavoredNode)
&& localOrFavoredNode instanceof DatanodeDescriptor
&& clusterMap.contains(localOrFavoredNode)) {
DatanodeDescriptor localDatanode =
(DatanodeDescriptor) localOrFavoredNode;
// otherwise try local machine first
if (excludedNodes.add(localOrFavoredNode) // was not in the excluded list
&& isGoodDatanode(localDatanode, maxNodesPerRack, false,
results, avoidStaleNodes)) {
for (Iterator<Map.Entry<StorageType, Integer>> iter = storageTypes
.entrySet().iterator(); iter.hasNext(); ) {
Map.Entry<StorageType, Integer> entry = iter.next();
DatanodeStorageInfo localStorage = chooseStorage4Block(
localDatanode, blocksize, results, entry.getKey());
if (localStorage != null) {
// add node and related nodes to excludedNode
addToExcludedNodes(localDatanode, excludedNodes);
int num = entry.getValue();
if (num == 1) {
iter.remove();
} else {
entry.setValue(num - 1);
}
return localStorage;
}
}
}
}
return null;
}
//层层调用,最后会调用BlockPlacementPolicyDefault的chooseTargetInOrder方法
protected Node chooseTargetInOrder(int numOfReplicas,
Node writer,
final Set<Node> excludedNodes,
final long blocksize,
final int maxNodesPerRack,
final List<DatanodeStorageInfo> results,
final boolean avoidStaleNodes,
final boolean newBlock,
EnumMap<StorageType, Integer> storageTypes)
throws NotEnoughReplicasException {
final int numOfResults = results.size();
if (numOfResults == 0) {
//1.第一个节点 选择本地节点,如果client不是DataNode节点,那么将随机选择一个节点,否则选择client节点作为第一个placement节点
DatanodeStorageInfo storageInfo = chooseLocalStorage(writer,
excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes,
storageTypes, true);
writer = (storageInfo != null) ? storageInfo.getDatanodeDescriptor()
: null;
if (--numOfReplicas == 0) {
return writer;
}
}
final DatanodeDescriptor dn0 = results.get(0).getDatanodeDescriptor();
//2. 这里开始选择第二个节点,选择一个和第一个placement节点不在同一机架的节点,改节点也是随机选择的
if (numOfResults <= 1) {
chooseRemoteRack(1, dn0, excludedNodes, blocksize, maxNodesPerRack,
results, avoidStaleNodes, storageTypes);
if (--numOfReplicas == 0) {
return writer;
}
}
//3. 这里开始选择第三个节点,选择一个和第二个placement节点在同一机架的节点
if (numOfResults <= 2) {
final DatanodeDescriptor dn1 = results.get(1).getDatanodeDescriptor();
if (clusterMap.isOnSameRack(dn0, dn1)) {
chooseRemoteRack(1, dn0, excludedNodes, blocksize, maxNodesPerRack,
results, avoidStaleNodes, storageTypes);
} else if (newBlock){
chooseLocalRack(dn1, excludedNodes, blocksize, maxNodesPerRack,
results, avoidStaleNodes, storageTypes);
} else {
chooseLocalRack(writer, excludedNodes, blocksize, maxNodesPerRack,
results, avoidStaleNodes, storageTypes);
}
if (--numOfReplicas == 0) {
return writer;
}
}
//4. 这里选择剩余的节点,这里作者只设计了前三个放置点的防治策略,后续的放置节点均采用随机的策略
chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes, blocksize,
maxNodesPerRack, results, avoidStaleNodes, storageTypes);
return writer;
}
总结
HDFS的放置策略是允许客户端进行调整的,可以设置DataNode的ip地址来让namenode去优先考虑,但是这种不是一定都能通过考核,还需要根据节点的自身情况来考量是否选用该节点。
如果未提供DataNode列表的话,那么会按照以下流程进行选择合适的placement:
- 第一个节点 选择本地节点,如果client不是DataNode节点,那么将随机选择一个节点,否则选择client节点作为第一个placement节点
- 这里开始选择第二个节点,选择一个和第一个placement节点不在同一机架的节点,改节点也是随机选择的
- 这里开始选择第三个节点,选择一个和第二个placement节点在同一机架的节点
- 这里选择剩余的节点,这里作者只设计了前三个放置点的防治策略,后续的放置节点均采用随机的策略
注:喜欢的可以点个关注哦,不定时更新哦。