HDFS源码之DataNode 启动流程
1. start-dfs.sh
"$HADOOP_PREFIX/sbin/hadoop-daemons.sh"
--config "$HADOOP_CONF_DIR" \
--hostnames "$NAMENODES" \
--script "$bin/hdfs" start namenode $nameStartOpt
2. hadoop-daemons.sh
"$bin/slaves.sh" --config $HADOOP_CONF_DIR cd "$HADOOP_PREFIX" \; "$bin/hadoop-daemon.sh" --config $HADOOP_CONF_DIR "$@"
3. hadoop-daemon.sh
case $command in
namenode|secondarynamenode|datanode|journalnode|dfs|dfsadmin|fsck|balancer|zkfc)
if [ -z "$HADOOP_HDFS_HOME" ]; then
hdfsScript="$HADOOP_PREFIX"/bin/hdfs
else
hdfsScript="$HADOOP_HDFS_HOME"/bin/hdfs
fi
nohup nice -n $HADOOP_NICENESS $hdfsScript --config $HADOOP_CONF_DIR $command "$@" > "$log" 2>&1 < /dev/null &
;;
(*)
nohup nice -n $HADOOP_NICENESS $hadoopScript --config $HADOOP_CONF_DIR $command "$@" > "$log" 2>&1 < /dev/null &
;;
4. hdfs
CLASS='org.apache.hadoop.hdfs.server.datanode.DataNode'
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_NAMENODE_OPTS"
5. DataNode
//首先走main方法
public static void main(String args[]) {
if (DFSUtil.parseHelpArgument(args, DataNode.USAGE, System.out, true)) {
System.exit(0);
}
//执行该方法
secureMain(args, null);
}
//org.apache.hadoop.hdfs.server.datanode.DataNode#secureMain
public static void secureMain(String args[], SecureResources resources) {
int errorCode = 0;
try {
StringUtils.startupShutdownMessage(DataNode.class, args, LOG);
//构建DataNode对象
DataNode datanode = createDataNode(args, null, resources);
if (datanode != null) {
// 与namenode一样,将其加入守护线程
datanode.join();
} else {
errorCode = 1;
//.....
}
}
6. createDataNode
//org.apache.hadoop.hdfs.server.datanode.DataNode#createDataNode()
public static DataNode createDataNode(String args[], Configuration conf,
SecureResources resources) throws IOException {
//实例化datanode对象
DataNode dn = instantiateDataNode(args, conf, resources);
if (dn != null) {
//启动
dn.runDatanodeDaemon();
}
return dn;
}
6.1 instantiateDataNode
//org.apache.hadoop.hdfs.server.datanode.DataNode#instantiateDataNode
public static DataNode instantiateDataNode(String args [], Configuration conf,
SecureResources resources) throws IOException {
if (conf == null)
conf = new HdfsConfiguration();
//......
//从配置文件中读取信息dfs.datanode.data.dir
Collection<StorageLocation> dataLocations = getStorageLocations(conf);
return makeInstance(dataLocations, conf, resources);
}
6.2 makeInstance
//org.apache.hadoop.hdfs.server.datanode.DataNode#makeInstance
/**
* 在确定有一个data目录下,创建datanode实例
* @param dataDirs List of directories, where the new DataNode instance should
* keep its files.
* @param conf Configuration instance to use.
* @param resources Secure resources needed to run under Kerberos
* @return DataNode instance for given list of data dirs and conf, or null if
* no directory from this directory list can be created.
* @throws IOException
*/
static DataNode makeInstance(Collection<StorageLocation> dataDirs,
Configuration conf, SecureResources resources) throws IOException {
//获取本地文件系统
LocalFileSystem localFS = FileSystem.getLocal(conf);
//实例化文件或文件夹的权限管理类
FsPermission permission = new FsPermission(
conf.get(DFS_DATANODE_DATA_DIR_PERMISSION_KEY,
DFS_DATANODE_DATA_DIR_PERMISSION_DEFAULT));
DataNodeDiskChecker dataNodeDiskChecker =
new DataNodeDiskChecker(permission);
//检查配置文件中给定目录下的文件夹或文件,具体是检查权限,并且要保证文件可读
List<StorageLocation> locations =
checkStorageLocations(dataDirs, localFS, dataNodeDiskChecker);
DefaultMetricsSystem.initialize("DataNode");
//如果目录检查不过关,报错
assert locations.size() > 0 : "number of data directories should be > 0";
return new DataNode(conf, locations, resources);
}
6.1.1 DataNode 构造方法
//org.apache.hadoop.hdfs.server.datanode.DataNode#DataNode
startDataNode(conf, dataDirs, resources); //大概在428行
//org.apache.hadoop.hdfs.server.datanode.DataNode#startDataNode
void startDataNode(Configuration conf,
List<StorageLocation> dataDirs,
SecureResources resources
) throws IOException {
//.....
//Data storage information file.
storage = new DataStorage();
// global DN settings
registerMXBean();
initDataXceiver(conf);
startInfoServer(conf);
//java 虚拟机监控
pauseMonitor = new JvmPauseMonitor(conf);
pauseMonitor.start();
// BlockPoolTokenSecretManager is required to create ipc server.
this.blockPoolTokenSecretManager = new BlockPoolTokenSecretManager();
//....
// 初始化IpcServer(RPC通信),DataNode#runDatanodeDaemon()中启动
initIpcServer(conf);
//...
// 按照namespace(nameservice)、namenode的二级结构进行初始化
blockPoolManager = new BlockPoolManager(this);
//重点
blockPoolManager.refreshNamenodes(conf);
//....
}
6.1.1.1 doRefreshNamenodes
//org.apache.hadoop.hdfs.server.datanode.BlockPoolManager#doRefreshNamenodes
private void doRefreshNamenodes(
Map<String, Map<String, InetSocketAddress>> addrMap) throws IOException {
assert Thread.holdsLock(refreshNamenodesLock);
Set<String> toRefresh = Sets.newLinkedHashSet();
Set<String> toAdd = Sets.newLinkedHashSet();
Set<String> toRemove;
synchronized (this) {
// Step 1.对于每个nameservice,确保是nameservices集合中的刷新,还是添加到该集合
for (String nameserviceId : addrMap.keySet()) {
if (bpByNameserviceId.containsKey(nameserviceId)) {
toRefresh.add(nameserviceId);
} else {
toAdd.add(nameserviceId);
}
}
// Step 2. 对于每一个nameservices,如果我们当前拥有,但是该服务已经不存在了,需要移除
toRemove = Sets.newHashSet(Sets.difference(
bpByNameserviceId.keySet(), addrMap.keySet()));
assert toRefresh.size() + toAdd.size() ==
addrMap.size() :
"toAdd: " + Joiner.on(",").useForNull("<default>").join(toAdd) +
" toRemove: " + Joiner.on(",").useForNull("<default>").join(toRemove) +
" toRefresh: " + Joiner.on(",").useForNull("<default>").join(toRefresh);
// Step 3. 为每个namespace创建对应的BPOfferService(包括每个namenode对应的BPServiceActor)
if (!toAdd.isEmpty()) {
LOG.info("Starting BPOfferServices for nameservices: " +
Joiner.on(",").useForNull("<default>").join(toAdd));
for (String nsToAdd : toAdd) {
ArrayList<InetSocketAddress> addrs =
Lists.newArrayList(addrMap.get(nsToAdd).values());
//BPOfferService,datanode上的一个block-pool实例,主要处理与active的namenode发送心跳
BPOfferService bpos = createBPOS(addrs);
bpByNameserviceId.put(nsToAdd, bpos);
offerServices.add(bpos);
}
}
//启动所有BPOfferService
startAll();
}
6.1.1.2 createBPOS
//org.apache.hadoop.hdfs.server.datanode.BlockPoolManager#createBPOS
protected BPOfferService createBPOS(List<InetSocketAddress> nnAddrs) {
return new BPOfferService(nnAddrs, dn);
}
//org.apache.hadoop.hdfs.server.datanode.BPOfferService#BPOfferService
BPOfferService(List<InetSocketAddress> nnAddrs, DataNode dn) {
Preconditions.checkArgument(!nnAddrs.isEmpty(),
"Must pass at least one NN.");
this.dn = dn;
for (InetSocketAddress addr : nnAddrs) {
//每一个bpServices都会绑定一个BPServiceActor
//BPOfferService通过bpServices维护同一个namespace下各namenode对应的BPServiceActor。
this.bpServices.add(new BPServiceActor(addr, this));
//关于BPServiceActor描述:A thread per active or standby namenode to perform:
//Pre-registration handshake with namenode 握手
//Registration with namenode
//Send periodic heartbeats to the namenode
//Handle commands received from the namenode
}
}
6.1.1.3 startAll
//org.apache.hadoop.hdfs.server.datanode.BlockPoolManager#startAll
for (BPOfferService bpos : offerServices) {
bpos.start();
}
//org.apache.hadoop.hdfs.server.datanode.BPServiceActor#start
void start() {
if ((bpThread != null) && (bpThread.isAlive())) {
//Thread is started already
return;
}
bpThread = new Thread(this, formatThreadName());
bpThread.setDaemon(true); // needed for JUnit testing
bpThread.start();
}
//直接看run()
6.1.2.1 BPServiceActor
//org.apache.hadoop.hdfs.server.datanode.BPServiceActor#run
@Override
public void run() {
LOG.info(this + " starting to offer service");
try {
while (true) {
// init stuff
try {
//与namenode 握手
connectToNNAndHandshake();
break;
} catch (IOException ioe) {
//大部分握手失败的情况都需要重试,除非抛出了非IOException异常或datanode关闭
runningState = RunningState.INIT_FAILED;
//......
}
}
}
while (shouldRun()) {
try {
// BPServiceActor提供的服务
offerService();
} catch (Exception ex) {
//不管抛出任何异常,都持续提供服务(包括心跳、数据块汇报等),直到datanode关闭
LOG.error("Exception in BPOfferService for " + this, ex);
sleepAndLogInterrupts(5000, "offering service");
}
}
//.......
}
}
//主要看connectToNNAndHandshake
6.1.2.1 connectToNNAndHandshake
//org.apache.hadoop.hdfs.server.datanode.BPServiceActor#connectToNNAndHandshake
private void connectToNNAndHandshake() throws IOException {
// get NN proxy
bpNamenode = dn.connectToNN(nnAddr);
// First phase of the handshake with NN - get the namespace info.
NamespaceInfo nsInfo = retrieveNamespaceInfo();
// 验证并初始化该datanode上的BlockPool
bpos.verifyAndSetNamespaceInfo(nsInfo);
// Second phase of the handshake with the NN.
register(nsInfo);
}
//主要看verifyAndSetNamespaceInfo
6.1.2.3 verifyAndSetNamespaceInfo
//org.apache.hadoop.hdfs.server.datanode.BPOfferService#verifyAndSetNamespaceInfo
void verifyAndSetNamespaceInfo(NamespaceInfo nsInfo) throws IOException {
writeLock();
try {
if (this.bpNSInfo == null) {
this.bpNSInfo = nsInfo;
boolean success = false;
// 如果是第一次连接namenode(也就必然是第一次连接namespace),则初始化blockpool(块池)
try {
// 以BPOfferService为单位初始化blockpool
dn.initBlockPool(this);
success = true;
} finally {
if (!success) {
// 如果一个BPServiceActor线程失败了,还可以由同BPOfferService的其他BPServiceActor线程重新尝试
this.bpNSInfo = null;
}
}
} else {
checkNSEquality(bpNSInfo.getBlockPoolID(), nsInfo.getBlockPoolID(),
"Blockpool ID");
checkNSEquality(bpNSInfo.getNamespaceID(), nsInfo.getNamespaceID(),
"Namespace ID");
checkNSEquality(bpNSInfo.getClusterID(), nsInfo.getClusterID(),
"Cluster ID");
}
} finally {
writeUnlock();
}
}
//我们来看一下initBlockPool
//org.apache.hadoop.hdfs.server.datanode.DataNode#initBlockPool
6.1.2.4 initBlockPool
/**
* 一个Block Pools成功连接到Namenode,该block pool需要在本地存储中初始化,
* 并且检查集群ID的一致性
* 如果这是第一个注册的block pool,那么它也需要为datanode存储区域做初始化
*
* @param bpos Block pool offer service
* @throws IOException if the NN is inconsistent with the local storage.
*/
void initBlockPool(BPOfferService bpos) throws IOException {
//获取getNamespace信息
NamespaceInfo nsInfo = bpos.getNamespaceInfo();
//...
setClusterId(nsInfo.clusterID, nsInfo.getBlockPoolID());
// Register the new block pool with the BP manager.
blockPoolManager.addBlockPool(bpos);
// In the case that this is the first block pool to connect, initialize
// the dataset, block scanners, etc.
initStorage(nsInfo);
//在初始化块池之前排除故障磁盘以避免启动失败。
checkDiskError();
//将blockpool添加到FsDatasetIpml,并继续初始化存储结构
data.addBlockPool(nsInfo.getBlockPoolID(), conf);
blockScanner.enableBlockPoolId(bpos.getBlockPoolId());
initDirectoryScanner(conf);
}
5.1.2.5 initStorage
//org.apache.hadoop.hdfs.server.datanode.DataNode#initStorage
private void initStorage(final NamespaceInfo nsInfo) throws IOException {
final FsDatasetSpi.Factory<? extends FsDatasetSpi<?>> factory
= FsDatasetSpi.Factory.getFactory(conf);
if (!factory.isSimulated()) {
...
// 构造参数
// 初始化DataStorage(每个datanode分别只持有一个)。可能会触发DataStorage级别的状态装换,因此,要在DataNode上加锁
synchronized (this) {
//read storage info, lock data dirs and transition fs state if necessary
storage.recoverTransitionRead(this, bpid, nsInfo, dataDirs, startOpt);
}
final StorageInfo bpStorage = storage.getBPStorage(bpid);
LOG.info("Setting up storage: nsid=" + bpStorage.getNamespaceID()
+ ";bpid=" + bpid + ";lv=" + storage.getLayoutVersion()
+ ";nsInfo=" + nsInfo + ";dnuuid=" + storage.getDatanodeUuid());
}
...// 检查
// 初始化FsDatasetImpl(同上,每个datanode分别只持有一个)
synchronized(this) {
if (data == null) {
data = factory.newInstance(this, storage, conf);
}
}
}
5.1.2.6 recoverTransitionRead
void recoverTransitionRead(DataNode datanode, NamespaceInfo nsInfo,
Collection<StorageLocation> dataDirs, StartupOption startOpt) throws IOException {
//...
if (addStorageLocations(datanode, nsInfo, dataDirs, startOpt).isEmpty()) {
throw new IOException("All specified directories are failed to load.");
}
}
//org.apache.hadoop.hdfs.server.datanode.DataStorage#addStorageLocations
//添加要由DataStorage管理的卷的列表。如果卷为空,格式化它,否则根据需要从以前的转换恢复它。
synchronized List<StorageLocation> addStorageLocations(DataNode datanode,
NamespaceInfo nsInfo, Collection<StorageLocation> dataDirs,
StartupOption startOpt) throws IOException {
final String bpid = nsInfo.getBlockPoolID();
List<StorageLocation> successVolumes = Lists.newArrayList();
for (StorageLocation dataDir : dataDirs) {
File root = dataDir.getFile();
if (!containsStorageDir(root)) {
try {
// It first ensures the datanode level format is completed.
StorageDirectory sd = loadStorageDirectory(
datanode, nsInfo, root, startOpt);
//
addStorageDir(sd);
} catch (IOException e) {
LOG.warn(e);
continue;
}
} else {
LOG.info("Storage directory " + dataDir + " has already been used.");
}
List<File> bpDataDirs = new ArrayList<File>();
bpDataDirs.add(BlockPoolSliceStorage.getBpRoot(bpid, new File(root,
STORAGE_DIR_CURRENT)));
try {
// 在各${dfs.datanode.data.dir}/current下检查并创建blockpool目录
makeBlockPoolDataDir(bpDataDirs, null);
// 创建BlockPoolSliceStorage,并放入映射DataStorage#bpStorageMap:`Map<bpid, BlockPoolSliceStorage>`
BlockPoolSliceStorage bpStorage = this.bpStorageMap.get(bpid);
if (bpStorage == null) {
bpStorage = new BlockPoolSliceStorage(
nsInfo.getNamespaceID(), bpid, nsInfo.getCTime(),
nsInfo.getClusterID());
}
bpStorage.recoverTransitionRead(datanode, nsInfo, bpDataDirs, startOpt);
addBlockPoolStorage(bpid, bpStorage);
} catch (IOException e) {
LOG.warn("Failed to add storage for block pool: " + bpid + " : "
+ e.getMessage());
continue;
}
successVolumes.add(dataDir);
}
return successVolumes;
}
5.2 runDatanodeDaemon
//org.apache.hadoop.hdfs.server.datanode.DataNode#runDatanodeDaemon
public void runDatanodeDaemon() throws IOException {
blockPoolManager.startAll();
// start dataXceiveServer
dataXceiverServer.start();
if (localDataXceiverServer != null) {
localDataXceiverServer.start();
}
ipcServer.start();
startPlugins(conf);
}
5.2.1 dataXceiverServer.start
//org.apache.hadoop.hdfs.server.datanode.DataXceiverServer#DataXceiverServer
//关于DataXceiverServer的描述:用于接收/发送数据块的服务器。或侦听来自此客户端的请求
public void run() {
Peer peer = null;
//只要是datanode在运行
while (datanode.shouldRun && !datanode.shutdownForUpgrade) {
try {
//socket阻塞
peer = peerServer.accept();
// 确保没有超过xceiver计数
int curXceiverCount = datanode.getXceiverCount();
//启动线程DataXceiver去交互
new Daemon(datanode.threadGroup,
DataXceiver.create(peer, datanode, this))
.start();
}//......
}
// Close the server to stop reception of more requests.
try {
peerServer.close();
closed = true;
//...
closeAllPeers();
}
5.2.2 DataXceiver
//org.apache.hadoop.hdfs.server.datanode.DataXceiver#run
/**
* Read/write data from/to the DataXceiverServer.
*/
@Override
public void run() {
int opsProcessed = 0;
Op op = null;
try {
dataXceiverServer.addPeer(peer, Thread.currentThread(), this);
peer.setWriteTimeout(datanode.getDnConf().socketWriteTimeout);
InputStream input = socketIn;
try {
IOStreamPair saslStreams = datanode.saslServer.receive(peer, socketOut,
socketIn, datanode.getXferAddress().getPort(),
datanode.getDatanodeId());
input = new BufferedInputStream(saslStreams.in,
smallBufferSize);
socketOut = saslStreams.out;
} catch (InvalidMagicNumberException imne) {
//....
}
super.initialize(new DataInputStream(input));
// We process requests in a loop, and stay around for a short timeout.
// This optimistic behaviour allows the other end to reuse connections.
// Setting keepalive timeout to 0 disable this behavior.
do {
updateCurrentThreadName("Waiting for operation #" + (opsProcessed + 1));
try {
if (opsProcessed != 0) {
assert dnConf.socketKeepaliveTimeout > 0;
peer.setReadTimeout(dnConf.socketKeepaliveTimeout);
} else {
peer.setReadTimeout(dnConf.socketTimeout);
}
op = readOp();
} catch (InterruptedIOException ignored) {
// Time out while we wait for client rpc
}
// restore normal timeout
if (opsProcessed != 0) {
peer.setReadTimeout(dnConf.socketTimeout);
}
opStartTime = monotonicNow();
processOp(op);
++opsProcessed;
} while ((peer != null) &&
(!peer.isClosed() && dnConf.socketKeepaliveTimeout > 0));
} catch (Throwable t) {
//....
} finally {
//...
}