HDFS源码之DataNode 启动流程

1. start-dfs.sh

"$HADOOP_PREFIX/sbin/hadoop-daemons.sh" 
  --config "$HADOOP_CONF_DIR" \
  --hostnames "$NAMENODES" \
  --script "$bin/hdfs" start namenode $nameStartOpt

2. hadoop-daemons.sh

"$bin/slaves.sh" --config $HADOOP_CONF_DIR cd "$HADOOP_PREFIX" \; "$bin/hadoop-daemon.sh" --config $HADOOP_CONF_DIR "$@"

3. hadoop-daemon.sh

case $command in
      namenode|secondarynamenode|datanode|journalnode|dfs|dfsadmin|fsck|balancer|zkfc)
        if [ -z "$HADOOP_HDFS_HOME" ]; then
          hdfsScript="$HADOOP_PREFIX"/bin/hdfs
        else
          hdfsScript="$HADOOP_HDFS_HOME"/bin/hdfs
        fi
        nohup nice -n $HADOOP_NICENESS $hdfsScript --config $HADOOP_CONF_DIR $command "$@" > "$log" 2>&1 < /dev/null &
      ;;
      (*)
        nohup nice -n $HADOOP_NICENESS $hadoopScript --config $HADOOP_CONF_DIR $command "$@" > "$log" 2>&1 < /dev/null &
      ;;

4. hdfs

CLASS='org.apache.hadoop.hdfs.server.datanode.DataNode'
  HADOOP_OPTS="$HADOOP_OPTS $HADOOP_NAMENODE_OPTS"

5. DataNode

//首先走main方法
public static void main(String args[]) {
    if (DFSUtil.parseHelpArgument(args, DataNode.USAGE, System.out, true)) {
      System.exit(0);
    }
	//执行该方法
    secureMain(args, null);
}

//org.apache.hadoop.hdfs.server.datanode.DataNode#secureMain
public static void secureMain(String args[], SecureResources resources) {
  int errorCode = 0;
  try {
    StringUtils.startupShutdownMessage(DataNode.class, args, LOG);
    //构建DataNode对象
    DataNode datanode = createDataNode(args, null, resources);
    if (datanode != null) {
      // 与namenode一样,将其加入守护线程
      datanode.join();
    } else {
      errorCode = 1;
  //.....
  }
}

6. createDataNode

//org.apache.hadoop.hdfs.server.datanode.DataNode#createDataNode()
public static DataNode createDataNode(String args[], Configuration conf,
    SecureResources resources) throws IOException {
  //实例化datanode对象
  DataNode dn = instantiateDataNode(args, conf, resources);
  if (dn != null) {
    //启动
    dn.runDatanodeDaemon();
  }
  return dn;
}
6.1 instantiateDataNode
//org.apache.hadoop.hdfs.server.datanode.DataNode#instantiateDataNode
public static DataNode instantiateDataNode(String args [], Configuration conf,
    SecureResources resources) throws IOException {
  if (conf == null)
    conf = new HdfsConfiguration();
  //......
  //从配置文件中读取信息dfs.datanode.data.dir
  Collection<StorageLocation> dataLocations = getStorageLocations(conf);
  return makeInstance(dataLocations, conf, resources);
}
6.2 makeInstance
//org.apache.hadoop.hdfs.server.datanode.DataNode#makeInstance
/**
   * 在确定有一个data目录下,创建datanode实例
   * @param dataDirs List of directories, where the new DataNode instance should
   * keep its files.
   * @param conf Configuration instance to use.
   * @param resources Secure resources needed to run under Kerberos
   * @return DataNode instance for given list of data dirs and conf, or null if
   * no directory from this directory list can be created.
   * @throws IOException
   */
  static DataNode makeInstance(Collection<StorageLocation> dataDirs,
      Configuration conf, SecureResources resources) throws IOException {
    //获取本地文件系统
    LocalFileSystem localFS = FileSystem.getLocal(conf);
    //实例化文件或文件夹的权限管理类
    FsPermission permission = new FsPermission(
        conf.get(DFS_DATANODE_DATA_DIR_PERMISSION_KEY,
                 DFS_DATANODE_DATA_DIR_PERMISSION_DEFAULT));
    DataNodeDiskChecker dataNodeDiskChecker =
        new DataNodeDiskChecker(permission);
    //检查配置文件中给定目录下的文件夹或文件,具体是检查权限,并且要保证文件可读
    List<StorageLocation> locations =
        checkStorageLocations(dataDirs, localFS, dataNodeDiskChecker);
    DefaultMetricsSystem.initialize("DataNode");
	//如果目录检查不过关,报错
    assert locations.size() > 0 : "number of data directories should be > 0";
    return new DataNode(conf, locations, resources);
  }
6.1.1 DataNode 构造方法
//org.apache.hadoop.hdfs.server.datanode.DataNode#DataNode
startDataNode(conf, dataDirs, resources); //大概在428行

//org.apache.hadoop.hdfs.server.datanode.DataNode#startDataNode
void startDataNode(Configuration conf, 
                 List<StorageLocation> dataDirs,
                 SecureResources resources
                 ) throws IOException {

//.....
//Data storage information file.
storage = new DataStorage();

// global DN settings
registerMXBean();
initDataXceiver(conf);
startInfoServer(conf);
//java 虚拟机监控
pauseMonitor = new JvmPauseMonitor(conf);
pauseMonitor.start();

// BlockPoolTokenSecretManager is required to create ipc server.
this.blockPoolTokenSecretManager = new BlockPoolTokenSecretManager();
//....
// 初始化IpcServer(RPC通信),DataNode#runDatanodeDaemon()中启动
initIpcServer(conf);
//...
// 按照namespace(nameservice)、namenode的二级结构进行初始化
blockPoolManager = new BlockPoolManager(this);
//重点
blockPoolManager.refreshNamenodes(conf);
//....
}
6.1.1.1 doRefreshNamenodes
//org.apache.hadoop.hdfs.server.datanode.BlockPoolManager#doRefreshNamenodes
private void doRefreshNamenodes(
  Map<String, Map<String, InetSocketAddress>> addrMap) throws IOException {
assert Thread.holdsLock(refreshNamenodesLock);

Set<String> toRefresh = Sets.newLinkedHashSet();
Set<String> toAdd = Sets.newLinkedHashSet();
Set<String> toRemove;

synchronized (this) {
  // Step 1.对于每个nameservice,确保是nameservices集合中的刷新,还是添加到该集合
  for (String nameserviceId : addrMap.keySet()) {
    if (bpByNameserviceId.containsKey(nameserviceId)) {
      toRefresh.add(nameserviceId);
    } else {
      toAdd.add(nameserviceId);
    }
  }
  
  // Step 2. 对于每一个nameservices,如果我们当前拥有,但是该服务已经不存在了,需要移除
  toRemove = Sets.newHashSet(Sets.difference(
      bpByNameserviceId.keySet(), addrMap.keySet()));
  
  assert toRefresh.size() + toAdd.size() ==
    addrMap.size() :
      "toAdd: " + Joiner.on(",").useForNull("<default>").join(toAdd) +
      "  toRemove: " + Joiner.on(",").useForNull("<default>").join(toRemove) +
      "  toRefresh: " + Joiner.on(",").useForNull("<default>").join(toRefresh);

  
  // Step 3. 为每个namespace创建对应的BPOfferService(包括每个namenode对应的BPServiceActor)
  if (!toAdd.isEmpty()) {
    LOG.info("Starting BPOfferServices for nameservices: " +
        Joiner.on(",").useForNull("<default>").join(toAdd));
  
    for (String nsToAdd : toAdd) {
      ArrayList<InetSocketAddress> addrs =
        Lists.newArrayList(addrMap.get(nsToAdd).values());
      //BPOfferService,datanode上的一个block-pool实例,主要处理与active的namenode发送心跳
      BPOfferService bpos = createBPOS(addrs);
      bpByNameserviceId.put(nsToAdd, bpos);
      offerServices.add(bpos);
    }
  }
  //启动所有BPOfferService
  startAll();
}
6.1.1.2 createBPOS
//org.apache.hadoop.hdfs.server.datanode.BlockPoolManager#createBPOS
protected BPOfferService createBPOS(List<InetSocketAddress> nnAddrs) {
    return new BPOfferService(nnAddrs, dn);
}

//org.apache.hadoop.hdfs.server.datanode.BPOfferService#BPOfferService
BPOfferService(List<InetSocketAddress> nnAddrs, DataNode dn) {
  Preconditions.checkArgument(!nnAddrs.isEmpty(),
      "Must pass at least one NN.");
  this.dn = dn;

  for (InetSocketAddress addr : nnAddrs) {
    //每一个bpServices都会绑定一个BPServiceActor
    //BPOfferService通过bpServices维护同一个namespace下各namenode对应的BPServiceActor。
    this.bpServices.add(new BPServiceActor(addr, this));
      
    //关于BPServiceActor描述:A thread per active or standby namenode to perform:
    //Pre-registration handshake with namenode 握手
    //Registration with namenode
    //Send periodic heartbeats to the namenode
    //Handle commands received from the namenode
  }
}
6.1.1.3 startAll
//org.apache.hadoop.hdfs.server.datanode.BlockPoolManager#startAll
for (BPOfferService bpos : offerServices) {
   bpos.start();
}

//org.apache.hadoop.hdfs.server.datanode.BPServiceActor#start
void start() {
  if ((bpThread != null) && (bpThread.isAlive())) {
    //Thread is started already
    return;
  }
  bpThread = new Thread(this, formatThreadName());
  bpThread.setDaemon(true); // needed for JUnit testing
  bpThread.start();
}
//直接看run()
6.1.2.1 BPServiceActor
//org.apache.hadoop.hdfs.server.datanode.BPServiceActor#run

@Override
public void run() {
  LOG.info(this + " starting to offer service");

  try {
    while (true) {
      // init stuff
      try {
        //与namenode 握手
        connectToNNAndHandshake();
        break;
      } catch (IOException ioe) {
        //大部分握手失败的情况都需要重试,除非抛出了非IOException异常或datanode关闭
        runningState = RunningState.INIT_FAILED;
        //......
        }
      }
    }

    while (shouldRun()) {
      try {
        // BPServiceActor提供的服务
        offerService();
      } catch (Exception ex) {
        //不管抛出任何异常,都持续提供服务(包括心跳、数据块汇报等),直到datanode关闭
        LOG.error("Exception in BPOfferService for " + this, ex);
        sleepAndLogInterrupts(5000, "offering service");
      }
    }
  //.......
  }
}

//主要看connectToNNAndHandshake
6.1.2.1 connectToNNAndHandshake
//org.apache.hadoop.hdfs.server.datanode.BPServiceActor#connectToNNAndHandshake
private void connectToNNAndHandshake() throws IOException {
    // get NN proxy
    bpNamenode = dn.connectToNN(nnAddr);

    // First phase of the handshake with NN - get the namespace info.
    NamespaceInfo nsInfo = retrieveNamespaceInfo();
    
    // 验证并初始化该datanode上的BlockPool
    bpos.verifyAndSetNamespaceInfo(nsInfo);
    
    // Second phase of the handshake with the NN.
    register(nsInfo);
}
//主要看verifyAndSetNamespaceInfo
6.1.2.3 verifyAndSetNamespaceInfo
//org.apache.hadoop.hdfs.server.datanode.BPOfferService#verifyAndSetNamespaceInfo
void verifyAndSetNamespaceInfo(NamespaceInfo nsInfo) throws IOException {
writeLock();
try {
  if (this.bpNSInfo == null) {
    this.bpNSInfo = nsInfo;
    boolean success = false;

    // 如果是第一次连接namenode(也就必然是第一次连接namespace),则初始化blockpool(块池)
    try {
      // 以BPOfferService为单位初始化blockpool
      dn.initBlockPool(this);
      success = true;
    } finally {
      if (!success) {
         // 如果一个BPServiceActor线程失败了,还可以由同BPOfferService的其他BPServiceActor线程重新尝试
        this.bpNSInfo = null;
      }
    }
  } else {
    checkNSEquality(bpNSInfo.getBlockPoolID(), nsInfo.getBlockPoolID(),
        "Blockpool ID");
    checkNSEquality(bpNSInfo.getNamespaceID(), nsInfo.getNamespaceID(),
        "Namespace ID");
    checkNSEquality(bpNSInfo.getClusterID(), nsInfo.getClusterID(),
        "Cluster ID");
  }
} finally {
  writeUnlock();
}
}
//我们来看一下initBlockPool
//org.apache.hadoop.hdfs.server.datanode.DataNode#initBlockPool
6.1.2.4 initBlockPool
/**
 * 一个Block Pools成功连接到Namenode,该block pool需要在本地存储中初始化,
 * 并且检查集群ID的一致性
 * 如果这是第一个注册的block pool,那么它也需要为datanode存储区域做初始化
 * 
 * @param bpos Block pool offer service
 * @throws IOException if the NN is inconsistent with the local storage.
 */
void initBlockPool(BPOfferService bpos) throws IOException {
  //获取getNamespace信息
  NamespaceInfo nsInfo = bpos.getNamespaceInfo();
  //...
  setClusterId(nsInfo.clusterID, nsInfo.getBlockPoolID());

  // Register the new block pool with the BP manager.
  blockPoolManager.addBlockPool(bpos);
  
  // In the case that this is the first block pool to connect, initialize
  // the dataset, block scanners, etc.
  initStorage(nsInfo);


  //在初始化块池之前排除故障磁盘以避免启动失败。
  checkDiskError();

  //将blockpool添加到FsDatasetIpml,并继续初始化存储结构
  data.addBlockPool(nsInfo.getBlockPoolID(), conf);
  blockScanner.enableBlockPoolId(bpos.getBlockPoolId());
  initDirectoryScanner(conf);
}
5.1.2.5 initStorage
//org.apache.hadoop.hdfs.server.datanode.DataNode#initStorage
private void initStorage(final NamespaceInfo nsInfo) throws IOException {
  final FsDatasetSpi.Factory<? extends FsDatasetSpi<?>> factory
      = FsDatasetSpi.Factory.getFactory(conf);
  
  if (!factory.isSimulated()) {
    ...
    // 构造参数
    // 初始化DataStorage(每个datanode分别只持有一个)。可能会触发DataStorage级别的状态装换,因此,要在DataNode上加锁
    synchronized (this) {
       //read storage info, lock data dirs and transition fs state if necessary
      storage.recoverTransitionRead(this, bpid, nsInfo, dataDirs, startOpt);
    }
    final StorageInfo bpStorage = storage.getBPStorage(bpid);
    LOG.info("Setting up storage: nsid=" + bpStorage.getNamespaceID()
        + ";bpid=" + bpid + ";lv=" + storage.getLayoutVersion()
        + ";nsInfo=" + nsInfo + ";dnuuid=" + storage.getDatanodeUuid());
  }

  ...// 检查

  // 初始化FsDatasetImpl(同上,每个datanode分别只持有一个)
  synchronized(this)  {
    if (data == null) {
      data = factory.newInstance(this, storage, conf);
    }
  }
}
5.1.2.6 recoverTransitionRead
void recoverTransitionRead(DataNode datanode, NamespaceInfo nsInfo,
    Collection<StorageLocation> dataDirs, StartupOption startOpt) throws IOException {
  
  //...
  	
  if (addStorageLocations(datanode, nsInfo, dataDirs, startOpt).isEmpty()) {
    throw new IOException("All specified directories are failed to load.");
  }
}

//org.apache.hadoop.hdfs.server.datanode.DataStorage#addStorageLocations
//添加要由DataStorage管理的卷的列表。如果卷为空,格式化它,否则根据需要从以前的转换恢复它。
synchronized List<StorageLocation> addStorageLocations(DataNode datanode,
      NamespaceInfo nsInfo, Collection<StorageLocation> dataDirs,
      StartupOption startOpt) throws IOException {
    final String bpid = nsInfo.getBlockPoolID();
    List<StorageLocation> successVolumes = Lists.newArrayList();
    for (StorageLocation dataDir : dataDirs) {
      File root = dataDir.getFile();
      if (!containsStorageDir(root)) {
        try {
          // It first ensures the datanode level format is completed.
          StorageDirectory sd = loadStorageDirectory(
              datanode, nsInfo, root, startOpt);
          //
          addStorageDir(sd);
        } catch (IOException e) {
          LOG.warn(e);
          continue;
        }
      } else {
        LOG.info("Storage directory " + dataDir + " has already been used.");
      }

      List<File> bpDataDirs = new ArrayList<File>();
      bpDataDirs.add(BlockPoolSliceStorage.getBpRoot(bpid, new File(root,
              STORAGE_DIR_CURRENT)));
      try {
         // 在各${dfs.datanode.data.dir}/current下检查并创建blockpool目录
        makeBlockPoolDataDir(bpDataDirs, null);
         // 创建BlockPoolSliceStorage,并放入映射DataStorage#bpStorageMap:`Map<bpid, BlockPoolSliceStorage>`
        BlockPoolSliceStorage bpStorage = this.bpStorageMap.get(bpid);
        if (bpStorage == null) {
          bpStorage = new BlockPoolSliceStorage(
              nsInfo.getNamespaceID(), bpid, nsInfo.getCTime(),
              nsInfo.getClusterID());
        }
		
        bpStorage.recoverTransitionRead(datanode, nsInfo, bpDataDirs, startOpt);
        addBlockPoolStorage(bpid, bpStorage);
      } catch (IOException e) {
        LOG.warn("Failed to add storage for block pool: " + bpid + " : "
            + e.getMessage());
        continue;
      }
      successVolumes.add(dataDir);
    }
    return successVolumes;
  }
5.2 runDatanodeDaemon
//org.apache.hadoop.hdfs.server.datanode.DataNode#runDatanodeDaemon
public void runDatanodeDaemon() throws IOException {
  blockPoolManager.startAll();

  // start dataXceiveServer
  dataXceiverServer.start();
  if (localDataXceiverServer != null) {
    localDataXceiverServer.start();
  }
  ipcServer.start();
  startPlugins(conf);
}
5.2.1 dataXceiverServer.start
//org.apache.hadoop.hdfs.server.datanode.DataXceiverServer#DataXceiverServer
//关于DataXceiverServer的描述:用于接收/发送数据块的服务器。或侦听来自此客户端的请求
public void run() {
  Peer peer = null;
  //只要是datanode在运行
  while (datanode.shouldRun && !datanode.shutdownForUpgrade) {
    try {
      //socket阻塞
      peer = peerServer.accept();

      // 确保没有超过xceiver计数
      int curXceiverCount = datanode.getXceiverCount();
      //启动线程DataXceiver去交互
      new Daemon(datanode.threadGroup,
          DataXceiver.create(peer, datanode, this))
          .start();
    }//......
  }

  // Close the server to stop reception of more requests.
  try {
    peerServer.close();
    closed = true;
  //...
  closeAllPeers();
}
5.2.2 DataXceiver
//org.apache.hadoop.hdfs.server.datanode.DataXceiver#run
/**
 * Read/write data from/to the DataXceiverServer.
 */
@Override
public void run() {
  int opsProcessed = 0;
  Op op = null;

  try {
    dataXceiverServer.addPeer(peer, Thread.currentThread(), this);
    peer.setWriteTimeout(datanode.getDnConf().socketWriteTimeout);
    InputStream input = socketIn;
    try {
      IOStreamPair saslStreams = datanode.saslServer.receive(peer, socketOut,
        socketIn, datanode.getXferAddress().getPort(),
        datanode.getDatanodeId());
      input = new BufferedInputStream(saslStreams.in,
          smallBufferSize);
      socketOut = saslStreams.out;
    } catch (InvalidMagicNumberException imne) {
      //....
    }
    
    super.initialize(new DataInputStream(input));
    
    // We process requests in a loop, and stay around for a short timeout.
    // This optimistic behaviour allows the other end to reuse connections.
    // Setting keepalive timeout to 0 disable this behavior.
    do {
      updateCurrentThreadName("Waiting for operation #" + (opsProcessed + 1));

      try {
        if (opsProcessed != 0) {
          assert dnConf.socketKeepaliveTimeout > 0;
          peer.setReadTimeout(dnConf.socketKeepaliveTimeout);
        } else {
          peer.setReadTimeout(dnConf.socketTimeout);
        }
        op = readOp();
      } catch (InterruptedIOException ignored) {
        // Time out while we wait for client rpc
      }

      // restore normal timeout
      if (opsProcessed != 0) {
        peer.setReadTimeout(dnConf.socketTimeout);
      }

      opStartTime = monotonicNow();
      processOp(op);
      ++opsProcessed;
    } while ((peer != null) &&
        (!peer.isClosed() && dnConf.socketKeepaliveTimeout > 0));
  } catch (Throwable t) {
    //....
  } finally {
    //...
}