Flink Yarn Per Job - RM启动SlotManager_客户端


ResourceManager

public final void onStart() throws Exception {
try {
startResourceManagerServices();
} catch (Throwable t) {
final ResourceManagerException exception = new ResourceManagerException(String.format("Could not start the ResourceManager %s", getAddress()), t);
onFatalError(exception);
throw exception;
}
}

private void startResourceManagerServices() throws Exception {
try {
leaderElectionService = highAvailabilityServices.getResourceManagerLeaderElectionService();

// 创建了Yarn的RM和NM的客户端,初始化并启动
initialize();

// 通过选举服务,启动ResourceManager
leaderElectionService.start(this);

jobLeaderIdService.start(new JobLeaderIdActionsImpl());

registerTaskExecutorMetrics();
} catch (Exception e) {
handleStartResourceManagerServicesException(e);
}
}

创建了Yarn的RM和NM的客户端,初始化并启动

通过选举服务,启动ResourceManager


创建了Yarn的RM和NM的客户端


ActiveResourceManager

@Override
protected void initialize() throws ResourceManagerException {
try {
resourceManagerDriver.initialize(
this,
new GatewayMainThreadExecutor(),
ioExecutor);
} catch (Exception e) {
throw new ResourceManagerException("Cannot initialize resource provider.", e);
}
}


AbstractResourceManagerDriver

@Override
public final void initialize(
ResourceEventHandler<WorkerType> resourceEventHandler,
ScheduledExecutor mainThreadExecutor,
Executor ioExecutor) throws Exception {
this.resourceEventHandler = Preconditions.checkNotNull(resourceEventHandler);
this.mainThreadExecutor = Preconditions.checkNotNull(mainThreadExecutor);
this.ioExecutor = Preconditions.checkNotNull(ioExecutor);
// 下追
initializeInternal();
}

protected abstract void initializeInternal() throws Exception;


YarnResourceManagerDriver

@Override
protected void initializeInternal() throws Exception {
final YarnContainerEventHandler yarnContainerEventHandler = new YarnContainerEventHandler();
try {
// 创建Yarn的ResourceManager的客户端,并且初始化和启动
resourceManagerClient = yarnResourceManagerClientFactory.createResourceManagerClient(
yarnHeartbeatIntervalMillis,
yarnContainerEventHandler);
resourceManagerClient.init(yarnConfig);
resourceManagerClient.start();

final RegisterApplicationMasterResponse registerApplicationMasterResponse = registerApplicationMaster();
getContainersFromPreviousAttempts(registerApplicationMasterResponse);
taskExecutorProcessSpecContainerResourcePriorityAdapter =
new TaskExecutorProcessSpecContainerResourcePriorityAdapter(
registerApplicationMasterResponse.getMaximumResourceCapability(),
ExternalResourceUtils.getExternalResources(flinkConfig, YarnConfigOptions.EXTERNAL_RESOURCE_YARN_CONFIG_KEY_SUFFIX));
} catch (Exception e) {
throw new ResourceManagerException("Could not start resource manager client.", e);
}

// 创建yarn的 NodeManager的客户端,并且初始化和启动
nodeManagerClient = yarnNodeManagerClientFactory.createNodeManagerClient(yarnContainerEventHandler);
nodeManagerClient.init(yarnConfig);
nodeManagerClient.start();
}

创建Yarn的ResourceManager的客户端,并且初始化和启动

创建yarn的 NodeManager的客户端,并且初始化和启动


启动SlotManager


StandaloneLeaderElectionService

@Override
public void start(LeaderContender newContender) throws Exception {
if (contender != null) {
// Service was already started
throw new IllegalArgumentException("Leader election service cannot be started multiple times.");
}

contender = Preconditions.checkNotNull(newContender);

// directly grant leadership to the given contender
contender.grantLeadership(HighAvailabilityServices.DEFAULT_LEADER_ID);
}


ResourceManager

public void grantLeadership(final UUID newLeaderSessionID) {
final CompletableFuture<Boolean> acceptLeadershipFuture = clearStateFuture
// 下追
.thenComposeAsync((ignored) -> tryAcceptLeadership(newLeaderSessionID), getUnfencedMainThreadExecutor());

... ...
}

private CompletableFuture<Boolean> tryAcceptLeadership(final UUID newLeaderSessionID) {
if (leaderElectionService.hasLeadership(newLeaderSessionID)) {
... ...
startServicesOnLeadership();

return prepareLeadershipAsync().thenApply(ignored -> true);
} else {
return CompletableFuture.completedFuture(false);
}
}


private void startServicesOnLeadership() {

  //  启动心跳服务:TaskManager、JobMaster

  startHeartbeatServices();

  //  启动slotManager

  slotManager.start(getFencingToken(), getMainThreadExecutor(), new ResourceActionsImpl());

  onLeadership();

}

启动心跳服务:TaskManager、JobMaster

启动slotManager

private void startHeartbeatServices() {
taskManagerHeartbeatManager = heartbeatServices.createHeartbeatManagerSender(
resourceId,
new TaskManagerHeartbeatListener(),
getMainThreadExecutor(),
log);

jobManagerHeartbeatManager = heartbeatServices.createHeartbeatManagerSender(
resourceId,
new JobManagerHeartbeatListener(),
getMainThreadExecutor(),
log);
}


作为资源的老大,肯定要跟task小弟和job去通信


SlotManagerImpl

public void start(ResourceManagerId newResourceManagerId, Executor newMainThreadExecutor, ResourceActions newResourceActions) {
LOG.info("Starting the SlotManager.");

this.resourceManagerId = Preconditions.checkNotNull(newResourceManagerId);
mainThreadExecutor = Preconditions.checkNotNull(newMainThreadExecutor);
resourceActions = Preconditions.checkNotNull(newResourceActions);

started = true;

taskManagerTimeoutsAndRedundancyCheck = scheduledExecutor.scheduleWithFixedDelay(
() -> mainThreadExecutor.execute(
// 检查超时和多余的TaskManager
() -> checkTaskManagerTimeoutsAndRedundancy()),
0L,
taskManagerTimeout.toMilliseconds(),
TimeUnit.MILLISECONDS);

slotRequestTimeoutCheck = scheduledExecutor.scheduleWithFixedDelay(
() -> mainThreadExecutor.execute(
() -> checkSlotRequestTimeouts()),
0L,
slotRequestTimeout.toMilliseconds(),
TimeUnit.MILLISECONDS);

registerSlotManagerMetrics();
}

检查超时和多余的TaskManager


void checkTaskManagerTimeoutsAndRedundancy() {
if (!taskManagerRegistrations.isEmpty()) {
long currentTime = System.currentTimeMillis();

ArrayList<TaskManagerRegistration> timedOutTaskManagers = new ArrayList<>(taskManagerRegistrations.size());

// first retrieve the timed out TaskManagers
for (TaskManagerRegistration taskManagerRegistration : taskManagerRegistrations.values()) {
if (currentTime - taskManagerRegistration.getIdleSince() >= taskManagerTimeout.toMilliseconds()) {
// we collect the instance ids first in order to avoid concurrent modifications by the
// ResourceActions.releaseResource call
timedOutTaskManagers.add(taskManagerRegistration);
}
}

int slotsDiff = redundantTaskManagerNum * numSlotsPerWorker - freeSlots.size();
if (freeSlots.size() == slots.size()) {
// No need to keep redundant taskManagers if no job is running.
// 如果没有job在运行,释放taskmanager
releaseTaskExecutors(timedOutTaskManagers, timedOutTaskManagers.size());
} else if (slotsDiff > 0) {
// Keep enough redundant taskManagers from time to time.
// 保证随时有足够额taskmanager
int requiredTaskManagers = MathUtils.divideRoundUp(slotsDiff, numSlotsPerWorker);
allocateRedundantTaskManagers(requiredTaskManagers);
} else {
// second we trigger the release resource callback which can decide upon the resource release
int maxReleaseNum = (-slotsDiff) / numSlotsPerWorker;
releaseTaskExecutors(timedOutTaskManagers, Math.min(maxReleaseNum, timedOutTaskManagers.size()));
}
}
}


Flink Yarn Per Job - RM启动SlotManager_初始化_02