每个消费者都会通过HeartbeatTask任务定时向GroupCoordinator发送heartbeatRequest,告知GroupCoordinator自己正常在线。
HeartBeatRequest首先由KafkaApi.handleHeartbeatRequest方法进行处理,它负责验证权限,定义回调函数,并将请求委托给GroupCoordinator处理。
def handleHeartbeatRequest(request: RequestChannel.Request) {
val heartbeatRequest = request.body.asInstanceOf[HeartbeatRequest]
val respHeader = new ResponseHeader(request.header.correlationId)
// the callback for sending a heartbeat response
// 定义回调函数,把heartbeatResponse放入requestChannel等待发送
def sendResponseCallback(errorCode: Short) {
val response = new HeartbeatResponse(errorCode)
trace("Sending heartbeat response %s for correlation id %d to client %s."
.format(response, request.header.correlationId, request.header.clientId))
requestChannel.sendResponse(new RequestChannel.Response(request, new ResponseSend(request.connectionId, respHeader, response)))
}
if (!authorize(request.session, Read, new Resource(Group, heartbeatRequest.groupId))) {
val heartbeatResponse = new HeartbeatResponse(Errors.GROUP_AUTHORIZATION_FAILED.code)
requestChannel.sendResponse(new Response(request, new ResponseSend(request.connectionId, respHeader, heartbeatResponse)))
}
else {
// let the coordinator to handle heartbeat
// 把heartbeat委托给GroupCoordinator处理
coordinator.handleHeartbeat(
heartbeatRequest.groupId(),
heartbeatRequest.memberId(),
heartbeatRequest.groupGenerationId(),
sendResponseCallback)
}
}
GroupCoordinator.handleHeartbeat首先进行一系列的检测,保证groupMetadataManager处于可用状态且是对应消费者组的管理者。之后检测Consumer Group状态、memberID、generationId是否合法。最后调用HeartbeatExpiraion方法。
def handleHeartbeat(groupId: String,
memberId: String,
generationId: Int,
responseCallback: Short => Unit) {
if (!isActive.get) {
responseCallback(Errors.GROUP_COORDINATOR_NOT_AVAILABLE.code)
} else if (!isCoordinatorForGroup(groupId)) {
// 检测GroupCoordinator是否管理此consumer group
responseCallback(Errors.NOT_COORDINATOR_FOR_GROUP.code)
} else if (isCoordinatorLoadingInProgress(groupId)) {
// the group is still loading, so respond just blindly
// 是否已经加载对应的offsets topic分区
responseCallback(Errors.NONE.code)
} else {
val group = groupManager.getGroup(groupId)
if (group == null) { // 检测groupmetadata是否存在
responseCallback(Errors.UNKNOWN_MEMBER_ID.code)
} else {
group synchronized {
if (group.is(Dead)) {
// if the group is marked as dead, it means some other thread has just removed the group
// from the coordinator metadata; this is likely that the group has migrated to some other
// coordinator OR the group is in a transient unstable phase. Let the member retry
// joining without the specified member id,
responseCallback(Errors.UNKNOWN_MEMBER_ID.code)
// 检测consumer Group的状态
} else if (!group.is(Stable)) {
responseCallback(Errors.REBALANCE_IN_PROGRESS.code)
} else if (!group.has(memberId)) { // 检测memberID
responseCallback(Errors.UNKNOWN_MEMBER_ID.code)
} else if (generationId != group.generationId) { // 检测generationID
responseCallback(Errors.ILLEGAL_GENERATION.code)
} else {
val member = group.get(memberId)
// 继续下一步操作
completeAndScheduleNextHeartbeatExpiration(group, member)
responseCallback(Errors.NONE.code)
}
}
}
}
}
在completeAndScheduleNextHeartbeatExpiration中,更新收到这个member心跳的时间戳,会尝试执行响应的delayeHeartbeat,并创建新的delayedHeartbeat对象放入heartbeatPurgatory中等待下次心跳到来或者delayedheartbeat超时。
private def completeAndScheduleNextHeartbeatExpiration(group: GroupMetadata, member: MemberMetadata) {
// complete current heartbeat expectation
// 更新心跳时间
member.latestHeartbeat = time.milliseconds()
// 获取delayHeartbeat的key
val memberKey = MemberKey(member.groupId, member.memberId)
// 尝试完成之前添加的delayedheartbeat
heartbeatPurgatory.checkAndComplete(memberKey)
// reschedule the next heartbeat expiration deadline
// 计算下一次heartbeat的超时时间
val newHeartbeatDeadline = member.latestHeartbeat + member.sessionTimeoutMs
// 创建新的delayedHeartbeat对象,并添加到heartbeatPurgatory中
val delayedHeartbeat = new DelayedHeartbeat(this, group, member, newHeartbeatDeadline, member.sessionTimeoutMs)
heartbeatPurgatory.tryCompleteElseWatch(delayedHeartbeat, Seq(memberKey))
}
下面我们看看delayedHeartbeat的实现,主要字段有:
private[coordinator] class DelayedHeartbeat(coordinator: GroupCoordinator,// GroupCoordinator对象,delayedHeartbeat中的方法的实现方式都是调用GroupCoordinator中对应的方法
group: GroupMetadata,// 对应GroupMetadata对象
member: MemberMetadata, // 对应MemberMetadata对象
heartbeatDeadline: Long, //delayedHeartbeat到期时间戳
sessionTimeout: Long) // 指定delayedHeartbeat的到期时长,这个时间是消费者在JoinGroupRequest中设置的,并符合GroupConfig指定的合法区间
extends DelayedOperation(sessionTimeout) {
override def tryComplete(): Boolean = coordinator.tryCompleteHeartbeat(group, member, heartbeatDeadline, forceComplete)
override def onExpiration() = coordinator.onExpireHeartbeat(group, member, heartbeatDeadline)
override def onComplete() = coordinator.onCompleteHeartbeat()
}
tryCompleteHeartbeat会检测下列四个条件,如果满足其中任意一个,就认为DelayedHeartbeat符合执行条件:
def tryCompleteHeartbeat(group: GroupMetadata, member: MemberMetadata, heartbeatDeadline: Long, forceComplete: () => Boolean) = {
group synchronized {
if (shouldKeepMemberAlive(member, heartbeatDeadline) || member.isLeaving) // member已经离开的消费者组
forceComplete()
else false
}
}
private def shouldKeepMemberAlive(member: MemberMetadata, heartbeatDeadline: Long) =
member.awaitingJoinCallback != null || //awaitingJoinCallback不是null,即消费者正在等待JoinGroupResponse
member.awaitingSyncCallback != null ||//awaitingSyncCallback不是null,即消费者正在等待SyncGroupResponse
member.latestHeartbeat + member.sessionTimeoutMs > heartbeatDeadline // 最后一次收到心跳信息的时间和heartbeatDeadline的差距大于sessionTimeout。
private def isCoordinatorForGroup(groupId: String) = groupManager.isGroupLocal(groupId)
private def isCoordinatorLoadingInProgress(groupId: String) = groupManager.isGroupLoading(groupId)
}
onCompleteHeartbeat是空实现,所以DelayedHeartbeat执行之后仅会从hearbeatPurgatory中删除,并不会指定其他操作。
DelayedHeartbeat到期执行还会调用GroupCoordinator.onExpireHeartbeat方法,它会将其对应的member从GroupMetadata中删除,并按照当前GroupMetadta所处的状态进行分类处理。
def onExpireHeartbeat(group: GroupMetadata, member: MemberMetadata, heartbeatDeadline: Long) {
group synchronized {
// 再次检测member是否下线
if (!shouldKeepMemberAlive(member, heartbeatDeadline))
onMemberFailure(group, member) // member下线后的相关处理操作
}
}
private def onMemberFailure(group: GroupMetadata, member: MemberMetadata) {
trace("Member %s in group %s has failed".format(member.memberId, group.groupId))
// 将对应的member从GroupMetadata中删除
group.remove(member.memberId)
group.currentState match {
case Dead => // do nothing
// 之前的分区分配可能已经失效,把GroupMetadta切换成PrepareRebalance状态
case Stable | AwaitingSync => maybePrepareRebalance(group)
// GroupMetadata中的member减少,可能满足delayedJoin的指定条件,尝试执行
case PreparingRebalance => joinPurgatory.checkAndComplete(GroupKey(group.groupId))
}
}