Kafka Shell Lag
kafka 版本:2.1.0
前言
在生产环境中,比如你正在使用group kafka-lag
消费某topic内的数据。目前你没有搭建对应的监控系统,你如何去查看对应partition 的堆积信息呢?很多人都会去使用这个命令:
# 正常使用
kafka-consumer-groups --bootstrap-server master:9092 --describe --group default
# 系统存在kerberos认证使用
kafka-consumer-groups --bootstrap-server master:9092 --describe --group default --command-config /home/xiahu/client.properties
client.properties
security.protocol=PLAINTEXT
sasl.mechanism=GSSAPI
=kafka
没错,今天我们就来研究一下这个命令,先从kafka-consumer-groups
启动脚本看起
1. kafka-consumer-groups.sh
# 该脚本只是简单的去调用了另外一个脚本,并将参数传递过去
exec $(dirname $0)/ kafka.admin.ConsumerGroupCommand "$@"
2.
# 这个脚本内的内容太多了,其他的我也没看,但是你所需要明白的是:
# 在命令行执行: kafka-consumer-groups --bootstrap-server master:9092 --describe --group default
# 最终调用:kafka.admin.ConsumerGroupCommand --bootstrap-server master:9092 --describe --group default
# 所以主要看源码:kafka.admin.ConsumerGroupCommand 这个类
if [ "x$DAEMON_MODE" = "xtrue" ]; then
echo $JAVA $KAFKA_HEAP_OPTS $KAFKA_JVM_PERFORMANCE_OPTS $KAFKA_GC_LOG_OPTS $KAFKA_JMX_OPTS $KAFKA_LOG4J_OPTS -cp $CLASSPATH $KAFKA_OPTS "$@"
else
exec $JAVA $KAFKA_HEAP_OPTS $KAFKA_JVM_PERFORMANCE_OPTS $KAFKA_GC_LOG_OPTS $KAFKA_JMX_OPTS $KAFKA_LOG4J_OPTS -cp $CLASSPATH $KAFKA_OPTS "$@"
fi
3. ConsumerGroupCommand
def main(args: Array[String]) {
val opts = new ConsumerGroupCommandOptions(args)
if (args.length == 0)
CommandLineUtils.printUsageAndDie(opts.parser, "List all consumer groups, describe a consumer group, delete consumer group info, or reset consumer group offsets.")
val actions = Seq(opts.listOpt, opts.describeOpt, opts.deleteOpt, opts.resetOffsetsOpt).count(opts.options.has)
if (actions != 1)
CommandLineUtils.printUsageAndDie(opts.parser, "Command must include exactly one action: --list, --describe, --delete, --reset-offsets")
//参数判断
opts.checkArgs()
//通过ConsumerGroupCommandOptions,构造ConsumerGroupService对象
val consumerGroupService = new ConsumerGroupService(opts)
try {
if (opts.options.has(opts.listOpt))
consumerGroupService.listGroups().foreach(println(_))
else if (opts.options.has(opts.describeOpt))
//因为此次我们探究的是kafka lag的数据,所以主要看方法
consumerGroupService.describeGroup()
else if (opts.options.has(opts.deleteOpt))
...
}
4. describeGroup()
def describeGroup(): Unit = {
// 从配置类中获取配置
val group = opts.options.valuesOf(opts.groupOpt).asScala.head
val membersOptPresent = opts.options.has(opts.membersOpt)
val stateOptPresent = opts.options.has(opts.stateOpt)
val offsetsOptPresent = opts.options.has(opts.offsetsOpt)
val subActions = Seq(membersOptPresent, offsetsOptPresent, stateOptPresent).count(_ == true)
if (subActions == 0 || offsetsOptPresent) {
// kafka lag 信息的查询,主要封装与该类中
val offsets = collectGroupOffsets()
printOffsets(group, offsets._1, offsets._2)
} else if (membersOptPresent) {
val members = collectGroupMembers(opts.options.has(opts.verboseOpt))
printMembers(group, members._1, members._2, opts.options.has(opts.verboseOpt))
} else
printState(group, collectGroupState())
}
5. collectGroupOffsets()
def collectGroupOffsets(): (Option[String], Option[Seq[PartitionAssignmentState]]) = {
val groupId = opts.options.valueOf(opts.groupOpt)
// 首先构造AdminClient 对象
// 关于Admin Client,查看该博客即可了解:
// AdminClient 根据 groupId 获取 ConsumerGroupDescription
//ConsumerGroupDescription: A detailed description of a single consumer group in the cluster.
val consumerGroup = adminClient.describeConsumerGroups(
List(groupId).asJava,
withTimeoutMs(new DescribeConsumerGroupsOptions())
).describedGroups.get(groupId).get
val state = consumerGroup.state
// 根据groupId 返回一个Map对象<TopicPartition,OffsetAndMetadata>
// TopicPartition: 内部封装topic,partition
// OffsetAndMetadata : 内部封装当前topic,partition 对应的groupId 的 当前的offset 和元数据信息
// 比如:
// topic:kafka_lag_test partition:0 groupId:kafka-lag
// 众所周知,topic + partition + groupId 都对应着唯一的 :currentOffset
val committedOffsets = getCommittedOffsets(groupId).asScala.toMap
var assignedTopicPartitions = ListBuffer[TopicPartition]()
// 下面这段代码主要过滤空的TopicPartition,并且封装TopicPartition 对应的currentOffset
val rowsWithConsumer = consumerGroup.members.asScala.filter(!_.assignment.topicPartitions.isEmpty).toSeq
.sortWith(_.assignment.topicPartitions.size > _.assignment.topicPartitions.size).flatMap { consumerSummary =>
val topicPartitions = consumerSummary.assignment.topicPartitions.asScala
assignedTopicPartitions = assignedTopicPartitions ++ topicPartitions
val partitionOffsets = consumerSummary.assignment.topicPartitions.asScala
.map { topicPartition =>
topicPartition -> committedOffsets.get(topicPartition).map(_.offset)
}.toMap
// 主要看一下这个方法
collectConsumerAssignment(groupId, Option(consumerGroup.coordinator), topicPartitions.toList,
partitionOffsets, Some(s"${consumerSummary.consumerId}"), Some(s"${consumerSummary.host}"),
Some(s"${consumerSummary.clientId}"))
}
val rowsWithoutConsumer = committedOffsets.filterKeys(!assignedTopicPartitions.contains(_)).flatMap {
case (topicPartition, offset) =>
collectConsumerAssignment(
groupId,
Option(consumerGroup.coordinator),
Seq(topicPartition),
Map(topicPartition -> Some(offset.offset)),
Some(MISSING_COLUMN_VALUE),
Some(MISSING_COLUMN_VALUE),
Some(MISSING_COLUMN_VALUE))
}
(Some(state.toString), Some(rowsWithConsumer ++ rowsWithoutConsumer))
}
6. collectConsumerAssignment
//该方法返回一个PartitionAssignmentState数据
private def collectConsumerAssignment(group: String,
coordinator: Option[Node],
topicPartitions: Seq[TopicPartition],
getPartitionOffset: TopicPartition => Option[Long],
consumerIdOpt: Option[String],
hostOpt: Option[String],
clientIdOpt: Option[String]):
Array[PartitionAssignmentState] = {
// 一般情况下,topicPartitions为空
if (topicPartitions.isEmpty) {
Array[PartitionAssignmentState](
PartitionAssignmentState(group, coordinator, None, None, None, getLag(None, None), consumerIdOpt, hostOpt, clientIdOpt, None)
)
}
else
// 主要看这个方法
describePartitions(group, coordinator, topicPartitions.sortBy(_.partition), getPartitionOffset, consumerIdOpt, hostOpt, clientIdOpt)
}
7. describePartitions
private def describePartitions(group: String,
coordinator: Option[Node],
topicPartitions: Seq[TopicPartition],
getPartitionOffset: TopicPartition => Option[Long],
consumerIdOpt: Option[String],
hostOpt: Option[String],
clientIdOpt: Option[String]): Array[PartitionAssignmentState] = {
def getDescribePartitionResult(topicPartition: TopicPartition, logEndOffsetOpt: Option[Long]): PartitionAssignmentState = {
val offset = getPartitionOffset(topicPartition)
PartitionAssignmentState(group, coordinator, Option(topicPartition.topic), Option(topicPartition.partition), offset,
getLag(offset, logEndOffsetOpt), consumerIdOpt, hostOpt, clientIdOpt, logEndOffsetOpt)
}
//getLogEndOffsets
//1. 根据bootstrap-server,groupId 实例化KafkaConsumer对象
//2. 根据TopicPartition,调用KafkaConsumer的endOffsets方法,获取topic内每一个partition的最大offset
//3. 根据之前查询到的groupId对应topic内每一个partition的currentOffset,与此次获取到的offset,做一个计算,最终得到Lag,并将其封装PartitionAssignmentState返回
getLogEndOffsets(topicPartitions).map {
case (topicPartition, LogOffsetResult.LogOffset(offset)) => getDescribePartitionResult(topicPartition, Some(offset))
case (topicPartition, _) => getDescribePartitionResult(topicPartition, None)
}.toArray
}
说明
在kafka内,有以下几个概念
- broker
- topic
- partition
- group
- offset
分别说明:
1. broker
broker可以理解为一台安装kafka的机器,多个broker构成kafka集群,如果只有一个broker,那么这个kafka服务是单机的
2. topic
topic 翻译过来为主题. 一个kafka集群下有多个topic
3. partition
partition翻译为分区,hive里面就有分区的概念,与hive的分区类似,一个topic 内有多个partition
4. groupId
结合实际说明:
目前,我有 topic: kafka_lag ,该topic有两个partition,目前往topic内生产10000条数据,按照默认的分区测试,partition 0,partition 1 分别有 5000 条数据.
除此之外,我有两个group:kafka-consumer-lag-1,kafka-consumer-lag-2
首先:我使用kafka-consumer-lag-1 去消费topic内的数据,加入,partition0,1 分别消费2000 ,则offset 如下:
groupId | topic | partition | currentOffset | lag | endOffset |
kafka-consumer-lag-1 | kafka_lag | 0 | 2000 | 3000 | 5000 |
kafka-consumer-lag-1 | kafka_lag | 1 | 2000 | 3000 | 5000 |
kafka-consumer-lag-2 | kafka_lag | 0 | 0 | 5000 | 5000 |
kafka-consumer-lag-2 | kafka_lag | 1 | 0 | 5000 | 5000 |
然后,我用 kafka-consumer-lag-2 去消费topic内的数据,partition 0,1 分区消费4000 ,则offset如下:
groupId | topic | partition | currentOffset | lag | endOffset |
kafka-consumer-lag-1 | kafka_lag | 0 | 2000 | 3000 | 5000 |
kafka-consumer-lag-1 | kafka_lag | 1 | 2000 | 3000 | 5000 |
kafka-consumer-lag-2 | kafka_lag | 0 | 4000 | 1000 | 5000 |
kafka-consumer-lag-2 | kafka_lag | 1 | 4000 | 1000 | 5000 |
总结
由上面的数据展示可知:
topic + partition 对应唯一的endOffset
topic + partition + group 对应唯一的currentOffset
其实kafka 提供的 就是使用的这个原理
- 构造AdminClient,使用AdminClient 的listConsumerGroupOffsets() 根据groupid 获取每一个 topic + partition + groupId 对应的唯一的currentOffset
- 实例化KafkaConsumer对象,根据topic + partiton 组成的TopicPartition 对象集合,获取 topic + partition 对应的唯一的endOffset
- 通过一系列计算(endOffset - currentOffset),获取到了groupID 对应的Lag ,最终打印呈现
由于kafka 源码是使用scala写的,没了解过scala的人看起来会比较困难,我用java重新给逻辑实现了一遍,代码如下:
package com.clb.lag;
import org.apache.kafka.clients.admin.AdminClient;
import org.apache.kafka.clients.admin.ConsumerGroupDescription;
import org.apache.kafka.clients.admin.MemberDescription;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.clients.consumer.OffsetAndMetadata;
import org.apache.kafka.common.ConsumerGroupState;
import org.apache.kafka.common.KafkaFuture;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer;
import java.util.*;
import java.util.function.Consumer;
/**
* @author Xiahu
* @create 2021/1/11
*/
public class KafkaOffsetTool {
private AdminClient adminClient;
private static final String MISSING_COLUMN_VALUE = "-";
private KafkaConsumer consumer;
public KafkaOffsetTool() {
Properties properties = new Properties();
properties.put("bootstrap.servers", "node2:9092");
//kerberos认证需要自己实现
if (false) {
properties.put("", "kafka");
properties.put("sasl.mechanism", "GSSAPI");
properties.put("security.protocol", "PLAINTEXT");
}
this.adminClient = AdminClient.create(properties);
}
public List<PartitionOffsetState> collectGroupOffsets(String group) throws Exception {
List<PartitionOffsetState> result = new ArrayList<>();
List<String> groupId = Arrays.asList(group);
Map<String, KafkaFuture<ConsumerGroupDescription>> describedGroups = adminClient.describeConsumerGroups(groupId).describedGroups();
ConsumerGroupDescription consumerGroup = describedGroups.get(group).get();
ConsumerGroupState state = consumerGroup.state();
Map<TopicPartition, OffsetAndMetadata> committedOffsets = getCommitsOffsets(group);
Collection<MemberDescription> memberDescriptions = consumerGroup.members();
Set<MemberDescription> memberDescriptionSet = new HashSet<>();
Iterator<MemberDescription> iterator = memberDescriptions.iterator();
while (iterator.hasNext()) {
MemberDescription memberDescription = iterator.next();
if (null != memberDescription.assignment().topicPartitions()) {
memberDescriptionSet.add(memberDescription);
}
}
memberDescriptionSet.stream().sorted(new Comparator<MemberDescription>() {
@Override
public int compare(MemberDescription o1, MemberDescription o2) {
if (o1.assignment().topicPartitions().size() >= o2.assignment().topicPartitions().size()) {
return 1;
} else {
return -1;
}
}
}).forEach(new Consumer<MemberDescription>() {
@Override
public void accept(MemberDescription memberDescription) {
Set<TopicPartition> topicPartitions = memberDescription.assignment().topicPartitions();
for (TopicPartition tp : topicPartitions) {
long offset = committedOffsets.get(tp).offset();
PartitionOffsetState partitionOffsetState = new PartitionOffsetState();
partitionOffsetState.setGroup(group);
partitionOffsetState.setCoordinator(consumerGroup.coordinator().toString());
partitionOffsetState.setHost(memberDescription.host());
partitionOffsetState.setClientId(memberDescription.clientId());
partitionOffsetState.setConsumerId(memberDescription.consumerId());
partitionOffsetState.setPartition(tp.partition());
partitionOffsetState.setTopic(tp.topic());
partitionOffsetState.setOffset(offset);
result.add(partitionOffsetState);
}
}
});
//封装committedOffsets
Iterator<Map.Entry<TopicPartition, OffsetAndMetadata>> entryIterator = committedOffsets.entrySet().iterator();
while (entryIterator.hasNext()) {
Map.Entry<TopicPartition, OffsetAndMetadata> entry = entryIterator.next();
PartitionOffsetState partitionOffsetState = new PartitionOffsetState();
partitionOffsetState.setGroup(group);
partitionOffsetState.setCoordinator(consumerGroup.coordinator().toString());
partitionOffsetState.setHost(MISSING_COLUMN_VALUE);
partitionOffsetState.setClientId(MISSING_COLUMN_VALUE);
partitionOffsetState.setConsumerId(MISSING_COLUMN_VALUE);
partitionOffsetState.setPartition(entry.getKey().partition());
partitionOffsetState.setTopic(entry.getKey().topic());
partitionOffsetState.setOffset(entry.getValue().offset());
result.add(partitionOffsetState);
}
return result;
}
private Map<TopicPartition, OffsetAndMetadata> getCommitsOffsets(String groupId) throws Exception {
Map<TopicPartition, OffsetAndMetadata> result = adminClient.listConsumerGroupOffsets(groupId).partitionsToOffsetAndMetadata().get();
return result;
}
public List<PartitionOffsetState> getLag(List<PartitionOffsetState> partitionOffsetStateList,String groupId) {
getConsumer(new Properties(), groupId);
List<TopicPartition> topicPartitionList = new ArrayList<>();
for (PartitionOffsetState partitionOffset : partitionOffsetStateList) {
topicPartitionList.add(new TopicPartition(partitionOffset.getTopic(), partitionOffset.getPartition()));
}
Map<TopicPartition, Long> map = consumer.endOffsets(topicPartitionList);
for (PartitionOffsetState partitionOffset : partitionOffsetStateList) {
for (Map.Entry<TopicPartition, Long> entry : map.entrySet()) {
if (entry.getKey().topic().equals(partitionOffset.getTopic()) && entry.getKey().partition() == partitionOffset.getPartition()) {
partitionOffset.setLag(entry.getValue() - partitionOffset.getOffset());
partitionOffset.setLogEndOffset(entry.getValue());
}
}
}
return partitionOffsetStateList;
}
private KafkaConsumer getConsumer(Properties prop, String groupId) {
if (consumer == null) {
createConsumer(prop, groupId);
}
return consumer;
}
public void createConsumer(Properties prop, String groupId) {
//kerberos认证需要自己实现
if (false) {
System.setProperty("java.security.krb5.conf", prop.getProperty(NuwaConstant.KERBEROS_KRB5));
System.setProperty("java.security.auth.login.config", prop.getProperty(NuwaConstant.KERBEROS_LOGIN_CONFIG));
prop.put(NuwaConstant.KAFKA_SECURITY_PROTOCOL, prop.getProperty(NuwaConstant.KAFKA_SECURITY_PROTOCOL));
prop.put(NuwaConstant.KAFKA_SASL_MECHANISM, prop.getProperty(NuwaConstant.KAFKA_SASL_MECHANISM));
prop.put(NuwaConstant.KAFKA_SASL_KERBEROS_SERVICE_NAME, prop.getProperty(NuwaConstant.KAFKA_SASL_KERBEROS_SERVICE_NAME));
}
String deserializer = StringDeserializer.class.getName();
String broker = "node1:9092";
prop.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, broker);
prop.put(ConsumerConfig.GROUP_ID_CONFIG, groupId);
prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
prop.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, "30000");
prop.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, deserializer);
prop.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, deserializer);
consumer = new KafkaConsumer(prop);
}
public static void main(String[] args) throws Exception {
KafkaOffsetTool kafkaOffsetTool = new KafkaOffsetTool();
List<PartitionOffsetState> partitionOffsetStates = kafkaOffsetTool.collectGroupOffsets("kafka-lag");
partitionOffsetStates = kafkaOffsetTool.getLag(partitionOffsetStates,"kafka-lag");
System.out.println(partitionOffsetStates);
}
}
PartitionOffsetState
package com.clb.lag;
import lombok.Data;
/**
* @author Xiahu
* @create 2021/1/11
*/
@Data
public class PartitionOffsetState {
private String group;
private String coordinator;
private String topic;
private int partition;
private Long offset;
private Long lag;
private String consumerId;
private String host;
private String clientId;
private Long logEndOffset;
}