参考:深入理解Kafka核心设计和实践原理
8、数据保存策略
每个partition物理上对应一个文件夹,此文件叫存储该partition中所有消息和索引文件;
1、基于时间的保存策略:log.retition.hours=168(一周)
2、基于大小的保存策略:log.retention.bytes=1073741824(1G)
9、kafka的消费过程
kafka提供了高级consumerAPI和低级consumerAPI。
高级consumeAPI的特征:无需自行管理offset,offset的管理取决于系统;无需管理分区、副本等,系统自行管理。
消费者断线会自动根据上一记录在zooKeeper中的offset去继续获取数据(默认的为一分钟更新一次zooKeeper中的offset);为了不影响topic之间的offset,我们一般采用不同的Consume group 去消费同一个topic,因为不同的group记录自己本省的offset,不会影响到其他的group的offset。
低阶consume的API特征:我们可以自行管理offset的值。
consumer group:
消费者是以消费者组进行消费的方式的,一般由多个消费者组成一个消费组,共同消费一个topic。但是每个partition在同一时间内只能由Consume group 中的一个consumer 去消费某个topic,多个group 可以同时消费这个partition。
这种情况下,consumer可以通过可以通过水平扩展的方式同时读取大量的消息。另外,如果一个consumer消费失败了,那么consumer group成员可以通过自动负载均衡读取之前失败的consumer读取的分区。
如图所示,consumer 1 可以可以消费主体中的两个分区。
消费过程:
consumer采用pull(拉取)的模式主动从broker中拉取数据。由于发送数据的速度是由broker决定的,而kafka集群中的不同consumer 的消费速度是不同的,如果采用push的方式主动推送的话,很容易造成consumer来不及消费数据,通常表现为拒绝服务获取阻塞网络。为我们采用pull的方式可以适应不同consumer的消费能力,避免造成上述的现象。
pull的方式不仅使得consumer可以控制自己消费的速度,还可以控制自己消费方式–采用批处理消费或者逐条数据的消费,可以根据不同场景选择不同的消费方式。
10、kafka实例
package com.paojiaojiang.consumer;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import java.util.Arrays;
import java.util.Properties;
/**
* @Author: jja
* @Description: kafka高级消费者的实例
* @Date: 2019/3/2 9:09
*/
public class CustomerConsumer {
public static void main(String[] args) {
// 所有的配置信息
Properties props = new Properties();
// kafka集群
props.put("bootstrap.servers", "spark1:9092");
// 组id
props.put("group.id", "test");
// 自动提交 提交的是offset
props.put("enable.auto.commit", "true");
// 自动提交的延时
props.put("auto.commit.interval.ms", "1000");
// 反序列化
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
// 创建消费者对象
KafkaConsumer<String, String> consumer = new KafkaConsumer<>(props);
// 指定topic 注意 topic--third不存在的。
consumer.subscribe(Arrays.asList("second", "first", "third"));
while (true) {
// 拉取数据
ConsumerRecords<String, String> datas = consumer.poll(100);
for (ConsumerRecord<String, String> data : datas) {
System.out.println((data.topic() + " "
+ data.partition() + " "
+ data.value()));
}
}
}
}
package com.paojiaojiang.producer;
import org.apache.kafka.clients.producer.*;
import java.util.Properties;
/**
@Author: jja
@Description: kafka的生产者
@Date: 2019/3/1 0:01
*/
public class CustomerProducer {
public static void main(String[] args) {
// 循环次数
int num = 10;
// 配置信息
Properties props = new Properties();
// kafka集群信息
props.put("bootstrap.servers", "spark1:9092");
// 应答级别
props.put(ProducerConfig.ACKS_CONFIG, "all");
// 重试次数
props.put("retries", 0);
// 批量大小
props.put("batch.size", 16384);
// 提交延迟
props.put("linger.ms", 1);
// 缓存大小
props.put("buffer.memory", 33554432);
// 序列化
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
// 自定义分区
props.put("partitioner.class", "com.paojiaojiang.producer.CustomerPartitioner");
Producer<String, String> producer = new KafkaProducer<>(props);
for (int i = 0; i < num; i++) {
producer.send(new ProducerRecord<>("second", Integer.toString(i), Integer.toString(i)), new Callback() {
@Override
public void onCompletion(RecordMetadata metadata, Exception exception) {
if (exception == null) {
System.out.println(metadata.offset() + "-" + metadata.partition());
}else {
System.out.println("发送失败");
}
}
});
}
producer.close();
}
}
官方提供的消费者案例
package com.paojiaojiang.consumer;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import java.util.Arrays;
import java.util.Properties;
/**
* @Author: jja
* @Description:
* @Date: 2019/3/19 23:45
*/
public class ClassicalConsumer {
public static void main(String[] args) {
Properties props = new Properties();
// 定义kakfa 服务的地址,不需要将所有broker指定上
props.put("bootstrap.servers", "hadoop102:9092");
// 制定consumer group
props.put("group.id", "test");
// 是否自动确认offset
props.put("enable.auto.commit", "true");
// 自动确认offset的时间间隔
props.put("auto.commit.interval.ms", "1000");
// key的序列化类
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
// value的序列化类
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
// 定义consumer
KafkaConsumer<String, String> consumer = new KafkaConsumer<>(props);
// 消费者订阅的topic, 可同时订阅多个
consumer.subscribe(Arrays.asList("first", "second", "third"));
while (true) {
// 读取数据,读取超时时间为100ms
ConsumerRecords<String, String> records = consumer.poll(100);
for (ConsumerRecord<String, String> record : records) {
System.out.printf("offset = %d, key = %s, value = %s%n", record.offset(), record.key(), record.value());
}
}
}
}
11、kafka生产者
首先创建topic,名称为fourth
[root@spark kafka]# bin/kafka-topics.sh --zookeeper spark:2181 spark1:2181 spark2:2181 --create --replication-factor 3 --partitions 3 --topic fourth
查看当前的topic都有哪些:
[root@spark2 kafka]# bin/kafka-topics.sh --zookeeper spark1:2181 --list
first
fisrt
fourth // 这个是我新创建的
second
third
package com.paojiaojiang.producer;
import kafka.producer.KeyedMessage;
import kafka.javaapi.producer.Producer;
import kafka.producer.ProducerConfig;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
/**
* @Author: jja
* @Description:
* @Date: 2019/3/20 0:35
*/
public class KafkaProducer implements Runnable {
private Producer<String, String> producer;
private ProducerConfig config = null;
public KafkaProducer() {
Properties props = new Properties();
props.put("zookeeper.connect", "spark:2181,spark1:2181,spark2:2181");
// props.put("zookeeper.connect", "localhost:2181");
// 指定序列化处理类,默认为kafka.serializer.DefaultEncoder,即byte[]
props.put("serializer.class", "kafka.serializer.StringEncoder");
// 同步还是异步,默认2表同步,1表异步。异步可以提高发送吞吐量,但是也可能导致丢失未发送过去的消息
props.put("producer.type", "sync");
// 是否压缩,默认0表示不压缩,1表示用gzip压缩,2表示用snappy压缩。压缩后消息中会有头来指明消息压缩类型,故在消费者端消息解压是透明的无需指定。
props.put("compression.codec", "1");
// 指定kafka节点列表,用于获取metadata(元数据),不必全部指定
props.put("metadata.broker.list", "spark:9092,spark1:9092,spark2:9092");
config = new ProducerConfig(props);
}
@Override
public void run() {
producer = new Producer<String, String>(config);
for (int i = 1; i <= 3; i++) { //往3个分区发数据
List<KeyedMessage<String, String>> messageList = new ArrayList<>();
for (int j = 0; j < 10; j++) { //每个分区10条数据
messageList.add(new KeyedMessage<>
//String topic, String partition, String message
("fourth", "partition[" + i + "]", "message[The " + i + " message]"));
}
producer.send(messageList);
}
}
public static void main(String[] args) {
Thread t = new Thread(new KafkaProducer());
t.start();
}
}
12、kafka多线程消费
package com.paojiaojiang.consumer;
import kafka.consumer.ConsumerConfig;
import kafka.consumer.KafkaStream;
import kafka.javaapi.consumer.ConsumerConnector;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* @Author: jja
* @Description:
* @Date: 2019/3/19 23:52
*/
public class MultiThreadConsumer implements Runnable {
private ConsumerConfig consumerConfig;
private static String TOPIC="fourth";
Properties props;
final static int NUM_THREAD = 3; // 三个线程进行读取数据,因为只有三个partition
public MultiThreadConsumer() {
props = new Properties();
props.put("bootstrap.servers", "spark:2181,spark1:2181,spark2:2181");
props.put("zookeeper.connectiontimeout.ms", "30000");
props.put("group.id", "paojiaojiang");
props.put("zookeeper.session.timeout.ms", "400");
props.put("zookeeper.sync.time.ms", "200");
props.put("auto.commit.interval.ms", "1000");
props.put("auto.offset.reset", "smallest");
consumerConfig = new ConsumerConfig(props);
}
@Override
public void run() {
Map<String, Integer> topicCountMap = new HashMap<>();
topicCountMap.put(TOPIC, new Integer(NUM_THREAD));
ConsumerConfig consumerConfig = new ConsumerConfig(props);
ConsumerConnector consumer = kafka.consumer.Consumer.createJavaConsumerConnector(consumerConfig);
Map<String, List<KafkaStream<byte[], byte[]>>> consumerMap = consumer.createMessageStreams(topicCountMap);
List<KafkaStream<byte[], byte[]>> streams = consumerMap.get(TOPIC);
ExecutorService executorService = Executors.newFixedThreadPool(NUM_THREAD);
for (final KafkaStream stream : streams) {
executorService.submit(new KafkaConsumerThread(stream));
}
}
public static void main(String[] args) {
System.out.println(TOPIC);
Thread t = new Thread(new MultiThreadConsumer());
t.start();
}
}
package com.paojiaojiang.consumer;
import kafka.consumer.ConsumerIterator;
import kafka.consumer.KafkaStream;
import kafka.message.MessageAndMetadata;
/**
* @Author: jja
* @Description:
* @Date: 2019/3/19 23:54
*/
public class KafkaConsumerThread implements Runnable {
private KafkaStream<byte[], byte[]> stream;
public KafkaConsumerThread(KafkaStream<byte[], byte[]> stream) {
this.stream = stream;
}
@Override
public void run() {
ConsumerIterator<byte[], byte[]> it = stream.iterator();
while (it.hasNext()) {
MessageAndMetadata<byte[], byte[]> mam = it.next();
System.out.println(Thread.currentThread().getName() + ": partition[" + mam.partition() + "],"
+ "offset[" + mam.offset() + "], " + new String(mam.message()));
}
}
}
结果如下:
pool-2-thread-3: partition[0],offset[0], message[The 2 message]
pool-2-thread-2: partition[2],offset[0], message[The 1 message]
pool-2-thread-1: partition[1],offset[0], message[The 3 message]
pool-2-thread-3: partition[0],offset[1], message[The 2 message]
pool-2-thread-2: partition[2],offset[1], message[The 1 message]
pool-2-thread-1: partition[1],offset[1], message[The 3 message]
pool-2-thread-3: partition[0],offset[2], message[The 2 message]
pool-2-thread-2: partition[2],offset[2], message[The 1 message]
pool-2-thread-1: partition[1],offset[2], message[The 3 message]
pool-2-thread-3: partition[0],offset[3], message[The 2 message]
pool-2-thread-2: partition[2],offset[3], message[The 1 message]
pool-2-thread-1: partition[1],offset[3], message[The 3 message]
pool-2-thread-3: partition[0],offset[4], message[The 2 message]
pool-2-thread-1: partition[1],offset[4], message[The 3 message]
pool-2-thread-3: partition[0],offset[5], message[The 2 message]
pool-2-thread-2: partition[2],offset[4], message[The 1 message]
pool-2-thread-1: partition[1],offset[5], message[The 3 message]
pool-2-thread-3: partition[0],offset[6], message[The 2 message]
pool-2-thread-2: partition[2],offset[5], message[The 1 message]
pool-2-thread-3: partition[0],offset[7], message[The 2 message]
pool-2-thread-1: partition[1],offset[6], message[The 3 message]
pool-2-thread-2: partition[2],offset[6], message[The 1 message]
pool-2-thread-1: partition[1],offset[7], message[The 3 message]
pool-2-thread-3: partition[0],offset[8], message[The 2 message]
pool-2-thread-1: partition[1],offset[8], message[The 3 message]
pool-2-thread-2: partition[2],offset[7], message[The 1 message]
pool-2-thread-3: partition[0],offset[9], message[The 2 message]
pool-2-thread-1: partition[1],offset[9], message[The 3 message]
pool-2-thread-2: partition[2],offset[8], message[The 1 message]
pool-2-thread-2: partition[2],offset[9], message[The 1 message]
可以看到:每个partition消费的offset是有序的,但是整体的offset是无序的。因为kafka只维持每个partition的offset的有序性,不保证整个集群消费kafka的offset的有序性。
注意:设置线程数量的大小与kafka中server.properties的num.partitions的值得大小一致。如:
############################# Log Basics #############################
# A comma seperated list of directories under which to store log files
log.dirs=/opt/module/kafka/logs
# The default number of log partitions per topic. More partitions allow greater
# parallelism for consumption, but this will also result in more files across
# the brokers.
num.partitions=3