关于kafka的相关知识
- kafka是属于点对点模式的(一对一,消费者主动拉取数据,数据消息被消费后清除)
- 消息对列的好处
- 解耦
- 冗余
- 扩展性
- 灵活性峰值处理能力
- 可恢复性
- 顺序保证性
- 缓冲
- 异步通信
- kafka是一个分布式的消息对列。kafka对消息保存是根据topic进行归类,发送消息者称为producer,消费笑着者称为consumer,此外kafka集群有多个kafka实例组成,每个实例称为broker。
- 无论是kafka集群还是consumer都依赖于zookeeper集群保存一些meta信息,来保证系统的可用性。
- kafka集群的部署
- 官网下载kafka的安装包
- 修改配置文件
- .修改这个文件server.properties
broker.id=0 //这个值在集群中是唯一的
delete.topic.enable=true 这个的默认值为false
//自己配置的日志信息的存储位置 kafka存储的数据也在这里
log.dirs=/usr/hadoop/kafka/kafka_2.11-2.1.1/logs
zookeeper.connect=hadoop:2181 //zookeeper的连接地址 集群以,分隔
- 启动kafka bin/kafka-server-start.sh config/server.properties
- 测试
bin/kafka-topics.sh --create --zookeeper hadoop:2181 \
--partitions 2 --replication-factor 1 --topic first
Created topic "first".表示成功创建
bin/kafka-topics.sh --list --zookeeper hadoop:2181 //查看当前的topic
//输入你所需要提供的数据 消息提供端
bin/kafka-console-producer.sh --broker-list hadoop:9092 --topic first
bin/kafka-console-consumer.sh --bootstrap-server hadoop:9092 --topic first//消费数据
bin/kafka-topics.sh --zookeeper hadoop:2181 --describe topic first//查看topic的详情
bin/kafka-topics.sh --delete --zookeeper hadoop:2181 --topic first//删除topic
- 使用kafka的高级API
- 创建一个生产者
package com.kafka.producer;
import org.apache.kafka.clients.producer.*;
import java.util.Properties;
/**
* 低级的API的消息提供者
*/
public class CustomerProducer {
public static void main(String[] args) {
//配置信息 ProducerConfig
Properties properties = new Properties();
//hadoop集群
properties.put("bootstrap.servers", "hadoop:9092");
//应答级别
properties.put(ProducerConfig.ACKS_CONFIG, "all");
//重试次数
properties.put("retries", 0);
//批量大小
properties.put("batch.size", 16384);
//提交延时
properties.put("linger.ms", 1);
//缓冲内存
properties.put("buffer.memory", 33554432);
//提供序列化的类
properties.put("key.serializer",
"org.apache.kafka.common.serialization.StringSerializer");
properties.put("value.serializer",
"org.apache.kafka.common.serialization.StringSerializer");
properties.put(ProducerConfig.PARTITIONER_CLASS_CONFIG, CustomerPartition.class);
//创建生产者
KafkaProducer<String, String> producer = new KafkaProducer<>(properties);
//循环发送数据 回调函数
for (int i = 0; i < 10; i++) {
producer.send(new ProducerRecord<String, String>("first", String.valueOf(i)),
(metadata, exception) -> {
if (exception != null) {
System.out.println("发送失败");
} else {
System.out.println(metadata.partition() + "---" +
metadata.offset());
}
});
}
producer.close();
}
}
- 自定义分区
package com.kafka.producer;
import org.apache.kafka.clients.producer.Partitioner;
import org.apache.kafka.common.Cluster;
import java.util.Map;
/**
* 自定义分区
*/
public class CustomerPartition implements Partitioner {
public int partition(String topic, Object key, byte[] keyBytes,
Object value, byte[] valueBytes, Cluster cluster) {
return 0;
}
public void close() {
}
public void configure(Map<String, ?> configs) {
}
}
- 创建消费者的高级api
package com.kafka.producer;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import java.util.Arrays;
import java.util.Properties;
public class CustomerConsumer {
public static void main(String[] args) {
//配置信息 ProducerConfig
Properties properties = new Properties();
//hadoop集群
properties.put("bootstrap.servers", "hadoop:9092");
//消费者组id
properties.put("group.id", "test");
//是否自动提交offset
properties.put("enable.auto.commit", "true");
//自动提交的延时
properties.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "10000");
//对象的反序列化
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,
"org.apache.kafka.common.serialization.StringDeserializer");
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,
"org.apache.kafka.common.serialization.StringDeserializer");
//创建消费者对象
KafkaConsumer<String, String> consumer = new KafkaConsumer<>(properties);
//指定topic
consumer.subscribe(Arrays.asList("first", "second", "third"));
while (true) {
ConsumerRecords<String, String> consumerRecords = consumer.poll(100);
for (ConsumerRecord<String, String> consumerRecord : consumerRecords) {
System.out.println(consumerRecord.topic() +
"-----" + consumerRecord.partition() +
"-------" + consumerRecord.value());
}
}
}
}
- 消费者高级API的重复消费的两种方法
- 换组
//设置这个参数改变两一个组名
//消费者组id
properties.put("group.id", "test");
//重复消费的配置 还需添加这段代码
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
- 把偏移量重置后指定
consumer.assign(Collections.singletonList(new TopicPartition("first",0)));
consumer.seek(new TopicPartition("first",0),2);
- 创建消费者的低级api
package com.kafka.consumer;
import kafka.api.FetchRequest;
import kafka.api.FetchRequestBuilder;
import kafka.cluster.BrokerEndPoint;
import kafka.javaapi.*;
import kafka.javaapi.consumer.SimpleConsumer;
import kafka.javaapi.message.ByteBufferMessageSet;
import kafka.message.MessageAndOffset;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* 根据指定的topic,offset,partition获取数据
*/
public class LowerConsumer {
public static void main(String[] args) {
//定义相关参数
List<String> brokers = new ArrayList<>();//kafka集群
brokers.add("hadoop");
//端口号
int port = 9092;
//topic主题
String topic = "first";
//分区
int partition = 0;
//偏移量
long offset = 2L;
LowerConsumer lowerConsumer = new LowerConsumer();
lowerConsumer.getData(brokers,port,topic,partition,offset);
}
//找分区领导
private BrokerEndPoint findLeader(List<String> brokers,
int port, String topic, int partition) {
for (String broker : brokers) {
//创建获取leader的消费者对象
SimpleConsumer leader = new SimpleConsumer(broker, port, 1000,
1024 , "getLeader");
TopicMetadataRequest topicMetadataRequest =
new TopicMetadataRequest(Collections.singletonList(topic));
//获取主题元数据返回值
TopicMetadataResponse metadataResponse = leader.send(topicMetadataRequest);
List<TopicMetadata> topicsMetadata = metadataResponse.topicsMetadata();
for (TopicMetadata topicMetadatum : topicsMetadata) {
List<PartitionMetadata> partitionsMetadata =
topicMetadatum.partitionsMetadata();
for (PartitionMetadata partitionMetadatum : partitionsMetadata) {
if (partition==partitionMetadatum.partitionId()) {
return partitionMetadatum.leader();
}
}
}
}
return null;
}
//获取数据
private void getData(List<String> brokers, int port,
String topic, int partition, long offset) {
BrokerEndPoint leader = findLeader(brokers, port, topic, partition);
if (leader == null) {
return;
}
String leaderHost = leader.host();
SimpleConsumer getData = new SimpleConsumer(leaderHost, port,
1000, 1024, "getData");
//创建获取数据对象
FetchRequest fetchRequest =
new FetchRequestBuilder().addFetch(topic, partition, offset, 100).build();
//获取数据的返回值
FetchResponse fetchResponse = getData.fetch(fetchRequest);
//解析返回值
ByteBufferMessageSet messageAndOffsets = fetchResponse.messageSet(topic, partition);
for (MessageAndOffset messageAndOffset : messageAndOffsets) {
long offset1 = messageAndOffset.offset();
ByteBuffer payload = messageAndOffset.message().payload();
byte[] bytes=new byte[payload.limit()];
payload.get(bytes);
System.out.println(offset1+"--------"+new String(bytes));
}
}
}
- 拦截器
//数量拦截器
package com.kafka.intercetor;
import com.sun.org.apache.bcel.internal.generic.IFNE;
import org.apache.kafka.clients.producer.ProducerInterceptor;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.clients.producer.RecordMetadata;
import java.util.Map;
public class CountInterceptor implements ProducerInterceptor<String,String> {
private int successCount=0;
private int errorCount=0;
@Override
public ProducerRecord<String, String> onSend(ProducerRecord<String, String> record) {
return record;
}
@Override
public void onAcknowledgement(RecordMetadata metadata, Exception exception) {
if (exception == null)
successCount++;
else
errorCount++;
}
@Override
public void close() {
System.out.println("发送成功"+successCount+"条数据");
System.out.println("发送成功"+errorCount+"条数据");
}
@Override
public void configure(Map<String, ?> configs) {
}
}
//时间拦截器
package com.kafka.intercetor;
import org.apache.kafka.clients.producer.ProducerInterceptor;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.clients.producer.RecordMetadata;
import java.util.Map;
/**
* 拦截器
*/
public class TimeInterceptor implements ProducerInterceptor<String,String> {
@Override
public ProducerRecord<String,String> onSend(ProducerRecord record) {
return new ProducerRecord(record.topic(),record.key(),
System.currentTimeMillis()+","+record.value());
}
@Override
public void onAcknowledgement(RecordMetadata metadata, Exception exception) {
}
@Override
public void close() {
}
@Override
public void configure(Map<String, ?> configs) {
}
}
//增加拦截器的设置
List<String> list = new ArrayList<>();
list.add("com.kafka.intercetor.TimeInterceptor");
list.add("com.kafka.intercetor.CountInterceptor");
properties.put(ProducerConfig.INTERCEPTOR_CLASSES_CONFIG,list);
- kafka的流式计算kafkastreams
package com.kafka.stream;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.processor.TopologyBuilder;
import java.util.Properties;
public class KafkaStream {
public static void main(String[] args) {
//创建topology对象
TopologyBuilder builder = new TopologyBuilder();
//创建配置文件
Properties properties = new Properties();
properties.put("bootstrap.servers","hadoop:9092");
properties.put("application.id","kafkaStream");
//构建拓扑结构
builder.addSource("SOURCE","first")
.addProcessor("PROCESS", () -> new LogProcessor(), "SOURCE")
.addSink("SINK","first","PROCESS");
KafkaStreams kafkaStreams = new KafkaStreams(builder, properties);
kafkaStreams.start();
}
}
//实现类
package com.kafka.stream;
import org.apache.kafka.streams.processor.Processor;
import org.apache.kafka.streams.processor.ProcessorContext;
public class LogProcessor implements Processor<byte[],byte[]> {
private ProcessorContext context;
@Override
public void init(ProcessorContext processorContext) {
context=processorContext;
}
@Override
public void process(byte[] bytes, byte[] bytes2) {
//获取一个条数据
String line = new String(bytes2);
line=line.replaceAll(">>>","");
bytes2 = line.getBytes();
context.forward(bytes,bytes2);
}
@Override
public void punctuate(long l) {
}
@Override
public void close() {
}
}
- kafka整合flume
- 文件配置
#定义这个agent中各个组件的名字
agent1.sources = source1
agent1.sinks = sink1
agent1.channels = channel1
# 描述和配置source组件r1
agent1.sources.source1.type = exec
agent1.sources.source1.command=tail -f /home/hadoop/catalina.out
agent1.sources.source1.channels=channel1
#配置拦截器
agent1.sources.source1.interceptors=i1
agent1.sources.source1.interceptors.i1.type=host
agent1.sources.source1.interceptors.i1.hostHeader=hostname
# 描述和配置sink组件
agent1.sinks.sink1.type =org.apache.flume.sink.kafka.KafkaSink
agent1.sinks.sink1.kafka.bootstrap.servers=hadoop:9092
agent1.sinks.sink1.kafka.topic=first
agent1.sinks.sink1.flumeBatchSize=10
agent1.sinks.sink1.kafka.producer.acks=1
agent1.sinks.sink1.kafka.producer.linger.ms=1
# 描述和配置channel组件此处使用的是内存缓存的方式
agent1.channels.channel1.type = memory
agent1.channels.channel1.keep-alive=120
agent1.channels.channel1.capacity=500000
agent1.channels.channel1.transactionCapacity=600
#描述和配置source channel sink之间的连接关系
agent1.sources.source1.channels =channel1
agent1.sinks.sink1.channel =channel1
- 启动命令
bin/flume-ng agent -c conf -f agentconf/kafka-loger.properties -n agent1
-Dflume.root.logger=INFO,console
- 启动kafka创建一个topic
- 启动消费者进行消费