关于kafka的相关知识

  1. kafka是属于点对点模式的(一对一,消费者主动拉取数据,数据消息被消费后清除)
  2. 消息对列的好处
  1. 解耦
  2. 冗余
  3. 扩展性
  4. 灵活性峰值处理能力
  5. 可恢复性
  6. 顺序保证性
  7. 缓冲
  8. 异步通信
  1. kafka是一个分布式的消息对列。kafka对消息保存是根据topic进行归类,发送消息者称为producer,消费笑着者称为consumer,此外kafka集群有多个kafka实例组成,每个实例称为broker。
  2. 无论是kafka集群还是consumer都依赖于zookeeper集群保存一些meta信息,来保证系统的可用性。
  3. kafka集群的部署
  1. 官网下载kafka的安装包
  2. 修改配置文件
  • .修改这个文件server.properties
broker.id=0 //这个值在集群中是唯一的
  delete.topic.enable=true 这个的默认值为false
  //自己配置的日志信息的存储位置 kafka存储的数据也在这里
  log.dirs=/usr/hadoop/kafka/kafka_2.11-2.1.1/logs
  zookeeper.connect=hadoop:2181 //zookeeper的连接地址 集群以,分隔
  1. 启动kafka bin/kafka-server-start.sh config/server.properties
  2. 测试
bin/kafka-topics.sh --create --zookeeper hadoop:2181 \
  --partitions 2 --replication-factor 1 --topic first
 Created topic "first".表示成功创建
 bin/kafka-topics.sh --list --zookeeper hadoop:2181 //查看当前的topic
 //输入你所需要提供的数据 消息提供端
 bin/kafka-console-producer.sh --broker-list hadoop:9092 --topic first
 bin/kafka-console-consumer.sh --bootstrap-server hadoop:9092 --topic first//消费数据
 bin/kafka-topics.sh --zookeeper hadoop:2181 --describe topic first//查看topic的详情
 bin/kafka-topics.sh --delete --zookeeper hadoop:2181 --topic first//删除topic
  1. 使用kafka的高级API
  • 创建一个生产者
package com.kafka.producer;
  
  import org.apache.kafka.clients.producer.*;
  
  import java.util.Properties;
  
  /**
   * 低级的API的消息提供者
   */
  public class CustomerProducer {
      public static void main(String[] args) {
          //配置信息 ProducerConfig
          Properties properties = new Properties();
          //hadoop集群
          properties.put("bootstrap.servers", "hadoop:9092");
          //应答级别
          properties.put(ProducerConfig.ACKS_CONFIG, "all");
          //重试次数
          properties.put("retries", 0);
          //批量大小
          properties.put("batch.size", 16384);
          //提交延时
          properties.put("linger.ms", 1);
          //缓冲内存
          properties.put("buffer.memory", 33554432);
          //提供序列化的类
          properties.put("key.serializer", 
          "org.apache.kafka.common.serialization.StringSerializer");
          properties.put("value.serializer", 
          "org.apache.kafka.common.serialization.StringSerializer");
          properties.put(ProducerConfig.PARTITIONER_CLASS_CONFIG, CustomerPartition.class);
          //创建生产者
          KafkaProducer<String, String> producer = new KafkaProducer<>(properties);
          //循环发送数据 回调函数
          for (int i = 0; i < 10; i++) {
              producer.send(new ProducerRecord<String, String>("first", String.valueOf(i)),
                      (metadata, exception) -> {
                          if (exception != null) {
                              System.out.println("发送失败");
                          } else {
                              System.out.println(metadata.partition() + "---" + 
                              metadata.offset());
                          }
                      });
          }
          producer.close();
      }
  }
  • 自定义分区
package com.kafka.producer;
  
  import org.apache.kafka.clients.producer.Partitioner;
  import org.apache.kafka.common.Cluster;
  
  import java.util.Map;
  
  /**
   * 自定义分区
   */
  public class CustomerPartition implements Partitioner {
  
      public int partition(String topic, Object key, byte[] keyBytes,
  		 Object value, byte[] valueBytes, Cluster cluster) {
          return 0;
      }
  
      public void close() {
  
      }
  
      public void configure(Map<String, ?> configs) {
  
      }
  }
  • 创建消费者的高级api
package com.kafka.producer;
   
   import org.apache.kafka.clients.consumer.ConsumerConfig;
   import org.apache.kafka.clients.consumer.ConsumerRecord;
   import org.apache.kafka.clients.consumer.ConsumerRecords;
   import org.apache.kafka.clients.consumer.KafkaConsumer;
   
   import java.util.Arrays;
   import java.util.Properties;
   
   public class CustomerConsumer {
       public static void main(String[] args) {
           //配置信息 ProducerConfig
           Properties properties = new Properties();
           //hadoop集群
           properties.put("bootstrap.servers", "hadoop:9092");
           //消费者组id
           properties.put("group.id", "test");
           //是否自动提交offset
           properties.put("enable.auto.commit", "true");
           //自动提交的延时
           properties.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "10000");
           //对象的反序列化
           properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,
                   "org.apache.kafka.common.serialization.StringDeserializer");
           properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,
                   "org.apache.kafka.common.serialization.StringDeserializer");
           //创建消费者对象
           KafkaConsumer<String, String> consumer = new KafkaConsumer<>(properties);
           //指定topic
           consumer.subscribe(Arrays.asList("first", "second", "third"));
           while (true) {
               ConsumerRecords<String, String> consumerRecords = consumer.poll(100);
               for (ConsumerRecord<String, String> consumerRecord : consumerRecords) {
                   System.out.println(consumerRecord.topic() +
                           "-----" + consumerRecord.partition() +
                           "-------" + consumerRecord.value());
               }
           }
       }
   }
  • 消费者高级API的重复消费的两种方法
  • 换组
//设置这个参数改变两一个组名	  
  //消费者组id
  properties.put("group.id", "test");
  //重复消费的配置  还需添加这段代码
  properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
  • 把偏移量重置后指定
consumer.assign(Collections.singletonList(new TopicPartition("first",0)));
  consumer.seek(new TopicPartition("first",0),2);
  • 创建消费者的低级api
package com.kafka.consumer;
  
  import kafka.api.FetchRequest;
  import kafka.api.FetchRequestBuilder;
  import kafka.cluster.BrokerEndPoint;
  import kafka.javaapi.*;
  import kafka.javaapi.consumer.SimpleConsumer;
  import kafka.javaapi.message.ByteBufferMessageSet;
  import kafka.message.MessageAndOffset;
  
  import java.nio.ByteBuffer;
  import java.util.ArrayList;
  import java.util.Collections;
  import java.util.List;
  
  /**
   * 根据指定的topic,offset,partition获取数据
   */
  public class LowerConsumer {
      public static void main(String[] args) {
          //定义相关参数
          List<String> brokers = new ArrayList<>();//kafka集群
          brokers.add("hadoop");
          //端口号
          int port = 9092;
          //topic主题
          String topic = "first";
          //分区
          int partition = 0;
          //偏移量
          long offset = 2L;
          LowerConsumer lowerConsumer = new LowerConsumer();
          lowerConsumer.getData(brokers,port,topic,partition,offset);
  
  
      }
  
      //找分区领导
      private BrokerEndPoint findLeader(List<String> brokers, 
      int port, String topic, int partition) {
          for (String broker : brokers) {
              //创建获取leader的消费者对象
              SimpleConsumer leader = new SimpleConsumer(broker, port, 1000,
                      1024 , "getLeader");
              TopicMetadataRequest topicMetadataRequest = 
              new TopicMetadataRequest(Collections.singletonList(topic));
              //获取主题元数据返回值
              TopicMetadataResponse metadataResponse = leader.send(topicMetadataRequest);
              List<TopicMetadata> topicsMetadata = metadataResponse.topicsMetadata();
              for (TopicMetadata topicMetadatum : topicsMetadata) {
                  List<PartitionMetadata> partitionsMetadata = 
                  topicMetadatum.partitionsMetadata();
                  for (PartitionMetadata partitionMetadatum : partitionsMetadata) {
                      if (partition==partitionMetadatum.partitionId()) {
                          return partitionMetadatum.leader();
                      }
                  }
              }
          }
          return null;
      }
  
      //获取数据
      private void getData(List<String> brokers, int port,
       String topic, int partition, long offset) {
          BrokerEndPoint leader = findLeader(brokers, port, topic, partition);
          if (leader == null) {
              return;
          }
          String leaderHost = leader.host();
          SimpleConsumer getData = new SimpleConsumer(leaderHost, port,
                  1000, 1024, "getData");
          //创建获取数据对象
          FetchRequest fetchRequest =
                  new FetchRequestBuilder().addFetch(topic, partition, offset, 100).build();
          //获取数据的返回值
          FetchResponse fetchResponse = getData.fetch(fetchRequest);
          //解析返回值
          ByteBufferMessageSet messageAndOffsets = fetchResponse.messageSet(topic, partition);
          for (MessageAndOffset messageAndOffset : messageAndOffsets) {
              long offset1 = messageAndOffset.offset();
              ByteBuffer payload = messageAndOffset.message().payload();
              byte[] bytes=new byte[payload.limit()];
              payload.get(bytes);
              System.out.println(offset1+"--------"+new String(bytes));
          }
      }
  }
  • 拦截器
//数量拦截器
  package com.kafka.intercetor;
  
  import com.sun.org.apache.bcel.internal.generic.IFNE;
  import org.apache.kafka.clients.producer.ProducerInterceptor;
  import org.apache.kafka.clients.producer.ProducerRecord;
  import org.apache.kafka.clients.producer.RecordMetadata;
  
  import java.util.Map;
  
  public class CountInterceptor implements ProducerInterceptor<String,String> {
      private int successCount=0;
      private int errorCount=0;
      @Override
      public ProducerRecord<String, String> onSend(ProducerRecord<String, String> record) {
          return record;
      }
  
      @Override
      public void onAcknowledgement(RecordMetadata metadata, Exception exception) {
          if (exception == null)
              successCount++;
          else
              errorCount++;
      }
  
      @Override
      public void close() {
          System.out.println("发送成功"+successCount+"条数据");
          System.out.println("发送成功"+errorCount+"条数据");
      }
      @Override
      public void configure(Map<String, ?> configs) {
      }
  }

  //时间拦截器
  package com.kafka.intercetor;
  
  import org.apache.kafka.clients.producer.ProducerInterceptor;
  import org.apache.kafka.clients.producer.ProducerRecord;
  import org.apache.kafka.clients.producer.RecordMetadata;
  
  import java.util.Map;
  
  /**
   * 拦截器
   */
  public class TimeInterceptor implements ProducerInterceptor<String,String> {
  
      @Override
      public ProducerRecord<String,String> onSend(ProducerRecord record) {
          return new ProducerRecord(record.topic(),record.key(),
          System.currentTimeMillis()+","+record.value());
      }
  
      @Override
      public void onAcknowledgement(RecordMetadata metadata, Exception exception) {
  
      }
  
      @Override
      public void close() {
  
      }
  
      @Override
      public void configure(Map<String, ?> configs) {
  
      }
  }
  //增加拦截器的设置
  List<String> list = new ArrayList<>();
  list.add("com.kafka.intercetor.TimeInterceptor");
  list.add("com.kafka.intercetor.CountInterceptor");
  properties.put(ProducerConfig.INTERCEPTOR_CLASSES_CONFIG,list);
  • kafka的流式计算kafkastreams
package com.kafka.stream;
  
  import org.apache.kafka.streams.KafkaStreams;
  import org.apache.kafka.streams.processor.TopologyBuilder;
  
  import java.util.Properties;
  
  public class KafkaStream {
      public static void main(String[] args) {
          //创建topology对象
          TopologyBuilder builder = new TopologyBuilder();
          //创建配置文件
          Properties properties = new Properties();
          properties.put("bootstrap.servers","hadoop:9092");
          properties.put("application.id","kafkaStream");
          //构建拓扑结构
          builder.addSource("SOURCE","first")
                  .addProcessor("PROCESS", () -> new LogProcessor(), "SOURCE")
                  .addSink("SINK","first","PROCESS");
          KafkaStreams kafkaStreams = new KafkaStreams(builder, properties);
          kafkaStreams.start();
      }
  }

  //实现类
  package com.kafka.stream;
  
  import org.apache.kafka.streams.processor.Processor;
  import org.apache.kafka.streams.processor.ProcessorContext;
  
  public class LogProcessor implements Processor<byte[],byte[]> {
      private ProcessorContext context;
      @Override
      public void init(ProcessorContext processorContext) {
          context=processorContext;
  
      }
      @Override
      public void process(byte[] bytes, byte[] bytes2) {
          //获取一个条数据
          String line = new String(bytes2);
          line=line.replaceAll(">>>","");
          bytes2 = line.getBytes();
          context.forward(bytes,bytes2);
      }
      @Override
      public void punctuate(long l) {
  
      }
      @Override
      public void close() {
  
      }
  }
  • kafka整合flume
  • 文件配置
#定义这个agent中各个组件的名字
  agent1.sources = source1
  agent1.sinks = sink1
  agent1.channels = channel1			
  # 描述和配置source组件r1
  agent1.sources.source1.type = exec
  agent1.sources.source1.command=tail -f /home/hadoop/catalina.out
  agent1.sources.source1.channels=channel1			
  #配置拦截器
  agent1.sources.source1.interceptors=i1
  agent1.sources.source1.interceptors.i1.type=host
  agent1.sources.source1.interceptors.i1.hostHeader=hostname					
  # 描述和配置sink组件
  agent1.sinks.sink1.type =org.apache.flume.sink.kafka.KafkaSink
  agent1.sinks.sink1.kafka.bootstrap.servers=hadoop:9092
  agent1.sinks.sink1.kafka.topic=first
  agent1.sinks.sink1.flumeBatchSize=10
  agent1.sinks.sink1.kafka.producer.acks=1
  agent1.sinks.sink1.kafka.producer.linger.ms=1						
  # 描述和配置channel组件此处使用的是内存缓存的方式
  agent1.channels.channel1.type = memory
  agent1.channels.channel1.keep-alive=120
  agent1.channels.channel1.capacity=500000
  agent1.channels.channel1.transactionCapacity=600			
  #描述和配置source channel sink之间的连接关系
  agent1.sources.source1.channels =channel1
  agent1.sinks.sink1.channel =channel1
  • 启动命令
bin/flume-ng agent -c conf -f agentconf/kafka-loger.properties -n agent1 
  -Dflume.root.logger=INFO,console
  • 启动kafka创建一个topic
  • 启动消费者进行消费