自定义输出多个文件

原创

ccna_zhang 2022-09-05 17:07:05 博主文章分类：hadoop ©著作权

文章标签 hadoop apache java 文章分类 Java 后端开发

©著作权归作者所有：来自51CTO博客作者ccna_zhang的原创作品，请联系作者获取转载授权，否则将追究法律责任

package com.ccse.hadoop.outputformat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleOutputFormat;
import org.apache.hadoop.util.Progressable;
import com.ccse.hadoop.old.WordCountApp;

/**
 * 自定义输出多文件
 * @author woshiccna
 *
 */
public class MyMultipleOutputFormatApp {

  public final static String INPUT_PATH = "hdfs://chaoren1:9000/mapinput";
  public final static String OUTPUT_PATH = "hdfs://chaoren1:9000/mapoutput";
  
  public static void main(String[] args) throws IOException, URISyntaxException {
    JobConf conf = new JobConf(WordCountApp.class);
    conf.setJobName("wordcount");
    
    Configuration config = new Configuration();
    FileSystem fileSystem = FileSystem.get(new URI(OUTPUT_PATH), config);
    fileSystem.delete(new Path(OUTPUT_PATH), true);
    
    conf.setMapperClass(MyMapper.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(LongWritable.class);
    conf.setInputFormat(TextInputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(INPUT_PATH));
    
    conf.setReducerClass(MyReducer.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(LongWritable.class);
    conf.setOutputFormat(MySelfMultipleOutputFormat.class);
    FileOutputFormat.setOutputPath(conf, new Path(OUTPUT_PATH));
    
    JobClient.runJob(conf);
  }

  public static class MyMapper extends MapReduceBase 
           implements Mapper<LongWritable, Text, Text, LongWritable> {

    private Text word = new Text();
    private LongWritable writable = new LongWritable(1);
    
    @Override
    public void map(LongWritable key, Text value,
        OutputCollector<Text, LongWritable> output, Reporter reporter)
        throws IOException {
      if (value != null) {
        String line = value.toString();
        StringTokenizer tokenizer = new StringTokenizer(line);
        while (tokenizer.hasMoreElements()) {
          word = new Text(tokenizer.nextToken());
            output.collect(word, writable);
        }
      }
    }
  }
  
  public static class MyReducer extends MapReduceBase implements 
         Reducer<Text, LongWritable, Text, LongWritable> {
    @Override
    public void reduce(Text key, Iterator<LongWritable> values,
        OutputCollector<Text, LongWritable> output, Reporter reporter)
        throws IOException {
      long sum = 0;
      while (values.hasNext()) {
        LongWritable value = values.next();
        sum += value.get();
      }
      output.collect(key, new LongWritable(sum));
    }
  }
  
  public static class MySelfMultipleOutputFormat 
         extends MultipleOutputFormat<Text, LongWritable> {

    @Override
    protected RecordWriter<Text, LongWritable> getBaseRecordWriter(
        FileSystem fs, JobConf job, String name, Progressable progress)
        throws IOException {
      final TextOutputFormat<Text, LongWritable> format 
                    = new TextOutputFormat<Text, LongWritable>();
      return format.getRecordWriter(fs, job, name, progress);
    }

    @Override
    protected String generateFileNameForKeyValue(Text key,
        LongWritable value, String name) {
      //根据key和value的值来生成文件的名字
      /*if (key != null) {
        return key.toString();
      } else {
        return "hello";
      }*/
      if (key.toString().startsWith("hello")) {
        return "hello";
      } else {
        return key.toString();
      }
    }
  }
}