mapreduce讲解

转载

mob64ca1405a060 2024-12-22 06:57:35

文章标签 mapreduce讲解大数据开发工具 java hadoop 文章分类 架构后端开发

MapReduce编程规范：

（1）用户编写的程序分成三个部分：Mapper，Reducer，Driver(提交运行mr程序的客户端)

（2）Mapper的输入数据是KV对的形式（KV的类型可自定义）

（3）Mapper的输出数据是KV对的形式（KV的类型可自定义）

（4）Mapper中的业务逻辑写在map()方法中

（5）map()方法（maptask进程）对每一个<K,V>调用一次

（6）Reducer的输入数据类型对应Mapper的输出数据类型，也是KV

（7）Reducer的业务逻辑写在reduce()方法中

（8）Reducetask进程对每一组相同k的<k,v>组调用一次reduce()方法

（9）用户自定义的Mapper和Reducer都要继承各自的父类

（10）整个程序需要一个Drvier来进行提交，提交的是一个描述了各种必要信息的job对象

WordCount:在给定的文本文件中统计输出每一个单词出现的总次数

1 package com.ahu.bigdata.mr;
  2 
  3 import java.io.IOException;
  4 
  5 import org.apache.hadoop.conf.Configuration;
  6 import org.apache.hadoop.fs.Path;
  7 import org.apache.hadoop.io.IntWritable;
  8 import org.apache.hadoop.io.LongWritable;
  9 import org.apache.hadoop.io.Text;
 10 import org.apache.hadoop.mapreduce.Job;
 11 import org.apache.hadoop.mapreduce.Mapper;
 12 import org.apache.hadoop.mapreduce.Reducer;
 13 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 14 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 15 
 16 /**
 17  * 在给定的文本文件中统计出每一个单词出现的总次数
 18  * 
 19  * @author ahu_lichang
 20  * 
 21  */
 22 public class WordCountDriver {
 23     /**
 24      * 自定义mapper类
 25      * 
 26      * @author ahu_lichang
 27      * 
 28      */
 29     static class WordCountMapper extends
 30             Mapper<LongWritable, Text, Text, IntWritable> {
 31         // map方法的生命周期：框架每传一行数据就被调用一次
 32         // key:这一行的起始点在文件中的偏移量
 33         // value:这一行的内容
 34         @Override
 35         protected void map(LongWritable key, Text value, Context context)
 36                 throws IOException, InterruptedException {
 37             // 拿到一行数据转换为String
 38             String line = value.toString();
 39             // 将这一行切分出各个单词
 40             String[] words = line.split("\t");
 41             // 遍历数组，输出<单词，1>
 42             for (String word : words) {
 43                 context.write(new Text(word), new IntWritable(1));
 44             }
 45         }
 46     }
 47 
 48     /**
 49      * 自定义一个reducer类
 50      * 
 51      * @author ahu_lichang
 52      * 
 53      */
 54     static class WordCountReducer extends
 55             Reducer<Text, IntWritable, Text, IntWritable> {
 56         // reduce生命周期：框架每传递进来一个KV组，reduce方法就被调用一次
 57         @Override
 58         protected void reduce(Text key, Iterable<IntWritable> values,
 59                 Context context) throws IOException, InterruptedException {
 60             // 定义一个计数器
 61             int count = 0;
 62             // 遍历这一组KV的所有V,累加到count中
 63             for (IntWritable value : values) {
 64                 count += value.get();
 65             }
 66             context.write(key, new IntWritable(count));
 67         }
 68     }
 69 
 70     private static final String INPUT_PATH = "hdfs://hadoop1:9000/data.txt";
 71     private static final String OUT_PATH = "hdfs://hadoop1:9000/wcoutput";
 72 
 73     // WordCountDriver是一个主类，用来描述job并提交job
 74     // 相当于一个yarn集群的客户端
 75     // 需要在此封装我们的mr程序的相关运行参数，指定jar包
 76     // 最后提交给yarn
 77     public static void main(String[] args) throws Exception {
 78         if (args == null || args.length == 0) {
 79             args = new String[2];
 80             args[0] = INPUT_PATH;
 81             args[1] = OUT_PATH;
 82         }
 83         // 把业务逻辑相关的信息（哪个是mapper,哪个是reducer，要处理的数据在哪里，输出的结果放在哪里...）描述成一个job对象
 84         // 把这个描述好的job提交给集群去运行
 85         Configuration conf = new Configuration();
 86         Job job = Job.getInstance(conf);
 87 
 88         // 指定这个job所在的jar包
 89         // job.setJar("/usr/local/wordcount.jar");
 90         job.setJarByClass(WordCountDriver.class);
 91 
 92         job.setMapperClass(WordCountMapper.class);
 93         job.setReducerClass(WordCountReducer.class);
 94 
 95         // 设置业务逻辑Mapper类的输出key和value的数据类型
 96         job.setMapOutputKeyClass(Text.class);
 97         job.setMapOutputValueClass(IntWritable.class);
 98 
 99         // 设置业务逻辑Reducer类的输出key和value的数据类型
100         job.setOutputKeyClass(Text.class);
101         job.setOutputValueClass(IntWritable.class);
102 
103         // 指定job的输入原始文件所在目录
104         //FileInputFormat.setInputPaths(job, new Path(INPUT_PATH));
105         FileInputFormat.setInputPaths(job, new Path(args[0]));
106         // 指定job的输出结果所在目录
107         //FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
108         FileOutputFormat.setOutputPath(job, new Path(args[1]));
109 
110         // 将job中配置的相关参数，以及job所用的java类所在的jar包，提交给yarn集群去运行
111         /* job.submit(); */
112         boolean res = job.waitForCompletion(true);
113         System.exit(res ? 0 : 1);
114 
115     }
116 
117 }

1、先在eclipse工程中创建一个用户类库hadoop264jar，将hadoop安装目录中的share文件夹中的common、hdfs、MapReduce、yarn中的jar包全部添加进去。

2、书写代码：创建一个mapper自定义类，在创建一个reducer自定义类，最后创建一个描述job并提交job的主类。

3、运行方式有两种：

　　　　　　（1）直接在eclipse中运行。但是会出现个权限拒绝错误，那是因为没有身份标识造成的。这里再介绍第二种身份标识方式：（第一种身份标识方式，见《HDFS详解》）

mapreduce讲解_开发工具