【MapReduce】:为了高效计算大数据集中的有价值的数据

1.简介:
   MapReduce是一个计算软件框架、可以在集群上并行处理数据集。
   进行作业时:分两个阶段
   Map阶段:
       map函数
   Reduce阶段:
       reduce函数

   计算和存储是同一个节点【不是绝对的】
2、MapReduce对数据的处理:都是以键值对<key,value>的形式
   
   (input) <k1, v1> -> map -> <k2, v2>  -> reduce -> <k3, v3> (output)
    
    注意:Mapreduce的key和value必须序列化:Writable 
            Key必须实现WritableComparable,进行排序
     
3、编写MapReduce程序
   (1)创建java项目,导入jar包
   (2)编写map函数
   (3)编写reduce函数
   (4)编写driver类[main方法]
    
4、打包mapreduce程序
   
   右键选择export-->java->jar file-->选择你要导出的项目下的哪个包,[lib文件夹不要选择]
                  -->next-->设置 mainClass-->选择生成路径-->wc.jar
5、上传到某一个Linux节点上
6、运行mapreduce程序
   ]$ hadoop jar wc.jar  [input path]  [output path]

     注意:output path的目录应该指定一个新目录,运行程序时,会自动创建这个路径
     
     准备工作:将n个文本文件 上传到 input path内
     input path:

    
    

    
    
    单词统计:

hello world  welcome to china   <0,"hello world  welcome to china">   -> <"hello",1> <"world","1"> 
     hello kitty                     <29,"hello kitty">  ---> <"hello",1> <"kitty",1>
     ni hao                <40,"ni hao">
     i like china                    <46,"i like china">

   
   进入reduce函数时:输入数据 <key,value>

<"hello",[1,1]>
    <"world",[1]>   reduce函数处理后:<"hello",2>  <"world",1> /**
  * Mapper类型的泛型:Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
  *   单词统计案例: KEYIN:  行偏移量LongWritable
  *                  VALUEING: 一行数据Text
  *                  KEYOUT:   单词 Text
  *                  VALUEOUT: 单词的数据IntWritable
  *    
  *              
  *  hello world  welcome to china    <0,"hello world  welcome to china"> -------->  <"hello",1> <"world","1"> 
  *   hello kitty                     <29,"hello kitty">                  --------> <"hello",1> <"kitty",1>            
  */
 public class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
     private Text word = new Text();
         private final IntWritable one = new IntWritable(1);
     @Override
     protected void map(LongWritable key, Text value, Context context)
             throws IOException, InterruptedException {
          //将一行数据进行切分,把多个单词封装到StringTokenizer对象内部,就是一个迭代器
          StringTokenizer itr = new StringTokenizer(value.toString()," ");
          while (itr.hasMoreTokens()) {
              //从迭代器中取出一个单词,然后转成Text类型
              String str =  itr.nextToken();
              word.set(str);
              //将key和value输出
              context.write(word, one);
          }
     }
     
 } /**
  * Reducer<KEYIN,VALUEIN,KEYOUT,VALUEOUT>
  * 
  * 
  *  KEYIN:Text           单词作为key         map阶段的输出数据key类型
  *  VALUEIN:IntWritable  单词的数量value:  map阶段的输出数据value类型
  *  KEYOUT:Text          单词作为key
  *  VALUEOUT:IntWritable 单词最总统计数量
  *  
  *   在接收map的输出数据后,进入reduce方法前,做了一次处理:将同一个key的value值放到一起组成新的键值对
  */
 public class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
     private IntWritable result = new IntWritable();    @Override
     protected void reduce(Text key, Iterable<IntWritable> value,
             Context context) throws IOException, InterruptedException {
         //累加当前key的所有value值
         int sum = 0;
         for(IntWritable i:value) {
             sum+=i.get();
         }
         //将和封装到IntWritable对象上
         result.set(sum);
         //最终写出输出数据,也是键值对,存入文件系统
         context.write(key, result);
     }
     
 }public class MyWorldCount {
     public static void main(String[] args) throws Exception, Exception {
         //获取一些配置信息
         Configuration conf = new Configuration();
         //获取一个作业对象
         Job job = Job.getInstance(conf, "word count");
         //设置执行jar包内的哪个类型[含有main方法的类型]
         job.setJarByClass(MyWorldCount.class);
         //设置map函数的位置
         job.setMapperClass(MyMapper.class);
         //设置reduce函数的位置
         job.setReducerClass(MyReducer.class);
         //设置reduce阶段key的输出类型
         job.setOutputKeyClass(Text.class);
         job.setOutputValueClass(IntWritable.class);
         //设置要访问的文件的位置
         FileInputFormat.addInputPath(job, new Path(args[0]));
         //设置生成文件的位置
         FileOutputFormat.setOutputPath(job, new Path(args[0]));
         //打印报告
         System.exit(job.waitForCompletion(true) ? 0 : 1);
     }
 } /**
  * 根据全球气象温度数据:统计每年的最高气温
  * 原始数据:
  * 0029029070999991901010106004+64333+023450FM-12+000599999V0202701N015919999999N0000001N9-00781+99999102001ADDGF108991999999999999999999
    0029029070999991901010113004+64333+023450FM-12+000599999V0202901N008219999999N0000001N9-00721+99999102001ADDGF104991999999999999999999
    0029029070999991901010120004+64333+023450FM-12+000599999V0209991C000019999999N0000001N9-00941+99999102001ADDGF108991999999999999999999
  *      
  *      KEYIN:   LongWritable
  *      VALUEIN: Text
  *      KEYOUT:  Text
  *      VALUEOUT: IntWritable
  *             map阶段
  *     k1 v1 
  *     0  0029029070999991901010106004+64333+023450FM-12+000599999V0202701N015919999999N0000001N9-00781+99999102001ADDGF108991999999999999999999
  *     133 0029029070999991901010113004+64333+023450FM-12+000599999V0202901N008219999999N0000001N9-00721+99999102001ADDGF104991999999999999999999
  *     266 0029029070999991901010120004+64333+023450FM-12+000599999V0209991C000019999999N0000001N9-00941+99999102001ADDGF108991999999999999999999
  * 
   *         经过map函数  k2 v2
   *           ("1901",-78)
   *           ("1901",-72)
   *           ("1901",-94)
  * 
  * map函数的编写:
  */
 public class MyMapper extends Mapper<LongWritable,Text,Text,IntWritable>{    @Override
     protected void map(LongWritable key, Text value,Context context)
             throws IOException, InterruptedException {
         //将value转成字符串类型,进行截取年份和温度
         String line = value.toString();
         //获取年份
         String year = line.substring(15, 19);
         //定义温度变量
         int temp;
         if(line.charAt(87)=='+') {
             temp =Integer.parseInt(line.substring(88,92));
         }else {
             temp =Integer.parseInt(line.substring(87,92));
         }
         String code = line.substring(92,93);
         if(temp!=9999&&code.matches("[01459]")) {
             Text yearT = new Text(year);//转换年份的类型为输出类型
             IntWritable temperature = new IntWritable(temp);//转换温度的类型为输出类型
             context.write(yearT, temperature);//将数据写出
         }
         
     }
     
 }
 /**
  * Reduce阶段:
   *   接收的数据       ("1901",-78)
   *            ("1901",-72)
   *            ("1901",-94)
  * 
  * KEYIN:Text
  * VALUEIN: IntWritable
  * KEYOUT: Text
  * VALUEOUT:IntWritable
  * 
  *         reduce函数接收的数据:("1901",[-78,-72,-94]) 
  *         reduce函数输出的数据:("1901",maxTemperature)
  * 
  * @author Michael
  *
  */
 public class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable>{    @Override
     protected void reduce(Text year, Iterable<IntWritable> temps,Context context) throws IOException, InterruptedException {
         //定义一个变量,赋值为int类型的最小值
         int maxTemperature = Integer.MIN_VALUE;
         //遍历所有的温度
         for(IntWritable i:temps) {
             //当前温度和变量所指的温度比较,找出较大值,赋值给当前变量
             maxTemperature = Math.max(i.get(), maxTemperature);
         }
         //循环结束后,当前变量就是最高温度,写出去
         context.write(year, new IntWritable(maxTemperature));
     }
     
 }public class MyMaxTemperature {
     public static void main(String[] args) throws IOException, Exception, InterruptedException {
         //获取一个Job对象
         Configuration conf = new Configuration();
         Job job = Job.getInstance(conf,"max temperature");
         //设置主类
         job.setJarByClass(MyMaxTemperature.class);
         //设置map函数的类型
         job.setMapperClass(MyMapper.class);
         //设置reduce函数的类型
         job.setReducerClass(MyReducer.class);
         //设置输出键值对的类型
         job.setOutputKeyClass(Text.class);
         job.setOutputValueClass(IntWritable.class);
         
         //设置输入输出路径
         FileInputFormat.addInputPath(job, new Path(args[0]));
         FileOutputFormat.setOutputPath(job, new Path(args[1]));
         
         //查看状态,推出程序
         System.exit(job.waitForCompletion(true)?0:1);
         
     }
 }