【MapReduce】:为了高效计算大数据集中的有价值的数据
1.简介:
MapReduce是一个计算软件框架、可以在集群上并行处理数据集。
进行作业时:分两个阶段
Map阶段:
map函数
Reduce阶段:
reduce函数
计算和存储是同一个节点【不是绝对的】
2、MapReduce对数据的处理:都是以键值对<key,value>的形式
(input) <k1, v1> -> map -> <k2, v2> -> reduce -> <k3, v3> (output)
注意:Mapreduce的key和value必须序列化:Writable
Key必须实现WritableComparable,进行排序
3、编写MapReduce程序
(1)创建java项目,导入jar包
(2)编写map函数
(3)编写reduce函数
(4)编写driver类[main方法]
4、打包mapreduce程序
右键选择export-->java->jar file-->选择你要导出的项目下的哪个包,[lib文件夹不要选择]
-->next-->设置 mainClass-->选择生成路径-->wc.jar
5、上传到某一个Linux节点上
6、运行mapreduce程序
]$ hadoop jar wc.jar [input path] [output path]
注意:output path的目录应该指定一个新目录,运行程序时,会自动创建这个路径
准备工作:将n个文本文件 上传到 input path内
input path:
单词统计:
hello world welcome to china <0,"hello world welcome to china"> -> <"hello",1> <"world","1">
hello kitty <29,"hello kitty"> ---> <"hello",1> <"kitty",1>
ni hao <40,"ni hao">
i like china <46,"i like china">
进入reduce函数时:输入数据 <key,value>
<"hello",[1,1]>
<"world",[1]> reduce函数处理后:<"hello",2> <"world",1> /**
* Mapper类型的泛型:Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
* 单词统计案例: KEYIN: 行偏移量LongWritable
* VALUEING: 一行数据Text
* KEYOUT: 单词 Text
* VALUEOUT: 单词的数据IntWritable
*
*
* hello world welcome to china <0,"hello world welcome to china"> --------> <"hello",1> <"world","1">
* hello kitty <29,"hello kitty"> --------> <"hello",1> <"kitty",1>
*/
public class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
private Text word = new Text();
private final IntWritable one = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//将一行数据进行切分,把多个单词封装到StringTokenizer对象内部,就是一个迭代器
StringTokenizer itr = new StringTokenizer(value.toString()," ");
while (itr.hasMoreTokens()) {
//从迭代器中取出一个单词,然后转成Text类型
String str = itr.nextToken();
word.set(str);
//将key和value输出
context.write(word, one);
}
}
} /**
* Reducer<KEYIN,VALUEIN,KEYOUT,VALUEOUT>
*
*
* KEYIN:Text 单词作为key map阶段的输出数据key类型
* VALUEIN:IntWritable 单词的数量value: map阶段的输出数据value类型
* KEYOUT:Text 单词作为key
* VALUEOUT:IntWritable 单词最总统计数量
*
* 在接收map的输出数据后,进入reduce方法前,做了一次处理:将同一个key的value值放到一起组成新的键值对
*/
public class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable(); @Override
protected void reduce(Text key, Iterable<IntWritable> value,
Context context) throws IOException, InterruptedException {
//累加当前key的所有value值
int sum = 0;
for(IntWritable i:value) {
sum+=i.get();
}
//将和封装到IntWritable对象上
result.set(sum);
//最终写出输出数据,也是键值对,存入文件系统
context.write(key, result);
}
}public class MyWorldCount {
public static void main(String[] args) throws Exception, Exception {
//获取一些配置信息
Configuration conf = new Configuration();
//获取一个作业对象
Job job = Job.getInstance(conf, "word count");
//设置执行jar包内的哪个类型[含有main方法的类型]
job.setJarByClass(MyWorldCount.class);
//设置map函数的位置
job.setMapperClass(MyMapper.class);
//设置reduce函数的位置
job.setReducerClass(MyReducer.class);
//设置reduce阶段key的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置要访问的文件的位置
FileInputFormat.addInputPath(job, new Path(args[0]));
//设置生成文件的位置
FileOutputFormat.setOutputPath(job, new Path(args[0]));
//打印报告
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
} /**
* 根据全球气象温度数据:统计每年的最高气温
* 原始数据:
* 0029029070999991901010106004+64333+023450FM-12+000599999V0202701N015919999999N0000001N9-00781+99999102001ADDGF108991999999999999999999
0029029070999991901010113004+64333+023450FM-12+000599999V0202901N008219999999N0000001N9-00721+99999102001ADDGF104991999999999999999999
0029029070999991901010120004+64333+023450FM-12+000599999V0209991C000019999999N0000001N9-00941+99999102001ADDGF108991999999999999999999
*
* KEYIN: LongWritable
* VALUEIN: Text
* KEYOUT: Text
* VALUEOUT: IntWritable
* map阶段
* k1 v1
* 0 0029029070999991901010106004+64333+023450FM-12+000599999V0202701N015919999999N0000001N9-00781+99999102001ADDGF108991999999999999999999
* 133 0029029070999991901010113004+64333+023450FM-12+000599999V0202901N008219999999N0000001N9-00721+99999102001ADDGF104991999999999999999999
* 266 0029029070999991901010120004+64333+023450FM-12+000599999V0209991C000019999999N0000001N9-00941+99999102001ADDGF108991999999999999999999
*
* 经过map函数 k2 v2
* ("1901",-78)
* ("1901",-72)
* ("1901",-94)
*
* map函数的编写:
*/
public class MyMapper extends Mapper<LongWritable,Text,Text,IntWritable>{ @Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
//将value转成字符串类型,进行截取年份和温度
String line = value.toString();
//获取年份
String year = line.substring(15, 19);
//定义温度变量
int temp;
if(line.charAt(87)=='+') {
temp =Integer.parseInt(line.substring(88,92));
}else {
temp =Integer.parseInt(line.substring(87,92));
}
String code = line.substring(92,93);
if(temp!=9999&&code.matches("[01459]")) {
Text yearT = new Text(year);//转换年份的类型为输出类型
IntWritable temperature = new IntWritable(temp);//转换温度的类型为输出类型
context.write(yearT, temperature);//将数据写出
}
}
}
/**
* Reduce阶段:
* 接收的数据 ("1901",-78)
* ("1901",-72)
* ("1901",-94)
*
* KEYIN:Text
* VALUEIN: IntWritable
* KEYOUT: Text
* VALUEOUT:IntWritable
*
* reduce函数接收的数据:("1901",[-78,-72,-94])
* reduce函数输出的数据:("1901",maxTemperature)
*
* @author Michael
*
*/
public class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable>{ @Override
protected void reduce(Text year, Iterable<IntWritable> temps,Context context) throws IOException, InterruptedException {
//定义一个变量,赋值为int类型的最小值
int maxTemperature = Integer.MIN_VALUE;
//遍历所有的温度
for(IntWritable i:temps) {
//当前温度和变量所指的温度比较,找出较大值,赋值给当前变量
maxTemperature = Math.max(i.get(), maxTemperature);
}
//循环结束后,当前变量就是最高温度,写出去
context.write(year, new IntWritable(maxTemperature));
}
}public class MyMaxTemperature {
public static void main(String[] args) throws IOException, Exception, InterruptedException {
//获取一个Job对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,"max temperature");
//设置主类
job.setJarByClass(MyMaxTemperature.class);
//设置map函数的类型
job.setMapperClass(MyMapper.class);
//设置reduce函数的类型
job.setReducerClass(MyReducer.class);
//设置输出键值对的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置输入输出路径
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//查看状态,推出程序
System.exit(job.waitForCompletion(true)?0:1);
}
}