需求
找出每个月温度最高的两天
数据集
1949-10-01 14:21:02 34c
1949-10-01 19:21:02 38c
1949-10-02 14:01:02 36c
1950-01-01 11:21:02 32c
1950-10-01 12:21:02 37c
1951-12-01 12:21:02 23c
1950-10-02 12:21:02 41c
1950-10-03 12:21:02 27c
1951-07-01 12:21:02 45c
1951-07-02 12:21:02 46c
1951-07-03 12:21:03 47c
案例分析
在MR中,原语是“相同”key的键值对为一组,调用一次reduce方法,方法内迭代这组数据计算。
找出每个月气温最高的两天
二次排序,分组比较器和排序比较器不一样
map:key LongWritable
value Text
日期+气温 将value中的指标拿出来组成一个新的key
分组的时候,需要将新的key拆开来比较
需要重写排序比较器/分组比较器
步骤
- 自定义数据类型Weather
包含时间
包含温度
自定义排序比较规则 - 自定义分组比较
年月相同被视为相同的key
那么reduce迭代时,相同年月的记录有可能是同一天的,reduce中需要判断是否同一天 - 数据量很大
全量数据可以切分成最少按一个月份的数据量进行判断
这种业务场景可以设置多个reduce
通过实现partition
具体实现
Weather
public class Weather implements WritableComparable<Weather> {
private Integer year;
private Integer month;
private Integer day;
private Double temperature;
//默认比较器
// @Override
// public int compareTo(Weather o) {
// System.out.println("******Weather*******compareTo**********");
// int result=this.year.compareTo(o.getYear());
// if (result==0){
// result=this.month.compareTo(o.getMonth());
// if (result==0){
// result=this.day.compareTo(o.getDay());
// if (result==0){
// o.getTemperature().compareTo(this.temperature);
// }
// }
// }
// return result;
// }
@Override
public int compareTo(Weather o) {
System.out.println("********Weather********compareTo*****************");
int result=this.year.compareTo(o.getYear());
if (result==0){
result=this.month.compareTo(o.getMonth());
if (result==0){
o.getTemperature().compareTo(this.temperature);
}
}
return result;
}
//当前类Weather作为key,并且对应的job作业没有"通过job.setSortComparatorClass(...);来指定比较器的时候",使用该比较器
//请参考Text作为key的案例:wordcount
//年 月 升序 温度 倒叙
public static class Comparator extends WritableComparator{
public Comparator(){
super(Weather.class,true);
}
public int compare(WritableComparable a,WritableComparable b){
System.out.println("======Weather=====subClass=====Comparator==========================");
Weather wa=(Weather) a;
Weather wb=(Weather) b;
return wa.compareTo(wb);
}
}
static {
//register this comparator
WritableComparator.define(Weather.class,new Comparator());
}
//序列化
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(year);
dataOutput.writeInt(month);
dataOutput.writeInt(day);
dataOutput.writeDouble(temperature);
}
//反序列化
@Override
public void readFields(DataInput dataInput) throws IOException {
setYear(dataInput.readInt());
setMonth(dataInput.readInt());
setDay(dataInput.readInt());
setTemperature(dataInput.readDouble());
}
public Integer getYear() {
return year;
}
public void setYear(Integer year) {
this.year = year;
}
public Integer getMonth() {
return month;
}
public void setMonth(Integer month) {
this.month = month;
}
public Integer getDay() {
return day;
}
public void setDay(Integer day) {
this.day = day;
}
public Double getTemperature() {
return temperature;
}
public void setTemperature(Double temperature) {
this.temperature = temperature;
}
}
WeatherGroupingComparator
WeatherMain
public class WeatherMain {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
if (args==null||args.length!=2){
System.out.println("Usage:yarn jar myweather.jar com.bupt.weather <input> <output>");
System.exit(1);
}
Configuration configuration=new Configuration(true);
configuration.set("mapreduce.framework.name","local");
Job job= Job.getInstance(configuration);
job.setJobName("统计每个月温度最高的两天");
job.setJarByClass(WeatherMain.class);
job.setMapperClass(WeatherMapper.class);
job.setMapOutputKeyClass(Weather.class);
//1949-10-01 14:21:02 34c
job.setMapOutputValueClass(Text.class);
//设置reduce
job.setReducerClass(WeatherReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//设置输入输出路径
FileInputFormat.addInputPath(job,new Path(args[0]));
Path outputPath=new Path(args[1]);
if (outputPath.getFileSystem(configuration).exists(outputPath)){
outputPath.getFileSystem(configuration).delete(outputPath,true);
}
FileOutputFormat.setOutputPath(job,outputPath);
//设置分区,并指定reducerTask的数量
job.setNumReduceTasks(2);
job.setPartitionerClass(WeatherPatitioner.class);
//设置排序比较器(如果设置的排序比较器,将不再走key默认的内部比较器类注册的排序比较)
job.setSortComparatorClass(WeatherSortComparator.class);
//设置分组比较器
job.setGroupingComparatorClass(WeatherGroupingComparator.class);
job.waitForCompletion(true);
}
}
WeatherMapper
public class WeatherMapper extends Mapper<LongWritable, Text, Weather, Text> {
private Weather weather = new Weather();
@Override
protected void map(LongWritable key, Text value, Context context) {
//1949-10-01 14:21:02 34c
//->Weather
String line = value.toString().trim();
String[] datas = line.split("\t");
//设置温度:34->"34"->34.0
weather.setTemperature(Double.parseDouble(datas[1].substring(0, datas[1].length() - 1)));
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
try {
Date date = simpleDateFormat.parse(datas[0]);
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
weather.setYear(calendar.get(Calendar.YEAR));
//获取出的月份比实际值小1
weather.setMonth(calendar.get(Calendar.MONTH) + 1);
weather.setDay(calendar.get(Calendar.DAY_OF_MONTH));
//输出到环形缓冲区
context.write(weather, value);
} catch (Exception e) {
e.printStackTrace();
}
}
}
WeatherPatitioner
WeatherReducer
public class WeatherReducer extends Reducer<Weather, Text,Text, NullWritable> {
@Override
protected void reduce(Weather key,Iterable<Text> values,Context context) throws IOException, InterruptedException {
//Weather:1949-10-01 38
//1949-10-02 37
int day=-1;
//遍历values
for (Text value:values) {
//第一条:也就是当前月的最高温度
if (day==-1){
context.write(value,NullWritable.get());
//获取最高温度对应的 号(天)
day=key.getDay();
}else{//非第一条
if (day!=key.getDay()){
context.write(value,NullWritable.get());
break;
}
}
}
}
}
WeatherSortComparator
配置传参
测试结果