电影案例中的求每部电影评论分数最高的前N条记录需求 ,我们的解题思路是这样的:
以电影的ID为KEY,以电影数据为Value输出到Reduce端 , Reduce端再将同一部电影的所有的评论记录存储在list集合中,对list集合按照评论分数的降序排列! 输出前N条数据
高效topN实现思路如下 :
我们将整个电影Bean放在Map的key的位置, 在MR内部是默认按照Key进行排序 , Key分区 , Key分组的!所以如果我们将Bean放在KEY的位置需要做一下三件事
- 自定义的类要序列化和可排序 实现接口 WritableComparable
- 自定义分区器 , 按照自定义Bean的执行的属性分区 , 按照电影Id分区,保证同一部电影被同一个ReduceTask处理
- 自定义分区器 , 保证同一部电影分配到同一个迭代器中聚合操作
1 自定义POJO类
指定排序规则 电影id相同按照电影评论分数的降序排列
package com._51doit.pojo;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* Author: 多易教育-行哥
* Date: 2020/7/13
* Description: 此类会放在MT的key的输出位置所以要实现序列化和排序
*/
public class MovieWritableComparable implements WritableComparable<MovieWritableComparable> {
private String movie;
private double rate;
private long timeStamp;
private int uid;
public String getMovie() {
return movie;
}
public void setMovie(String movie) {
this.movie = movie;
}
public double getRate() {
return rate;
}
public void setRate(double rate) {
this.rate = rate;
}
public long getTimeStamp() {
return timeStamp;
}
public void setTimeStamp(long timeStamp) {
this.timeStamp = timeStamp;
}
public int getUid() {
return uid;
}
public void setUid(int uid) {
this.uid = uid;
}
@Override
public String toString() {
return "MovieWritableComparable{" +
"movie='" + movie + '\'' +
", rate=" + rate +
", timeStamp=" + timeStamp +
", uid=" + uid +
'}';
}
/**
* 电影相同 分数降序排列
*
* @param o
* @return
*/
@Override
public int compareTo(MovieWritableComparable o) {
String oMovie = o.getMovie();
return oMovie.compareTo(this.movie) == 0 ? Double.compare(o.getRate(), this.rate) : oMovie.compareTo(this.movie);
}
/**
* 写的方法 序列化方式 序列化方式
*
* @param dataOutput
* @throws IOException
*/
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(movie);
dataOutput.writeDouble(rate);
dataOutput.writeLong(timeStamp);
dataOutput.writeInt(uid);
}
/**
* 读的方法 反序列化
*
* @param dataInput
* @throws IOException
*/
public void readFields(DataInput dataInput) throws IOException {
this.movie = dataInput.readUTF();
this.rate = dataInput.readDouble();
this.timeStamp = dataInput.readLong();
this.uid = dataInput.readInt();
}
}
2 自定义分区器
按照电影的类中的电影ID分区
import com._51doit.pojo.MovieWritableComparable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* Author: 多易教育-行哥
* Date: 2020/7/13
* Description:
*/
public class MyPartitioner extends Partitioner<MovieWritableComparable , NullWritable> {
@Override
public int getPartition(MovieWritableComparable movieWritableComparable, NullWritable nullWritable, int numPartitions) {
return (movieWritableComparable.getMovie().hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}
3 自定义分组器
package com._51doit.mr.high_top;
import com._51doit.pojo.MovieWritableComparable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* Author: 多易教育-行哥
* Date: 2020/7/13
* Description:
*/
public class MyGroupComparetor extends WritableComparator {
public MyGroupComparetor() {
super(MovieWritableComparable.class , true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
MovieWritableComparable m1 = (MovieWritableComparable) a;//null
MovieWritableComparable m2 = (MovieWritableComparable) b;// null
return m1.getMovie().compareTo(m2.getMovie()); //空指针异常
}
}
注意: 在自定义分组器的时候我们需要在自己类的无参构造函数中调用父类的构造函数从而保证MovieWritableComparable 类实例的创建 !
在方法调用的时候默认是使用对象.方法的形式 , 创建对象默认调用的是自己类的空参构造函数 , 自己的空参构造函数又会默认的调用父类空参的构造函数!
注意到keyclass属性为null
所以我们在自己的构造函数中调用父类的构造函数
4 代码实现
package com._51doit.mr.high_top;
import com._51doit.mr.join.Join;
import com._51doit.mr.line.LineDemo;
import com._51doit.pojo.JoinBean;
import com._51doit.pojo.MovieWritable;
import com._51doit.pojo.MovieWritableComparable;
import com.alibaba.fastjson.JSON;
import org.apache.commons.lang.ObjectUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import java.io.IOException;
/**
* Author: 多易教育-行哥
* Date: 2020/7/13
* Description:
*/
public class TopN {
static class TopNMapper extends Mapper<LongWritable, Text, MovieWritableComparable, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
try {
String line = value.toString();
MovieWritableComparable mw = JSON.parseObject(line, MovieWritableComparable.class);
context.write(mw, NullWritable.get());
} catch (Exception e) {
// e.printStackTrace();
}
}
}
static class TopNReducer extends Reducer<MovieWritableComparable, NullWritable, MovieWritableComparable, NullWritable> {
/**
* 一组 同一个电影
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(MovieWritableComparable key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (NullWritable value : values) {
context.write(key, NullWritable.get());
count++ ;
if(count==3){
return;
}
}
}
}
public static void main(String[] args) throws Exception {
Logger.getLogger("org").setLevel(Level.INFO);
Configuration conf = new Configuration();
// 参数2 job的名字
Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());
job.setMapperClass(TopNMapper.class);
job.setReducerClass(TopNReducer.class);
// 设置map阶段的输出类型
job.setMapOutputKeyClass(MovieWritableComparable.class);
job.setMapOutputValueClass(NullWritable.class);
// 最终结果的数据类型
job.setOutputKeyClass(MovieWritableComparable.class);
job.setOutputValueClass(NullWritable.class);
//自定义分区器
job.setPartitionerClass(MyPartitioner.class);
// 自定义分组
job.setGroupingComparatorClass(MyGroupComparetor.class);
// job.setNumReduceTasks(2); //启动3个reduce任务
// 待处理数据的路径
FileInputFormat.setInputPaths(job, new Path("D:\\data\\movie\\input"));
FileOutputFormat.setOutputPath(job, new Path("D:\\data\\movie\\res3"));
job.waitForCompletion(true);
}
}