MapReduce综合练习
数据
information表
游戏 大数据 1
null Java 3
学习 null 4
逛街 全栈 2
student表
1 张三 女
4 李四 男
3 王五 男
1 赵六 女
需求
- 使用MapJOIN来合并表
- 将俩张表的数据封装到一个JavaBean对象中
- 要求在map端封装好对象,在reduce端计算对象中属性为Null的个数作为value个数输出,输出key为Bean对象的tostring
- 分区设为两个,根据性别分区
写JavaBean对象来存储数据,实现需求二
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class JavaBean implements WritableComparable<JavaBean> {
//定义五个变量
private int id;
private String name;
private String sex;
private String hobby;
private String job;
//变量的set和get方法
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getSex() {
return sex;
}
public void setSex(String sex) {
this.sex = sex;
}
public String getHobby() {
return hobby;
}
public void setHobby(String hobby) {
this.hobby = hobby;
}
public String getJob() {
return job;
}
public void setJob(String job) {
this.job = job;
}
//重写toString方法,最后输出JavaBean的数据
@Override
public String toString() {
return "JavaBean{" +
"id=" + id +
", name='" + name + '\'' +
", sex='" + sex + '\'' +
", hobby='" + hobby + '\'' +
", job='" + job + '\'' +
'}';
}
//写一个set方法,以便在Map阶段可以一次性的存储所有的数据
public void set(int id, String name, String sex, String hobby, String job) {
this.id = id;
this.name = name;
this.sex = sex;
this.hobby = hobby;
this.job = job;
}
//重写排序方式,我这里是先比较id,在比较名字,id与姓名相同的算作是同一个
@Override
public int compareTo(JavaBean o) {
if (o.id-this.id==0){
return o.name.compareTo(this.name);
}
return o.id-this.id;
}
//hadoop中的序列化,写出数据
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(id);
dataOutput.writeUTF(name);
dataOutput.writeUTF(sex);
dataOutput.writeUTF(hobby);
dataOutput.writeUTF(job);
}
//hadoop序列化中的读取数据,读的顺序要和写的时候一样,否则会乱码
@Override
public void readFields(DataInput dataInput) throws IOException {
id = dataInput.readInt();
name = dataInput.readUTF();
sex = dataInput.readUTF();
hobby= dataInput.readUTF();
job = dataInput.readUTF();
}
}
重写Map方法,实现需求一
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
public class MapTest extends Mapper<LongWritable, Text,JavaBean, NullWritable> {
List list = new ArrayList();
JavaBean k = new JavaBean();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//获取载入缓存的文件的路径
URI[] urls = context.getCacheFiles();
String path = urls[0].getPath().toString();
//读取缓存文件
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
String line;
while ((line=bufferedReader.readLine())!=null){
//将缓存文件中的每一列都放置到集合里
String information_line = line;
list.add(information_line);
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//分割文件
String [] student_line = value.toString().split("\t");
//比较information和student文件中的id,将id相同的一行拼接
for (int i = 0;i<list.size();i++){
String [] informations = list.get(i).toString().split("\t");
if (informations[2].equals(student_line[0])){
k.set(Integer.parseInt(student_line[0]),student_line[1],student_line[2],informations[0],informations[1]);
}
}
//最后输出结果
context.write(k,NullWritable.get());
}
}
重写分区类,实现需求四
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class PartitionTest extends Partitioner<JavaBean, NullWritable> {
@Override
public int getPartition(JavaBean javaBean, NullWritable nullWritable, int i) {
//定义连个两个分区,分区是从0开始计算的
int partition_num = 1;
//如果性别为男,数据就放入第一个分区
if ("男".equals(javaBean.getSex())) {
return 0;
}
return partition_num;
}
}
Reduce,实现需求三
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class ReduceTest extends Reducer<JavaBean, NullWritable,JavaBean, IntWritable> {
int count = 0;
IntWritable v = new IntWritable();
@Override
protected void reduce(JavaBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
//循环key相同的的使用kv键值对
for (NullWritable value:values){
//如果hobby或者job为空就加一
if ("null".equals(key.getHobby())){
count++;
}
if ("null".equals(key.getJob())){
count++;
}
v.set(count);
context.write(key,v);
//结束时设置count为0,不累加
count=0;
}
}
}
Driver类
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.net.URI;
public class DriverTest {
public static void main(String[] args) throws Exception {
Job job = Job.getInstance(new Configuration());
//分别设置Map、Reduce、Driver类是那三个
job.setJarByClass(DriverTest.class);
job.setMapperClass(MapTest.class);
job.setReducerClass(ReduceTest.class);
//设置Map阶段输出的数据类型
job.setMapOutputKeyClass(JavaBean.class);
job.setMapOutputValueClass(NullWritable.class);
//设置在Reduce阶段输出的数据类型
job.setOutputValueClass(JavaBean.class);
job.setOutputKeyClass(IntWritable.class);
//设置分区为自己的自定义分区
job.setPartitionerClass(PartitionTest.class);
//设置ReduceTask的个数,必须要大于等于自己设置的分区数
job.setNumReduceTasks(2);
//设置要缓存到内存的数据表的路径,file不可少,代表着本地文件系统
job.addCacheFile(new URI("file:///D:/MP/MPTest01/input/information.txt"));
//设置文件数据输出和输入地址
FileInputFormat.setInputPaths(job,"D:\\MP\\MPTest01\\input\\student.txt");
FileOutputFormat.setOutputPath(job,new Path("D:\\MP\\MPTest01\\output"));
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}