数据及需求

数据

information表

游戏 大数据 1
null Java 3
学习 null 4
逛街 全栈 2

student表

1 张三 女
4 李四 男
3 王五 男
1 赵六 女

需求

  • 使用MapJOIN来合并表
  • 将俩张表的数据封装到一个JavaBean对象中
  • 要求在map端封装好对象,在reduce端计算对象中属性为Null的个数作为value个数输出,输出key为Bean对象的tostring
  • 分区设为两个,根据性别分区
代码实现

写JavaBean对象来存储数据,实现需求二

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class JavaBean implements WritableComparable<JavaBean> {
    //定义五个变量
    private int id;
    private String name;
    private String sex;
    private String hobby;
    private String job;
    //变量的set和get方法
    public int getId() {
        return id;
    }

    public void setId(int id) {
        this.id = id;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getSex() {
        return sex;
    }

    public void setSex(String sex) {
        this.sex = sex;
    }

    public String getHobby() {
        return hobby;
    }

    public void setHobby(String hobby) {
        this.hobby = hobby;
    }

    public String getJob() {
        return job;
    }

    public void setJob(String job) {
        this.job = job;
    }

    //重写toString方法,最后输出JavaBean的数据
    @Override
    public String toString() {
        return "JavaBean{" +
                "id=" + id +
                ", name='" + name + '\'' +
                ", sex='" + sex + '\'' +
                ", hobby='" + hobby + '\'' +
                ", job='" + job + '\'' +
                '}';
    }
    
    //写一个set方法,以便在Map阶段可以一次性的存储所有的数据
    public void set(int id, String name, String sex, String hobby, String job) {
        this.id = id;
        this.name = name;
        this.sex = sex;
        this.hobby = hobby;
        this.job = job;
    }

    //重写排序方式,我这里是先比较id,在比较名字,id与姓名相同的算作是同一个
    @Override
    public int compareTo(JavaBean o) {
        if (o.id-this.id==0){
            return o.name.compareTo(this.name);
        }
        return o.id-this.id;
    }

    //hadoop中的序列化,写出数据
    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeInt(id);
        dataOutput.writeUTF(name);
        dataOutput.writeUTF(sex);
        dataOutput.writeUTF(hobby);
        dataOutput.writeUTF(job);
    }

    //hadoop序列化中的读取数据,读的顺序要和写的时候一样,否则会乱码
    @Override
    public void readFields(DataInput dataInput) throws IOException {
        id = dataInput.readInt();
        name = dataInput.readUTF();
        sex = dataInput.readUTF();
        hobby= dataInput.readUTF();
        job = dataInput.readUTF();
    }
}

重写Map方法,实现需求一

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;

public class MapTest extends Mapper<LongWritable, Text,JavaBean, NullWritable> {
    List list = new ArrayList();
    JavaBean k = new JavaBean();
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //获取载入缓存的文件的路径
        URI[] urls = context.getCacheFiles();
        String path = urls[0].getPath().toString();
        //读取缓存文件
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
        String line;
        while ((line=bufferedReader.readLine())!=null){
            //将缓存文件中的每一列都放置到集合里
            String information_line = line;
            list.add(information_line);
        }
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //分割文件
        String [] student_line = value.toString().split("\t");
        //比较information和student文件中的id,将id相同的一行拼接
        for (int i = 0;i<list.size();i++){
            String [] informations = list.get(i).toString().split("\t");
            if (informations[2].equals(student_line[0])){
                k.set(Integer.parseInt(student_line[0]),student_line[1],student_line[2],informations[0],informations[1]);
            }
        }
        //最后输出结果
        context.write(k,NullWritable.get());
    }
}

重写分区类,实现需求四

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class PartitionTest extends Partitioner<JavaBean, NullWritable> {
    @Override
    public int getPartition(JavaBean javaBean, NullWritable nullWritable, int i) {
        //定义连个两个分区,分区是从0开始计算的
        int partition_num = 1;
        //如果性别为男,数据就放入第一个分区
        if ("男".equals(javaBean.getSex())) {
            return 0;
        }
        return partition_num;
    }
}

Reduce,实现需求三

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class ReduceTest extends Reducer<JavaBean, NullWritable,JavaBean, IntWritable> {
    int count = 0;
    IntWritable v = new IntWritable();
    @Override
    protected void reduce(JavaBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        //循环key相同的的使用kv键值对
        for (NullWritable value:values){
            //如果hobby或者job为空就加一
            if ("null".equals(key.getHobby())){
                count++;
            }
            if ("null".equals(key.getJob())){
                count++;
            }
            v.set(count);
            context.write(key,v);
            //结束时设置count为0,不累加
            count=0;
        }
    }
}

Driver类

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.net.URI;


public class DriverTest {
    public static void main(String[] args) throws Exception {
        Job job = Job.getInstance(new Configuration());

        //分别设置Map、Reduce、Driver类是那三个
        job.setJarByClass(DriverTest.class);
        job.setMapperClass(MapTest.class);
        job.setReducerClass(ReduceTest.class);

        //设置Map阶段输出的数据类型
        job.setMapOutputKeyClass(JavaBean.class);
        job.setMapOutputValueClass(NullWritable.class);

        //设置在Reduce阶段输出的数据类型
        job.setOutputValueClass(JavaBean.class);
        job.setOutputKeyClass(IntWritable.class);

        //设置分区为自己的自定义分区
        job.setPartitionerClass(PartitionTest.class);
        //设置ReduceTask的个数,必须要大于等于自己设置的分区数
        job.setNumReduceTasks(2);
        //设置要缓存到内存的数据表的路径,file不可少,代表着本地文件系统
        job.addCacheFile(new URI("file:///D:/MP/MPTest01/input/information.txt"));

        //设置文件数据输出和输入地址
        FileInputFormat.setInputPaths(job,"D:\\MP\\MPTest01\\input\\student.txt");
        FileOutputFormat.setOutputPath(job,new Path("D:\\MP\\MPTest01\\output"));

        boolean b = job.waitForCompletion(true);
        System.exit(b?0:1);
    }
}