hadoop的分组和分区 hadoop默认分区规则是

转载

laojean 2023-09-20 10:32:41

文章标签 hadoop的分组和分区 hadoop apache 自定义 文章分类 Hadoop 大数据

lz在学习hadoop大数据实践，接触到可以通过继承partitioner这个类来自定义分区，将map后输出的结果按照key来划分到不同的reduce中进行汇总，也就是reduce，默认情况下只有一个partitioner分区，可以自定义来划分不同的partitioner分区，方便快捷。而且，自定义分区，必须通过hadoop jar的方式来运行，以下通过一个例子来说明如何运行这个partitioner。

有一个sort.txt文件，其中每一行的内容分别表示的是长和宽，那么需要按照面积的大小来进行排序，所以这里也要自定义排序，然后划分完后，再根据是否是长方形还是正方形来将其输送到不同的reduce中进行汇总计算。

首先是sort.txt文件，其中用tab来分隔。

1 1
9 9
4 5
7 8

然后是自定义排序

class RectangleWritable implements WritableComparable  //实现了接口
{
    int length,width;

    public RectangleWritable()
    {
        super();
    }

    public RectangleWritable(int length, int width)
    {
        super();
        this.length = length;
        this.width = width;
    }

    public int getLength()
    {
        return length;
    }

    public int getWidth()
    {
        return width;
    }

    public void setLength(int length)
    {
        this.length = length;
    }

    public void setWidth(int width)
    {
        this.width = width;
    }

    public int compareTo(Object o)  //自定义排序规则
    {  //就是按照长*宽得到的长方形的面积大小来进行排序
        RectangleWritable to = (RectangleWritable)o;
        if(this.getLength() * this.getWidth() > to.getLength() * to.getWidth())
            return 1;
        else if(this.getLength() * this.getWidth() < to.getLength() * to.getWidth())
            return -1;
        else
             return 0;
    }

    public void write(DataOutput out) throws IOException  //序列化，保证序列化和反序列化的时候，长宽的顺序不能搞乱
    {
        out.writeInt(length);
        out.writeInt(width);
    }

    public void readFields(DataInput in) throws IOException  //反序列化
    {
        this.length = in.readInt();
        this.width = in.readInt();
    }
}

然后是map代码

public static class MyMapper extends Mapper<LongWritable,Text,RectangleWritable,NullWritable>
    {
        public void map(LongWritable k1,Text v1,Context context) throws IOException, InterruptedException
        {
            String[] splits = v1.toString().split("\t");
            RectangleWritable k2 = new RectangleWritable(Integer.parseInt(splits[0]),Integer.parseInt(splits[1]));
            context.write(k2,NullWritable.get());

        }
    }

那么在完成map之后，就需要通过自定义分区，将正方形放置到part-r-00000文件中，将长方形放置到part-r-00001文件中，新建两个不同的分区。

class MyPartitioner extends Partitioner<RectangleWritable,NullWritable>
{
    public int getPartition(RectangleWritable k2, NullWritable v2,
                            int numReduceTasks)   //这里的两个参数分别是key和value
    {
        if(k2.getLength() == k2.getWidth())   //正方形在任务0中汇总
            return 0;
        else                        //长方形在任务1中汇总
            return 1;
    }
}

接下来是reduce汇总程序：

public static class MyReducer extends Reducer<RectangleWritable,NullWritable,IntWritable,IntWritable>
    {
        public void reduce(RectangleWritable k2,Iterable<NullWritable> v2s,Context context) throws IOException, InterruptedException
        {
            context.write(new IntWritable(k2.getLength()),new IntWritable(k2.getWidth()));
        }
    }

然后是main代码，可以在此写入自定义划分

public static void main(String[] args) throws Throwable,URISyntaxException
    {
        //String uri = args[0];
        Configuration conf = new Configuration();
        /*FileSystem fs = FileSystem.get(new URI(uri),conf);
        if(fs.exists(new Path(args[1])))    //先判断输出路径是否已经存在，如果存在，那么就将循环地删除此文件路径
            fs.delete(new Path(args[1]),true);*/

        Job job = Job.getInstance(conf,"RectangleSort");

        job.setJarByClass(RectangleSort.class);

        job.setMapperClass(RectangleSort.MyMapper.class);
        job.setMapOutputKeyClass(RectangleWritable.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setReducerClass(RectangleSort.MyReducer.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.addInputPath(job,new Path(args[0]));

        FileOutputFormat.setOutputPath(job,new Path(args[1]));


        job.setPartitionerClass(MyPartitioner.class);  //使用自定义分区，用来对map之后的任务进行汇总
        job.setNumReduceTasks(2);     //设置两个不同的reduce任务


        Boolean b = job.waitForCompletion(true);
        if(!b)
        {
            System.err.println("failed");
        }
        else
            System.out.println("finished!");

    }

注意，自定义分区必须用hadoop jar的形式来运行，其运行的结果

sunwangdongMacBook-Pro:sbin sunwangdong$ hdfs dfs -ls /data/rectangle/out
17/07/15 14:34:10 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 3 items
-rw-r--r--   1 sunwangdong supergroup          0 2017-07-15 14:03 /data/rectangle/out/_SUCCESS
-rw-r--r--   1 sunwangdong supergroup          8 2017-07-15 14:03 /data/rectangle/out/part-r-00000
-rw-r--r--   1 sunwangdong supergroup          8 2017-07-15 14:03 /data/rectangle/out/part-r-00001

就出现了两个part-r-00000和part-r-00001文件，分别存放不同类型的长方形和正方形。

sunwangdongMacBook-Pro:sbin sunwangdong$ hdfs dfs -cat /data/rectangle/out/p*0
17/07/15 14:35:12 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
1	1
9	9

在part-r-00000中存放的就是正方形，而在part-r-00001中存放的就是长方形

sunwangdongMacBook-Pro:sbin sunwangdong$ hdfs dfs -cat /data/rectangle/out/p*1
17/07/15 14:35:56 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
4	5
7	8

完整代码如下：

package com.sunwangdong.hadoop;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.URISyntaxException;

/**
 * Created by sunwangdong on 2017/7/2.
 */
public class RectangleSort
{

    public static class MyMapper extends Mapper<LongWritable,Text,RectangleWritable,NullWritable>
    {
        public void map(LongWritable k1,Text v1,Context context) throws IOException, InterruptedException
        {
            String[] splits = v1.toString().split("\t");
            RectangleWritable k2 = new RectangleWritable(Integer.parseInt(splits[0]),Integer.parseInt(splits[1]));
            context.write(k2,NullWritable.get());

        }
    }


    public static class MyReducer extends Reducer<RectangleWritable,NullWritable,IntWritable,IntWritable>
    {
        public void reduce(RectangleWritable k2,Iterable<NullWritable> v2s,Context context) throws IOException, InterruptedException
        {
            context.write(new IntWritable(k2.getLength()),new IntWritable(k2.getWidth()));
        }
    }

    public static void main(String[] args) throws Throwable,URISyntaxException
{
    //String uri = args[0];
    Configuration conf = new Configuration();
        /*FileSystem fs = FileSystem.get(new URI(uri),conf);
        if(fs.exists(new Path(args[1])))    //先判断输出路径是否已经存在，如果存在，那么就将循环地删除此文件路径
            fs.delete(new Path(args[1]),true);*/

    Job job = Job.getInstance(conf,"RectangleSort");

    job.setJarByClass(RectangleSort.class);

    job.setMapperClass(RectangleSort.MyMapper.class);
    job.setMapOutputKeyClass(RectangleWritable.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setReducerClass(RectangleSort.MyReducer.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job,new Path(args[0]));

    FileOutputFormat.setOutputPath(job,new Path(args[1]));


    job.setPartitionerClass(MyPartitioner.class);  //使用自定义分区，用来对map之后的任务进行汇总
    job.setNumReduceTasks(2);     //设置两个不同的reduce任务


    Boolean b = job.waitForCompletion(true);
    if(!b)
    {
        System.err.println("failed");
    }
    else
        System.out.println("finished!");

}


}

class RectangleWritable implements WritableComparable
{
    int length,width;

    public RectangleWritable()
    {
        super();
    }

    public RectangleWritable(int length, int width)
    {
        super();
        this.length = length;
        this.width = width;
    }

    public int getLength()
    {
        return length;
    }

    public int getWidth()
    {
        return width;
    }

    public void setLength(int length)
    {
        this.length = length;
    }

    public void setWidth(int width)
    {
        this.width = width;
    }

    public int compareTo(Object o)
    {  //就是按照长*宽得到的长方形的面积大小来进行排序
        RectangleWritable to = (RectangleWritable)o;
        if(this.getLength() * this.getWidth() > to.getLength() * to.getWidth())
            return 1;
        else if(this.getLength() * this.getWidth() < to.getLength() * to.getWidth())
            return -1;
        else
             return 0;
    }

    public void write(DataOutput out) throws IOException  //序列化，保证序列化和反序列化的时候，长宽的顺序不能搞乱
    {
        out.writeInt(length);
        out.writeInt(width);
    }

    public void readFields(DataInput in) throws IOException  //反序列化
    {
        this.length = in.readInt();
        this.width = in.readInt();
    }
}

class MyPartitioner extends Partitioner<RectangleWritable,NullWritable>
{
    public int getPartition(RectangleWritable k2, NullWritable v2,
                            int numReduceTasks)   //这里的两个参数分别是key和value
    {
        if(k2.getLength() == k2.getWidth())   //正方形在任务0中汇总
            return 0;
        else                        //长方形在任务1中汇总
            return 1;
    }
}

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。