lz在学习hadoop大数据实践,接触到可以通过继承partitioner这个类来自定义分区,将map后输出的结果按照key来划分到不同的reduce中进行汇总,也就是reduce,默认情况下只有一个partitioner分区,可以自定义来划分不同的partitioner分区,方便快捷。而且,自定义分区,必须通过hadoop jar的方式来运行,以下通过一个例子来说明如何运行这个partitioner。
有一个sort.txt文件,其中每一行的内容分别表示的是长和宽,那么需要按照面积的大小来进行排序,所以这里也要自定义排序,然后划分完后,再根据是否是长方形还是正方形来将其输送到不同的reduce中进行汇总计算。
首先是sort.txt文件,其中用tab来分隔。
1 1
9 9
4 5
7 8
然后是自定义排序
class RectangleWritable implements WritableComparable //实现了接口
{
int length,width;
public RectangleWritable()
{
super();
}
public RectangleWritable(int length, int width)
{
super();
this.length = length;
this.width = width;
}
public int getLength()
{
return length;
}
public int getWidth()
{
return width;
}
public void setLength(int length)
{
this.length = length;
}
public void setWidth(int width)
{
this.width = width;
}
public int compareTo(Object o) //自定义排序规则
{ //就是按照长*宽得到的长方形的面积大小来进行排序
RectangleWritable to = (RectangleWritable)o;
if(this.getLength() * this.getWidth() > to.getLength() * to.getWidth())
return 1;
else if(this.getLength() * this.getWidth() < to.getLength() * to.getWidth())
return -1;
else
return 0;
}
public void write(DataOutput out) throws IOException //序列化,保证序列化和反序列化的时候,长宽的顺序不能搞乱
{
out.writeInt(length);
out.writeInt(width);
}
public void readFields(DataInput in) throws IOException //反序列化
{
this.length = in.readInt();
this.width = in.readInt();
}
}
然后是map代码
public static class MyMapper extends Mapper<LongWritable,Text,RectangleWritable,NullWritable>
{
public void map(LongWritable k1,Text v1,Context context) throws IOException, InterruptedException
{
String[] splits = v1.toString().split("\t");
RectangleWritable k2 = new RectangleWritable(Integer.parseInt(splits[0]),Integer.parseInt(splits[1]));
context.write(k2,NullWritable.get());
}
}
那么在完成map之后,就需要通过自定义分区,将正方形放置到part-r-00000文件中,将长方形放置到part-r-00001文件中,新建两个不同的分区。
class MyPartitioner extends Partitioner<RectangleWritable,NullWritable>
{
public int getPartition(RectangleWritable k2, NullWritable v2,
int numReduceTasks) //这里的两个参数分别是key和value
{
if(k2.getLength() == k2.getWidth()) //正方形在任务0中汇总
return 0;
else //长方形在任务1中汇总
return 1;
}
}
接下来是reduce汇总程序:
public static class MyReducer extends Reducer<RectangleWritable,NullWritable,IntWritable,IntWritable>
{
public void reduce(RectangleWritable k2,Iterable<NullWritable> v2s,Context context) throws IOException, InterruptedException
{
context.write(new IntWritable(k2.getLength()),new IntWritable(k2.getWidth()));
}
}
然后是main代码,可以在此写入自定义划分
public static void main(String[] args) throws Throwable,URISyntaxException
{
//String uri = args[0];
Configuration conf = new Configuration();
/*FileSystem fs = FileSystem.get(new URI(uri),conf);
if(fs.exists(new Path(args[1]))) //先判断输出路径是否已经存在,如果存在,那么就将循环地删除此文件路径
fs.delete(new Path(args[1]),true);*/
Job job = Job.getInstance(conf,"RectangleSort");
job.setJarByClass(RectangleSort.class);
job.setMapperClass(RectangleSort.MyMapper.class);
job.setMapOutputKeyClass(RectangleWritable.class);
job.setMapOutputValueClass(NullWritable.class);
job.setReducerClass(RectangleSort.MyReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setPartitionerClass(MyPartitioner.class); //使用自定义分区,用来对map之后的任务进行汇总
job.setNumReduceTasks(2); //设置两个不同的reduce任务
Boolean b = job.waitForCompletion(true);
if(!b)
{
System.err.println("failed");
}
else
System.out.println("finished!");
}
注意,自定义分区必须用hadoop jar的形式来运行,其运行的结果
sunwangdongMacBook-Pro:sbin sunwangdong$ hdfs dfs -ls /data/rectangle/out
17/07/15 14:34:10 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 3 items
-rw-r--r-- 1 sunwangdong supergroup 0 2017-07-15 14:03 /data/rectangle/out/_SUCCESS
-rw-r--r-- 1 sunwangdong supergroup 8 2017-07-15 14:03 /data/rectangle/out/part-r-00000
-rw-r--r-- 1 sunwangdong supergroup 8 2017-07-15 14:03 /data/rectangle/out/part-r-00001
就出现了两个part-r-00000和part-r-00001文件,分别存放不同类型的长方形和正方形。
sunwangdongMacBook-Pro:sbin sunwangdong$ hdfs dfs -cat /data/rectangle/out/p*0
17/07/15 14:35:12 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
1 1
9 9
在part-r-00000中存放的就是正方形,而在part-r-00001中存放的就是长方形
sunwangdongMacBook-Pro:sbin sunwangdong$ hdfs dfs -cat /data/rectangle/out/p*1
17/07/15 14:35:56 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
4 5
7 8
完整代码如下:
package com.sunwangdong.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.URISyntaxException;
/**
* Created by sunwangdong on 2017/7/2.
*/
public class RectangleSort
{
public static class MyMapper extends Mapper<LongWritable,Text,RectangleWritable,NullWritable>
{
public void map(LongWritable k1,Text v1,Context context) throws IOException, InterruptedException
{
String[] splits = v1.toString().split("\t");
RectangleWritable k2 = new RectangleWritable(Integer.parseInt(splits[0]),Integer.parseInt(splits[1]));
context.write(k2,NullWritable.get());
}
}
public static class MyReducer extends Reducer<RectangleWritable,NullWritable,IntWritable,IntWritable>
{
public void reduce(RectangleWritable k2,Iterable<NullWritable> v2s,Context context) throws IOException, InterruptedException
{
context.write(new IntWritable(k2.getLength()),new IntWritable(k2.getWidth()));
}
}
public static void main(String[] args) throws Throwable,URISyntaxException
{
//String uri = args[0];
Configuration conf = new Configuration();
/*FileSystem fs = FileSystem.get(new URI(uri),conf);
if(fs.exists(new Path(args[1]))) //先判断输出路径是否已经存在,如果存在,那么就将循环地删除此文件路径
fs.delete(new Path(args[1]),true);*/
Job job = Job.getInstance(conf,"RectangleSort");
job.setJarByClass(RectangleSort.class);
job.setMapperClass(RectangleSort.MyMapper.class);
job.setMapOutputKeyClass(RectangleWritable.class);
job.setMapOutputValueClass(NullWritable.class);
job.setReducerClass(RectangleSort.MyReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setPartitionerClass(MyPartitioner.class); //使用自定义分区,用来对map之后的任务进行汇总
job.setNumReduceTasks(2); //设置两个不同的reduce任务
Boolean b = job.waitForCompletion(true);
if(!b)
{
System.err.println("failed");
}
else
System.out.println("finished!");
}
}
class RectangleWritable implements WritableComparable
{
int length,width;
public RectangleWritable()
{
super();
}
public RectangleWritable(int length, int width)
{
super();
this.length = length;
this.width = width;
}
public int getLength()
{
return length;
}
public int getWidth()
{
return width;
}
public void setLength(int length)
{
this.length = length;
}
public void setWidth(int width)
{
this.width = width;
}
public int compareTo(Object o)
{ //就是按照长*宽得到的长方形的面积大小来进行排序
RectangleWritable to = (RectangleWritable)o;
if(this.getLength() * this.getWidth() > to.getLength() * to.getWidth())
return 1;
else if(this.getLength() * this.getWidth() < to.getLength() * to.getWidth())
return -1;
else
return 0;
}
public void write(DataOutput out) throws IOException //序列化,保证序列化和反序列化的时候,长宽的顺序不能搞乱
{
out.writeInt(length);
out.writeInt(width);
}
public void readFields(DataInput in) throws IOException //反序列化
{
this.length = in.readInt();
this.width = in.readInt();
}
}
class MyPartitioner extends Partitioner<RectangleWritable,NullWritable>
{
public int getPartition(RectangleWritable k2, NullWritable v2,
int numReduceTasks) //这里的两个参数分别是key和value
{
if(k2.getLength() == k2.getWidth()) //正方形在任务0中汇总
return 0;
else //长方形在任务1中汇总
return 1;
}
}