这里用一个影评案例来说明
案例部分数据:
users.dat
1::F::1::10::48067
2::M::56::16::70072
3::M::25::15::55117
4::M::45::7::02460
5::M::25::20::55455
6::F::50::9::55117
7::M::35::1::06810
8::M::25::12::11413
9::M::25::17::61614
10::F::35::1::95370
11::F::25::1::04093
12::M::25::12::32793
13::M::45::1::93304
14::M::35::0::60126
15::M::25::7::22903
16::F::35::0::20670
17::M::50::1::95350
18::F::18::3::95825
19::M::1::10::48073
20::M::25::14::55113
ratings.dat
1::1193::5::978300760
1::661::3::978302109
1::914::3::978301968
1::3408::4::978300275
1::2355::5::978824291
1::1197::3::978302268
1::1287::5::978302039
1::2804::5::978300719
1::594::4::978302268
1::919::4::978301368
1::595::5::978824268
1::938::4::978301752
1::2398::4::978302281
1::2918::4::978302124
1::1035::5::978301753
1::2791::4::978302188
1::2687::3::978824268
1::2018::4::978301777
1::3105::5::978301713
1::2797::4::978302039
要求:关联两表中的数据
使用reducejoin
package com.hadoop.mapreduce.join;
/**
* reducejoin管理users(小表)与ratings(大表)
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class ReduceJoin {
static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
String fileName = "";
Text mk = new Text();
Text mv = new Text();
// 我们要在map启动之前获取文件的名字,所以要重写setup
@Override
protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
// 获取文件的切片,从而获取文件名
FileSplit fileSplit = (FileSplit) context.getInputSplit();
fileName = fileSplit.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String[] splits = value.toString().split("::");
// 如果文件名相同说明是users表,给它加一个U的标识
if ("users.dat".equals(fileName)) {
mk.set(splits[0]);
mv.set("U" + splits[1] + "\t" + splits[2] + "\t" + splits[3] + "\t" + splits[4]);
} else {
// 不同就加一个R的标识
mk.set(splits[0]);
mv.set("R" + splits[1] + "\t" + splits[2] + "\t" + splits[3]);
}
context.write(mk, mv);
}
}
static class MyReducer extends Reducer<Text, Text, Text, Text> {
Text rv = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
// 定义两个集合来存放两表的信息
List<String> ulist = new ArrayList<String>();
List<String> rlist = new ArrayList<String>();
for (Text v : values) {
String str = v.toString();
// 如果以u开头就添加在ulist集合中
if (str.startsWith("U"))
// 舍去第一个标识符从第二个元素开始
ulist.add(str.substring(1));
else
rlist.add(str.substring(1));
}
// 嵌套相加
for (String u : ulist) {
for (String r : rlist) {
String res = u + "\t" + r;
rv.set(res);
context.write(key, rv);
}
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(ReduceJoin.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 这里有两个表所有需要两个路径
Path inpath1 = new Path("F:\\test\\rating\\ratings.dat");
Path inpath2 = new Path("F:\\test\\user\\users.dat");
// 这里应该用FileInputFormat里面的setInputPaths方法添加多个路径
FileInputFormat.setInputPaths(job, inpath1, inpath2);
Path outpath = new Path("F:\\test\\testout\\");
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outpath)) {
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job, outpath);
job.waitForCompletion(true);
}
}
缺点:处理不了数据倾斜
部分结果:
1 F 1 10 48067 1907 4 978824330
1 F 1 10 48067 783 4 978824291
1 F 1 10 48067 1836 5 978300172
1 F 1 10 48067 1022 5 978300055
1 F 1 10 48067 2762 4 978302091
1 F 1 10 48067 150 5 978301777
1 F 1 10 48067 1 5 978824268
1 F 1 10 48067 1961 5 978301590
1 F 1 10 48067 1962 4 978301753
1 F 1 10 48067 2692 4 978301570
1 F 1 10 48067 260 4 978300760
1 F 1 10 48067 1028 5 978301777
1 F 1 10 48067 1029 5 978302205
1 F 1 10 48067 1207 4 978300719
1 F 1 10 48067 2028 5 978301619
1 F 1 10 48067 531 4 978302149
1 F 1 10 48067 3114 4 978302174
1 F 1 10 48067 608 4 978301398
1 F 1 10 48067 1246 4 978302091
1 F 1 10 48067 1193 5 978300760
使用mapjoin(推荐)
package com.hadoop.mapreduce.join;
import java.io.BufferedReader;
import java.io.FileReader;
/**
* mapjoin管理users(小表)与ratings(大表)
*/
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MapJoin {
static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
Map<String, String> map = new HashMap<String, String>();
Text mk = new Text();
Text mv = new Text();
// 可以直接在map里面的setup阶段对小表进行处理
@Override
protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
// 获取本地缓存文件
Path[] files = context.getLocalCacheFiles();
// 我们这里只有一个缓存文件,获取第一个即可
String localFile = files[0].toString();
// 缓冲流读取文件信息
BufferedReader br = new BufferedReader(new FileReader(localFile));
String lines = null;
while ((lines = br.readLine()) != null) {
String[] splits = lines.split("::");
map.put(splits[0], splits[1] + "\t" + splits[2] + "\t" + splits[3] + "\t" + splits[4]);
}
// 这里不用context.write,只用记在缓存中即可
br.close();
}
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String[] splits = value.toString().split("::");
// 将第一个元素设置为关键
String joinKey = splits[0];
// 如果包含这个键说明是需要合并的信息
if (map.containsKey(joinKey)) {
// 输出主键
mk.set(joinKey);
// 利用map.get(joinKey)输入setup里面的信息,将剩下的追加即可
mv.set(map.get(joinKey) + "\t" + splits[1] + "\t" + splits[2] + "\t" + splits[3]);
context.write(mk, mv);
}
}
}
public static void main(String[] args)
throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(MapJoin.class);
job.setMapperClass(MyMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
/*
* 这里我们要手机设置ReduceTasks为0,ReduceTasks默认为1, 虽然没
* 写reduce但是还是会走reduce里面的run方法,什么也没做这样会降低效率
*/
job.setNumReduceTasks(0);
// 这里给一下本地缓存文件的路径即users表的路径,注意这里只能打jar包运行
job.addCacheFile(new URI(args[0]));
Path inpath = new Path(args[1]);
FileInputFormat.setInputPaths(job, inpath);
Path outpath = new Path(args[2]);
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outpath)) {
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job, outpath);
job.waitForCompletion(true);
}
}
打jar包运行
hadoop jar mapjoin.jar com.hadoop.mapreduce.join.MapJoin /users/users.dat /ratings /mapjoin_out01
优点:避免数据倾斜
缺点:不适合做大表与大表之间的join,缓存文件只能是小文件
部分结果:
1 F 1 10 48067 1193 5 978300760
1 F 1 10 48067 661 3 978302109
1 F 1 10 48067 914 3 978301968
1 F 1 10 48067 3408 4 978300275
1 F 1 10 48067 2355 5 978824291
1 F 1 10 48067 1197 3 978302268
1 F 1 10 48067 1287 5 978302039
1 F 1 10 48067 2804 5 978300719
1 F 1 10 48067 594 4 978302268
1 F 1 10 48067 919 4 978301368
1 F 1 10 48067 595 5 978824268
1 F 1 10 48067 938 4 978301752
1 F 1 10 48067 2398 4 978302281
1 F 1 10 48067 2918 4 978302124
1 F 1 10 48067 1035 5 978301753
1 F 1 10 48067 2791 4 978302188
1 F 1 10 48067 2687 3 978824268
1 F 1 10 48067 2018 4 978301777
1 F 1 10 48067 3105 5 978301713
1 F 1 10 48067 2797 4 978302039