mapjoin与reducejoin的使用

原创

DanielMaster 2023-10-20 10:15:42 博主文章分类：Hadoop ©著作权

文章标签 Text hadoop apache 文章分类 Hadoop 大数据 yyds干货盘点

©著作权归作者所有：来自51CTO博客作者DanielMaster的原创作品，请联系作者获取转载授权，否则将追究法律责任

这里用一个影评案例来说明

案例部分数据:

users.dat

1::F::1::10::48067
2::M::56::16::70072
3::M::25::15::55117
4::M::45::7::02460
5::M::25::20::55455
6::F::50::9::55117
7::M::35::1::06810
8::M::25::12::11413
9::M::25::17::61614
10::F::35::1::95370
11::F::25::1::04093
12::M::25::12::32793
13::M::45::1::93304
14::M::35::0::60126
15::M::25::7::22903
16::F::35::0::20670
17::M::50::1::95350
18::F::18::3::95825
19::M::1::10::48073
20::M::25::14::55113

ratings.dat

1::1193::5::978300760
1::661::3::978302109
1::914::3::978301968
1::3408::4::978300275
1::2355::5::978824291
1::1197::3::978302268
1::1287::5::978302039
1::2804::5::978300719
1::594::4::978302268
1::919::4::978301368
1::595::5::978824268
1::938::4::978301752
1::2398::4::978302281
1::2918::4::978302124
1::1035::5::978301753
1::2791::4::978302188
1::2687::3::978824268
1::2018::4::978301777
1::3105::5::978301713
1::2797::4::978302039

要求:关联两表中的数据

使用reducejoin

package com.hadoop.mapreduce.join;

/**
 * reducejoin管理users(小表)与ratings(大表)
 */
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class ReduceJoin {
	static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
		String fileName = "";
		Text mk = new Text();
		Text mv = new Text();

		// 我们要在map启动之前获取文件的名字，所以要重写setup
		@Override
		protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			// 获取文件的切片，从而获取文件名
			FileSplit fileSplit = (FileSplit) context.getInputSplit();
			fileName = fileSplit.getPath().getName();
		}

		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			String[] splits = value.toString().split("::");
			// 如果文件名相同说明是users表，给它加一个U的标识
			if ("users.dat".equals(fileName)) {
				mk.set(splits[0]);
				mv.set("U" + splits[1] + "\t" + splits[2] + "\t" + splits[3] + "\t" + splits[4]);
			} else {
				// 不同就加一个R的标识
				mk.set(splits[0]);
				mv.set("R" + splits[1] + "\t" + splits[2] + "\t" + splits[3]);
			}
			context.write(mk, mv);
		}

	}

	static class MyReducer extends Reducer<Text, Text, Text, Text> {
		Text rv = new Text();

		@Override
		protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			// 定义两个集合来存放两表的信息
			List<String> ulist = new ArrayList<String>();
			List<String> rlist = new ArrayList<String>();
			for (Text v : values) {
				String str = v.toString();
				// 如果以u开头就添加在ulist集合中
				if (str.startsWith("U"))
					// 舍去第一个标识符从第二个元素开始
					ulist.add(str.substring(1));
				else
					rlist.add(str.substring(1));
			}
			// 嵌套相加
			for (String u : ulist) {
				for (String r : rlist) {
					String res = u + "\t" + r;
					rv.set(res);
					context.write(key, rv);
				}
			}

		}

	}

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(ReduceJoin.class);

		job.setMapperClass(MyMapper.class);

		job.setReducerClass(MyReducer.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		// 这里有两个表所有需要两个路径
		Path inpath1 = new Path("F:\\test\\rating\\ratings.dat");
		Path inpath2 = new Path("F:\\test\\user\\users.dat");
		// 这里应该用FileInputFormat里面的setInputPaths方法添加多个路径
		FileInputFormat.setInputPaths(job, inpath1, inpath2);

		Path outpath = new Path("F:\\test\\testout\\");
		FileSystem fs = FileSystem.get(conf);
		if (fs.exists(outpath)) {
			fs.delete(outpath, true);
		}
		FileOutputFormat.setOutputPath(job, outpath);

		job.waitForCompletion(true);
	}

}

缺点:处理不了数据倾斜

部分结果:

1	F	1	10	48067	1907	4	978824330
1	F	1	10	48067	783	4	978824291
1	F	1	10	48067	1836	5	978300172
1	F	1	10	48067	1022	5	978300055
1	F	1	10	48067	2762	4	978302091
1	F	1	10	48067	150	5	978301777
1	F	1	10	48067	1	5	978824268
1	F	1	10	48067	1961	5	978301590
1	F	1	10	48067	1962	4	978301753
1	F	1	10	48067	2692	4	978301570
1	F	1	10	48067	260	4	978300760
1	F	1	10	48067	1028	5	978301777
1	F	1	10	48067	1029	5	978302205
1	F	1	10	48067	1207	4	978300719
1	F	1	10	48067	2028	5	978301619
1	F	1	10	48067	531	4	978302149
1	F	1	10	48067	3114	4	978302174
1	F	1	10	48067	608	4	978301398
1	F	1	10	48067	1246	4	978302091
1	F	1	10	48067	1193	5	978300760

使用mapjoin(推荐)

package com.hadoop.mapreduce.join;

import java.io.BufferedReader;
import java.io.FileReader;
/**
 * mapjoin管理users(小表)与ratings(大表)
 */
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MapJoin {
	static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
		Map<String, String> map = new HashMap<String, String>();
		Text mk = new Text();
		Text mv = new Text();

		// 可以直接在map里面的setup阶段对小表进行处理
		@Override
		protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			// 获取本地缓存文件
			Path[] files = context.getLocalCacheFiles();
			// 我们这里只有一个缓存文件，获取第一个即可
			String localFile = files[0].toString();
			// 缓冲流读取文件信息
			BufferedReader br = new BufferedReader(new FileReader(localFile));
			String lines = null;
			while ((lines = br.readLine()) != null) {
				String[] splits = lines.split("::");
				map.put(splits[0], splits[1] + "\t" + splits[2] + "\t" + splits[3] + "\t" + splits[4]);
			}
			// 这里不用context.write，只用记在缓存中即可
			br.close();
		}

		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			String[] splits = value.toString().split("::");
			// 将第一个元素设置为关键
			String joinKey = splits[0];
			// 如果包含这个键说明是需要合并的信息
			if (map.containsKey(joinKey)) {
				// 输出主键
				mk.set(joinKey);
				// 利用map.get(joinKey)输入setup里面的信息，将剩下的追加即可
				mv.set(map.get(joinKey) + "\t" + splits[1] + "\t" + splits[2] + "\t" + splits[3]);
				context.write(mk, mv);
			}

		}

	}

	public static void main(String[] args)
			throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {

		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(MapJoin.class);

		job.setMapperClass(MyMapper.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		/*
		 * 这里我们要手机设置ReduceTasks为0,ReduceTasks默认为1， 虽然没
		 * 写reduce但是还是会走reduce里面的run方法,什么也没做这样会降低效率
		 */
		job.setNumReduceTasks(0);
		// 这里给一下本地缓存文件的路径即users表的路径,注意这里只能打jar包运行
		job.addCacheFile(new URI(args[0]));
		Path inpath = new Path(args[1]);
		FileInputFormat.setInputPaths(job, inpath);

		Path outpath = new Path(args[2]);
		FileSystem fs = FileSystem.get(conf);
		if (fs.exists(outpath)) {
			fs.delete(outpath, true);
		}
		FileOutputFormat.setOutputPath(job, outpath);

		job.waitForCompletion(true);
	}

}

打jar包运行

hadoop jar mapjoin.jar com.hadoop.mapreduce.join.MapJoin /users/users.dat /ratings /mapjoin_out01

优点:避免数据倾斜

缺点:不适合做大表与大表之间的join,缓存文件只能是小文件

部分结果:

1	F	1	10	48067	1193	5	978300760
1	F	1	10	48067	661	3	978302109
1	F	1	10	48067	914	3	978301968
1	F	1	10	48067	3408	4	978300275
1	F	1	10	48067	2355	5	978824291
1	F	1	10	48067	1197	3	978302268
1	F	1	10	48067	1287	5	978302039
1	F	1	10	48067	2804	5	978300719
1	F	1	10	48067	594	4	978302268
1	F	1	10	48067	919	4	978301368
1	F	1	10	48067	595	5	978824268
1	F	1	10	48067	938	4	978301752
1	F	1	10	48067	2398	4	978302281
1	F	1	10	48067	2918	4	978302124
1	F	1	10	48067	1035	5	978301753
1	F	1	10	48067	2791	4	978302188
1	F	1	10	48067	2687	3	978824268
1	F	1	10	48067	2018	4	978301777
1	F	1	10	48067	3105	5	978301713
1	F	1	10	48067	2797	4	978302039