有很多数据存在mongodb中,希望使用mr来去将数据分布式读取出来,并且统计,然后再将结果存储到mongo中。(其它mr支持不友好的数据库均可以自行定义)

现有如下数据:

db.students.insert({name:"bingbing",age:16,sex:"F"})
db.students.insert({name:"zhiyin",sex:"M"})
db.students.insert({name:"kaige",age:16})
db.students.insert({name:"yejie",age:16,sex:"F"})
db.students.insert({name:"boduo",age:18})
db.students.insert({name:"cunshang",age:15,sex:"F"})
db.students.insert({name:"dubian",age:18,sex:"F"})

结果数据存储到hadoop库下的stu_res集合中:

{"age":15,"coount":1}
{"age":16,"coount":3}
{"age":18,"coount":2}

注意点:1.本地的MongoDB数据库,即Windows的。
2.本地应该安装了hadoop。
3.本地的MongoDB数据库没有账户密码。
4.代码中注释的部分是我尝试连接远程虚拟机上的Linux上的MongoDB(这个是有账户密码的),
但是提示我连接超时,感觉连接数据库的部分是没问题的,应该是代码中的其他问题,或者配置文件的事、如果知道,麻烦告诉一下。

共有这么几个类

mongodb容量计算 mongodb数据统计_MR与MongoDB

import org.apache.hadoop.io.Writable;

import com.mongodb.DBCollection;
import com.mongodb.DBObject;

/**
 * 自定义数据类型的方法
 *
 *
 */
public interface MongoDBWritable extends Writable {
	 /**
	  * 向mongodb写数据的对象
	   */
		public void write(DBCollection dbCollection);
		
		/**
		 * 读mongodb表里面的数据对象
		 */
		public void readFields(DBObject dbObject); 
}
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import com.mongodb.BasicDBObjectBuilder;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;

/**
 * 自定义数据类型
 *
 */
public class StudentsMongoDBWritable implements MongoDBWritable{

	public String name;
	public Integer age;
	public String sex = "";
	public Integer counter = 1;
	

	/**
	 * 注意处理空的值 不过只处理了age 让sex默认为"" 不然报空指针错误
	 */
	public void write(DataOutput out) throws IOException {
		out.writeUTF(name);
		if(age == null){
			out.writeBoolean(false);
		} else {
			out.writeBoolean(true);
			out.writeInt(age);
		}
		out.writeUTF(sex);
		out.writeInt(counter);
	}

	public void readFields(DataInput in) throws IOException {
		this.name = in.readUTF();
		if(in.readBoolean()){
			this.age = in.readInt();
		} else {
			this.age = 0;
		}
		this.sex = in.readUTF();
		this.counter = in.readInt();
				
	}

	/**
	 * 写数据
	 */
	public void write(DBCollection dbCollection) {
		DBObject dbObject = BasicDBObjectBuilder.start().add("age", this.age).add("counter",this.counter).get();
		//将dbobject插入
		dbCollection.insert(dbObject);
	}

	/**
	 * 读数据
	 */
	public void readFields(DBObject dbObject) {
		this.name = dbObject.get("name").toString();
		if(dbObject.get("age") != null){
			this.age = Double.valueOf(dbObject.get("age").toString()).intValue();
		} else {
			this.age = 0;
		}
	}

	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	public Integer getAge() {
		return age;
	}


	public void setAge(Integer age) {
		this.age = age;
	}


	public String getSex() {
		return sex;
	}


	public void setSex(String sex) {
		this.sex = sex;
	}

	public Integer getCounter() {
		return counter;
	}


	public void setCounter(Integer counter) {
		this.counter = counter;
	}

}
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import com.mongodb.*;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.util.ReflectionUtils;

/**
 * 自定义mongodb的输入格式化器
 */

public class MongoDBInputFormat<V extends MongoDBWritable> extends InputFormat<LongWritable, V>{

	
	/**
	 * 自定义输入数据分片信息类
	 */
	public static class MongoDBInputSplit extends InputSplit implements Writable{

		private long start = 0; //分片的起始位置
		private long end = 0; //分片的结束位置
		
		public MongoDBInputSplit(){
			
		}
		
		public MongoDBInputSplit(long start, long end) {
			super();
			this.start = start;
			this.end = end;
		}

		public void write(DataOutput out) throws IOException {
			out.writeLong(start);
			out.writeLong(end);
		}

		public void readFields(DataInput in) throws IOException {
			this.start = in.readLong();
			this.end = in.readLong();
		}

		@Override
		public long getLength() throws IOException, InterruptedException {
			return end - start;
		}

		@Override
		public String[] getLocations() throws IOException, InterruptedException {
			return new String[0];
			//return new String[] {};
		}
	}
	
	/**
	 * 获取分片(将mongodb里面的数据取出来进行分片)
	 */
	@Override
	public List<InputSplit> getSplits(JobContext context) throws IOException,
			InterruptedException {

	    //有密码的连接方式 但是版本要提高 3.6.4不过方法过时 2.9.1不行
//        MongoCredential credential=MongoCredential.createCredential("root", "hadoop", "root".toCharArray());
//        MongoClient mongoClient = new MongoClient(new ServerAddress("192.168.216.111", 27017), Arrays.asList(credential));
//        System.out.println("连接成功");
//        DB db = mongoClient.getDB("hadoop");

		//获取mongodb的连接
//		DB db =  Mongo.connect(new DBAddress("192.168.37.111", "hadoop"));
		DB db =  Mongo.connect(new DBAddress("127.0.0.1", "hadoop"));
		//获取集合
		DBCollection dbCollection = db.getCollection("students");
		//定义分片大小,多少条数据一个分片
		long chunk = 2;
		//获取mongdb的collection的总的记录数
		long count = dbCollection.count();
		//计算分片有多少个 3
		long chunksize = (count / chunk);
		
		//定义一个集合存储分片
		List<InputSplit> li = new ArrayList<InputSplit>();
		
		//循环分片,注意不能刚好分片
		for (int i = 0; i < chunksize; i++) {
			/*
			 * 0-2
			 * 2-4
			 * 4-7
			 */
			MongoDBInputSplit is = null;
			if(i+1 == chunksize){
				is = new MongoDBInputSplit(i*chunk, count); //
				li.add(is);
			} else {
				is = new MongoDBInputSplit(i*chunk, i*chunk + chunk);
				li.add(is);
			}
		}
		return li;
	}

	/**
	 * 自定义一个Null类型
	 */
	public static class NULLMongoDBWritable implements MongoDBWritable{

		public void write(DataOutput out) throws IOException {
		}

		public void readFields(DataInput in) throws IOException {
		}

		public void write(DBCollection dbCollection) {
		}

		public void readFields(DBObject dbObject) {
		}
	}
	
	/**
	 * 自定义分片记录器
	 */
	public static class MongoDBRecordReader<V extends MongoDBWritable> extends RecordReader<LongWritable, V>{
		//分片信息
		private MongoDBInputSplit split;
		//结果集(游标)
		private DBCursor dbcursor;
		//定义索引
		private int index;
		private LongWritable key;
		private V value;
		
		public MongoDBRecordReader(){
			
		}

		public MongoDBRecordReader(InputSplit split,TaskAttemptContext context) throws IOException, InterruptedException{
			super();
			initialize(split,context);
		}
		

		public MongoDBRecordReader(MongoDBInputSplit split, DBCursor dbcursor,
				int index, LongWritable key, V value) {
			super();
			this.split = split;
			this.dbcursor = dbcursor;
			this.index = index;
			this.key = key;
			this.value = value;
		}

		/**
		 * 初始化
		 */
		@Override
		public void initialize(InputSplit split, TaskAttemptContext context)
				throws IOException, InterruptedException {
			//初始化分片
			this.split = (MongoDBInputSplit) split;
			//初始化key
			key =  new LongWritable();
			//初始化类
			Configuration conf = context.getConfiguration();
			Class classz = conf.getClass("mapred.mongo.split.value.class", NULLMongoDBWritable.class);
			//初始化value值
			value = (V) ReflectionUtils.newInstance(classz, conf);
		}

		/**
		 * 获取下一个keyvalue值
		 */
		@Override
		public boolean nextKeyValue() throws IOException, InterruptedException {
			//判断dbcursor是否为null
			if(this.dbcursor == null){
                //有密码的连接方式 但是版本要提高 3.6.4不过方法过时 2.9.1不行
//                MongoCredential credential=MongoCredential.createCredential("root", "hadoop", "root".toCharArray());
//                MongoClient mongoClient = new MongoClient(new ServerAddress("192.168.216.111", 27017), Arrays.asList(credential));
//                System.out.println("连接成功");
//                DB db = mongoClient.getDB("hadoop");
//                2.9.1版本
                //获取dbcursor的值
//				DB db = Mongo.connect(new DBAddress("192.168.216.111", "hadoop"));
				DB db = Mongo.connect(new DBAddress("127.0.0.1", "hadoop"));


				//获取集合
				DBCollection dbCollection = db.getCollection("students");
				//获取游标
				dbcursor = dbCollection.find().skip((int)this.split.start).limit((int)this.split.getLength());
			}
			//操作游标
			boolean hasNext = this.dbcursor.hasNext();
			if(hasNext){
				//获取游标的下一个值
				DBObject dbObject = this.dbcursor.next();
				//下一个的key
				this.key.set(this.split.start+index);
				index ++;
				//下一个value
				this.value.readFields(dbObject);
			}
			return hasNext;
		}

		@Override
		public LongWritable getCurrentKey() throws IOException,
				InterruptedException {
			return this.key;
		}

		@Override
		public V getCurrentValue() throws IOException, InterruptedException {
			return this.value;
		}

		/**
		 * 创建记录的进度
		 */
		@Override
		public float getProgress() throws IOException, InterruptedException {
			return 0;
		}

		/**
		 *关闭之前开启的对象 
		 */
		@Override
		public void close() throws IOException {
			dbcursor.close();
		}
	}
	
	
	@Override
	public RecordReader<LongWritable, V> createRecordReader(InputSplit split,
			TaskAttemptContext context) throws IOException,
			InterruptedException {
		/**
		 * 创建输入记录器
		 */
		return new MongoDBRecordReader(split,context);
	}
	
}
import java.io.IOException;
import java.net.UnknownHostException;
import java.util.Arrays;

import com.mongodb.*;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;

/**
 * 自定义输出格式化器

 */
public class MongoDBOutPutFormat<V extends MongoDBWritable> extends OutputFormat<NullWritable, V>{
//	DBOutputFormat

	public static class MongoDBRecordWriter<V extends MongoDBWritable> extends RecordWriter<NullWritable, V>{
		public DBCollection dbCollection = null;
		
		public MongoDBRecordWriter() {
		}
		
		public MongoDBRecordWriter(TaskAttemptContext context) throws UnknownHostException {

            //有密码的连接方式 但是版本要提高 3.6.4不过方法过时 2.9.1不行

//                MongoCredential credential=MongoCredential.createCredential("root", "hadoop", "root".toCharArray());
//                MongoClient mongoClient = new MongoClient(new ServerAddress("192.168.216.111", 27017), Arrays.asList(credential));
//                System.out.println("连接成功");
//                DB db = mongoClient.getDB("hadoop");
				//获取mongodb的连接
//				DB db = Mongo.connect(new DBAddress("192.168.37.111", "hadoop"));
				DB db = Mongo.connect(new DBAddress("127.0.0.1", "hadoop"));
				dbCollection = db.getCollection("stu_res");

		}
		
		@Override
		public void write(NullWritable key, V value) throws IOException,
				InterruptedException {
			/**
			 * 使用value 的write。本质是使用MongoDBWritable的write()
			 */
			value.write(this.dbCollection);
		}

		@Override
		public void close(TaskAttemptContext context) throws IOException,
				InterruptedException {
			//do nothing
		}
		
	}
	
	
	@Override
	public RecordWriter<NullWritable, V> getRecordWriter(
			TaskAttemptContext context) throws IOException,
			InterruptedException {
		//使用泛型为空的时候需要使用 jdk 1.7
		return new MongoDBRecordWriter<>(context);
	}

	@Override
	public void checkOutputSpecs(JobContext context) throws IOException,
			InterruptedException {
		//do nothing
	}

	/**
	 * 输出对象的提交
	 */
	@Override
	public OutputCommitter getOutputCommitter(TaskAttemptContext context)
			throws IOException, InterruptedException {
		/**
		 * 没有输出文件路径   ,为null则可以
		 */
		return new FileOutputCommitter(null, context);
	}
}
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
public class MongoDBMRDemo {
	
	/**
	 * 自定义mapper
	 */
	public static class MyMapper extends Mapper<LongWritable, StudentsMongoDBWritable, IntWritable, StudentsMongoDBWritable>{

		@Override
		protected void map(LongWritable key,StudentsMongoDBWritable value,Context context)
				throws IOException, InterruptedException {
			if(value.age != null){
				context.write(new IntWritable(value.age), value);
			} else {
				System.out.println("invalid value:"+value.getName());
				//System.exit(0);
				return;
			}
		}
	}
	
	/**
	 * 自定义reducer
	 *
	 */
	public static class MyReducer extends Reducer<IntWritable, StudentsMongoDBWritable, NullWritable, StudentsMongoDBWritable>{

		@Override
		protected void reduce(IntWritable key,Iterable<StudentsMongoDBWritable> value,Context context)
				throws IOException, InterruptedException {
			int counter = 0;
			for (StudentsMongoDBWritable p : value) {
				counter += 1;
			}
			//构造返回对象
			StudentsMongoDBWritable person = new StudentsMongoDBWritable();
			person.setAge(key.get());
			person.setCounter(counter);
			context.write(NullWritable.get(), person);
		}
		
	}
	
	/**
	 * 驱动方法
	 */
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//        System.setProperty("HADOOP_USER_NAME","root");
		Configuration conf = new Configuration();
		//mapred.mongo.split.value.class
//        conf.set("fs.defaultFS", "hdfs://qf");
//        conf.set("dfs.nameservices", "qf");
//        conf.set("dfs.ha.namenodes.qf", "nn1, nn2");
//        conf.set("dfs.namenode.rpc-address.qf.nn1", "hadoop01:8020");
//        conf.set("dfs.namenode.rpc-address.qf.nn2", "hadoop02:8020");
//        conf.set("dfs.client.failover.proxy.provider.qf", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");


		conf.setClass("mapred.mongo.split.value.class", StudentsMongoDBWritable.class, MongoDBWritable.class);
		Job job = Job.getInstance(conf, "myInputFormat/OutputFormat");
		job.setJarByClass(MongoDBMRDemo.class);
		
		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(IntWritable.class);
		job.setMapOutputValueClass(StudentsMongoDBWritable.class);
		
		job.setReducerClass(MyReducer.class);
		job.setOutputKeyClass(NullWritable.class);
		job.setOutputValueClass(StudentsMongoDBWritable.class);
		
		//设置输入输出格式化器
		job.setInputFormatClass(MongoDBInputFormat.class);
		job.setOutputFormatClass(MongoDBOutPutFormat.class);
		//提交job
		int isok = job.waitForCompletion(true)?0:1;
		System.exit(isok);
	}

}