有很多数据存在mongodb中,希望使用mr来去将数据分布式读取出来,并且统计,然后再将结果存储到mongo中。(其它mr支持不友好的数据库均可以自行定义)
现有如下数据:
db.students.insert({name:"bingbing",age:16,sex:"F"})
db.students.insert({name:"zhiyin",sex:"M"})
db.students.insert({name:"kaige",age:16})
db.students.insert({name:"yejie",age:16,sex:"F"})
db.students.insert({name:"boduo",age:18})
db.students.insert({name:"cunshang",age:15,sex:"F"})
db.students.insert({name:"dubian",age:18,sex:"F"})
结果数据存储到hadoop库下的stu_res集合中:
{"age":15,"coount":1}
{"age":16,"coount":3}
{"age":18,"coount":2}
注意点:1.本地的MongoDB数据库,即Windows的。
2.本地应该安装了hadoop。
3.本地的MongoDB数据库没有账户密码。
4.代码中注释的部分是我尝试连接远程虚拟机上的Linux上的MongoDB(这个是有账户密码的),
但是提示我连接超时,感觉连接数据库的部分是没问题的,应该是代码中的其他问题,或者配置文件的事、如果知道,麻烦告诉一下。
共有这么几个类
import org.apache.hadoop.io.Writable;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
/**
* 自定义数据类型的方法
*
*
*/
public interface MongoDBWritable extends Writable {
/**
* 向mongodb写数据的对象
*/
public void write(DBCollection dbCollection);
/**
* 读mongodb表里面的数据对象
*/
public void readFields(DBObject dbObject);
}
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import com.mongodb.BasicDBObjectBuilder;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
/**
* 自定义数据类型
*
*/
public class StudentsMongoDBWritable implements MongoDBWritable{
public String name;
public Integer age;
public String sex = "";
public Integer counter = 1;
/**
* 注意处理空的值 不过只处理了age 让sex默认为"" 不然报空指针错误
*/
public void write(DataOutput out) throws IOException {
out.writeUTF(name);
if(age == null){
out.writeBoolean(false);
} else {
out.writeBoolean(true);
out.writeInt(age);
}
out.writeUTF(sex);
out.writeInt(counter);
}
public void readFields(DataInput in) throws IOException {
this.name = in.readUTF();
if(in.readBoolean()){
this.age = in.readInt();
} else {
this.age = 0;
}
this.sex = in.readUTF();
this.counter = in.readInt();
}
/**
* 写数据
*/
public void write(DBCollection dbCollection) {
DBObject dbObject = BasicDBObjectBuilder.start().add("age", this.age).add("counter",this.counter).get();
//将dbobject插入
dbCollection.insert(dbObject);
}
/**
* 读数据
*/
public void readFields(DBObject dbObject) {
this.name = dbObject.get("name").toString();
if(dbObject.get("age") != null){
this.age = Double.valueOf(dbObject.get("age").toString()).intValue();
} else {
this.age = 0;
}
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public Integer getAge() {
return age;
}
public void setAge(Integer age) {
this.age = age;
}
public String getSex() {
return sex;
}
public void setSex(String sex) {
this.sex = sex;
}
public Integer getCounter() {
return counter;
}
public void setCounter(Integer counter) {
this.counter = counter;
}
}
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.mongodb.*;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.util.ReflectionUtils;
/**
* 自定义mongodb的输入格式化器
*/
public class MongoDBInputFormat<V extends MongoDBWritable> extends InputFormat<LongWritable, V>{
/**
* 自定义输入数据分片信息类
*/
public static class MongoDBInputSplit extends InputSplit implements Writable{
private long start = 0; //分片的起始位置
private long end = 0; //分片的结束位置
public MongoDBInputSplit(){
}
public MongoDBInputSplit(long start, long end) {
super();
this.start = start;
this.end = end;
}
public void write(DataOutput out) throws IOException {
out.writeLong(start);
out.writeLong(end);
}
public void readFields(DataInput in) throws IOException {
this.start = in.readLong();
this.end = in.readLong();
}
@Override
public long getLength() throws IOException, InterruptedException {
return end - start;
}
@Override
public String[] getLocations() throws IOException, InterruptedException {
return new String[0];
//return new String[] {};
}
}
/**
* 获取分片(将mongodb里面的数据取出来进行分片)
*/
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException,
InterruptedException {
//有密码的连接方式 但是版本要提高 3.6.4不过方法过时 2.9.1不行
// MongoCredential credential=MongoCredential.createCredential("root", "hadoop", "root".toCharArray());
// MongoClient mongoClient = new MongoClient(new ServerAddress("192.168.216.111", 27017), Arrays.asList(credential));
// System.out.println("连接成功");
// DB db = mongoClient.getDB("hadoop");
//获取mongodb的连接
// DB db = Mongo.connect(new DBAddress("192.168.37.111", "hadoop"));
DB db = Mongo.connect(new DBAddress("127.0.0.1", "hadoop"));
//获取集合
DBCollection dbCollection = db.getCollection("students");
//定义分片大小,多少条数据一个分片
long chunk = 2;
//获取mongdb的collection的总的记录数
long count = dbCollection.count();
//计算分片有多少个 3
long chunksize = (count / chunk);
//定义一个集合存储分片
List<InputSplit> li = new ArrayList<InputSplit>();
//循环分片,注意不能刚好分片
for (int i = 0; i < chunksize; i++) {
/*
* 0-2
* 2-4
* 4-7
*/
MongoDBInputSplit is = null;
if(i+1 == chunksize){
is = new MongoDBInputSplit(i*chunk, count); //
li.add(is);
} else {
is = new MongoDBInputSplit(i*chunk, i*chunk + chunk);
li.add(is);
}
}
return li;
}
/**
* 自定义一个Null类型
*/
public static class NULLMongoDBWritable implements MongoDBWritable{
public void write(DataOutput out) throws IOException {
}
public void readFields(DataInput in) throws IOException {
}
public void write(DBCollection dbCollection) {
}
public void readFields(DBObject dbObject) {
}
}
/**
* 自定义分片记录器
*/
public static class MongoDBRecordReader<V extends MongoDBWritable> extends RecordReader<LongWritable, V>{
//分片信息
private MongoDBInputSplit split;
//结果集(游标)
private DBCursor dbcursor;
//定义索引
private int index;
private LongWritable key;
private V value;
public MongoDBRecordReader(){
}
public MongoDBRecordReader(InputSplit split,TaskAttemptContext context) throws IOException, InterruptedException{
super();
initialize(split,context);
}
public MongoDBRecordReader(MongoDBInputSplit split, DBCursor dbcursor,
int index, LongWritable key, V value) {
super();
this.split = split;
this.dbcursor = dbcursor;
this.index = index;
this.key = key;
this.value = value;
}
/**
* 初始化
*/
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
//初始化分片
this.split = (MongoDBInputSplit) split;
//初始化key
key = new LongWritable();
//初始化类
Configuration conf = context.getConfiguration();
Class classz = conf.getClass("mapred.mongo.split.value.class", NULLMongoDBWritable.class);
//初始化value值
value = (V) ReflectionUtils.newInstance(classz, conf);
}
/**
* 获取下一个keyvalue值
*/
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
//判断dbcursor是否为null
if(this.dbcursor == null){
//有密码的连接方式 但是版本要提高 3.6.4不过方法过时 2.9.1不行
// MongoCredential credential=MongoCredential.createCredential("root", "hadoop", "root".toCharArray());
// MongoClient mongoClient = new MongoClient(new ServerAddress("192.168.216.111", 27017), Arrays.asList(credential));
// System.out.println("连接成功");
// DB db = mongoClient.getDB("hadoop");
// 2.9.1版本
//获取dbcursor的值
// DB db = Mongo.connect(new DBAddress("192.168.216.111", "hadoop"));
DB db = Mongo.connect(new DBAddress("127.0.0.1", "hadoop"));
//获取集合
DBCollection dbCollection = db.getCollection("students");
//获取游标
dbcursor = dbCollection.find().skip((int)this.split.start).limit((int)this.split.getLength());
}
//操作游标
boolean hasNext = this.dbcursor.hasNext();
if(hasNext){
//获取游标的下一个值
DBObject dbObject = this.dbcursor.next();
//下一个的key
this.key.set(this.split.start+index);
index ++;
//下一个value
this.value.readFields(dbObject);
}
return hasNext;
}
@Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return this.key;
}
@Override
public V getCurrentValue() throws IOException, InterruptedException {
return this.value;
}
/**
* 创建记录的进度
*/
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
/**
*关闭之前开启的对象
*/
@Override
public void close() throws IOException {
dbcursor.close();
}
}
@Override
public RecordReader<LongWritable, V> createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException,
InterruptedException {
/**
* 创建输入记录器
*/
return new MongoDBRecordReader(split,context);
}
}
import java.io.IOException;
import java.net.UnknownHostException;
import java.util.Arrays;
import com.mongodb.*;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
/**
* 自定义输出格式化器
*/
public class MongoDBOutPutFormat<V extends MongoDBWritable> extends OutputFormat<NullWritable, V>{
// DBOutputFormat
public static class MongoDBRecordWriter<V extends MongoDBWritable> extends RecordWriter<NullWritable, V>{
public DBCollection dbCollection = null;
public MongoDBRecordWriter() {
}
public MongoDBRecordWriter(TaskAttemptContext context) throws UnknownHostException {
//有密码的连接方式 但是版本要提高 3.6.4不过方法过时 2.9.1不行
// MongoCredential credential=MongoCredential.createCredential("root", "hadoop", "root".toCharArray());
// MongoClient mongoClient = new MongoClient(new ServerAddress("192.168.216.111", 27017), Arrays.asList(credential));
// System.out.println("连接成功");
// DB db = mongoClient.getDB("hadoop");
//获取mongodb的连接
// DB db = Mongo.connect(new DBAddress("192.168.37.111", "hadoop"));
DB db = Mongo.connect(new DBAddress("127.0.0.1", "hadoop"));
dbCollection = db.getCollection("stu_res");
}
@Override
public void write(NullWritable key, V value) throws IOException,
InterruptedException {
/**
* 使用value 的write。本质是使用MongoDBWritable的write()
*/
value.write(this.dbCollection);
}
@Override
public void close(TaskAttemptContext context) throws IOException,
InterruptedException {
//do nothing
}
}
@Override
public RecordWriter<NullWritable, V> getRecordWriter(
TaskAttemptContext context) throws IOException,
InterruptedException {
//使用泛型为空的时候需要使用 jdk 1.7
return new MongoDBRecordWriter<>(context);
}
@Override
public void checkOutputSpecs(JobContext context) throws IOException,
InterruptedException {
//do nothing
}
/**
* 输出对象的提交
*/
@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context)
throws IOException, InterruptedException {
/**
* 没有输出文件路径 ,为null则可以
*/
return new FileOutputCommitter(null, context);
}
}
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
public class MongoDBMRDemo {
/**
* 自定义mapper
*/
public static class MyMapper extends Mapper<LongWritable, StudentsMongoDBWritable, IntWritable, StudentsMongoDBWritable>{
@Override
protected void map(LongWritable key,StudentsMongoDBWritable value,Context context)
throws IOException, InterruptedException {
if(value.age != null){
context.write(new IntWritable(value.age), value);
} else {
System.out.println("invalid value:"+value.getName());
//System.exit(0);
return;
}
}
}
/**
* 自定义reducer
*
*/
public static class MyReducer extends Reducer<IntWritable, StudentsMongoDBWritable, NullWritable, StudentsMongoDBWritable>{
@Override
protected void reduce(IntWritable key,Iterable<StudentsMongoDBWritable> value,Context context)
throws IOException, InterruptedException {
int counter = 0;
for (StudentsMongoDBWritable p : value) {
counter += 1;
}
//构造返回对象
StudentsMongoDBWritable person = new StudentsMongoDBWritable();
person.setAge(key.get());
person.setCounter(counter);
context.write(NullWritable.get(), person);
}
}
/**
* 驱动方法
*/
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// System.setProperty("HADOOP_USER_NAME","root");
Configuration conf = new Configuration();
//mapred.mongo.split.value.class
// conf.set("fs.defaultFS", "hdfs://qf");
// conf.set("dfs.nameservices", "qf");
// conf.set("dfs.ha.namenodes.qf", "nn1, nn2");
// conf.set("dfs.namenode.rpc-address.qf.nn1", "hadoop01:8020");
// conf.set("dfs.namenode.rpc-address.qf.nn2", "hadoop02:8020");
// conf.set("dfs.client.failover.proxy.provider.qf", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
conf.setClass("mapred.mongo.split.value.class", StudentsMongoDBWritable.class, MongoDBWritable.class);
Job job = Job.getInstance(conf, "myInputFormat/OutputFormat");
job.setJarByClass(MongoDBMRDemo.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(StudentsMongoDBWritable.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(StudentsMongoDBWritable.class);
//设置输入输出格式化器
job.setInputFormatClass(MongoDBInputFormat.class);
job.setOutputFormatClass(MongoDBOutPutFormat.class);
//提交job
int isok = job.waitForCompletion(true)?0:1;
System.exit(isok);
}
}