mongodb提供数据的复制机制,老的master/slave和新的replset模式,本文分析老的master/slave
机制,replset在下一篇文中分析.master/slave机制是一台主服务器,其它的从服务器,从服务器从主服务
器中读出操作记录,然后在自己这端重现操作,达到和主服务器一致的目的.主从服务器是启动时设定的,
之间无法动态的切换,其提供数据的备份机制,默认情况下从服务器是不能读写的,需要读操作那么可以调
用rs.slaveOk(),这样每次对从服务器的查询都会带上标志QueryOption_SlaveOk表示可以读从服务器.
主从模式的流程,主服务器将每一次的操作记录到local.oplog.$main中,这个集合是capped,集合大
小固定,可以通过--oplogSize设置其大小,单位是M.默认情况下32位系统大小为50M,64位系统最小为
990M,最大为数据库所在磁盘的可用空间的5%.
从服务器首先从主服务器复制一份数据库数据,然后就只从主服务器的local.oplog.$main集合中读
取操作记录然后replay了.如果由于local.oplog.$main上的操作时间戳超过了从服务器,这说明主服务器
的操作记录已经被更新的操作记录覆盖了,但是从服务器没有读取到做replay,从服务器只能再次完全从
主服务器中拷贝一份数据库了.下面是本文分析到的collection的作用.
local.sources: 记录从服务器要同步的主服务器地址.
local.oplog.$main: 主服务器的binlog.
下面来看代码吧.主服务器的启动是通过--master完成的,入口函数为repl.cpp startReplication.删除
了不相关的代码.
void startReplication() {
oldRepl();//设置记录binlog的函数指针.
{
Lock::GlobalWrite lk;
replLocalAuth();//增加本地用户local数据库_repl账户写权限
}
if ( replSettings.slave ) {//从服务器的线程完成读取local.oplog.$main并且replay
boost::thread repl_thread(replSlaveThread);
}
if ( replSettings.master ) {
replSettings.master = true;
createOplog();//若未建立local.oplog.$main集合则在这里建立.
boost::thread t(replMasterThread);//这个线程没做什么事
}
while( replSettings.fastsync ) // don't allow writes until we've set up from log
sleepmillis( 50 );
}
来看看主服务器的操作日志记录.操作日志分几种.
i: 插入操作.
u: 更新操作.
c db命令操作.
d: 删除操作.
n: 无操作,仅仅是一种心跳,告诉从服务器主服务器在正常运行.
继续logOp操作:
void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt, bool *b, bool fromMigrate) {
if ( replSettings.master ) {//主服务器的log,记录到local.oplog.$main中
_logOp(opstr, ns, 0, obj, patt, b, fromMigrate);
}
logOpForSharding( opstr , ns , obj , patt );
}
_logOp这种在初始化时设置为了_logOpOld.
static void _logOpOld(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, bool fromMigrate ) {
Lock::DBWrite lk("local");
static BufBuilder bufbuilder(8*1024); // todo there is likely a mutex on this constructor
mutex::scoped_lock lk2(OpTime::m);
const OpTime ts = OpTime::now(lk2);
Client::Context context("",0,false);
/* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
instead we do a single copy to the destination position in the memory mapped file.
*/
bufbuilder.reset();
BSONObjBuilder b(bufbuilder);
b.appendTimestamp("ts", ts.asDate());//记录日志时间,同步用
b.append("op", opstr);
b.append("ns", ns);
if (fromMigrate)
b.appendBool("fromMigrate", true);
if ( bb )
b.appendBool("b", *bb);
if ( o2 )//只有update操作存在,为query对象
b.append("o2", *o2);
BSONObj partial = b.done(); // partial is everything except the o:... part.
int po_sz = partial.objsize();
int len = po_sz + obj.objsize() + 1 + 2 /*o:*/;
Record *r;//这里完成空间分配
if( logNS == 0 ) {
logNS = "local.oplog.$main";
if ( localOplogMainDetails == 0 ) {
Client::Context ctx( logNS , dbpath, false);
localDB = ctx.db();
verify( localDB );
localOplogMainDetails = nsdetails(logNS);
verify( localOplogMainDetails );
}
Client::Context ctx( logNS , localDB, false );
r = theDataFileMgr.fast_oplog_insert(localOplogMainDetails, logNS, len);
}
else {
Client::Context ctx( logNS, dbpath, false );
verify( nsdetails( logNS ) );
// first we allocate the space, then we fill it below.
r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len);
}
append_O_Obj(r->data(), partial, obj);//实际的数据插入
context.getClient()->setLastOp( ts );//更新最后log的时间
}
下面来看看从服务器的同步工作,归结起来可以是加载同步的数据源,读取操作日志,replay.从服务
器的同步入口为replSlaveThread,其内部调用replMain做同步工作.下面直接从replMain开始分析.
void replMain() {
ReplSource::SourceVector sources;
while ( 1 ) {
int s = 0;
{
Lock::GlobalWrite lk;
if ( replAllDead ) {//同步出现错误了,resync是删除数据库后再同步
// throttledForceResyncDead can throw
if ( !replSettings.autoresync || !ReplSource::throttledForceResyncDead( "auto" ) ) {
break;
}
}
verify( syncing == 0 ); // i.e., there is only one sync thread running. we will want to change/fix this.
syncing++;
}
try {
int nApplied = 0;
s = _replMain(sources, nApplied);
if( s == 1 ) {
if( nApplied == 0 ) s = 2;
else if( nApplied > 100 ) {
// sleep very little - just enought that we aren't truly hammering master
sleepmillis(75);
s = 0;
}
}
}
catch (...) {
out() << "caught exception in _replMain" << endl;
s = 4;
}
{
Lock::GlobalWrite lk;
verify( syncing == 1 );
syncing--;
}
if( relinquishSyncingSome ) {
relinquishSyncingSome = 0;
s = 1; // sleep before going back in to syncing=1
}
if ( s ) {
stringstream ss;
ss << "repl: sleep " << s << " sec before next pass";
string msg = ss.str();
if ( ! cmdLine.quiet )
log() << msg << endl;
ReplInfo r(msg.c_str());
sleepsecs(s);
}
}
}
replMain->_replMain
int _replMain(ReplSource::SourceVector& sources, int& nApplied) {
{
Lock::GlobalWrite lk;
ReplSource::loadAll(sources);//加载要需要sync的源端
replSettings.fastsync = false; // only need this param for initial reset
}
int sleepAdvice = 1;
for ( ReplSource::SourceVector::iterator i = sources.begin(); i != sources.end(); i++ ) {
ReplSource *s = i->get();
int res = -1;
try {
res = s->sync(nApplied);//从具体的主服务器端读取操作记录做同步工作
bool moreToSync = s->haveMoreDbsToSync();
if( res < 0 ) {
sleepAdvice = 3;
}
else if( moreToSync ) {
sleepAdvice = 0;
}
else if ( s->sleepAdvice() ) {
sleepAdvice = s->sleepAdvice();
}
else
sleepAdvice = res;
}
if ( res < 0 )
s->oplogReader.resetConnection();
}
return sleepAdvice;
}
replMain->_replMain->loadAll
void ReplSource::loadAll(SourceVector &v) {
Client::Context ctx("local.sources");
SourceVector old = v;
v.clear();
if ( !cmdLine.source.empty() ) {
// --source <host> specified.
// check that no items are in sources other than that
// add if missing
shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
if ( n == 0 ) {//local.sources中不存在同步资源,这里加入
// source missing. add.
ReplSource s;
s.hostName = cmdLine.source;
s.only = cmdLine.only;
s.save();//将数据记录到local.sources集合中
}
}
//这里加载的Cursor是Reverse的,加载最后一个需要同步的资源
shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
while ( c->ok() ) {
ReplSource tmp(c->current());
if ( tmp.syncedTo.isNull() ) {
DBDirectClient c;//这里从本地local.oplog.$main拿出当前同步到的时间点
if ( c.exists( "local.oplog.$main" ) ) {//倒序查找最后一个操作不是n的记录并根据其记录sync时间
BSONObj op = c.findOne( "local.oplog.$main", QUERY( "op" << NE << "n" ).sort( BSON( "$natural" << -1 ) ) );
if ( !op.isEmpty() ) {
tmp.syncedTo = op[ "ts" ].date();
}
}
}
addSourceToList(v, tmp, old);//加入每一个同步源
c->advance();
}
}
replMain->_replMain->sync
int ReplSource::sync(int& nApplied) {
if ( !oplogReader.connect(hostName) ) {//连接master,并完成认证工作
log(4) << "repl: can't connect to sync source" << endl;
return -1;
}
return sync_pullOpLog(nApplied);//获取操作日志与replay
}
replMain->_replMain->sync->sync_pullOpLog
int ReplSource::sync_pullOpLog(int& nApplied) {
int okResultCode = 1;
string ns = string("local.oplog.$") + sourceName();
bool tailing = true;
oplogReader.tailCheck();
bool initial = syncedTo.isNull();
if ( !oplogReader.haveCursor() || initial ) {//初次同步数据
if ( initial ) {
// Important to grab last oplog timestamp before listing databases.
syncToTailOfRemoteLog();//读取local.oplog.$main中的最新一条有用的操作数据,指定这次sync从哪个时间点开始
BSONObj info;
bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info );
BSONObjIterator i( info.getField( "databases" ).embeddedObject() );
while( i.moreWithEOO() ) {//加入所有非空的并且不为local的数据库,若只指定了
BSONElement e = i.next();//only,则只加入only指定的数据库
string name = e.embeddedObject().getField( "name" ).valuestr();
if ( !e.embeddedObject().getBoolField( "empty" ) ) {
if ( name != "local" ) {
if ( only.empty() || only == name ) {
addDbNextPass.insert( name );
}
}
}
}
Lock::GlobalWrite lk;
save();
}
//初始化cursor,这里指定的查询条件是大于等于这个syncedTo,
//而这个syncedTo在slave第一次启动时第一次运行到这里时是
//master的指定的最后一条数据
BSONObjBuilder q;
q.appendDate("$gte", syncedTo.asDate());
BSONObjBuilder query;
query.append("ts", q.done());
if ( !only.empty() ) {
// note we may here skip a LOT of data table scanning, a lot of work for the master.
// maybe append "\\." here?
query.appendRegex("ns", string("^") + pcrecpp::RE::QuoteMeta( only ));
}
BSONObj queryObj = query.done();
// e.g. queryObj = { ts: { $gte: syncedTo } }
oplogReader.tailingQuery(ns.c_str(), queryObj);
tailing = false;
}
else {
log(2) << "repl: tailing=true\n";
}
{ // show any deferred database creates from a previous pass
set<string>::iterator i = addDbNextPass.begin();
if ( i != addDbNextPass.end() ) {//这里是待添加数据库的处理,一次处理一个
BSONObjBuilder b;
b.append("ns", *i + '.');
b.append("op", "db");
BSONObj op = b.done();
sync_pullOpLog_applyOperation(op, false);
}
}
OpTime nextOpTime;
{
BSONObj op = oplogReader.next();
BSONElement ts = op.getField("ts");
nextOpTime = OpTime( ts.date() );
if( tailing ) {
oplogReader.putBack( op ); // op will be processed in the loop below
nextOpTime = OpTime(); // will reread the op below
}
}
{ // apply operations
int n = 0;
time_t saveLast = time(0);
while ( 1 ) {
bool moreInitialSyncsPending = !addDbNextPass.empty() && n; // we need "&& n" to assure we actually process at least one op to get a sync point recorded in the first place.
if ( moreInitialSyncsPending || !oplogReader.more() ) {//还有数据库等待添加,这里只是保存了sync的时间戳
Lock::GlobalWrite lk;
if( oplogReader.awaitCapable() && tailing )
okResultCode = 0; // don't sleep
syncedTo = nextOpTime;
save(); // note how far we are synced up to now
nApplied = n;
break;
}
BSONObj op = oplogReader.next();
unsigned b = replApplyBatchSize;
bool justOne = b == 1;
scoped_ptr<Lock::GlobalWrite> lk( justOne ? 0 : new Lock::GlobalWrite() );
while( 1 ) {
BSONElement ts = op.getField("ts");
OpTime last = nextOpTime;
nextOpTime = OpTime( ts.date() );//这里sync的delay还没到,暂时不sync了,等待下一次sync
if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) {
oplogReader.putBack( op );
_sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1;
Lock::GlobalWrite lk;
if ( n > 0 ) {
syncedTo = last;
save();
}
return okResultCode;
}//实际的log处理
sync_pullOpLog_applyOperation(op, !justOne);
n++;
if( --b == 0 )
break;
// if to here, we are doing mulpile applications in a singel write lock acquisition
if( !oplogReader.moreInCurrentBatch() ) {
// break if no more in batch so we release lock while reading from the master
break;
}
op = oplogReader.next();
getDur().commitIfNeeded();
}
}
}
return okResultCode;
}
replMain->_replMain->sync->sync_pullOpLog->sync_pullOpLog_applyOperation
void ReplSource::sync_pullOpLog_applyOperation(BSONObj& op, bool alreadyLocked) {
if( op.getStringField("op")[0] == 'n' )
return;
char clientName[MaxDatabaseNameLen];
const char *ns = op.getStringField("ns");
nsToDatabase(ns, clientName);
if ( !only.empty() && only != clientName )//slave启动时指定了only,只sync某一个数据库
return;
//将要更新的数据部分预加载到数据库中
if( cmdLine.pretouch && !alreadyLocked/*doesn't make sense if in write lock already*/ ) {
if( cmdLine.pretouch > 1 ) {
/* note: this is bad - should be put in ReplSource. but this is first test... */
static int countdown;
if( countdown > 0 ) {
countdown--; // was pretouched on a prev pass
}
else {
const int m = 4;
if( tp.get() == 0 ) {
int nthr = min(8, cmdLine.pretouch);
nthr = max(nthr, 1);
tp.reset( new ThreadPool(nthr) );
}
vector<BSONObj> v;
oplogReader.peek(v, cmdLine.pretouch);
unsigned a = 0;
while( 1 ) {
if( a >= v.size() ) break;
unsigned b = a + m - 1; // v[a..b]
if( b >= v.size() ) b = v.size() - 1;
tp->schedule(pretouchN, v, a, b);
a += m;
}
// we do one too...
pretouchOperation(op);
tp->join();
countdown = v.size();
}
}
else {
pretouchOperation(op);
}
}
scoped_ptr<Lock::GlobalWrite> lk( alreadyLocked ? 0 : new Lock::GlobalWrite() );
//如果待添加数据库与本地数据库同名,删除本地数据库
if ( !handleDuplicateDbName( op, ns, clientName ) ) {
return;
}
Client::Context ctx( ns );
ctx.getClient()->curop()->reset();
bool empty = ctx.db()->isEmpty();
bool incompleteClone = incompleteCloneDbs.count( clientName ) != 0;
// always apply admin command command
// this is a bit hacky -- the semantics of replication/commands aren't well specified
if ( strcmp( clientName, "admin" ) == 0 && *op.getStringField( "op" ) == 'c' ) {
applyOperation( op );//admin的命令,直接执行了
return;
}
//该数据库在本地(slave)才建立,这里克隆数据库到本地
if ( ctx.justCreated() || empty || incompleteClone ) {
// we must add to incomplete list now that setClient has been called
incompleteCloneDbs.insert( clientName );
if ( nClonedThisPass ) {//已经在克隆一个数据库了,下次再克隆另一个
/* we only clone one database per pass, even if a lot need done. This helps us
avoid overflowing the master's transaction log by doing too much work before going
back to read more transactions. (Imagine a scenario of slave startup where we try to
clone 100 databases in one pass.)
*/
addDbNextPass.insert( clientName );
}
else {
save();
Client::Context ctx(ns);
nClonedThisPass++;
resync(ctx.db()->name);//同步复制数据库,整个复制,也就是一个一个的collection复制,过程可能很慢
addDbNextPass.erase(clientName);
incompleteCloneDbs.erase( clientName );
}
save();
}
else {
applyOperation( op );//这里将insert,update,delete等操作在本地执行一次,流程简单,不再分析
addDbNextPass.erase( clientName );
}
}
到这里master/slave模式分析完毕,主要需要注意的是数据库的复制,与当前同步的时间戳问题.
每一次都是查询从上一次同步到的时间戳到最新的时间戳,得到的结果必定是上一次同步到的时间
戳,否则说明主服务器操作太多,local.oplog.$main已经丢掉了老旧的操作日志,这时就只能重新复制
整个数据库了.
作者:yhjj0108,杨浩