redis主从复制解析

  • 建立主从结构关系
  • 建立主从的网络关系
  • 发送ping命令
  • 权限认证
  • 发送端口号和ip
  • 发送PSYNC命令
  • 主节点接收psync命令并处理
  • 从节点接收主节点的回复并开始同步
  • 将数据载入从库中
  • 命令传播
  • 部分重同步的实现
  • 1.定时任务
  • 2.复制积压缓冲区

建立主从结构关系

redis主从复制的前提是给master节点创建相应的slave,方法有以下几种:

  • 直接修改需要增加的从节点的redis.conf配置,增加slaveof ip port
  • 连接从节点的客户端,执行命令slaveof ip port
  • slave节点启动时直接指定从节点,启动时加参数–slaveof

建立主从关系源码分析

  1. 首先判断redis的模式,如果redis的模式是cluster模式,直接退出cluster模式下有自己的复制方式,不再赘述。
  2. 判断客户端输入参数,如果包含“no one”,并且当前节点有主节点,则断开和主节点的连接(将server中的参数masterhost置为空),并且将当前节点的所有slave节点都断开连接并且释放server.slaves,将当前节点变为一个独立的节点。
  3. 如果客户端的请求是请求复制的,那么将当前节点的连接到输入的ip和port,并且将当前节点的所有slave释放,将当前节点的复制状态置为REPL_STATE_CONNECT。
void slaveofCommand(client *c) {
	//如果是redis cluster模式 直接退出
    if (server.cluster_enabled) {
        addReplyError(c,"SLAVEOF not allowed in cluster mode.");
        return;
    }
    //如果是断开连接的请求并且当前节点已经连接到master
    if (!strcasecmp(c->argv[1]->ptr,"no") &&
        !strcasecmp(c->argv[2]->ptr,"one")) {
        if (server.masterhost) {
        //将server中的参数masterhost置为空并且释放server.slaves
            replicationUnsetMaster();
            sds client = catClientInfoString(sdsempty(),c);
            serverLog(LL_NOTICE,"MASTER MODE enabled (user request from '%s')",
                client);
            sdsfree(client);
        }
    } else {
        long port;
		//获取端口号
        if ((getLongFromObjectOrReply(c, c->argv[2], &port, NULL) != C_OK))
            return;
         //如果已经连接到当前请求节点,直接退出
        if (server.masterhost && !strcasecmp(server.masterhost,c->argv[1]->ptr)
            && server.masterport == port) {
            serverLog(LL_NOTICE,"SLAVE OF would result into synchronization with the master we are already connected with. No operation performed.");
            addReplySds(c,sdsnew("+OK Already connected to specified master\r\n"));
            return;
        }
       //将server中的参数masterhost置为请求得到ip并且释放server.slaves
        replicationSetMaster(c->argv[1]->ptr, port);
        sds client = catClientInfoString(sdsempty(),c);
        serverLog(LL_NOTICE,"SLAVE OF %s:%d enabled (user request from '%s')",
            server.masterhost, server.masterport, client);
        sdsfree(client);
    }

建立主从的网络关系

如果当前节点已经和需要复制的节点建立结构关系,那么接下来就要建立网络关系,即,网络连接是否可用。从这里开始的所有复制的代码都是在函数void replicationCron(void)中,这个函数是在server启动时就执行的一个定时任务,每一秒钟执行一次。
一下是次函数中建立网络关系的部分:

//如果是需要连接的状态,即已经建立主从结构关系,则建立网络连接
    if (server.repl_state == REPL_STATE_CONNECT) {
        serverLog(LL_NOTICE,"Connecting to MASTER %s:%d",
            server.masterhost, server.masterport);
        if (connectWithMaster() == C_OK) {
            serverLog(LL_NOTICE,"MASTER <-> SLAVE sync started");
        }
    }
    int connectWithMaster(void) {
    int fd;
    fd = anetTcpNonBlockBestEffortBindConnect(NULL,
        server.masterhost,server.masterport,NET_FIRST_BIND_ADDR);
    if (fd == -1) {
        serverLog(LL_WARNING,"Unable to connect to MASTER: %s",
            strerror(errno));
        return C_ERR;
    }
	//注册读事件和写事件并发送ping命令(syncWithMaster)函数中
    if (aeCreateFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE,syncWithMaster,NULL) ==
            AE_ERR)
    {
        close(fd);
        serverLog(LL_WARNING,"Can't create readable event for SYNC");
        return C_ERR;
    }

    server.repl_transfer_lastio = server.unixtime;
    server.repl_transfer_s = fd;
    server.repl_state = REPL_STATE_CONNECTING;
    return C_OK;
}

发送ping命令

当主从的网络关系建立之后,从节点注册异步读事件和写事件的同时,在syncWithMaster函数中向主节点发送ping,等待回复pong。一下代码均为syncWithMaster函数中的主要片段,上一个步骤中,当创建读写的事件时,传入此函数,此函数就作为读写事件的响应处理。

//发送ping的处理逻辑
if (server.repl_state == REPL_STATE_CONNECTING) {
        serverLog(LL_NOTICE,"Non blocking connect for SYNC fired the event.");
        //删除写事件的监听,等待pong的回复
        aeDeleteFileEvent(server.el,fd,AE_WRITABLE);
        //将复制的状态置为等待pong回复
        server.repl_state = REPL_STATE_RECEIVE_PONG;
        /* Send the PING, don't check for errors at all, we have the timeout
         * that will take care about this. */
        err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"PING",NULL);
        if (err) goto write_error;
        return;
   }
   //等待pong的逻辑
    if (server.repl_state == REPL_STATE_RECEIVE_PONG) {
    //从fd中读取pong
        err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
        /* *
         * 只接受两个有效的回复,一个正面的+PONG回复(我们只检查“+”)或者一个身份验证错误
         */
        if (err[0] != '+' &&
            strncmp(err,"-NOAUTH",7) != 0 &&
            strncmp(err,"-ERR operation not permitted",28) != 0)
        {
            serverLog(LL_WARNING,"Error reply to PING from master: '%s'",err);
            sdsfree(err);
            goto error;
        } else {
            serverLog(LL_NOTICE,
                "Master replied to PING, replication can continue...");
        }
        sdsfree(err);
        //将复制状态置为发送认证命令给主节点
        server.repl_state = REPL_STATE_SEND_AUTH;
    }

发送ping:

  • 之前从节点已经注册了读事件和写事件,首先取消监听的写事件并等待pong回复
  • 将当前的复制状态置为等待pong回复。
  • 发送ping命令。

权限认证

这个步骤是紧接着发送ping命令的。在同一个函数中处理。权限认证不再多作解释,权限认证完成之后,会将状态置为,REPL_STATE_SEND_PORT(发送端口号给主节点)。

/* AUTH with the master if required. */
    if (server.repl_state == REPL_STATE_SEND_AUTH) {
        if (server.masterauth) {
            err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"AUTH",server.masterauth,NULL);
            if (err) goto write_error;
            server.repl_state = REPL_STATE_RECEIVE_AUTH;
            return;
        } else {
       		//无需认证试直接将复制状态置为发送端口号给主节点
            server.repl_state = REPL_STATE_SEND_PORT;
        }
    }

    /* Receive AUTH reply. */
    if (server.repl_state == REPL_STATE_RECEIVE_AUTH) {
        err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
        if (err[0] == '-') {
            serverLog(LL_WARNING,"Unable to AUTH to MASTER: %s",err);
            sdsfree(err);
            goto error;
        }
        sdsfree(err);
        //将复制状态置为发送端口号给主节点
        server.repl_state = REPL_STATE_SEND_PORT;
    }

发送端口号和ip

这操作和上面是一样的,代码在相同的函数内,如下:

if (server.repl_state == REPL_STATE_SEND_PORT) {
        //获取端口号
        sds port = sdsfromlonglong(server.slave_announce_port ?
            server.slave_announce_port : server.port);
        err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
                "listening-port",port, NULL);
        sdsfree(port);
        if (err) goto write_error;
        sdsfree(err);
        //状态置为接受端口号回复
        server.repl_state = REPL_STATE_RECEIVE_PORT;
        return;
    }

    /* Receive REPLCONF listening-port reply. */
    if (server.repl_state == REPL_STATE_RECEIVE_PORT) {
        err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
        /* Ignore the error if any, not all the Redis versions support
         * REPLCONF listening-port. */
        if (err[0] == '-') {
            serverLog(LL_NOTICE,"(Non critical) Master does not understand "
                                "REPLCONF listening-port: %s", err);
        }
        sdsfree(err);
        server.repl_state = REPL_STATE_SEND_IP;
    }

发送PSYNC命令

前面的工作准备好了以后,接下来就开始进行复制操作啦~

if (server.repl_state == REPL_STATE_SEND_PSYNC) {
        //调用slaveTryPartialResynchronization函数来发送同步命令,参数0代表写状态
        if (slaveTryPartialResynchronization(fd,0) == PSYNC_WRITE_ERROR) {
            err = sdsnew("Write error sending the PSYNC command.");
            goto write_error;
        }
        server.repl_state = REPL_STATE_RECEIVE_PSYNC;
        return;
    }

slaveTryPartialResynchronization函数有两种状态,发送psync等待回复的状态(写状态),还有一个就是等待读取主节点发送回来的需要复制的内容(读状态)。由于对于发送同步命令来说,需要的是写状态,那么我们一下代码片段为写状态的片段:

/* Writing half */
    if (!read_reply) {
        /* Initially set master_initial_offset to -1 to mark the current
         * master run_id and offset as not valid. Later if we'll be able to do
         * a FULL resync using the PSYNC command we'll set the offset at the
         * right value, so that this information will be propagated to the
         * client structure representing the master into server.master. */
        
        server.master_initial_offset = -1;
    /*
    * 如果server没有缓存主节点,那么先尝试一次使用psync进行的全量同步,这样就可以获取主节点的id和偏移量,下一次连接的时候可以进行部分重同步
    */
        if (server.cached_master) {
            psync_replid = server.cached_master->replid;
            snprintf(psync_offset,sizeof(psync_offset),"%lld", server.cached_master->reploff+1);
            serverLog(LL_NOTICE,"Trying a partial resynchronization (request %s:%s).", psync_replid, psync_offset);
        } else {
            serverLog(LL_NOTICE,"Partial resynchronization not possible (no cached master)");
            psync_replid = "?";
            memcpy(psync_offset,"-1",3);
        }

        /* 发送同步命令给master,是全量同步还是部分重同步取决于offset*/
        reply = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"PSYNC",psync_replid,psync_offset,NULL);
        if (reply != NULL) {
            serverLog(LL_WARNING,"Unable to send PSYNC to master: %s",reply);
            sdsfree(reply);
            // 删除文件的读事件
            aeDeleteFileEvent(server.el,fd,AE_READABLE);
            return PSYNC_WRITE_ERROR;
        }
        return PSYNC_WAIT_REPLY;
    }

主节点接收psync命令并处理

上一步我们的从节点发送了psync命令,接下来master收到了psync的命令会调用相关的函数来处理,处理的函数就是syncCommand函数,如下为处理过程:

  1. 尝试部分重同步
if (!strcasecmp(c->argv[0]->ptr,"psync")) {
 		//主要是通过masterTryPartialResynchronization函数进行部分重同步发送+CONTINUE给从节点,让从节点进行部分重同步
        if (masterTryPartialResynchronization(c) == C_OK) {
            server.stat_sync_partial_ok++;
            return; /* No full resync needed, return. */
        } else {
            char *master_replid = c->argv[1]->ptr;
            if (master_replid[0] != '?') server.stat_sync_partial_err++;
        }
    } else {
        c->flags |= CLIENT_PRE_PSYNC;
    }
  1. 如果部分重同步失败,进行全量同步
/* Full resynchronization. */
    server.stat_sync_full++;

    /* Setup the slave as one waiting for BGSAVE to start. The following code
     * paths will change the state if we handle the slave differently. */
    c->replstate = SLAVE_STATE_WAIT_BGSAVE_START;
    if (server.repl_disable_tcp_nodelay)
        anetDisableTcpNoDelay(NULL, c->fd); /* Non critical if it fails. */
    c->repldbfd = -1;
    c->flags |= CLIENT_SLAVE;
    listAddNodeTail(server.slaves,c);

    /* Create the replication backlog if needed. */
    if (listLength(server.slaves) == 1 && server.repl_backlog == NULL) {
        /* When we create the backlog from scratch, we always use a new
         * replication ID and clear the ID2, since there is no valid
         * past history. */
        changeReplicationId();
        clearReplicationId2();
        createReplicationBacklog();
    }
  1. 已经有BGSAVE的进程往磁盘中写
if (server.rdb_child_pid != -1 &&
        server.rdb_child_type == RDB_CHILD_TYPE_DISK)
    {
        /* Ok a background save is in progress. Let's check if it is a good
         * one for replication, i.e. if there is another slave that is
         * registering differences since the server forked to save. */
        client *slave;
        listNode *ln;
        listIter li;

        listRewind(server.slaves,&li);
        while((ln = listNext(&li))) {
            slave = ln->value;
            if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END) break;
        }
        /* To attach this slave, we check that it has at least all the
         * capabilities of the slave that triggered the current BGSAVE. */
        if (ln && ((c->slave_capa & slave->slave_capa) == slave->slave_capa)) {
           // 将slave的输出缓冲区所有内容拷贝给client的所有输出缓冲区中
            copyClientOutputBuffer(c,slave);
            //进行全量同步,发送+FULLRESYNC给从节点,让从节点进行全量同步
            replicationSetupSlaveForFullResync(c,slave->psync_initial_offset);
            serverLog(LL_NOTICE,"Waiting for end of BGSAVE for SYNC");
        } else {
            serverLog(LL_NOTICE,"Can't attach the slave to the current BGSAVE. Waiting for next BGSAVE for SYNC");
        }
    }
  1. 已经有BGSAVE的进程往socket中写
else if (server.rdb_child_pid != -1 &&
               server.rdb_child_type == RDB_CHILD_TYPE_SOCKET)
    {
        // 有BGSAVE的进程直接写入socket,我们需要等待下一次BGSAVE同步
        serverLog(LL_NOTICE,"Current BGSAVE has socket target. Waiting for next BGSAVE for SYNC");
    }
  1. 还没有BGSAVE的进程
else {
      // 服务器支持无盘同步
        if (server.repl_diskless_sync && (c->slave_capa & SLAVE_CAPA_EOF)) {
            if (server.repl_diskless_sync_delay)
                serverLog(LL_NOTICE,"Delay next BGSAVE for diskless SYNC");
        } else { // 服务器不支持无盘同步
           //如果没有正在执行BGSAVE,也没有aof的进程,则开始进行gbsave
            if (server.aof_child_pid == -1) {
            	// 准备rdb文件
                startBgsaveForReplication(c->slave_capa);
            } else {
                serverLog(LL_NOTICE,
                    "No BGSAVE in progress, but an AOF rewrite is active. "
                    "BGSAVE for replication delayed");
            }
        }
    }

从节点接收主节点的回复并开始同步

执行完上一步,主节点处理完成同步的命令。并且可能向从节点发送三种命令:

  1. +CONTINUE:从节点执行部分重同步
  2. +FULLREPLICATION:从节点执行全量同步
  3. -ERR:错误处理
    这些是怎么处理的呢?仍然通过syncWithMaster函数中的slaveTryPartialResynchronization方法进行处理,之前在发送psync命令阶段说过,此方法分为两个状态,一个是读状态,一个是写状态,这次从节点会调用读阶段的部分。以下代码为具体的处理逻辑:
//发送一个读事件给主节点
reply = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
    if (sdslen(reply) == 0) {
        /* The master may send empty newlines after it receives PSYNC
         * and before to reply, just to keep the connection alive. */
        sdsfree(reply);
        return PSYNC_WAIT_REPLY;
    }
    //删除读事件
    aeDeleteFileEvent(server.el,fd,AE_READABLE);
    //如果主节点返回的是+FULLRESYNC,则进行全量复制
    if (!strncmp(reply,"+FULLRESYNC",11)) {
        char *replid = NULL, *offset = NULL;
        //解析回复节点的内容
        replid = strchr(reply,' ');
        if (replid) {
            replid++;
            offset = strchr(replid,' ');
            if (offset) offset++;
        }
        if (!replid || !offset || (offset-replid-1) != CONFIG_RUN_ID_SIZE) {
            serverLog(LL_WARNING,
            memset(server.master_replid,0,CONFIG_RUN_ID_SIZE+1);
        } else {
            memcpy(server.master_replid, replid, offset-replid-1);
            server.master_replid[CONFIG_RUN_ID_SIZE] = '\0';
            server.master_initial_offset = strtoll(offset,NULL,10);
            serverLog(LL_NOTICE,"Full resync from master: %s:%lld",
                server.master_replid,
                server.master_initial_offset);
        }
       //准备执行全量同步,所以清空缓存的主节点的内容
        replicationDiscardCachedMaster();
        sdsfree(reply);
        return PSYNC_FULLRESYNC;
    }
//执行部分重同步
    if (!strncmp(reply,"+CONTINUE",9)) {
        serverLog(LL_NOTICE,
            "Successful partial resynchronization with master.");
        char *start = reply+10;
        char *end = reply+9;
        while(end[0] != '\r' && end[0] != '\n' && end[0] != '\0') end++;
        if (end-start == CONFIG_RUN_ID_SIZE) {
            char new[CONFIG_RUN_ID_SIZE+1];
            memcpy(new,start,CONFIG_RUN_ID_SIZE);
            new[CONFIG_RUN_ID_SIZE] = '\0';

            if (strcmp(new,server.cached_master->replid)) {
                /* Master ID changed. */
                serverLog(LL_WARNING,"Master replication ID changed to %s",new);

                /* Set the old ID as our ID2, up to the current offset+1. */
                memcpy(server.replid2,server.cached_master->replid,
                    sizeof(server.replid2));
                server.second_replid_offset = server.master_repl_offset+1;

                /* Update the cached master ID and our own primary ID to the
                 * new one. */
                memcpy(server.replid,new,sizeof(server.replid));
                memcpy(server.cached_master->replid,new,sizeof(server.replid));
                //将所有的从服务器断开连接
                disconnectSlaves();
            }
        }
        sdsfree(reply);
        //将当前的复制的master节点缓存
        replicationResurrectCachedMaster(fd);

        /* If this instance was restarted and we read the metadata to
         * PSYNC from the persistence file, our replication backlog could
         * be still not initialized. Create it. */
        if (server.repl_backlog == NULL) createReplicationBacklog();
        return PSYNC_CONTINUE;

回到syncWithMaster函数中,如果此时slaveTryPartialResynchronization函数执行后返回的结果是要执行全量同步,那么此时需要将将要同步的从节点和他的自己点断开连接(因为此时要同步的从节点有可能是其他节点的主节点),要求他的子节点重新同步数据。

disconnectSlaves(); //断开要同步的节点和他的子节点的连接
  while(maxtries--) { //重试次数
  //创建临时文件来接受收据
        snprintf(tmpfile,256,
            "temp-%d.%ld.rdb",(int)server.unixtime,(long int)getpid());
        dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
        if (dfd != -1) break;
        sleep(1);
    }
    if (dfd == -1) {
        serverLog(LL_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
        goto error;
    }
 // 监听一个fd的读事件,并设置该事件的处理程序为readSyncBulkPayload,主节点执行周期函数是发来RDB文件的时候调用
    if (aeCreateFileEvent(server.el,fd, AE_READABLE,readSyncBulkPayload,NULL)
            == AE_ERR)
    {
        serverLog(LL_WARNING,
            "Can't create readable event for SYNC: %s (fd=%d)",
            strerror(errno),fd);
        goto error;
    }

    server.repl_state = REPL_STATE_TRANSFER;
    server.repl_transfer_size = -1;
    server.repl_transfer_read = 0;
    server.repl_transfer_last_fsync_off = 0;
    server.repl_transfer_fd = dfd;
    server.repl_transfer_lastio = server.unixtime;
    server.repl_transfer_tmpfile = zstrdup(tmpfile);


• 1
• 2
• 3
• 4
• 5
• 6
• 7
• 8
• 9
• 10
• 11
• 12
• 13
• 14
• 15
• 16
• 17
• 18
• 19
• 20
• 21
• 22
• 23
• 24
• 25
• 26
• 27
• 28
• 29
• 30

将数据载入从库中

到现在为止,从节点和主节点已经全部准备就绪,主节点正在监听写事件的发生。写时间什么时候发生呢?也就是主节点什么时候发送rdb文件呢?在serverCron的循环中主节点发送rdb文件,调用sendBulkToSlave()将rdb文件发送到输出缓冲区中,此时将触发从节点的读事件,从节点会调用readSyncBulkPayload()函数来讲缓冲区中内容载入数据库中完成操作。

命令传播

当前主从节点已经还没有达成一致,rdb期间的接收的命令会通过下一次心跳将当前的复制偏移量发送给主节点,在进行部分同步。这时候已经完全同步了。
但是如果在主节点中再写入命令的时候,不能达到同步,所以需要将主节点重新写入的命令同步到从节点,这个过程叫命令传播。
传播时会调用server.c中的propagate()函数,主要是里面的调用replicationFeedSlaves函数来完成命令传播,可以看一下他具体做了什么事情

void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
    listNode *ln;
    listIter li;
    int j, len;
    char llstr[LONG_STR_SIZE];
    if (server.masterhost != NULL) return;
    //backlog是为了部分重同步使用
    if (server.repl_backlog == NULL && listLength(slaves) == 0) return;
    serverAssert(!(listLength(slaves) != 0 && server.repl_backlog == NULL));

    /* 发送SELECT命令给每一个从节点*/
    if (server.slaveseldb != dictid) {
        robj *selectcmd;
        if (dictid >= 0 && dictid < PROTO_SHARED_SELECT_CMDS) {
            selectcmd = shared.select[dictid];
        } else {
            int dictid_len;

            dictid_len = ll2string(llstr,sizeof(llstr),dictid);
            selectcmd = createObject(OBJ_STRING,
                sdscatprintf(sdsempty(),
                "*2\r\n$6\r\nSELECT\r\n$%d\r\n%s\r\n",
                dictid_len, llstr));
        }

        /* 将SELECT命令追加在backlog */
        if (server.repl_backlog) feedReplicationBacklogWithObject(selectcmd);
        /*将命令发送到slave的缓冲区中. */
        listRewind(slaves,&li);
        while((ln = listNext(&li))) {
            client *slave = ln->value;
            if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
            addReply(slave,selectcmd);
        }

        if (dictid < 0 || dictid >= PROTO_SHARED_SELECT_CMDS)
            decrRefCount(selectcmd);
    }
    server.slaveseldb = dictid;
    /* 将命令追加到backlog中. */
    if (server.repl_backlog) {
        char aux[LONG_STR_SIZE+3];

        /* Add the multi bulk reply length. */
        aux[0] = '*';
        len = ll2string(aux+1,sizeof(aux)-1,argc);
        aux[len+1] = '\r';
        aux[len+2] = '\n';
        feedReplicationBacklog(aux,len+3);

        for (j = 0; j < argc; j++) {
            long objlen = stringObjectLen(argv[j]);
            aux[0] = '$';
            len = ll2string(aux+1,sizeof(aux)-1,objlen);
            aux[len+1] = '\r';
            aux[len+2] = '\n';
            feedReplicationBacklog(aux,len+3);
            feedReplicationBacklogWithObject(argv[j]);
            feedReplicationBacklog(aux+len+1,2);
        }
    }

    /* 将命令发送给每一个从节点. */
    listRewind(slaves,&li);
    while((ln = listNext(&li))) {
        client *slave = ln->value;

        /* Don't feed slaves that are still waiting for BGSAVE to start */
        if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;

        /* Feed slaves that are waiting for the initial SYNC (so these commands
         * are queued in the output buffer until the initial SYNC completes),
         * or are already in sync with the master. */

        /* Add the multi bulk length. */
        addReplyMultiBulkLen(slave,argc);

        /* Finally any additional argument that was not stored inside the
         * static buffer if any (from j to argc). */
        for (j = 0; j < argc; j++)
            addReplyBulk(slave,argv[j]);
    }
}

部分重同步的实现

如果从节点和主节点由于网络等原因断开了连接,那会怎样呢,如果在进行以上的全量同步,那会是很糟糕的事情。所以我们这时候进行部分重同步。

1.定时任务

所有的复制操作都是基于void replicationCron(void)定时任务来实现的,任务每一秒钟执行一次,每次执行把从节点当前的复制偏移量发送给主节点。

redis集群日志 redis日志详解_redis集群日志


主节点收到复制偏移量主要做两件事情:

  1. 当主节点返回的是进行部分重同步时,找到偏移量,复制到当前位置
  2. 当完成rdb进行socket传输bulk的时候,当多个slave都需要同步的时候可以等到每一个子节点的偏移量一起向他们发送文件, 这样避免多次rdb

2.复制积压缓冲区

上面说了,当主节点收到同步命令的时候使用void syncCommand(client *c)函数进行同步操作,会先使用masterTryPartialResynchronizationh函数尝试部分重同步:

redis集群日志 redis日志详解_redis_02


具体重同步的函数如下:

int masterTryPartialResynchronization(client *c) {
    long long psync_offset, psync_len;
    char *master_replid = c->argv[1]->ptr;
    char buf[128];
    int buflen;

    /* Parse the replication offset asked by the slave. Go to full sync
     * on parse error: this should never happen but we try to handle
     * it in a robust way compared to aborting. */
    if (getLongLongFromObjectOrReply(c,c->argv[2],&psync_offset,NULL) !=
       C_OK) goto need_full_resync;

    /* Is the replication ID of this master the same advertised by the wannabe
     * slave via PSYNC? If the replication ID changed this master has a
     * different replication history, and there is no way to continue.
     *
     * Note that there are two potentially valid replication IDs: the ID1
     * and the ID2. The ID2 however is only valid up to a specific offset. */
    if (strcasecmp(master_replid, server.replid) &&
        (strcasecmp(master_replid, server.replid2) ||
         psync_offset > server.second_replid_offset))
    {
        /* Run id "?" is used by slaves that want to force a full resync. */
        if (master_replid[0] != '?') {
            if (strcasecmp(master_replid, server.replid) &&
                strcasecmp(master_replid, server.replid2))
            {
                serverLog(LL_NOTICE,"Partial resynchronization not accepted: "
                    "Replication ID mismatch (Slave asked for '%s', my "
                    "replication IDs are '%s' and '%s')",
                    master_replid, server.replid, server.replid2);
            } else {
                serverLog(LL_NOTICE,"Partial resynchronization not accepted: "
                    "Requested offset for second ID was %lld, but I can reply "
                    "up to %lld", psync_offset, server.second_replid_offset);
            }
        } else {
            serverLog(LL_NOTICE,"Full resync requested by slave %s",
                replicationGetSlaveName(c));
        }
        goto need_full_resync;
    }

    /* We still have the data our slave is asking for? */
    if (!server.repl_backlog ||
        psync_offset < server.repl_backlog_off ||
        psync_offset > (server.repl_backlog_off + server.repl_backlog_histlen))
    {
        serverLog(LL_NOTICE,
            "Unable to partial resync with slave %s for lack of backlog (Slave request was: %lld).", replicationGetSlaveName(c), psync_offset);
        if (psync_offset > server.master_repl_offset) {
            serverLog(LL_WARNING,
                "Warning: slave %s tried to PSYNC with an offset that is greater than the master replication offset.", replicationGetSlaveName(c));
        }
        goto need_full_resync;
    }

    /* If we reached this point, we are able to perform a partial resync:
     * 1) Set client state to make it a slave.
     * 2) Inform the client we can continue with +CONTINUE
     * 3) Send the backlog data (from the offset to the end) to the slave. */
    c->flags |= CLIENT_SLAVE;
    c->replstate = SLAVE_STATE_ONLINE;
    c->repl_ack_time = server.unixtime;
    c->repl_put_online_on_ack = 0;
    listAddNodeTail(server.slaves,c);
    /* We can't use the connection buffers since they are used to accumulate
     * new commands at this stage. But we are sure the socket send buffer is
     * empty so this write will never fail actually. */
    if (c->slave_capa & SLAVE_CAPA_PSYNC2) {
        buflen = snprintf(buf,sizeof(buf),"+CONTINUE %s\r\n", server.replid);
    } else {
        buflen = snprintf(buf,sizeof(buf),"+CONTINUE\r\n");
    }
    if (write(c->fd,buf,buflen) != buflen) {
        freeClientAsync(c);
        return C_OK;
    }
    psync_len = addReplyReplicationBacklog(c,psync_offset);
    serverLog(LL_NOTICE,
        "Partial resynchronization request from %s accepted. Sending %lld bytes of backlog starting from offset %lld.",
            replicationGetSlaveName(c),
            psync_len, psync_offset);
    /* Note that we don't need to set the selected DB at server.slaveseldb
     * to -1 to force the master to emit SELECT, since the slave already
     * has this state from the previous connection with the master. */

    refreshGoodSlavesCount();
    return C_OK; /* The caller can return, no full resync needed. */