XLogRecordAssemble从注册的数据和缓冲区组成的XLogRecData链中合成WAL记录,后续可以调用XLogInsertRecord写入日志。Assemble a WAL record from the registered data and buffers into an XLogRecData chain, ready for insertion with XLogInsertRecord(). 除了xl_prev,其他记录头域都被填入。当前数据的CRC没有放入记录头部。The record header fields are filled in, except for the xl_prev field. The calculated CRC does not include the record header yet. 如果存在注册的buffer和full-page image,*fpw_lsn被设置为这些页的最低LSN。If there are any registered buffers, and a full-page image was not taken of all of them, *fpw_lsn is set to the lowest LSN among such pages. This signals that the assembled record is only good for insertion on the assumption that the RedoRecPtr and doPageWrites values were up-to-date.
代码执行到这个函数的时候,所有的数据已经完成注册,目前wal记录的数据存储在①mainrdata_head②每一个注册的buff的rdata_head③每一个注册的buff的page字段中。这个函数就是要①②③中的数据组合到一起的过程。

处理日志数据链头部

hdr_rdt用于在构造记录时保存记录头部,hdr_scratch会在初始化时使用palloc分配好空间。

/* These are used to hold the record header while constructing a record.
* 'hdr_scratch' is not a plain variable, but is palloc'd at initialization,
* because we want it to be MAXALIGNed and padding bytes zeroed.
* For simplicity, it's allocated large enough to hold the headers for any
* WAL record. */
static XLogRecData hdr_rdt;
typedef struct XLogRecData {
struct XLogRecData *next; /* next struct in chain, or NULL */
char *data; /* start of rmgr data to include */
uint32 len; /* length of rmgr data to include */
} XLogRecData;
static char *hdr_scratch = NULL;

PostgreSQL数据库WAL——日志合成XLogRecordAssemble_数据库


PostgreSQL数据库WAL——日志合成XLogRecordAssemble_数据库_02

static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecPtr RedoRecPtr, bool doPageWrites, XLogRecPtr *fpw_lsn) {
XLogRecData *rdt;
uint32 total_len = 0;
int block_id;
pg_crc32c rdata_crc;
registered_buffer *prev_regbuf = NULL;
XLogRecData *rdt_datas_last; // 尾指针
XLogRecord *rechdr;
char *scratch = hdr_scratch; // 记录日志头部的临时缓冲地址

/* Note: this function can be called multiple times for the same record. All the modifications we do to the rdata chains below must handle that. */
// 跳过日志头部的长度
/* The record begins with the fixed-size header */
rechdr = (XLogRecord *) scratch;
scratch += SizeOfXLogRecord;
// 初始化头部的XLogRecordData数据,rdt_datas_last指向日志数据链尾部
// hdr_rdt指向日志数据链的头部
hdr_rdt.next = NULL;
rdt_datas_last = &hdr_rdt;
hdr_rdt.data = hdr_scratch;

/* Enforce consistency checks for this record if user is looking for it.
* Do this before at the beginning of this routine to give the possibility
* for callers of XLogInsert() to pass XLR_CHECK_CONSISTENCY directly for
* a record. */ // 用户可以通过wal_consistency_checking参数来控制事务日志是否记录一致性检查的信息,在事务日志回放完毕后会检查数据页面的一致性
if (wal_consistency_checking[rmid]) info |= XLR_CHECK_CONSISTENCY;

处理XLogRegisterBuffer注册的Block

逐个处理XLogRegisterBuffer函数注册的各个Block

/* Make an rdata chain containing all the data portions of all block
* references. This includes the data for full-page images. Also append
* the headers for the block references in the scratch buffer. */
*fpw_lsn = InvalidXLogRecPtr;
for (block_id = 0; block_id < max_registered_block_id; block_id++) {
registered_buffer *regbuf = &registered_buffers[block_id];
bool needs_backup; // 是否做FPW
bool needs_data;
XLogRecordBlockHeader bkpb; // 通用的Block的Header信息
XLogRecordBlockImageHeader bimg; // 如果做FPW,则需要这个Header信息
XLogRecordBlockCompressHeader cbimg = {0}; // 如果做FPW,且页面需要压缩
bool samerel; // 日志记录的前一个页面是不是和本日志记录是同一个关系的
bool is_compressed = false; // 页面是否已经压缩
bool include_image; // 也是FPW的一个标志
if (!regbuf->in_use) continue; // XLogRegisterBuffer注册时会设置这个变量

/* Determine if this block needs to be backed up */ // 是否需要做FPW,优先根据flag信息判断,否则根据GUC参数和释放处于backup状态判断,最终根据LSN判断
if (regbuf->flags & REGBUF_FORCE_IMAGE) needs_backup = true;
else if (regbuf->flags & REGBUF_NO_IMAGE) needs_backup = false;
else if (!doPageWrites) needs_backup = false;
else {
/* We assume page LSN is first data on *every* page that can be passed to XLogInsert, whether it has the standard page layout or not. */
XLogRecPtr page_lsn = PageGetLSN(regbuf->page);
needs_backup = (page_lsn <= RedoRecPtr);
if (!needs_backup) {
if (*fpw_lsn == InvalidXLogRecPtr || page_lsn < *fpw_lsn) *fpw_lsn = page_lsn;
}
}

/* Determine if the buffer data needs to included */ // 是否保存页面数据
if (regbuf->rdata_len == 0) needs_data = false; // 页面没有数据
else if ((regbuf->flags & REGBUF_KEEP_DATA) != 0) needs_data = true; // 页面明确指出了需要保存数据
else needs_data = !needs_backup; // 如果没有指定,则根据是否做FPW来决定是否保存数据

//组装XLogRecordBlockHeader
bkpb.id = block_id;
bkpb.fork_flags = regbuf->forkno;
bkpb.data_length = 0;
if ((regbuf->flags & REGBUF_WILL_INIT) == REGBUF_WILL_INIT) bkpb.fork_flags |= BKPBLOCK_WILL_INIT;

/* If needs_backup is true or WAL checking is enabled for current resource manager, log a full-page write for the current block. */ // 如果要做FPW,则需要保存页面的备份,如果在回放时要检查日志的一致性,则需要做页面的备份
include_image = needs_backup || (info & XLR_CHECK_CONSISTENCY) != 0;
if (include_image) {
Page page = regbuf->page;
uint16 compressed_len = 0;
// 标准页面中在pd_lower和pd_upper之间会有一个空洞,这部分没有数据,可以考虑裁剪掉,提高存储空间的利用率
/* The page needs to be backed up, so calculate its hole length and offset. */
if (regbuf->flags & REGBUF_STANDARD) { // 如果有空洞,则记录空洞的位置及长度
/* Assume we can omit data between pd_lower and pd_upper */
uint16 lower = ((PageHeader) page)->pd_lower;
uint16 upper = ((PageHeader) page)->pd_upper;
if (lower >= SizeOfPageHeaderData &&upper > lower &&upper <= BLCKSZ){
bimg.hole_offset = lower;
cbimg.hole_length = upper - lower;
}else{ /* No "hole" to remove */
bimg.hole_offset = 0;
cbimg.hole_length = 0;
}
} else {
/* Not a standard page header, don't try to eliminate "hole" */
bimg.hole_offset = 0;
cbimg.hole_length = 0;
}
//如果开启了wal_compression参数,则会对保存进日志记录的数据页面进行压缩
/* Try to compress a block image if wal_compression is enabled */
if (wal_compression)
is_compressed = XLogCompressBackupBlock(page, bimg.hole_offset, cbimg.hole_length, regbuf->compressed_page, &compressed_len);

/* Fill in the remaining fields in the XLogRecordBlockHeader struct */
bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE;

// 当前阶段已经不适宜再调用Register系列函数,所以开始使用registered_buffer里面的临时槽位
/* Construct XLogRecData entries for the page content. */
rdt_datas_last->next = &regbuf->bkp_rdatas[0];
rdt_datas_last = rdt_datas_last->next;
bimg.bimg_info = (cbimg.hole_length == 0) ? 0 : BKPIMAGE_HAS_HOLE;

/* If WAL consistency checking is enabled for the resource manager
* of this WAL record, a full-page image is included in the record
* for the block modified. During redo, the full-page is replayed
* only if BKPIMAGE_APPLY is set. */
if (needs_backup)
bimg.bimg_info |= BKPIMAGE_APPLY;

if (is_compressed) { // 情况1:如果是压缩页面,则空洞信息已经包含在其中
bimg.length = compressed_len;
bimg.bimg_info |= BKPIMAGE_IS_COMPRESSED;
rdt_datas_last->data = regbuf->compressed_page;
rdt_datas_last->len = compressed_len;
}

PostgreSQL数据库WAL——日志合成XLogRecordAssemble_postgresql_03

else {
bimg.length = BLCKSZ - cbimg.hole_length;
if (cbimg.hole_length == 0){ // 情况2:如果空洞长度是0,则直接记录整个页面
rdt_datas_last->data = page;
rdt_datas_last->len = BLCKSZ;
}else{ /* must skip the hole */ // 情况3:如果未压缩且有空洞,则需要借用registered_buffer里面的两个槽位
rdt_datas_last->data = page;
rdt_datas_last->len = bimg.hole_offset;
rdt_datas_last->next = &regbuf->bkp_rdatas[1];
rdt_datas_last = rdt_datas_last->next;
rdt_datas_last->data = page + (bimg.hole_offset + cbimg.hole_length);
rdt_datas_last->len = BLCKSZ - (bimg.hole_offset + cbimg.hole_length);
}
}
total_len += bimg.length;
}

PostgreSQL数据库WAL——日志合成XLogRecordAssemble_sed_04


PostgreSQL数据库WAL——日志合成XLogRecordAssemble_big data_05

// 把XLogRegisterBufData注册到registered_buffer中的数据链接进数组中
// 此操作通常和FPW相反
// 因为FPW会记录整个页面,所以如果做了FPW通常不会记录日志修改的数据信息
// 但在有些情况下除外,例如逻辑日志解析可能需要数据信息
if (needs_data) {
/* Link the caller-supplied rdata chain for this buffer to the overall list. */
bkpb.fork_flags |= BKPBLOCK_HAS_DATA;
bkpb.data_length = regbuf->rdata_len; total_len += regbuf->rdata_len;
rdt_datas_last->next = regbuf->rdata_head; rdt_datas_last = regbuf->rdata_tail;
}
// 如果连续的两个日志都是同一个表中的日志记录,则可以省略一个filenode的空间
// 这里做个标记,下面会根据这个标记做对应的操作
if (prev_regbuf && RelFileNodeEquals(regbuf->rnode, prev_regbuf->rnode)) {
samerel = true; bkpb.fork_flags |= BKPBLOCK_SAME_REL;
}
else
samerel = false;
prev_regbuf = regbuf;

/* Ok, copy the header to the scratch buffer 正式组装,复制多个Block相关的Headerr到hdr_scratch*/
memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader); //1. 复制XLogRecordBlockHeader信息
scratch += SizeOfXLogRecordBlockHeader;
if (include_image) {
memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader); // 2. 复制XLogRecordBlockImageHeader信息
scratch += SizeOfXLogRecordBlockImageHeader;
if (cbimg.hole_length != 0 && is_compressed) {
memcpy(scratch, &cbimg, SizeOfXLogRecordBlockCompressHeader); // 3. 复制XLogRecordBlockCompressHeader信息
scratch += SizeOfXLogRecordBlockCompressHeader;
}
}
if (!samerel) { // 是否可以节省一个filenode空间
memcpy(scratch, &regbuf->rnode, sizeof(RelFileNode));
scratch += sizeof(RelFileNode);
}
memcpy(scratch, &regbuf->block, sizeof(BlockNumber));
scratch += sizeof(BlockNumber);
}

/* followed by the record's origin, if any */
if ((curinsert_flags & XLOG_INCLUDE_ORIGIN) && replorigin_session_origin != InvalidRepOriginId) {
*(scratch++) = (char) XLR_BLOCK_ID_ORIGIN;
memcpy(scratch, &replorigin_session_origin, sizeof(replorigin_session_origin));
scratch += sizeof(replorigin_session_origin);
}
// 记录maindata的长度,实际上maindata的主要内容会保存在hdr_rdt对应的数据链中
/* followed by main data, if any */
if (mainrdata_len > 0){
if (mainrdata_len > 255){
*(scratch++) = (char) XLR_BLOCK_ID_DATA_LONG;
memcpy(scratch, &mainrdata_len, sizeof(uint32));
scratch += sizeof(uint32);
}
else{
*(scratch++) = (char) XLR_BLOCK_ID_DATA_SHORT;
*(scratch++) = (uint8) mainrdata_len;
}
rdt_datas_last->next = mainrdata_head;
rdt_datas_last = mainrdata_last;
total_len += mainrdata_len;
}
rdt_datas_last->next = NULL;
hdr_rdt.len = (scratch - hdr_scratch);
total_len += hdr_rdt.len;

/* Calculate CRC of the data
* Note that the record header isn't added into the CRC initially since we
* don't know the prev-link yet. Thus, the CRC will represent the CRC of
* the whole record in the order: rdata, then backup blocks, then record
* header. */
INIT_CRC32C(rdata_crc);
COMP_CRC32C(rdata_crc, hdr_scratch + SizeOfXLogRecord, hdr_rdt.len - SizeOfXLogRecord);
for (rdt = hdr_rdt.next; rdt != NULL; rdt = rdt->next) COMP_CRC32C(rdata_crc, rdt->data, rdt->len);

/* Fill in the fields in the record header. Prev-link is filled in later,
* once we know where in the WAL the record will be inserted. The CRC does
* not include the record header yet. */
rechdr->xl_xid = GetCurrentTransactionIdIfAny();
rechdr->xl_tot_len = total_len;
rechdr->xl_info = info;
rechdr->xl_rmid = rmid;
rechdr->xl_prev = InvalidXLogRecPtr;
rechdr->xl_crc = rdata_crc;
return &hdr_rdt;
}

PostgreSQL数据库WAL——日志合成XLogRecordAssemble_big data_06


PostgreSQL数据库WAL——日志合成XLogRecordAssemble_数据_07

PostgreSQL数据库WAL——日志合成XLogRecordAssemble_sed_08


PostgreSQL数据库WAL——日志合成XLogRecordAssemble_数据库_09

PostgreSQL数据库WAL——日志合成XLogRecordAssemble_数据_10

PostgreSQL数据库WAL——日志合成XLogRecordAssemble_数据库_11