共计 11661 个字符,预计需要花费 30 分钟才能阅读完成。
这篇文章主要讲解了“PostgreSQL checkpoint 中用于刷一个脏 page 的函数是什么”,文中的讲解内容简单清晰,易于学习与理解,下面请大家跟着丸趣 TV 小编的思路慢慢深入,一起来研究和学习“PostgreSQL checkpoint 中用于刷一个脏 page 的函数是什么”吧!
一、数据结构
宏定义
checkpoints request flag bits, 检查点请求标记位定义.
/*
* OR-able request flag bits for checkpoints. The cause bits are used only
* for logging purposes. Note: the flags must be defined so that it s
* sensible to OR together request flags arising from different requestors.
*/
/* These directly affect the behavior of CreateCheckPoint and subsidiaries */
#define CHECKPOINT_IS_SHUTDOWN 0x0001 /* Checkpoint is for shutdown */
#define CHECKPOINT_END_OF_RECOVERY 0x0002 /* Like shutdown checkpoint, but
* issued at end of WAL recovery */
#define CHECKPOINT_IMMEDIATE 0x0004 /* Do it without delays */
#define CHECKPOINT_FORCE 0x0008 /* Force even if no activity */
#define CHECKPOINT_FLUSH_ALL 0x0010 /* Flush all pages, including those
* belonging to unlogged tables */
/* These are important to RequestCheckpoint */
#define CHECKPOINT_WAIT 0x0020 /* Wait for completion */
#define CHECKPOINT_REQUESTED 0x0040 /* Checkpoint request has been made */
/* These indicate the cause of a checkpoint request */
#define CHECKPOINT_CAUSE_XLOG 0x0080 /* XLOG consumption */
#define CHECKPOINT_CAUSE_TIME 0x0100 /* Elapsed time */
二、源码解读
SyncOneBuffer, 在 syncing 期间处理一个 buffer, 其主要处理逻辑如下:
1. 获取 buffer 描述符
2. 锁定 buffer
3. 根据 buffer 状态和输入参数执行相关判断 / 处理
4. 钉住脏页, 上共享锁, 调用 FlushBuffer 刷盘
5. 解锁 / 解钉和其他收尾工作
/*
* SyncOneBuffer -- process a single buffer during syncing.
* 在 syncing 期间处理一个 buffer
*
* If skip_recently_used is true, we don t write currently-pinned buffers, nor
* buffers marked recently used, as these are not replacement candidates.
* 如 skip_recently_used 为 T, 既不写 currently-pinned buffers,
* 也不写标记为最近使用的 buffers, 因为这些缓冲区不是可替代的缓冲区.
*
* Returns a bitmask containing the following flag bits:
* BUF_WRITTEN: we wrote the buffer.
* BUF_REUSABLE: buffer is available for replacement, ie, it has
* pin count 0 and usage count 0.
* 返回位掩码:
* BUF_WRITTEN: 已写入 buffer
* BUF_REUSABLE: buffer 可用于替代 (pin count 和 usage count 均为 0)
*
* (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean
* after locking it, but we don t care all that much.)
*
* Note: caller must have done ResourceOwnerEnlargeBuffers.
*/
static int
SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
int result = 0;
uint32 buf_state;
BufferTag tag;
ReservePrivateRefCountEntry();
/*
* Check whether buffer needs writing.
* 检查 buffer 是否需要写入.
*
* We can make this check without taking the buffer content lock so long
* as we mark pages dirty in access methods *before* logging changes with
* XLogInsert(): if someone marks the buffer dirty just after our check we
* don t worry because our checkpoint.redo points before log record for
* upcoming changes and so we are not required to write such dirty buffer.
* 在使用 XLogInsert() logging 变化前通过访问方法标记 pages 为脏时,
* 不需要持有锁太长的时间来执行该检查:
* 因为如果某个进程在检查后标记 buffer 为脏,
* 在这种情况下 checkpoint.redo 指向了变化出现前的 log 位置, 因此无需担心, 而且不必写这样的脏块.
*/
buf_state = LockBufHdr(bufHdr);
if (BUF_STATE_GET_REFCOUNT(buf_state) == 0
BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
{
result |= BUF_REUSABLE;
}
else if (skip_recently_used)
{
/* Caller told us not to write recently-used buffers */
// 跳过最近使用的 buffer
UnlockBufHdr(bufHdr, buf_state);
return result;
}
if (!(buf_state BM_VALID) || !(buf_state BM_DIRTY))
{
/* It s clean, so nothing to do */
//buffer 无效或者不是脏块
UnlockBufHdr(bufHdr, buf_state);
return result;
}
/*
* Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
* buffer is clean by the time we ve locked it.)
* 钉住它, 上共享锁, 并刷到盘上.
*/
PinBuffer_Locked(bufHdr);
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
// 调用 FlushBuffer
//If the caller has an smgr reference for the buffer s relation, pass it as the second parameter.
//If not, pass NULL.
FlushBuffer(bufHdr, NULL);
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
tag = bufHdr-
UnpinBuffer(bufHdr, true);
ScheduleBufferTagForWriteback(wb_context, tag);
return result | BUF_WRITTEN;
}
FlushBuffer
FlushBuffer 函数物理上把共享缓存刷盘, 主要实现函数还是 smgrwrite(storage manager write).
/*
* FlushBuffer
* Physically write out a shared buffer.
* 物理上把共享缓存刷盘.
*
* NOTE: this actually just passes the buffer contents to the kernel; the
* real write to disk won t happen until the kernel feels like it. This
* is okay from our point of view since we can redo the changes from WAL.
* However, we will need to force the changes to disk via fsync before
* we can checkpoint WAL.
* 只是把 buffer 内容发给 os 内核, 何时真正写盘由 os 来确定.
* 在 checkpoint WAL 前需要通过 fsync 强制落盘.
*
* The caller must hold a pin on the buffer and have share-locked the
* buffer contents. (Note: a share-lock does not prevent updates of
* hint bits in the buffer, so the page could change while the write
* is in progress, but we assume that that will not invalidate the data
* written.)
* 调用者必须钉住了缓存并且持有共享锁.
* ( 注意: 共享锁不会 buffer 中的 hint bits 的更新, 因此在写入期间 page 可能会出现变化,
* 但我假定那样不会让写入的数据无效 )
*
* If the caller has an smgr reference for the buffer s relation, pass it
* as the second parameter. If not, pass NULL.
*/
static void
FlushBuffer(BufferDesc *buf, SMgrRelation reln)
XLogRecPtr recptr;
ErrorContextCallback errcallback;
instr_time io_start,
io_time;
Block bufBlock;
char *bufToWrite;
uint32 buf_state;
/*
* Acquire the buffer s io_in_progress lock. If StartBufferIO returns
* false, then someone else flushed the buffer before we could, so we need
* not do anything.
*/
if (!StartBufferIO(buf, false))
return;
/* Setup error traceback support for ereport() */
errcallback.callback = shared_buffer_write_error_callback;
errcallback.arg = (void *) buf;
errcallback.previous = error_context_stack;
error_context_stack = errcallback;
/* Find smgr relation for buffer */
if (reln == NULL)
reln = smgropen(buf- tag.rnode, InvalidBackendId);
TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf- tag.forkNum,
buf- tag.blockNum,
reln- smgr_rnode.node.spcNode,
reln- smgr_rnode.node.dbNode,
reln- smgr_rnode.node.relNode);
buf_state = LockBufHdr(buf);
/*
* Run PageGetLSN while holding header lock, since we don t have the
* buffer locked exclusively in all cases.
*/
recptr = BufferGetLSN(buf);
/* To check if block content changes while flushing. - vadim 01/17/97 */
buf_state = ~BM_JUST_DIRTIED;
UnlockBufHdr(buf, buf_state);
/*
* Force XLOG flush up to buffer s LSN. This implements the basic WAL
* rule that log updates must hit disk before any of the data-file changes
* they describe do.
*
* However, this rule does not apply to unlogged relations, which will be
* lost after a crash anyway. Most unlogged relation pages do not bear
* LSNs since we never emit WAL records for them, and therefore flushing
* up through the buffer LSN would be useless, but harmless. However,
* GiST indexes use LSNs internally to track page-splits, and therefore
* unlogged GiST pages bear fake LSNs generated by
* GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
* LSN counter could advance past the WAL insertion point; and if it did
* happen, attempting to flush WAL through that location would fail, with
* disastrous system-wide consequences. To make sure that can t happen,
* skip the flush if the buffer isn t permanent.
*/
if (buf_state BM_PERMANENT)
XLogFlush(recptr);
/*
* Now it s safe to write buffer to disk. Note that no one else should
* have been able to write it while we were busy with log flushing because
* we have the io_in_progress lock.
*/
bufBlock = BufHdrGetBlock(buf);
/*
* Update page checksum if desired. Since we have only shared lock on the
* buffer, other processes might be updating hint bits in it, so we must
* copy the page to private storage if we do checksumming.
*/
bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf- tag.blockNum);
if (track_io_timing)
INSTR_TIME_SET_CURRENT(io_start);
/*
* bufToWrite is either the shared buffer or a copy, as appropriate.
*/
smgrwrite(reln,
buf- tag.forkNum,
buf- tag.blockNum,
bufToWrite,
false);
if (track_io_timing)
{ INSTR_TIME_SET_CURRENT(io_time);
INSTR_TIME_SUBTRACT(io_time, io_start);
pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
}
pgBufferUsage.shared_blks_written++;
/*
* Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
* end the io_in_progress state.
*/
TerminateBufferIO(buf, true, 0);
TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf- tag.forkNum,
buf- tag.blockNum,
reln- smgr_rnode.node.spcNode,
reln- smgr_rnode.node.dbNode,
reln- smgr_rnode.node.relNode);
/* Pop the error context stack */
error_context_stack = errcallback.previous;
}
三、跟踪分析
测试脚本
testdb=# update t_wal_ckpt set c2 = C4# ||substr(c2,4,40);
UPDATE 1
testdb=# checkpoint;
跟踪分析
(gdb) handle SIGINT print nostop pass
SIGINT is used by the debugger.
Are you sure you want to change it? (y or n) y
Signal Stop Print Pass to program Description
SIGINT No Yes Yes Interrupt
(gdb) b SyncOneBuffer
Breakpoint 1 at 0x8a7167: file bufmgr.c, line 2357.
(gdb) c
Continuing.
Program received signal SIGINT, Interrupt.
Breakpoint 1, SyncOneBuffer (buf_id=0, skip_recently_used=false, wb_context=0x7fff27f5ae00) at bufmgr.c:2357
2357 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
(gdb) n
2358 int result = 0;
(gdb) p *bufHdr
$1 = {tag = {rnode = {spcNode = 1663, dbNode = 16384, relNode = 221290}, forkNum = MAIN_FORKNUM, blockNum = 0}, buf_id = 0,
state = {value = 3548905472}, wait_backend_pid = 0, freeNext = -2, content_lock = {tranche = 53, state = { value = 536870912}, waiters = {head = 2147483647, tail = 2147483647}}}
(gdb) n
2362 ReservePrivateRefCountEntry();
(gdb)
2373 buf_state = LockBufHdr(bufHdr);
(gdb)
2375 if (BUF_STATE_GET_REFCOUNT(buf_state) == 0
(gdb)
2376 BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
(gdb)
2375 if (BUF_STATE_GET_REFCOUNT(buf_state) == 0
(gdb)
2380 else if (skip_recently_used)
(gdb)
2387 if (!(buf_state BM_VALID) || !(buf_state BM_DIRTY))
(gdb)
2398 PinBuffer_Locked(bufHdr);
(gdb) p buf_state
$2 = 3553099776
(gdb) n
2399 LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
(gdb)
2401 FlushBuffer(bufHdr, NULL);
(gdb) step
FlushBuffer (buf=0x7fedc4a68300, reln=0x0) at bufmgr.c:2687
2687 if (!StartBufferIO(buf, false))
(gdb) n
2691 errcallback.callback = shared_buffer_write_error_callback;
(gdb)
2692 errcallback.arg = (void *) buf;
(gdb)
2693 errcallback.previous = error_context_stack;
(gdb)
2694 error_context_stack = errcallback;
(gdb)
2697 if (reln == NULL)
(gdb)
2698 reln = smgropen(buf- tag.rnode, InvalidBackendId);
(gdb)
2700 TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf- tag.forkNum,
(gdb)
2706 buf_state = LockBufHdr(buf);
(gdb)
2712 recptr = BufferGetLSN(buf);
(gdb)
2715 buf_state = ~BM_JUST_DIRTIED;
(gdb) p recptr
$3 = 16953421760
(gdb) n
2716 UnlockBufHdr(buf, buf_state);
(gdb)
2735 if (buf_state BM_PERMANENT)
(gdb)
2736 XLogFlush(recptr);
(gdb)
2743 bufBlock = BufHdrGetBlock(buf);
(gdb)
2750 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf- tag.blockNum);
(gdb) p bufBlock
$4 = (Block) 0x7fedc4e68300
(gdb) n
2752 if (track_io_timing)
(gdb)
2758 smgrwrite(reln,
(gdb)
2764 if (track_io_timing)
(gdb)
2772 pgBufferUsage.shared_blks_written++;
(gdb)
2778 TerminateBufferIO(buf, true, 0);
(gdb)
2780 TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf- tag.forkNum,
(gdb)
2787 error_context_stack = errcallback.previous;
(gdb)
2788 }
(gdb)
SyncOneBuffer (buf_id=0, skip_recently_used=false, wb_context=0x7fff27f5ae00) at bufmgr.c:2403
2403 LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
(gdb)
2405 tag = bufHdr-
(gdb)
2407 UnpinBuffer(bufHdr, true);
(gdb)
2409 ScheduleBufferTagForWriteback(wb_context, tag);
(gdb)
2411 return result | BUF_WRITTEN;
(gdb)
2412 }
(gdb)
感谢各位的阅读,以上就是“PostgreSQL checkpoint 中用于刷一个脏 page 的函数是什么”的内容了,经过本文的学习后,相信大家对 PostgreSQL checkpoint 中用于刷一个脏 page 的函数是什么这一问题有了更深刻的体会,具体使用情况还需要大家实践验证。这里是丸趣 TV,丸趣 TV 小编将为大家推送更多相关知识点的文章,欢迎关注!