PostgreSQL中Old Master节点分析

133次阅读

共计 10054 个字符，预计需要花费 26 分钟才能阅读完成。

本篇内容介绍了“PostgreSQL 中 Old Master 节点分析”的有关知识，在实际案例的操作过程中，不少人都会遇到这样的困境，接下来就让丸趣 TV 小编带领大家学习一下如何处理这些情况吧！希望大家仔细阅读，能够学有所成！

基于 streaming replication 搭建的 PostgreSQL HA 环境, 如出现网络访问 / 硬件故障等原因导致 Standby 节点升级为 Master 节点, 但 Old Master 节点数据库并未损坏, 在排除故障后 Old Master 节点可以通过 pg_rewind 工具而不需要通过备份的方式成为 New Master 节点的 Standby 节点.
在执行命令 pg_rewind 时, 到底做了什么?

零、原理

在 PostgreSQL HA 环境中,Standby 节点升级为 Master 节点后, 时间线会切换为新的时间线, 比如从 1 变为 2. 而 Old Master 节点的时间线仍然为原来的时间线, 比如仍为 1, 那么使用 pg_rewind 工具,Old Master 节点如何从 New Master 节点读取相关的数据成为新的 Standby 节点?
简单来说, 有以下几步:
1. 确定 New Master 和 Old Master 数据一致性的 Checkpoint 位置. 在该位置上,New Master 和 Old Master 数据完全一致. 这可以通过读取新 Old Master 节点时间线历史文件可以获得, 该文件位于 $PGDATA/pg_wal/ 目录下, 文件名称为 XX.history
2.Old Master 节点根据上一步获取的 Checkpoint 读取本机日志文件 WAL Record, 获取在此 Checkpoint 之后出现变化的 Block, 并以链表的方式存储 Block 编号等信息
3. 根据第 2 步获取的 Block 信息从 New Master 节点拷贝相应的 Block, 替换 Old Master 节点相应的 Block
4. 拷贝 New Master 节点上除数据文件外的所有其他文件, 包括配置文件等 (如果拷贝数据文件, 与备份方式搭建区别不大)
5.Old Master 启动数据库, 应用从 Checkpoint 开始后的 WAL Record.

在执行主备切换后,New Master 节点的时间线切换为 n + 1, 通过 pg_rewind 可使 Old Master 在分叉点开始与 New Master 同步, 成为 New Standby 节点.

一、数据结构

XLogRecPtr
64bit 的 WAL Record 寻址空间地址.

/*
 * Pointer to a location in the XLOG. These pointers are 64 bits wide,
 * because we don t want them ever to overflow.
 *  指向 XLOG 中的位置.
 *  这些指针大小为 64bit, 以确保指针不会溢出.
 */
typedef uint64 XLogRecPtr;

TimeLineID
时间线 ID

typedef uint32 TimeLineID;

二、源码解读

pg_rewind 的源码较为简单, 详细请参考注释.

int
main(int argc, char **argv)
 static struct option long_options[] = { { help , no_argument, NULL,  ?},
 {target-pgdata , required_argument, NULL,  D},
 {source-pgdata , required_argument, NULL, 1},
 {source-server , required_argument, NULL, 2},
 {version , no_argument, NULL,  V},
 {dry-run , no_argument, NULL,  n},
 {no-sync , no_argument, NULL,  N},
 {progress , no_argument, NULL,  P},
 {debug , no_argument, NULL, 3},
 {NULL, 0, NULL, 0}
 };// 命令选项
 int option_index;// 选项编号
 int c;// 字符 ASCII 码
 XLogRecPtr divergerec;// 分支点
 int lastcommontliIndex;
 XLogRecPtr chkptrec;//checkpoint Record 位置
 TimeLineID chkpttli;// 时间线
 XLogRecPtr chkptredo;checkpoint REDO 位置
 size_t size;
 char *buffer;// 缓冲区
 bool rewind_needed;// 是否需要 rewind
 XLogRecPtr endrec;// 结束点
 TimeLineID endtli;// 结束时间线
 ControlFileData ControlFile_new;// 新的控制文件
 set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN(pg_rewind));
 progname = get_progname(argv[0]);
 /* Process command-line arguments */
 // 处理命令行参数
 if (argc   1)
 { if (strcmp(argv[1],  --help ) == 0 || strcmp(argv[1],  -? ) == 0)
 { usage(progname);
 exit(0);
 }
 if (strcmp(argv[1],  --version ) == 0 || strcmp(argv[1],  -V ) == 0)
 { puts( pg_rewind (PostgreSQL)   PG_VERSION);
 exit(0);
 }
 }
 while ((c = getopt_long(argc, argv,  D:nNP , long_options,  option_index)) != -1)
 { switch (c)
 {
 case  ? :
 fprintf(stderr, _( Try \ %s --help\  for more information.\n), progname);
 exit(1);
 case  P :
 showprogress = true;
 break;
 case  n :
 dry_run = true;
 break;
 case  N :
 do_sync = false;
 break;
 case 3:
 debug = true;
 break;
 case  D : /* -D or --target-pgdata */
 datadir_target = pg_strdup(optarg);
 break;
 case 1: /* --source-pgdata */
 datadir_source = pg_strdup(optarg);
 break;
 case 2: /* --source-server */
 connstr_source = pg_strdup(optarg);
 break;
 }
 }
 if (datadir_source == NULL   connstr_source == NULL)
 { fprintf(stderr, _( %s: no source specified (--source-pgdata or --source-server)\n ), progname);
 fprintf(stderr, _( Try \ %s --help\  for more information.\n), progname);
 exit(1);
 }
 if (datadir_source != NULL   connstr_source != NULL)
 { fprintf(stderr, _( %s: only one of --source-pgdata or --source-server can be specified\n), progname);
 fprintf(stderr, _( Try \ %s --help\  for more information.\n), progname);
 exit(1);
 }
 if (datadir_target == NULL)
 { fprintf(stderr, _( %s: no target data directory specified (--target-pgdata)\n ), progname);
 fprintf(stderr, _( Try \ %s --help\  for more information.\n), progname);
 exit(1);
 }
 if (optind   argc)
 { fprintf(stderr, _( %s: too many command-line arguments (first is \ %s\)\n ),
 progname, argv[optind]);
 fprintf(stderr, _( Try \ %s --help\  for more information.\n), progname);
 exit(1);
 }
 /*
 * Don t allow pg_rewind to be run as root, to avoid overwriting the
 * ownership of files in the data directory. We need only check for root
 * -- any other user won t have sufficient permissions to modify files in
 * the data directory.
 *  不需要以 root 用户运行 pg_rewind, 避免覆盖数据目录中的文件 owner.
 *  只需要检查 root 用户, 其他用户没有足够的权限更新数据目录中的文件.
 */
#ifndef WIN32
 if (geteuid() == 0)
 {
 //root 用户
 fprintf(stderr, _( cannot be executed by \ root\ \n));
 fprintf(stderr, _( You must run %s as the PostgreSQL superuser.\n),
 progname);
 exit(1);
 }
#endif
 get_restricted_token(progname);
 /* Set mask based on PGDATA permissions */
 // 根据 PGDATA 的权限设置权限 mask
 if (!GetDataDirectoryCreatePerm(datadir_target))
 { fprintf(stderr, _( %s: could not read permissions of directory \ %s\ : %s\n),
 progname, datadir_target, strerror(errno));
 exit(1);
 }
 umask(pg_mode_mask);
 /* Connect to remote server */
 // 连接到远程服务器
 if (connstr_source)
 libpqConnect(connstr_source);
 /*
 * Ok, we have all the options and we re ready to start. Read in all the
 * information we need from both clusters.
 *  现在, 我们有了相关的执行运行, 准备开始运行.
 *  从两个 db clusters 中读取所有需要的信息.
 */
 // 读取目标控制文件
 buffer = slurpFile(datadir_target,  global/pg_control ,  size);
 digestControlFile(ControlFile_target, buffer, size);
 pg_free(buffer);
 // 读取源控制文件
 buffer = fetchFile(global/pg_control ,  size);
 digestControlFile(ControlFile_source, buffer, size);
 pg_free(buffer);
 sanityChecks();
 /*
 * If both clusters are already on the same timeline, there s nothing to
 * do.
 *  如果两个 clusters 已经是同一个时间线, 没有什么好做的了, 报错.
 */
 if (ControlFile_target.checkPointCopy.ThisTimeLineID == ControlFile_source.checkPointCopy.ThisTimeLineID)
 { printf(_( source and target cluster are on the same timeline\n));
 rewind_needed = false;
 }
 else
 {
 // 找到分叉点
 findCommonAncestorTimeline(divergerec,  lastcommontliIndex);
 printf(_( servers diverged at WAL location %X/%X on timeline %u\n),
 (uint32) (divergerec   32), (uint32) divergerec,
 targetHistory[lastcommontliIndex].tli);
 /*
 * Check for the possibility that the target is in fact a direct
 * ancestor of the source. In that case, there is no divergent history
 * in the target that needs rewinding.
 *  检查目标是源的直接祖先的可能性.
 *  在这种情况下, 在需要调整的目标中就没有不同的历史.
 */
 if (ControlFile_target.checkPoint  = divergerec)
 {
 // 如果目标的 checkpoint    分叉点, 则需要 rewind
 rewind_needed = true;
 }
 else
 {
 // 目标的 checkpoint  =  分叉点
 XLogRecPtr chkptendrec;
 /* Read the checkpoint record on the target to see where it ends. */
 // 读取目标的 checkpoint 记录, 检查在哪结束?
 chkptendrec = readOneRecord(datadir_target,
 ControlFile_target.checkPoint,
 targetNentries - 1);
 /*
 * If the histories diverged exactly at the end of the shutdown
 * checkpoint record on the target, there are no WAL records in
 * the target that don t belong in the source s history, and no
 * rewind is needed.
 *  如果正好在 shutdown checkpoint Record 处出现分叉,
 *  那么在目标 cluster 中没有 WAL Record 属于源 cluster 历史,
 *  不需要进行 rewind 操作, 否则需要 rewind.
 */
 if (chkptendrec == divergerec)
 rewind_needed = false;
 else
 rewind_needed = true;
 }
 }
 if (!rewind_needed)
 {
 // 不需要 rewind, 退出
 printf(_( no rewind required\n));
 exit(0);
 }
 // 找到目标 cluster 最后的 checkpoint 点
 findLastCheckpoint(datadir_target, divergerec,
 lastcommontliIndex,
  chkptrec,  chkpttli,  chkptredo);
 printf(_( rewinding from last common checkpoint at %X/%X on timeline %u\n),
 (uint32) (chkptrec   32), (uint32) chkptrec,
 chkpttli);
 /*
 * Build the filemap, by comparing the source and target data directories.
 *  通过对比源和目标数据目录构建 filemap
 */
 // 创建 filemap
 filemap_create();
 pg_log(PG_PROGRESS,  reading source file list\n 
 fetchSourceFileList();
 pg_log(PG_PROGRESS,  reading target file list\n 
 traverse_datadir(datadir_target,  process_target_file);
 /*
 * Read the target WAL from last checkpoint before the point of fork, to
 * extract all the pages that were modified on the target cluster after
 * the fork. We can stop reading after reaching the final shutdown record.
 * XXX: If we supported rewinding a server that was not shut down cleanly,
 * we would need to replay until the end of WAL here.
 *  从在分叉点之前的最后一个 checkpoint 开始读取目标 WAL Record,
 *  提取目标 cluster 上在分叉后所有被修改的 pages.
 *  在到达最后一个 shutdown record 时停止读取.
 * XXX:  如果我们支持非正常关闭的数据库 rewind, 需要在这里重放 WAL Record 到 WAL 的末尾.
 */
 // 构造 filemap
 pg_log(PG_PROGRESS,  reading WAL in target\n 
 extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
 ControlFile_target.checkPoint);
 filemap_finalize();
 if (showprogress)
 calculate_totals();
 /* this is too verbose even for verbose mode */
 // 如为 debug 模式, 则打印 filemap
 if (debug)
 print_filemap();
 /*
 * Ok, we re ready to start copying things over.
 *  现在可以开始拷贝了.
 */
 if (showprogress)
 { pg_log(PG_PROGRESS,  need to copy %lu MB (total source directory size is %lu MB)\n ,
 (unsigned long) (filemap- fetch_size / (1024 * 1024)),
 (unsigned long) (filemap- total_size / (1024 * 1024)));
 fetch_size = filemap- fetch_size;
 fetch_done = 0;
 }
 /*
 * This is the point of no return. Once we start copying things, we have
 * modified the target directory and there is no turning back!
 *  到了这里, 已无回头路可走了.
 *  一旦开始拷贝, 就必须更新目标路径, 无法回头!
 */
 //
 executeFileMap();
 progress_report(true);
 // 创建 backup_label 文件并更新控制文件
 pg_log(PG_PROGRESS,  \ncreating backup label and updating control file\n 
 createBackupLabel(chkptredo, chkpttli, chkptrec);
 /*
 * Update control file of target. Make it ready to perform archive
 * recovery when restarting.
 *  更新目标控制文件. 在重启时可执行归档恢复.
 *
 * minRecoveryPoint is set to the current WAL insert location in the
 * source server. Like in an online backup, it s important that we recover
 * all the WAL that was generated while we copied the files over.
 * minRecoveryPoint 设置为目标服务器上当前 WAL 插入的位置.
 *  与在线 backup 类似, 在拷贝和覆盖文件时根据所有生成的 WAL 日志进行恢复是很重要的.
 */
 // 更新控制文件
 memcpy(ControlFile_new,  ControlFile_source, sizeof(ControlFileData));
 if (connstr_source)
 {
 // 获取源 WAL 插入的位置
 endrec = libpqGetCurrentXlogInsertLocation();
 // 获取时间线
 endtli = ControlFile_source.checkPointCopy.ThisTimeLineID;
 }
 else
 {
 endrec = ControlFile_source.checkPoint;
 endtli = ControlFile_source.checkPointCopy.ThisTimeLineID;
 }
 // 更新控制文件
 ControlFile_new.minRecoveryPoint = endrec;
 ControlFile_new.minRecoveryPointTLI = endtli;
 ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY;
 update_controlfile(datadir_target, progname,  ControlFile_new, do_sync);
 pg_log(PG_PROGRESS,  syncing target data directory\n 
 // 同步数据目录 (除数据文件之外)
 syncTargetDirectory();
 printf(_( Done!\n));
 return 0;
}

“PostgreSQL 中 Old Master 节点分析”的内容就介绍到这里了，感谢大家的阅读。如果想了解更多行业相关的知识可以关注丸趣 TV 网站，丸趣 TV 小编将为大家输出更多高质量的实用文章！

正文完