1 /*-------------------------------------------------------------------------
4 * Synchronizes a PostgreSQL data directory to a new timeline
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 *-------------------------------------------------------------------------
10 #include "postgres_fe.h"
17 #include "pg_rewind.h"
22 #include "access/timeline.h"
23 #include "access/xlog_internal.h"
24 #include "catalog/catversion.h"
25 #include "catalog/pg_control.h"
26 #include "common/controldata_utils.h"
27 #include "common/file_perm.h"
28 #include "common/file_utils.h"
29 #include "common/restricted_token.h"
30 #include "getopt_long.h"
31 #include "storage/bufpage.h"
33 static void usage(const char *progname);
35 static void createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli,
36 XLogRecPtr checkpointloc);
38 static void digestControlFile(ControlFileData *ControlFile, char *source,
40 static void syncTargetDirectory(void);
41 static void sanityChecks(void);
42 static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex);
44 static ControlFileData ControlFile_target;
45 static ControlFileData ControlFile_source;
50 /* Configuration options */
51 char *datadir_target = NULL;
52 char *datadir_source = NULL;
53 char *connstr_source = NULL;
55 static bool debug = false;
56 bool showprogress = false;
61 TimeLineHistoryEntry *targetHistory;
64 /* Progress counters */
70 usage(const char *progname)
72 printf(_("%s resynchronizes a PostgreSQL cluster with another copy of the cluster.\n\n"), progname);
73 printf(_("Usage:\n %s [OPTION]...\n\n"), progname);
74 printf(_("Options:\n"));
75 printf(_(" -D, --target-pgdata=DIRECTORY existing data directory to modify\n"));
76 printf(_(" --source-pgdata=DIRECTORY source data directory to synchronize with\n"));
77 printf(_(" --source-server=CONNSTR source server to synchronize with\n"));
78 printf(_(" -n, --dry-run stop before modifying anything\n"));
79 printf(_(" -N, --no-sync do not wait for changes to be written\n"
80 " safely to disk\n"));
81 printf(_(" -P, --progress write progress messages\n"));
82 printf(_(" --debug write a lot of debug messages\n"));
83 printf(_(" -V, --version output version information, then exit\n"));
84 printf(_(" -?, --help show this help, then exit\n"));
85 printf(_("\nReport bugs to <pgsql-bugs@lists.postgresql.org>.\n"));
90 main(int argc, char **argv)
92 static struct option long_options[] = {
93 {"help", no_argument, NULL, '?'},
94 {"target-pgdata", required_argument, NULL, 'D'},
95 {"source-pgdata", required_argument, NULL, 1},
96 {"source-server", required_argument, NULL, 2},
97 {"version", no_argument, NULL, 'V'},
98 {"dry-run", no_argument, NULL, 'n'},
99 {"no-sync", no_argument, NULL, 'N'},
100 {"progress", no_argument, NULL, 'P'},
101 {"debug", no_argument, NULL, 3},
106 XLogRecPtr divergerec;
107 int lastcommontliIndex;
110 XLogRecPtr chkptredo;
116 ControlFileData ControlFile_new;
118 pg_logging_init(argv[0]);
119 set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind"));
120 progname = get_progname(argv[0]);
122 /* Process command-line arguments */
125 if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
130 if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
132 puts("pg_rewind (PostgreSQL) " PG_VERSION);
137 while ((c = getopt_long(argc, argv, "D:nNP", long_options, &option_index)) != -1)
142 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
159 pg_logging_set_level(PG_LOG_DEBUG);
162 case 'D': /* -D or --target-pgdata */
163 datadir_target = pg_strdup(optarg);
166 case 1: /* --source-pgdata */
167 datadir_source = pg_strdup(optarg);
169 case 2: /* --source-server */
170 connstr_source = pg_strdup(optarg);
175 if (datadir_source == NULL && connstr_source == NULL)
177 pg_log_error("no source specified (--source-pgdata or --source-server)");
178 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
182 if (datadir_source != NULL && connstr_source != NULL)
184 pg_log_error("only one of --source-pgdata or --source-server can be specified");
185 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
189 if (datadir_target == NULL)
191 pg_log_error("no target data directory specified (--target-pgdata)");
192 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
198 pg_log_error("too many command-line arguments (first is \"%s\")",
200 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
205 * Don't allow pg_rewind to be run as root, to avoid overwriting the
206 * ownership of files in the data directory. We need only check for root
207 * -- any other user won't have sufficient permissions to modify files in
208 * the data directory.
213 pg_log_error("cannot be executed by \"root\"");
214 fprintf(stderr, _("You must run %s as the PostgreSQL superuser.\n"),
220 get_restricted_token();
222 /* Set mask based on PGDATA permissions */
223 if (!GetDataDirectoryCreatePerm(datadir_target))
225 pg_log_error("could not read permissions of directory \"%s\": %m",
232 /* Connect to remote server */
234 libpqConnect(connstr_source);
237 * Ok, we have all the options and we're ready to start. Read in all the
238 * information we need from both clusters.
240 buffer = slurpFile(datadir_target, "global/pg_control", &size);
241 digestControlFile(&ControlFile_target, buffer, size);
244 buffer = fetchFile("global/pg_control", &size);
245 digestControlFile(&ControlFile_source, buffer, size);
251 * If both clusters are already on the same timeline, there's nothing to
254 if (ControlFile_target.checkPointCopy.ThisTimeLineID == ControlFile_source.checkPointCopy.ThisTimeLineID)
256 pg_log_info("source and target cluster are on the same timeline");
257 rewind_needed = false;
261 findCommonAncestorTimeline(&divergerec, &lastcommontliIndex);
262 pg_log_info("servers diverged at WAL location %X/%X on timeline %u",
263 (uint32) (divergerec >> 32), (uint32) divergerec,
264 targetHistory[lastcommontliIndex].tli);
267 * Check for the possibility that the target is in fact a direct
268 * ancestor of the source. In that case, there is no divergent history
269 * in the target that needs rewinding.
271 if (ControlFile_target.checkPoint >= divergerec)
273 rewind_needed = true;
277 XLogRecPtr chkptendrec;
279 /* Read the checkpoint record on the target to see where it ends. */
280 chkptendrec = readOneRecord(datadir_target,
281 ControlFile_target.checkPoint,
285 * If the histories diverged exactly at the end of the shutdown
286 * checkpoint record on the target, there are no WAL records in
287 * the target that don't belong in the source's history, and no
290 if (chkptendrec == divergerec)
291 rewind_needed = false;
293 rewind_needed = true;
299 pg_log_info("no rewind required");
303 findLastCheckpoint(datadir_target, divergerec,
305 &chkptrec, &chkpttli, &chkptredo);
306 pg_log_info("rewinding from last common checkpoint at %X/%X on timeline %u",
307 (uint32) (chkptrec >> 32), (uint32) chkptrec,
311 * Build the filemap, by comparing the source and target data directories.
315 pg_log_info("reading source file list");
316 fetchSourceFileList();
318 pg_log_info("reading target file list");
319 traverse_datadir(datadir_target, &process_target_file);
322 * Read the target WAL from last checkpoint before the point of fork, to
323 * extract all the pages that were modified on the target cluster after
324 * the fork. We can stop reading after reaching the final shutdown record.
325 * XXX: If we supported rewinding a server that was not shut down cleanly,
326 * we would need to replay until the end of WAL here.
329 pg_log_info("reading WAL in target");
330 extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
331 ControlFile_target.checkPoint);
337 /* this is too verbose even for verbose mode */
342 * Ok, we're ready to start copying things over.
346 pg_log_info("need to copy %lu MB (total source directory size is %lu MB)",
347 (unsigned long) (filemap->fetch_size / (1024 * 1024)),
348 (unsigned long) (filemap->total_size / (1024 * 1024)));
350 fetch_size = filemap->fetch_size;
355 * This is the point of no return. Once we start copying things, we have
356 * modified the target directory and there is no turning back!
361 progress_report(true);
365 pg_log_info("creating backup label and updating control file");
366 createBackupLabel(chkptredo, chkpttli, chkptrec);
369 * Update control file of target. Make it ready to perform archive
370 * recovery when restarting.
372 * minRecoveryPoint is set to the current WAL insert location in the
373 * source server. Like in an online backup, it's important that we recover
374 * all the WAL that was generated while we copied the files over.
376 memcpy(&ControlFile_new, &ControlFile_source, sizeof(ControlFileData));
380 endrec = libpqGetCurrentXlogInsertLocation();
381 endtli = ControlFile_source.checkPointCopy.ThisTimeLineID;
385 endrec = ControlFile_source.checkPoint;
386 endtli = ControlFile_source.checkPointCopy.ThisTimeLineID;
388 ControlFile_new.minRecoveryPoint = endrec;
389 ControlFile_new.minRecoveryPointTLI = endtli;
390 ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY;
391 update_controlfile(datadir_target, &ControlFile_new, do_sync);
394 pg_log_info("syncing target data directory");
395 syncTargetDirectory();
397 pg_log_info("Done!");
405 /* TODO Check that there's no backup_label in either cluster */
407 /* Check system_id match */
408 if (ControlFile_target.system_identifier != ControlFile_source.system_identifier)
409 pg_fatal("source and target clusters are from different systems");
412 if (ControlFile_target.pg_control_version != PG_CONTROL_VERSION ||
413 ControlFile_source.pg_control_version != PG_CONTROL_VERSION ||
414 ControlFile_target.catalog_version_no != CATALOG_VERSION_NO ||
415 ControlFile_source.catalog_version_no != CATALOG_VERSION_NO)
417 pg_fatal("clusters are not compatible with this version of pg_rewind");
421 * Target cluster need to use checksums or hint bit wal-logging, this to
422 * prevent from data corruption that could occur because of hint bits.
424 if (ControlFile_target.data_checksum_version != PG_DATA_CHECKSUM_VERSION &&
425 !ControlFile_target.wal_log_hints)
427 pg_fatal("target server needs to use either data checksums or \"wal_log_hints = on\"");
431 * Target cluster better not be running. This doesn't guard against
432 * someone starting the cluster concurrently. Also, this is probably more
433 * strict than necessary; it's OK if the target node was not shut down
434 * cleanly, as long as it isn't running at the moment.
436 if (ControlFile_target.state != DB_SHUTDOWNED &&
437 ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
438 pg_fatal("target server must be shut down cleanly");
441 * When the source is a data directory, also require that the source
442 * server is shut down. There isn't any very strong reason for this
443 * limitation, but better safe than sorry.
445 if (datadir_source &&
446 ControlFile_source.state != DB_SHUTDOWNED &&
447 ControlFile_source.state != DB_SHUTDOWNED_IN_RECOVERY)
448 pg_fatal("source data directory must be shut down cleanly");
452 * Print a progress report based on the fetch_size and fetch_done variables.
454 * Progress report is written at maximum once per second, unless the
455 * force parameter is set to true.
458 progress_report(bool force)
460 static pg_time_t last_progress_report = 0;
462 char fetch_done_str[32];
463 char fetch_size_str[32];
470 if (now == last_progress_report && !force)
471 return; /* Max once per second */
473 last_progress_report = now;
474 percent = fetch_size ? (int) ((fetch_done) * 100 / fetch_size) : 0;
477 * Avoid overflowing past 100% or the full size. This may make the total
478 * size number change as we approach the end of the backup (the estimate
479 * will always be wrong if WAL is included), but that's better than having
480 * the done column be bigger than the total.
484 if (fetch_done > fetch_size)
485 fetch_size = fetch_done;
488 * Separate step to keep platform-dependent format code out of
489 * translatable strings. And we only test for INT64_FORMAT availability
490 * in snprintf, not fprintf.
492 snprintf(fetch_done_str, sizeof(fetch_done_str), INT64_FORMAT,
494 snprintf(fetch_size_str, sizeof(fetch_size_str), INT64_FORMAT,
497 fprintf(stderr, _("%*s/%s kB (%d%%) copied"),
498 (int) strlen(fetch_size_str), fetch_done_str, fetch_size_str,
500 if (isatty(fileno(stderr)))
501 fprintf(stderr, "\r");
503 fprintf(stderr, "\n");
507 * Find minimum from two WAL locations assuming InvalidXLogRecPtr means
508 * infinity as src/include/access/timeline.h states. This routine should
509 * be used only when comparing WAL locations related to history files.
512 MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
514 if (XLogRecPtrIsInvalid(a))
516 else if (XLogRecPtrIsInvalid(b))
523 * Retrieve timeline history for given control file which should behold
524 * either source or target.
526 static TimeLineHistoryEntry *
527 getTimelineHistory(ControlFileData *controlFile, int *nentries)
529 TimeLineHistoryEntry *history;
532 tli = controlFile->checkPointCopy.ThisTimeLineID;
535 * Timeline 1 does not have a history file, so there is no need to check
536 * and fake an entry with infinite start and end positions.
540 history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
542 history->begin = history->end = InvalidXLogRecPtr;
547 char path[MAXPGPATH];
550 TLHistoryFilePath(path, tli);
552 /* Get history file from appropriate source */
553 if (controlFile == &ControlFile_source)
554 histfile = fetchFile(path, NULL);
555 else if (controlFile == &ControlFile_target)
556 histfile = slurpFile(datadir_target, path, NULL);
558 pg_fatal("invalid control file\n");
560 history = rewind_parseTimeLineHistory(histfile, tli, nentries);
568 if (controlFile == &ControlFile_source)
569 pg_log_debug("Source timeline history:");
570 else if (controlFile == &ControlFile_target)
571 pg_log_debug("Target timeline history:");
576 * Print the target timeline history.
578 for (i = 0; i < targetNentries; i++)
580 TimeLineHistoryEntry *entry;
583 pg_log_debug("%d: %X/%X - %X/%X", entry->tli,
584 (uint32) (entry->begin >> 32), (uint32) (entry->begin),
585 (uint32) (entry->end >> 32), (uint32) (entry->end));
593 * Determine the TLI of the last common timeline in the timeline history of the
594 * two clusters. targetHistory is filled with target timeline history and
595 * targetNentries is number of items in targetHistory. *tliIndex is set to the
596 * index of last common timeline in targetHistory array, and *recptr is set to
597 * the position where the timeline history diverged (ie. the first WAL record
598 * that's not the same in both clusters).
600 * Control files of both clusters must be read into ControlFile_target/source
601 * before calling this routine.
604 findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
606 TimeLineHistoryEntry *sourceHistory;
611 /* Retrieve timelines for both source and target */
612 sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries);
613 targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries);
616 * Trace the history forward, until we hit the timeline diverge. It may
617 * still be possible that the source and target nodes used the same
618 * timeline number in their history but with different start position
619 * depending on the history files that each node has fetched in previous
620 * recovery processes. Hence check the start position of the new timeline
621 * as well and move down by one extra timeline entry if they do not match.
623 n = Min(sourceNentries, targetNentries);
624 for (i = 0; i < n; i++)
626 if (sourceHistory[i].tli != targetHistory[i].tli ||
627 sourceHistory[i].begin != targetHistory[i].begin)
634 *recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end);
637 pg_free(sourceHistory);
642 pg_fatal("could not find common ancestor of the source and target cluster's timelines");
648 * Create a backup_label file that forces recovery to begin at the last common
652 createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli, XLogRecPtr checkpointloc)
654 XLogSegNo startsegno;
657 char xlogfilename[MAXFNAMELEN];
662 XLByteToSeg(startpoint, startsegno, WalSegSz);
663 XLogFileName(xlogfilename, starttli, startsegno, WalSegSz);
666 * Construct backup label file
668 stamp_time = time(NULL);
669 tmp = localtime(&stamp_time);
670 strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", tmp);
672 len = snprintf(buf, sizeof(buf),
673 "START WAL LOCATION: %X/%X (file %s)\n"
674 "CHECKPOINT LOCATION: %X/%X\n"
675 "BACKUP METHOD: pg_rewind\n"
676 "BACKUP FROM: standby\n"
678 /* omit LABEL: line */
679 (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename,
680 (uint32) (checkpointloc >> 32), (uint32) checkpointloc,
682 if (len >= sizeof(buf))
683 pg_fatal("backup label buffer too small"); /* shouldn't happen */
685 /* TODO: move old file out of the way, if any. */
686 open_target_file("backup_label", true); /* BACKUP_LABEL_FILE */
687 write_target_range(buf, 0, len);
692 * Check CRC of control file
695 checkControlFile(ControlFileData *ControlFile)
701 COMP_CRC32C(crc, (char *) ControlFile, offsetof(ControlFileData, crc));
704 /* And simply compare it */
705 if (!EQ_CRC32C(crc, ControlFile->crc))
706 pg_fatal("unexpected control file CRC");
710 * Verify control file contents in the buffer src, and copy it to *ControlFile.
713 digestControlFile(ControlFileData *ControlFile, char *src, size_t size)
715 if (size != PG_CONTROL_FILE_SIZE)
716 pg_fatal("unexpected control file size %d, expected %d",
717 (int) size, PG_CONTROL_FILE_SIZE);
719 memcpy(ControlFile, src, sizeof(ControlFileData));
721 /* set and validate WalSegSz */
722 WalSegSz = ControlFile->xlog_seg_size;
724 if (!IsValidWalSegSize(WalSegSz))
725 pg_fatal(ngettext("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
726 "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
730 /* Additional checks on control file */
731 checkControlFile(ControlFile);
735 * Sync target data directory to ensure that modifications are safely on disk.
737 * We do this once, for the whole data directory, for performance reasons. At
738 * the end of pg_rewind's run, the kernel is likely to already have flushed
739 * most dirty buffers to disk. Additionally fsync_pgdata uses a two-pass
740 * approach (only initiating writeback in the first pass), which often reduces
741 * the overall amount of IO noticeably.
744 syncTargetDirectory(void)
746 if (!do_sync || dry_run)
749 fsync_pgdata(datadir_target, PG_VERSION_NUM);