]> granicus.if.org Git - postgresql/commitdiff
Use pg_rewind when target timeline was switched
authorTeodor Sigaev <teodor@sigaev.ru>
Tue, 1 Dec 2015 15:56:44 +0000 (18:56 +0300)
committerTeodor Sigaev <teodor@sigaev.ru>
Tue, 1 Dec 2015 15:56:44 +0000 (18:56 +0300)
Allow pg_rewind to work when target timeline was switched. Now
user can return promoted standby to old master.

Target timeline history becomes a global variable. Index
in target timeline history is used in function interfaces instead of
specifying TLI directly. Thus, SimpleXLogPageRead() can easily start
reading XLOGs from next timeline when current timeline ends.

Author: Alexander Korotkov
Review: Michael Paquier

doc/src/sgml/ref/pg_rewind.sgml
src/bin/pg_rewind/Makefile
src/bin/pg_rewind/parsexlog.c
src/bin/pg_rewind/pg_rewind.c
src/bin/pg_rewind/pg_rewind.h

index ef0cc280d14bc455614b7a3b0623a122ddfab8f4..d5fdaeae92380c2a6ed80968927dab8b628bbb5e 100644 (file)
@@ -61,13 +61,17 @@ PostgreSQL documentation
    <application>pg_rewind</> examines the timeline histories of the source
    and target clusters to determine the point where they diverged, and
    expects to find WAL in the target cluster's <filename>pg_xlog</> directory
-   reaching all the way back to the point of divergence. In the typical
-   failover scenario where the target cluster was shut down soon after the
-   divergence, that is not a problem, but if the target cluster had run for a
-   long time after the divergence, the old WAL files might not be present
-   anymore. In that case, they can be manually copied from the WAL archive to
-   the <filename>pg_xlog</> directory. Fetching missing files from a WAL
-   archive automatically is currently not supported.
+   reaching all the way back to the point of divergence. The point of divergence
+   could be found either on target timeline, source timeline or their common
+   ancestor. In the typical failover scenario where the target cluster was
+   shut down soon after the divergence, that is not a problem, but if the
+   target cluster had run for a long time after the divergence, the old WAL
+   files might not be present anymore. In that case, they can be manually
+   copied from the WAL archive to the <filename>pg_xlog</> directory. Fetching
+   missing files from a WAL archive automatically is currently not supported.
+   Besides, <application>pg_rewind</> use cases are not limited by failover.
+   For instance, standby server could be promoted, run some writes and
+   then be returned back as stanby. 
   </para>
 
   <para>
index 92b5d20afa707bc2dfe1137ac0cb8aee536b427b..48dc7702513512101d480c245b80fc04ab79aadd 100644 (file)
@@ -8,7 +8,7 @@
 #
 #-------------------------------------------------------------------------
 
-PGFILEDESC = "pg_rewind - repurpose an old master server as standby"
+PGFILEDESC = "pg_rewind - synchronize a data directory with another one forked from"
 PGAPPICON = win32
 
 subdir = src/bin/pg_rewind
index 2081cf8bd3342ffa4a7c62c3dd40bd7a07723ba0..d69eafbcf3da41789c9f3c2646780cd846e0b80e 100644 (file)
@@ -45,7 +45,7 @@ static char xlogfpath[MAXPGPATH];
 typedef struct XLogPageReadPrivate
 {
        const char *datadir;
-       TimeLineID      tli;
+       int                     tliIndex;
 } XLogPageReadPrivate;
 
 static int SimpleXLogPageRead(XLogReaderState *xlogreader,
@@ -55,11 +55,11 @@ static int SimpleXLogPageRead(XLogReaderState *xlogreader,
 
 /*
  * Read WAL from the datadir/pg_xlog, starting from 'startpoint' on timeline
- * 'tli', until 'endpoint'. Make note of the data blocks touched by the WAL
- * records, and return them in a page map.
+ * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of
+ * the data blocks touched by the WAL records, and return them in a page map.
  */
 void
-extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli,
+extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
                           XLogRecPtr endpoint)
 {
        XLogRecord *record;
@@ -68,7 +68,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli,
        XLogPageReadPrivate private;
 
        private.datadir = datadir;
-       private.tli = tli;
+       private.tliIndex = tliIndex;
        xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private);
        if (xlogreader == NULL)
                pg_fatal("out of memory\n");
@@ -112,7 +112,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli,
  * doing anything with the record itself.
  */
 XLogRecPtr
-readOneRecord(const char *datadir, XLogRecPtr ptr, TimeLineID tli)
+readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex)
 {
        XLogRecord *record;
        XLogReaderState *xlogreader;
@@ -121,7 +121,7 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, TimeLineID tli)
        XLogRecPtr      endptr;
 
        private.datadir = datadir;
-       private.tli = tli;
+       private.tliIndex = tliIndex;
        xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private);
        if (xlogreader == NULL)
                pg_fatal("out of memory\n");
@@ -152,7 +152,7 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, TimeLineID tli)
  * Find the previous checkpoint preceding given WAL position.
  */
 void
-findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, TimeLineID tli,
+findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
                                   XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli,
                                   XLogRecPtr *lastchkptredo)
 {
@@ -173,7 +173,7 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, TimeLineID tli,
                forkptr += (forkptr % XLogSegSize == 0) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD;
 
        private.datadir = datadir;
-       private.tli = tli;
+       private.tliIndex = tliIndex;
        xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private);
        if (xlogreader == NULL)
                pg_fatal("out of memory\n");
@@ -236,9 +236,11 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 {
        XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
        uint32          targetPageOff;
-       XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
+       XLogRecPtr      targetSegEnd;
+       XLogSegNo       targetSegNo;
 
        XLByteToSeg(targetPagePtr, targetSegNo);
+       XLogSegNoOffsetToRecPtr(targetSegNo + 1, 0, targetSegEnd);
        targetPageOff = targetPagePtr % XLogSegSize;
 
        /*
@@ -257,7 +259,20 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
        {
                char            xlogfname[MAXFNAMELEN];
 
-               XLogFileName(xlogfname, private->tli, xlogreadsegno);
+               /*
+                * Since incomplete segments are copied into next timelines, switch to
+                * the timeline holding the required segment. Assuming this scan can be
+                * done both forward and backward, consider also switching timeline
+                * accordingly.
+                */
+               while (private->tliIndex < targetNentries - 1 &&
+                               targetHistory[private->tliIndex].end < targetSegEnd)
+                       private->tliIndex++;
+               while (private->tliIndex > 0 &&
+                               targetHistory[private->tliIndex].begin >= targetSegEnd)
+                       private->tliIndex--;
+
+               XLogFileName(xlogfname, targetHistory[private->tliIndex].tli, xlogreadsegno);
 
                snprintf(xlogfpath, MAXPGPATH, "%s/" XLOGDIR "/%s", private->datadir, xlogfname);
 
@@ -293,7 +308,7 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 
        Assert(targetSegNo == xlogreadsegno);
 
-       *pageTLI = private->tli;
+       *pageTLI = targetHistory[private->tliIndex].tli;
        return XLOG_BLCKSZ;
 }
 
index a2d9ca36aa1f06d459b84f07e271c8ad6fb61fea..1ab82f07cb471c0eb687d45432c39e4490de18ee 100644 (file)
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pg_rewind.c
- *       Synchronizes an old master server to a new timeline
+ *       Synchronizes a PostgreSQL data directory to a new timeline
  *
  * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
  *
@@ -37,7 +37,7 @@ static void digestControlFile(ControlFileData *ControlFile, char *source,
                                  size_t size);
 static void updateControlFile(ControlFileData *ControlFile);
 static void sanityChecks(void);
-static void findCommonAncestorTimeline(XLogRecPtr *recptr, TimeLineID *tli);
+static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex);
 
 static ControlFileData ControlFile_target;
 static ControlFileData ControlFile_source;
@@ -53,6 +53,10 @@ bool         debug = false;
 bool           showprogress = false;
 bool           dry_run = false;
 
+/* Target history */
+TimeLineHistoryEntry *targetHistory;
+int targetNentries;
+
 static void
 usage(const char *progname)
 {
@@ -88,7 +92,7 @@ main(int argc, char **argv)
        int                     option_index;
        int                     c;
        XLogRecPtr      divergerec;
-       TimeLineID      lastcommontli;
+       int                     lastcommontliIndex;
        XLogRecPtr      chkptrec;
        TimeLineID      chkpttli;
        XLogRecPtr      chkptredo;
@@ -214,9 +218,10 @@ main(int argc, char **argv)
        if (ControlFile_target.checkPointCopy.ThisTimeLineID == ControlFile_source.checkPointCopy.ThisTimeLineID)
                pg_fatal("source and target cluster are on the same timeline\n");
 
-       findCommonAncestorTimeline(&divergerec, &lastcommontli);
+       findCommonAncestorTimeline(&divergerec, &lastcommontliIndex);
        printf(_("servers diverged at WAL position %X/%X on timeline %u\n"),
-                  (uint32) (divergerec >> 32), (uint32) divergerec, lastcommontli);
+                  (uint32) (divergerec >> 32), (uint32) divergerec,
+                  targetHistory[lastcommontliIndex].tli);
 
        /*
         * Check for the possibility that the target is in fact a direct ancestor
@@ -234,7 +239,7 @@ main(int argc, char **argv)
                /* Read the checkpoint record on the target to see where it ends. */
                chkptendrec = readOneRecord(datadir_target,
                                                                        ControlFile_target.checkPoint,
-                                                  ControlFile_target.checkPointCopy.ThisTimeLineID);
+                                                                       targetNentries - 1);
 
                /*
                 * If the histories diverged exactly at the end of the shutdown
@@ -254,7 +259,8 @@ main(int argc, char **argv)
                exit(0);
        }
 
-       findLastCheckpoint(datadir_target, divergerec, lastcommontli,
+       findLastCheckpoint(datadir_target, divergerec,
+                                          lastcommontliIndex,
                                           &chkptrec, &chkpttli, &chkptredo);
        printf(_("rewinding from last common checkpoint at %X/%X on timeline %u\n"),
                   (uint32) (chkptrec >> 32), (uint32) chkptrec,
@@ -277,7 +283,7 @@ main(int argc, char **argv)
         * we would need to replay until the end of WAL here.
         */
        pg_log(PG_PROGRESS, "reading WAL in target\n");
-       extractPageMap(datadir_target, chkptrec, lastcommontli,
+       extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
                                   ControlFile_target.checkPoint);
        filemap_finalize();
 
@@ -374,10 +380,11 @@ sanityChecks(void)
        /*
         * Target cluster better not be running. This doesn't guard against
         * someone starting the cluster concurrently. Also, this is probably more
-        * strict than necessary; it's OK if the master was not shut down cleanly,
-        * as long as it isn't running at the moment.
+        * strict than necessary; it's OK if the target node was not shut down
+        * cleanly, as long as it isn't running at the moment.
         */
-       if (ControlFile_target.state != DB_SHUTDOWNED)
+       if (ControlFile_target.state != DB_SHUTDOWNED &&
+               ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
                pg_fatal("target server must be shut down cleanly\n");
 
        /*
@@ -385,75 +392,149 @@ sanityChecks(void)
         * server is shut down. There isn't any very strong reason for this
         * limitation, but better safe than sorry.
         */
-       if (datadir_source && ControlFile_source.state != DB_SHUTDOWNED)
+       if (datadir_source &&
+               ControlFile_source.state != DB_SHUTDOWNED &&
+               ControlFile_source.state != DB_SHUTDOWNED_IN_RECOVERY)
                pg_fatal("source data directory must be shut down cleanly\n");
 }
 
 /*
- * Determine the TLI of the last common timeline in the histories of the two
- * clusters. *tli is set to the last common timeline, and *recptr is set to
- * the position where the histories diverged (ie. the first WAL record that's
- * not the same in both clusters).
- *
- * Control files of both clusters must be read into ControlFile_target/source
- * before calling this.
+ * Find minimum from two XLOG positions assuming InvalidXLogRecPtr means
+ * infinity as src/include/access/timeline.h states. This routine should
+ * be used only when comparing XLOG positions related to history files.
  */
-static void
-findCommonAncestorTimeline(XLogRecPtr *recptr, TimeLineID *tli)
+static XLogRecPtr
+MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
 {
-       TimeLineID      targettli;
-       TimeLineHistoryEntry *sourceHistory;
-       int                     nentries;
-       int                     i;
-       TimeLineID      sourcetli;
+       if (XLogRecPtrIsInvalid(a))
+               return b;
+       else if (XLogRecPtrIsInvalid(b))
+               return a;
+       else
+               return Min(a, b);
+}
+
+/*
+ * Retrieve timeline history for given control file which should behold
+ * either source or target.
+ */
+static TimeLineHistoryEntry *
+getTimelineHistory(ControlFileData *controlFile, int *nentries)
+{
+       TimeLineHistoryEntry   *history;
+       TimeLineID                              tli;
 
-       targettli = ControlFile_target.checkPointCopy.ThisTimeLineID;
-       sourcetli = ControlFile_source.checkPointCopy.ThisTimeLineID;
+       tli = controlFile->checkPointCopy.ThisTimeLineID;
 
-       /* Timeline 1 does not have a history file, so no need to check */
-       if (sourcetli == 1)
+       /*
+        * Timeline 1 does not have a history file, so there is no need to check and
+        * fake an entry with infinite start and end positions.
+        */
+       if (tli == 1)
        {
-               sourceHistory = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
-               sourceHistory->tli = sourcetli;
-               sourceHistory->begin = sourceHistory->end = InvalidXLogRecPtr;
-               nentries = 1;
+               history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
+               history->tli = tli;
+               history->begin = history->end = InvalidXLogRecPtr;
+               *nentries = 1;
        }
        else
        {
                char            path[MAXPGPATH];
                char       *histfile;
 
-               TLHistoryFilePath(path, sourcetli);
-               histfile = fetchFile(path, NULL);
+               TLHistoryFilePath(path, tli);
+
+               /* Get history file from appropriate source */
+               if (controlFile == &ControlFile_source)
+                       histfile = fetchFile(path, NULL);
+               else if (controlFile == &ControlFile_target)
+                       histfile = slurpFile(datadir_target, path, NULL);
+               else
+                       pg_fatal("Invalid control file");
 
-               sourceHistory = rewind_parseTimeLineHistory(histfile,
-                                                       ControlFile_source.checkPointCopy.ThisTimeLineID,
-                                                                                                       &nentries);
+               history = rewind_parseTimeLineHistory(histfile, tli, nentries);
                pg_free(histfile);
        }
 
-       /*
-        * Trace the history backwards, until we hit the target timeline.
-        *
-        * TODO: This assumes that there are no timeline switches on the target
-        * cluster after the fork.
-        */
-       for (i = nentries - 1; i >= 0; i--)
+       if (debug)
        {
-               TimeLineHistoryEntry *entry = &sourceHistory[i];
+               int             i;
+
+               if (controlFile == &ControlFile_source)
+                       printf("Source timeline history:\n");
+               else if (controlFile == &ControlFile_target)
+                       printf("Target timeline history:\n");
+               else
+                       Assert(false);
 
-               if (entry->tli == targettli)
+               /*
+                * Print the target timeline history.
+                */
+               for (i = 0; i < targetNentries; i++)
                {
-                       /* found it */
-                       *recptr = entry->end;
-                       *tli = entry->tli;
+                       TimeLineHistoryEntry *entry;
 
-                       pg_free(sourceHistory);
-                       return;
+                       entry = &history[i];
+                       printf("%d: %X/%X - %X/%X\n", entry->tli,
+                               (uint32) (entry->begin >> 32), (uint32) (entry->begin),
+                               (uint32) (entry->end >> 32), (uint32) (entry->end));
                }
        }
 
-       pg_fatal("could not find common ancestor of the source and target cluster's timelines\n");
+       return history;
+}
+
+/*
+ * Determine the TLI of the last common timeline in the timeline history of the
+ * two clusters. targetHistory is filled with target timeline history and
+ * targetNentries is number of items in targetHistory. *tliIndex is set to the
+ * index of last common timeline in targetHistory array, and *recptr is set to
+ * the position where the timeline history diverged (ie. the first WAL record
+ * that's not the same in both clusters).
+ *
+ * Control files of both clusters must be read into ControlFile_target/source
+ * before calling this routine.
+ */
+static void
+findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
+{
+       TimeLineHistoryEntry *sourceHistory;
+       int                     sourceNentries;
+       int                     i, n;
+
+       /* Retrieve timelines for both source and target */
+       sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries);
+       targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries);
+
+       /*
+        * Trace the history forward, until we hit the timeline diverge. It may
+        * still be possible that the source and target nodes used the same
+        * timeline number in their history but with different start position
+        * depending on the history files that each node has fetched in previous
+        * recovery processes. Hence check the start position of the new timeline
+        * as well and move down by one extra timeline entry if they do not match.
+        */
+       n = Min(sourceNentries, targetNentries);
+       for (i = 0; i < n; i++)
+       {
+               if (sourceHistory[i].tli != targetHistory[i].tli ||
+                       sourceHistory[i].begin != targetHistory[i].begin)
+                       break;
+       }
+
+       if (i > 0)
+       {
+               i--;
+               *recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end);
+               *tliIndex = i;
+
+               pg_free(sourceHistory);
+               return;
+       }
+       else
+       {
+               pg_fatal("could not find common ancestor of the source and target cluster's timelines\n");
+       }
 }
 
 
index e281369e39fe18c3725cbf5ce6f6333c281469d7..4826ddee6d121deea993a02d705854da1d52de76 100644 (file)
@@ -27,15 +27,19 @@ extern bool debug;
 extern bool showprogress;
 extern bool dry_run;
 
+/* Target history */
+extern TimeLineHistoryEntry *targetHistory;
+extern int targetNentries;
+
 /* in parsexlog.c */
 extern void extractPageMap(const char *datadir, XLogRecPtr startpoint,
-                          TimeLineID tli, XLogRecPtr endpoint);
+                          int tliIndex, XLogRecPtr endpoint);
 extern void findLastCheckpoint(const char *datadir, XLogRecPtr searchptr,
-                                  TimeLineID tli,
+                                  int tliIndex,
                                   XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli,
                                   XLogRecPtr *lastchkptredo);
 extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr,
-                         TimeLineID tli);
+                         int tliIndex);
 
 /* in timeline.c */
 extern TimeLineHistoryEntry *rewind_parseTimeLineHistory(char *buffer,