]> granicus.if.org Git - postgresql/commitdiff
Consistency check should compare last record replayed, not last record read.
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 11 Dec 2012 13:57:24 +0000 (15:57 +0200)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 11 Dec 2012 16:55:38 +0000 (18:55 +0200)
EndRecPtr is the last record that we've read, but not necessarily yet
replayed. CheckRecoveryConsistency should compare minRecoveryPoint with the
last replayed record instead. This caused recovery to think it's reached
consistency too early.

Now that we do the check in CheckRecoveryConsistency correctly, we have to
move the call of that function to after redoing a record. The current place,
after reading a record but before replaying it, is wrong. In particular, if
there are no more records after the one ending at minRecoveryPoint, we don't
enter hot standby until one extra record is generated and read by the
standby, and CheckRecoveryConsistency is called. These two bugs conspired
to make the code appear to work correctly, except for the small window
between reading the last record that reaches minRecoveryPoint, and
replaying it.

In the passing, rename recoveryLastRecPtr, which is the last record
replayed, to lastReplayedEndRecPtr. This makes it slightly less confusing
with replayEndRecPtr, which is the last record read that we're about to
replay.

Original report from Kyotaro HORIGUCHI, further diagnosis by Fujii Masao.
Backpatch to 9.0, where Hot Standby subtly changed the test from
"minRecoveryPoint < EndRecPtr" to "minRecoveryPoint <= EndRecPtr". The
former works because where the test is performed, we have always read one
more record than we've replayed.

src/backend/access/transam/xlog.c

index 616571bb2c4c7ec46c3ae98f3021bf5161e500f1..9aecad60209e2a540e69c154daa0f18742d07c8f 100644 (file)
@@ -402,10 +402,14 @@ typedef struct XLogCtlData
        XLogRecPtr      lastCheckPointRecPtr;
        CheckPoint      lastCheckPoint;
 
-       /* end+1 of the last record replayed (or being replayed) */
+       /*
+        * lastReplayedEndRecPtr points to end+1 of the last record successfully
+        * replayed. When we're currently replaying a record, ie. in a redo
+        * function, replayEndRecPtr points to the end+1 of the record being
+        * replayed, otherwise it's equal to lastReplayedEndRecPtr.
+        */
+       XLogRecPtr      lastReplayedEndRecPtr;
        XLogRecPtr      replayEndRecPtr;
-       /* end+1 of the last record replayed */
-       XLogRecPtr      recoveryLastRecPtr;
        /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
        TimestampTz recoveryLastXTime;
 
@@ -6161,7 +6165,7 @@ StartupXLOG(void)
                }
 
                /*
-                * Initialize shared replayEndRecPtr, recoveryLastRecPtr, and
+                * Initialize shared replayEndRecPtr, lastReplayedEndRecPtr, and
                 * recoveryLastXTime.
                 *
                 * This is slightly confusing if we're starting from an online
@@ -6174,7 +6178,7 @@ StartupXLOG(void)
                 */
                SpinLockAcquire(&xlogctl->info_lck);
                xlogctl->replayEndRecPtr = ReadRecPtr;
-               xlogctl->recoveryLastRecPtr = EndRecPtr;
+               xlogctl->lastReplayedEndRecPtr = EndRecPtr;
                xlogctl->recoveryLastXTime = 0;
                SpinLockRelease(&xlogctl->info_lck);
 
@@ -6263,9 +6267,6 @@ StartupXLOG(void)
                                /* Handle interrupt signals of startup process */
                                HandleStartupProcInterrupts();
 
-                               /* Allow read-only connections if we're consistent now */
-                               CheckRecoveryConsistency();
-
                                /*
                                 * Have we reached our recovery target?
                                 */
@@ -6313,15 +6314,18 @@ StartupXLOG(void)
                                error_context_stack = errcontext.previous;
 
                                /*
-                                * Update shared recoveryLastRecPtr after this record has been
-                                * replayed.
+                                * Update lastReplayedEndRecPtr after this record has been
+                                * successfully replayed.
                                 */
                                SpinLockAcquire(&xlogctl->info_lck);
-                               xlogctl->recoveryLastRecPtr = EndRecPtr;
+                               xlogctl->lastReplayedEndRecPtr = EndRecPtr;
                                SpinLockRelease(&xlogctl->info_lck);
 
                                LastRec = ReadRecPtr;
 
+                               /* Allow read-only connections if we're consistent now */
+                               CheckRecoveryConsistency();
+
                                record = ReadRecord(NULL, LOG, false);
                        } while (record != NULL && recoveryContinue);
 
@@ -6661,13 +6665,14 @@ CheckRecoveryConsistency(void)
         * Have we passed our safe starting point?
         */
        if (!reachedConsistency &&
-               XLByteLE(minRecoveryPoint, EndRecPtr) &&
+               XLByteLE(minRecoveryPoint, XLogCtl->lastReplayedEndRecPtr) &&
                XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
        {
                reachedConsistency = true;
                ereport(LOG,
                                (errmsg("consistent recovery state reached at %X/%X",
-                                               EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+                                               XLogCtl->lastReplayedEndRecPtr.xlogid,
+                                               XLogCtl->lastReplayedEndRecPtr.xrecoff)));
        }
 
        /*
@@ -8967,7 +8972,7 @@ pg_last_xlog_replay_location(PG_FUNCTION_ARGS)
        char            location[MAXFNAMELEN];
 
        SpinLockAcquire(&xlogctl->info_lck);
-       recptr = xlogctl->recoveryLastRecPtr;
+       recptr = xlogctl->lastReplayedEndRecPtr;
        SpinLockRelease(&xlogctl->info_lck);
 
        if (recptr.xlogid == 0 && recptr.xrecoff == 0)