Prevent references to invalid relation pages after fresh promotion

author Michael Paquier <michael@paquier.xyz>

Thu, 5 Jul 2018 01:46:43 +0000 (10:46 +0900)

committer Michael Paquier <michael@paquier.xyz>

Thu, 5 Jul 2018 01:46:43 +0000 (10:46 +0900)
author Michael Paquier <michael@paquier.xyz>
Thu, 5 Jul 2018 01:46:43 +0000 (10:46 +0900)
committer Michael Paquier <michael@paquier.xyz>
Thu, 5 Jul 2018 01:46:43 +0000 (10:46 +0900)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index dcfef365916a4c85f7f41e18d2ee5f35351fc080..d6b5b05425d9a9433ac1d3630b668379a7a065b9 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -821,8 +821,14 @@ static XLogSource XLogReceiptSource = 0;   /* XLOG_FROM_* code */
  static XLogRecPtr ReadRecPtr;  /* start of last record read */
  static XLogRecPtr EndRecPtr;   /* end+1 of last record read */
  
-static XLogRecPtr minRecoveryPoint; /* local copy of
-                                                                        * ControlFile->minRecoveryPoint */
+/*
+ * Local copies of equivalent fields in the control file.  When running
+ * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
+ * expect to replay all the WAL available, and updateMinRecoveryPoint is
+ * switched to false to prevent any updates while replaying records.
+ * Those values are kept consistent as long as crash recovery runs.
+ */
+static XLogRecPtr minRecoveryPoint;
  static TimeLineID minRecoveryPointTLI;
  static bool updateMinRecoveryPoint = true;
  
@@ -2711,20 +2717,26 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
         if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
                 return;
  
+       /*
+        * An invalid minRecoveryPoint means that we need to recover all the WAL,
+        * i.e., we're doing crash recovery.  We never modify the control file's
+        * value in that case, so we can short-circuit future checks here too. The
+        * local values of minRecoveryPoint and minRecoveryPointTLI should not be
+        * updated until crash recovery finishes.
+        */
+       if (XLogRecPtrIsInvalid(minRecoveryPoint))
+       {
+               updateMinRecoveryPoint = false;
+               return;
+       }
+
         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  
         /* update local copy */
         minRecoveryPoint = ControlFile->minRecoveryPoint;
         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
  
-       /*
-        * An invalid minRecoveryPoint means that we need to recover all the WAL,
-        * i.e., we're doing crash recovery.  We never modify the control file's
-        * value in that case, so we can short-circuit future checks here too.
-        */
-       if (minRecoveryPoint == 0)
-               updateMinRecoveryPoint = false;
-       else if (force || minRecoveryPoint < lsn)
+       if (force || minRecoveryPoint < lsn)
         {
                 XLogRecPtr      newMinRecoveryPoint;
                 TimeLineID      newMinRecoveryPointTLI;
@@ -3110,7 +3122,16 @@ XLogNeedsFlush(XLogRecPtr record)
          */
         if (RecoveryInProgress())
         {
-               /* Quick exit if already known updated */
+               /*
+                * An invalid minRecoveryPoint means that we need to recover all the
+                * WAL, i.e., we're doing crash recovery.  We never modify the control
+                * file's value in that case, so we can short-circuit future checks
+                * here too.
+                */
+               if (XLogRecPtrIsInvalid(minRecoveryPoint))
+                       updateMinRecoveryPoint = false;
+
+               /* Quick exit if already known to be updated or cannot be updated */
                 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
                         return false;
  
@@ -3124,20 +3145,8 @@ XLogNeedsFlush(XLogRecPtr record)
                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
                 LWLockRelease(ControlFileLock);
  
-               /*
-                * An invalid minRecoveryPoint means that we need to recover all the
-                * WAL, i.e., we're doing crash recovery.  We never modify the control
-                * file's value in that case, so we can short-circuit future checks
-                * here too.
-                */
-               if (minRecoveryPoint == 0)
-                       updateMinRecoveryPoint = false;
-
                 /* check again */
-               if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
-                       return false;
-               else
-                       return true;
+               return record > minRecoveryPoint;
         }
  
         /* Quick exit if already known flushed */
@@ -4269,6 +4278,12 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
                                 minRecoveryPoint = ControlFile->minRecoveryPoint;
                                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
  
+                               /*
+                                * The startup process can update its local copy of
+                                * minRecoveryPoint from this point.
+                                */
+                               updateMinRecoveryPoint = true;
+
                                 UpdateControlFile();
                                 LWLockRelease(ControlFileLock);
  
@@ -6892,9 +6907,26 @@ StartupXLOG(void)
                 /* No need to hold ControlFileLock yet, we aren't up far enough */
                 UpdateControlFile();
  
-               /* initialize our local copy of minRecoveryPoint */
-               minRecoveryPoint = ControlFile->minRecoveryPoint;
-               minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+               /*
+                * Initialize our local copy of minRecoveryPoint.  When doing crash
+                * recovery we want to replay up to the end of WAL.  Particularly, in
+                * the case of a promoted standby minRecoveryPoint value in the
+                * control file is only updated after the first checkpoint.  However,
+                * if the instance crashes before the first post-recovery checkpoint
+                * is completed then recovery will use a stale location causing the
+                * startup process to think that there are still invalid page
+                * references when checking for data consistency.
+                */
+               if (InArchiveRecovery)
+               {
+                       minRecoveryPoint = ControlFile->minRecoveryPoint;
+                       minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+               }
+               else
+               {
+                       minRecoveryPoint = InvalidXLogRecPtr;
+                       minRecoveryPointTLI = 0;
+               }
  
                 /*
                  * Reset pgstat data, because it may be invalid after recovery.
@@ -7861,6 +7893,8 @@ CheckRecoveryConsistency(void)
         if (XLogRecPtrIsInvalid(minRecoveryPoint))
                 return;
  
+       Assert(InArchiveRecovery);
+
         /*
          * assume that we are called in the startup process, and hence don't need
          * a lock to read lastReplayedEndRecPtr
@@ -9949,11 +9983,16 @@ xlog_redo(XLogReaderState *record)
                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
                  * recover back up to this point before allowing hot standby again.
                  * This is important if the max_* settings are decreased, to ensure
-                * you don't run queries against the WAL preceding the change.
+                * you don't run queries against the WAL preceding the change. The
+                * local copies cannot be updated as long as crash recovery is
+                * happening and we expect all the WAL to be replayed.
                  */
-               minRecoveryPoint = ControlFile->minRecoveryPoint;
-               minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
-               if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
+               if (InArchiveRecovery)
+               {
+                       minRecoveryPoint = ControlFile->minRecoveryPoint;
+                       minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+               }
+               if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
                 {
                         ControlFile->minRecoveryPoint = lsn;
                         ControlFile->minRecoveryPointTLI = ThisTimeLineID;
diff --git a/src/test/recovery/t/015_promotion_pages.pl b/src/test/recovery/t/015_promotion_pages.pl

new file mode 100644 (file)

index 0000000..48f941b
--- /dev/null
+++ b/src/test/recovery/t/015_promotion_pages.pl
@@ -0,0 +1,87 @@
+# Test for promotion handling with WAL records generated post-promotion
+# before the first checkpoint is generated.  This test case checks for
+# invalid page references at replay based on the minimum consistent
+# recovery point defined.
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More tests => 1;
+
+# Initialize primary node
+my $alpha = get_new_node('alpha');
+$alpha->init(allows_streaming => 1);
+# Setting wal_log_hints to off is important to get invalid page
+# references.
+$alpha->append_conf("postgresql.conf", <<EOF);
+wal_log_hints = off
+EOF
+
+# Start the primary
+$alpha->start;
+
+# setup/start a standby
+$alpha->backup('bkp');
+my $bravo = get_new_node('bravo');
+$bravo->init_from_backup($alpha, 'bkp', has_streaming => 1);
+$bravo->append_conf('postgresql.conf', <<EOF);
+checkpoint_timeout=1h
+checkpoint_completion_target=0.9
+EOF
+$bravo->start;
+
+# Dummy table for the upcoming tests.
+$alpha->safe_psql('postgres', 'create table test1 (a int)');
+$alpha->safe_psql('postgres', 'insert into test1 select generate_series(1, 10000)');
+
+# take a checkpoint
+$alpha->safe_psql('postgres', 'checkpoint');
+
+# The following vacuum will set visibility map bits and create
+# problematic WAL records.
+$alpha->safe_psql('postgres', 'vacuum verbose test1');
+# Wait for last record to have been replayed on the standby.
+$alpha->wait_for_catchup($bravo, 'replay',
+                                                $alpha->lsn('insert'));
+
+# Now force a checkpoint on the standby. This seems unnecessary but for "some"
+# reason, the previous checkpoint on the primary does not reflect on the standby
+# and without an explicit checkpoint, it may start redo recovery from a much
+# older point, which includes even create table and initial page additions.
+$bravo->safe_psql('postgres', 'checkpoint');
+
+# Now just use a dummy table and run some operations to move minRecoveryPoint
+# beyond the previous vacuum.
+$alpha->safe_psql('postgres', 'create table test2 (a int, b text)');
+$alpha->safe_psql('postgres', 'insert into test2 select generate_series(1,10000), md5(random()::text)');
+$alpha->safe_psql('postgres', 'truncate test2');
+
+# Wait again for all records to be replayed.
+$alpha->wait_for_catchup($bravo, 'replay',
+                                                $alpha->lsn('insert'));
+
+# Do the promotion, which reinitializes minRecoveryPoint in the control
+# file so as WAL is replayed up to the end.
+$bravo->promote;
+
+# Truncate the table on the promoted standby, vacuum and extend it
+# again to create new page references.  The first post-recovery checkpoint
+# has not happened yet.
+$bravo->safe_psql('postgres', 'truncate test1');
+$bravo->safe_psql('postgres', 'vacuum verbose test1');
+$bravo->safe_psql('postgres', 'insert into test1 select generate_series(1,1000)');
+
+# Now crash-stop the promoted standby and restart.  This makes sure that
+# replay does not see invalid page references because of an invalid
+# minimum consistent recovery point.
+$bravo->stop('immediate');
+$bravo->start;
+
+# Check state of the table after full crash recovery.  All its data should
+# be here.
+my $psql_out;
+$bravo->psql(
+       'postgres',
+       "SELECT count(*) FROM test1",
+       stdout => \$psql_out);
+is($psql_out, '1000', "Check that table state is correct");
author	Michael Paquier <michael@paquier.xyz>
	Thu, 5 Jul 2018 01:46:43 +0000 (10:46 +0900)
committer	Michael Paquier <michael@paquier.xyz>
	Thu, 5 Jul 2018 01:46:43 +0000 (10:46 +0900)
src/backend/access/transam/xlog.c		patch \| blob \| history
src/test/recovery/t/015_promotion_pages.pl	[new file with mode: 0644]	patch \| blob