Fix multiple bugs in index page locking during hot-standby WAL replay.

author Tom Lane <tgl@sss.pgh.pa.us>

Tue, 14 Jan 2014 22:34:47 +0000 (17:34 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Tue, 14 Jan 2014 22:35:21 +0000 (17:35 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Tue, 14 Jan 2014 22:34:47 +0000 (17:34 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Tue, 14 Jan 2014 22:35:21 +0000 (17:35 -0500)
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c

index 75a6ea81d45dc32da2c2cdcd9a2539cb1a526b74..b7f9e2e1608056594dddb2b78f62a05224ee14e3 100644 (file)
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -56,8 +56,8 @@ typedef struct
         IndexBulkDeleteCallback callback;
         void       *callback_state;
         BTCycleId       cycleid;
-       BlockNumber lastBlockVacuumed;          /* last blkno reached by Vacuum scan */
-       BlockNumber lastUsedPage;       /* blkno of last non-recyclable page */
+       BlockNumber lastBlockVacuumed;          /* highest blkno actually vacuumed */
+       BlockNumber lastBlockLocked;    /* highest blkno we've cleanup-locked */
         BlockNumber totFreePages;       /* true total # of free pages */
         MemoryContext pagedelcontext;
  } BTVacState;
@@ -763,7 +763,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
         vstate.callback_state = callback_state;
         vstate.cycleid = cycleid;
         vstate.lastBlockVacuumed = BTREE_METAPAGE;      /* Initialise at first block */
-       vstate.lastUsedPage = BTREE_METAPAGE;
+       vstate.lastBlockLocked = BTREE_METAPAGE;
         vstate.totFreePages = 0;
  
         /* Create a temporary memory context to run _bt_pagedel in */
@@ -819,27 +819,30 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
         }
  
         /*
-        * InHotStandby we need to scan right up to the end of the index for
-        * correct locking, so we may need to write a WAL record for the final
-        * block in the index if it was not vacuumed. It's possible that VACUUMing
-        * has actually removed zeroed pages at the end of the index so we need to
-        * take care to issue the record for last actual block and not for the
-        * last block that was scanned. Ignore empty indexes.
+        * If the WAL is replayed in hot standby, the replay process needs to get
+        * cleanup locks on all index leaf pages, just as we've been doing here.
+        * However, we won't issue any WAL records about pages that have no items
+        * to be deleted.  For pages between pages we've vacuumed, the replay code
+        * will take locks under the direction of the lastBlockVacuumed fields in
+        * the XLOG_BTREE_VACUUM WAL records.  To cover pages after the last one
+        * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
+        * against the last leaf page in the index, if that one wasn't vacuumed.
          */
         if (XLogStandbyInfoActive() &&
-               num_pages > 1 && vstate.lastBlockVacuumed < (num_pages - 1))
+               vstate.lastBlockVacuumed < vstate.lastBlockLocked)
         {
                 Buffer          buf;
  
                 /*
-                * We can't use _bt_getbuf() here because it always applies
-                * _bt_checkpage(), which will barf on an all-zero page. We want to
-                * recycle all-zero pages, not fail.  Also, we want to use a
-                * nondefault buffer access strategy.
+                * The page should be valid, but we can't use _bt_getbuf() because we
+                * want to use a nondefault buffer access strategy.  Since we aren't
+                * going to delete any items, getting cleanup lock again is probably
+                * overkill, but for consistency do that anyway.
                  */
-               buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL,
-                                                                info->strategy);
+               buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
+                                                                RBM_NORMAL, info->strategy);
                 LockBufferForCleanup(buf);
+               _bt_checkpage(rel, buf);
                 _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
                 _bt_relbuf(rel, buf);
         }
@@ -914,10 +917,6 @@ restart:
                 }
         }
  
-       /* If the page is in use, update lastUsedPage */
-       if (!_bt_page_recyclable(page) && vstate->lastUsedPage < blkno)
-               vstate->lastUsedPage = blkno;
-
         /* Page is valid, see what to do with it */
         if (_bt_page_recyclable(page))
         {
@@ -953,6 +952,13 @@ restart:
                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
                 LockBufferForCleanup(buf);
  
+               /*
+                * Remember highest leaf page number we've taken cleanup lock on; see
+                * notes in btvacuumscan
+                */
+               if (blkno > vstate->lastBlockLocked)
+                       vstate->lastBlockLocked = blkno;
+
                 /*
                  * Check whether we need to recurse back to earlier pages.      What we
                  * are concerned about is a page split that happened since we started
@@ -1019,19 +1025,26 @@ restart:
                  */
                 if (ndeletable > 0)
                 {
-                       BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf);
-
+                       /*
+                        * Notice that the issued XLOG_BTREE_VACUUM WAL record includes an
+                        * instruction to the replay code to get cleanup lock on all pages
+                        * between the previous lastBlockVacuumed and this page.  This
+                        * ensures that WAL replay locks all leaf pages at some point.
+                        *
+                        * Since we can visit leaf pages out-of-order when recursing,
+                        * replay might end up locking such pages an extra time, but it
+                        * doesn't seem worth the amount of bookkeeping it'd take to avoid
+                        * that.
+                        */
                         _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
                                                                 vstate->lastBlockVacuumed);
  
                         /*
-                        * Keep track of the block number of the lastBlockVacuumed, so we
-                        * can scan those blocks as well during WAL replay. This then
-                        * provides concurrency protection and allows btrees to be used
-                        * while in recovery.
+                        * Remember highest leaf page number we've issued a
+                        * XLOG_BTREE_VACUUM WAL record for.
                          */
-                       if (lastBlockVacuumed > vstate->lastBlockVacuumed)
-                               vstate->lastBlockVacuumed = lastBlockVacuumed;
+                       if (blkno > vstate->lastBlockVacuumed)
+                               vstate->lastBlockVacuumed = blkno;
  
                         stats->tuples_removed += ndeletable;
                         /* must recompute maxoff */
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c

index 0fd28e1d5d622c201f26d45cb802f9237348d41b..98d97634cef742eee68f46b9e8319ab8e2b6e459 100644 (file)
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -482,28 +482,47 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
         BTPageOpaque opaque;
  
         /*
-        * If queries might be active then we need to ensure every block is
+        * If queries might be active then we need to ensure every leaf page is
          * unpinned between the lastBlockVacuumed and the current block, if there
-        * are any. This ensures that every block in the index is touched during
-        * VACUUM as required to ensure scans work correctly.
+        * are any.  This prevents replay of the VACUUM from reaching the stage of
+        * removing heap tuples while there could still be indexscans "in flight"
+        * to those particular tuples (see nbtree/README).
+        *
+        * It might be worth checking if there are actually any backends running;
+        * if not, we could just skip this.
+        *
+        * Since VACUUM can visit leaf pages out-of-order, it might issue records
+        * with lastBlockVacuumed >= block; that's not an error, it just means
+        * nothing to do now.
+        *
+        * Note: since we touch all pages in the range, we will lock non-leaf
+        * pages, and also any empty (all-zero) pages that may be in the index. It
+        * doesn't seem worth the complexity to avoid that.  But it's important
+        * that HotStandbyActiveInReplay() will not return true if the database
+        * isn't yet consistent; so we need not fear reading still-corrupt blocks
+        * here during crash recovery.
          */
-       if (standbyState == STANDBY_SNAPSHOT_READY &&
-               (xlrec->lastBlockVacuumed + 1) != xlrec->block)
+       if (HotStandbyActiveInReplay())
         {
-               BlockNumber blkno = xlrec->lastBlockVacuumed + 1;
+               BlockNumber blkno;
  
-               for (; blkno < xlrec->block; blkno++)
+               for (blkno = xlrec->lastBlockVacuumed + 1; blkno < xlrec->block; blkno++)
                 {
                         /*
+                        * We use RBM_NORMAL_NO_LOG mode because it's not an error
+                        * condition to see all-zero pages.  The original btvacuumpage
+                        * scan would have skipped over all-zero pages, noting them in FSM
+                        * but not bothering to initialize them just yet; so we mustn't
+                        * throw an error here.  (We could skip acquiring the cleanup lock
+                        * if PageIsNew, but it's probably not worth the cycles to test.)
+                        *
                          * XXX we don't actually need to read the block, we just need to
                          * confirm it is unpinned. If we had a special call into the
                          * buffer manager we could optimise this so that if the block is
                          * not in shared_buffers we confirm it as unpinned.
-                        *
-                        * Another simple optimization would be to check if there's any
-                        * backends running; if not, we could just skip this.
                          */
-                       buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno, RBM_NORMAL);
+                       buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno,
+                                                                                       RBM_NORMAL_NO_LOG);
                         if (BufferIsValid(buffer))
                         {
                                 LockBufferForCleanup(buffer);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index b53ae874b9978716195fe8272b1162db02c2dac8..8679d0aa7f4755aa5fb5872382a6eff9399dd844 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7562,7 +7562,8 @@ RecoveryInProgress(void)
   * true. Postmaster knows this by way of signal, not via shared memory.
   *
   * Unlike testing standbyState, this works in any process that's connected to
- * shared memory.
+ * shared memory.  (And note that standbyState alone doesn't tell the truth
+ * anyway.)
   */
  bool
  HotStandbyActive(void)
@@ -7588,6 +7589,17 @@ HotStandbyActive(void)
         }
  }
  
+/*
+ * Like HotStandbyActive(), but to be used only in WAL replay code,
+ * where we don't need to ask any other process what the state is.
+ */
+bool
+HotStandbyActiveInReplay(void)
+{
+       Assert(AmStartupProcess());
+       return LocalHotStandbyActive;
+}
+
  /*
   * Is this process allowed to insert new WAL records?
   *
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c

index 59f4233e9f67092695a11dc0dcef95beeeb6a055..4cd82dfeb799269a658482e03a85c55829d0f931 100644 (file)
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -288,6 +288,10 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
   *
   * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
   * relation is extended with all-zeroes pages up to the given block number.
+ *
+ * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
+ * exist, and we don't check for all-zeroes.  Thus, no log entry is made
+ * to imply that the page should be dropped or truncated later.
   */
  Buffer
  XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
@@ -328,6 +332,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
                         log_invalid_page(rnode, forknum, blkno, false);
                         return InvalidBuffer;
                 }
+               if (mode == RBM_NORMAL_NO_LOG)
+                       return InvalidBuffer;
                 /* OK to extend the file */
                 /* we do this in recovery only - no rel-extension lock needed */
                 Assert(InRecovery);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c

index c6bae12e56d91de1c43b3a700473425a9180315a..91f0c7eb36e16b3c17e67d3bf7684daf8f2d9224 100644 (file)
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -207,7 +207,8 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
   * Assume when this function is called, that reln has been opened already.
   *
   * In RBM_NORMAL mode, the page is read from disk, and the page header is
- * validated. An error is thrown if the page header is not valid.
+ * validated.  An error is thrown if the page header is not valid.     (But
+ * note that an all-zero page is considered "valid"; see PageIsVerified().)
   *
   * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
   * valid, the page is zeroed instead of throwing an error. This is intended
@@ -221,6 +222,8 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
   * current physical EOF; that is likely to cause problems in md.c when
   * the page is modified and written out. P_NEW is OK, though.
   *
+ * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
+ *
   * If strategy is not NULL, a nondefault buffer access strategy is used.
   * See buffer/README for details.
   */
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h

index 017e74d3f0ab2927b0eeb08942d6103e3e01a033..281f51629e4195acecf537f386e1a07971805f86 100644 (file)
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -300,6 +300,7 @@ extern void issue_xlog_fsync(int fd, XLogSegNo segno);
  
  extern bool RecoveryInProgress(void);
  extern bool HotStandbyActive(void);
+extern bool HotStandbyActiveInReplay(void);
  extern bool XLogInsertAllowed(void);
  extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream);
  extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h

index 8217d90266385369b1f075bddfa7123b5d554435..89447d0b3d4a290ff70939861fe9ea675176cf24 100644 (file)
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -38,7 +38,9 @@ typedef enum
         RBM_NORMAL,                                     /* Normal read */
         RBM_ZERO,                                       /* Don't read from disk, caller will
                                                                  * initialize */
-       RBM_ZERO_ON_ERROR                       /* Read, but return an all-zeros page on error */
+       RBM_ZERO_ON_ERROR,                      /* Read, but return an all-zeros page on error */
+       RBM_NORMAL_NO_LOG                       /* Don't log page as invalid during WAL
+                                                                * replay; otherwise same as RBM_NORMAL */
  } ReadBufferMode;
  
  /* in globals.c ... this duplicates miscadmin.h */
author	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 14 Jan 2014 22:34:47 +0000 (17:34 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 14 Jan 2014 22:35:21 +0000 (17:35 -0500)
src/backend/access/nbtree/nbtree.c		patch \| blob \| history
src/backend/access/nbtree/nbtxlog.c		patch \| blob \| history
src/backend/access/transam/xlog.c		patch \| blob \| history
src/backend/access/transam/xlogutils.c		patch \| blob \| history
src/backend/storage/buffer/bufmgr.c		patch \| blob \| history
src/include/access/xlog.h		patch \| blob \| history
src/include/storage/bufmgr.h		patch \| blob \| history