]> granicus.if.org Git - postgresql/commitdiff
Fix race condition between hot standby and restoring a full-page image.
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Thu, 13 Nov 2014 17:47:44 +0000 (19:47 +0200)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Thu, 13 Nov 2014 18:01:09 +0000 (20:01 +0200)
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.

To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.

In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.

Backpatch to all supported versions; this has been racy since hot standby
was introduced.

src/backend/access/hash/hashpage.c
src/backend/access/heap/heapam.c
src/backend/access/transam/xlog.c
src/backend/access/transam/xlogutils.c
src/backend/storage/buffer/bufmgr.c
src/include/storage/bufmgr.h

index 62e7755634bb54d7352a9f2aee82c36e923183ef..0cbe813b04a31928028df8719a9f0c9b445cabe0 100644 (file)
@@ -158,9 +158,8 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno)
        if (blkno == P_NEW)
                elog(ERROR, "hash AM does not use P_NEW");
 
-       buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO, NULL);
-
-       LockBuffer(buf, HASH_WRITE);
+       buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,
+                                                        NULL);
 
        /* ref count and lock type are correct */
 
@@ -201,11 +200,13 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
                if (BufferGetBlockNumber(buf) != blkno)
                        elog(ERROR, "unexpected hash relation size: %u, should be %u",
                                 BufferGetBlockNumber(buf), blkno);
+               LockBuffer(buf, HASH_WRITE);
        }
        else
-               buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO, NULL);
-
-       LockBuffer(buf, HASH_WRITE);
+       {
+               buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
+                                                                NULL);
+       }
 
        /* ref count and lock type are correct */
 
index 0105825e14a02b643ce7055b77d035c5eeefad71..6645732f74c2f51974d3e2d12e2f0dc120048e32 100644 (file)
@@ -4367,9 +4367,8 @@ heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
         * not do anything that assumes we are touching a heap.
         */
        buffer = XLogReadBufferExtended(xlrec->node, xlrec->forknum, xlrec->blkno,
-                                                                       RBM_ZERO);
+                                                                       RBM_ZERO_AND_LOCK);
        Assert(BufferIsValid(buffer));
-       LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
        page = (Page) BufferGetPage(buffer);
 
        Assert(record->xl_len == SizeOfHeapNewpage + BLCKSZ);
index dafd7f679195b2028e53a29b4ed4642e5ea45564..45028d2828829d6548825e28defa935478d821e6 100644 (file)
@@ -3620,12 +3620,8 @@ RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
                {
                        /* Found it, apply the update */
                        buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
-                                                                                       RBM_ZERO);
+                                                                                       get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
                        Assert(BufferIsValid(buffer));
-                       if (get_cleanup_lock)
-                               LockBufferForCleanup(buffer);
-                       else
-                               LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
                        page = (Page) BufferGetPage(buffer);
 
index 3593a4e384da956774a4960743b4f4ba5aab5fa1..546e3ded2508888e29b302bd7e79ef5855602bb2 100644 (file)
@@ -234,7 +234,8 @@ XLogCheckInvalidPages(void)
  * The returned buffer is exclusively-locked.
  *
  * For historical reasons, instead of a ReadBufferMode argument, this only
- * supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes.
+ * supports RBM_ZERO_AND_LOCK (init == true) and RBM_NORMAL (init == false)
+ * modes.
  */
 Buffer
 XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
@@ -242,8 +243,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
        Buffer          buf;
 
        buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
-                                                                init ? RBM_ZERO : RBM_NORMAL);
-       if (BufferIsValid(buf))
+                                                                init ? RBM_ZERO_AND_LOCK : RBM_NORMAL);
+       if (BufferIsValid(buf) && !init)
                LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
        return buf;
@@ -262,8 +263,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
  * dropped or truncated. If we don't see evidence of that later in the WAL
  * sequence, we'll complain at the end of WAL replay.)
  *
- * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
- * relation is extended with all-zeroes pages up to the given block number.
+ * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
+ * with all-zeroes pages up to the given block number.
  *
  * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
  * exist, and we don't check for all-zeroes.  Thus, no log entry is made
@@ -317,7 +318,11 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
                do
                {
                        if (buffer != InvalidBuffer)
+                       {
+                               if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+                                       LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
                                ReleaseBuffer(buffer);
+                       }
                        buffer = ReadBufferWithoutRelcache(rnode, forknum,
                                                                                           P_NEW, mode, NULL);
                }
@@ -325,6 +330,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
                /* Handle the corner case that P_NEW returns non-consecutive pages */
                if (BufferGetBlockNumber(buffer) != blkno)
                {
+                       if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+                               LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
                        ReleaseBuffer(buffer);
                        buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
                                                                                           mode, NULL);
index cd7108b156da2ca30f236fa91c33427474684c7d..b57d87d7ac68e46c50827fe263aff6ccd0d12bc2 100644 (file)
@@ -208,14 +208,19 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
  * valid, the page is zeroed instead of throwing an error. This is intended
  * for non-critical data, where the caller is prepared to repair errors.
  *
- * In RBM_ZERO mode, if the page isn't in buffer cache already, it's filled
- * with zeros instead of reading it from disk.  Useful when the caller is
- * going to fill the page from scratch, since this saves I/O and avoids
+ * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
+ * filled with zeros instead of reading it from disk.  Useful when the caller
+ * is going to fill the page from scratch, since this saves I/O and avoids
  * unnecessary failure if the page-on-disk has corrupt page headers.
+ * The page is returned locked to ensure that the caller has a chance to
+ * initialize the page before it's made visible to others.
  * Caution: do not use this mode to read a page that is beyond the relation's
  * current physical EOF; that is likely to cause problems in md.c when
  * the page is modified and written out. P_NEW is OK, though.
  *
+ * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
+ * a cleanup-strength lock on the page.
+ *
  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
  *
  * If strategy is not NULL, a nondefault buffer access strategy is used.
@@ -356,6 +361,18 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                                                                                          isExtend,
                                                                                          found);
 
+                       /*
+                        * In RBM_ZERO_AND_LOCK mode, the caller expects the buffer to
+                        * be already locked on return.
+                        */
+                       if (!isLocalBuf)
+                       {
+                               if (mode == RBM_ZERO_AND_LOCK)
+                                       LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+                               else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
+                                       LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
+                       }
+
                        return BufferDescriptorGetBuffer(bufHdr);
                }
 
@@ -436,8 +453,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                 * Read in the page, unless the caller intends to overwrite it and
                 * just wants us to allocate a buffer.
                 */
-               if (mode == RBM_ZERO)
+               if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK ||
+                       mode == RBM_DO_NOT_USE)
+               {
                        MemSet((char *) bufBlock, 0, BLCKSZ);
+               }
                else
                {
                        smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
@@ -464,6 +484,19 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                }
        }
 
+       /*
+        * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
+        * the page as valid, to make sure that no other backend sees the zeroed
+        * page before the caller has had a chance to initialize it.
+        *
+        * Since no-one else can be looking at the page contents yet, there is no
+        * difference between an exclusive lock and a cleanup-strength lock.
+        * (Note that we cannot use LockBuffer() of LockBufferForCleanup() here,
+        * because they assert that the buffer is already valid.)
+        */
+       if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+               LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+
        if (isLocalBuf)
        {
                /* Only need to adjust flags */
index 15d9101edf618eca8f9fa7b7aad493a4a18915b7..eb85d6f208d1e40035ff641babff5670fd43dd91 100644 (file)
@@ -36,11 +36,16 @@ typedef enum BufferAccessStrategyType
 typedef enum
 {
        RBM_NORMAL,                                     /* Normal read */
-       RBM_ZERO,                                       /* Don't read from disk, caller will
-                                                                * initialize */
+       RBM_DO_NOT_USE,                         /* This used to be RBM_ZERO. Only kept for
+                                                                * binary compatibility with 3rd party
+                                                                * extensions. */
        RBM_ZERO_ON_ERROR,                      /* Read, but return an all-zeros page on error */
-       RBM_NORMAL_NO_LOG                       /* Don't log page as invalid during WAL
+       RBM_NORMAL_NO_LOG,                      /* Don't log page as invalid during WAL
                                                                 * replay; otherwise same as RBM_NORMAL */
+       RBM_ZERO_AND_LOCK,                      /* Don't read from disk, caller will
+                                                                * initialize. Also locks the page. */
+       RBM_ZERO_AND_CLEANUP_LOCK       /* Like RBM_ZERO_AND_LOCK, but locks the page
+                                                                * in "cleanup" mode */
 } ReadBufferMode;
 
 /* in globals.c ... this duplicates miscadmin.h */