From 81c45081960f39351c38cd53554bb3788af54023 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 13 Nov 2014 19:47:44 +0200 Subject: [PATCH] Fix race condition between hot standby and restoring a full-page image. There was a window in RestoreBackupBlock where a page would be zeroed out, but not yet locked. If a backend pinned and locked the page in that window, it saw the zeroed page instead of the old page or new page contents, which could lead to missing rows in a result set, or errors. To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins, zeroes, and locks the page, if it's not in the buffer cache already. In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE, to avoid breaking any 3rd party extensions that might use RBM_ZERO. More importantly, this avoids renumbering the other enum values, which would cause even bigger confusion in extensions that use ReadBufferExtended, but haven't been recompiled. Backpatch to all supported versions; this has been racy since hot standby was introduced. --- src/backend/access/hash/hashpage.c | 13 ++++---- src/backend/access/heap/heapam.c | 6 ++-- src/backend/access/transam/xlogutils.c | 44 ++++++++++++++++---------- src/backend/storage/buffer/bufmgr.c | 41 +++++++++++++++++++++--- src/include/storage/bufmgr.h | 6 ++-- 5 files changed, 78 insertions(+), 32 deletions(-) diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 9e4a2e0434..0aaa15c389 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -155,9 +155,8 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno) if (blkno == P_NEW) elog(ERROR, "hash AM does not use P_NEW"); - buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO, NULL); - - LockBuffer(buf, HASH_WRITE); + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK, + NULL); /* ref count and lock type are correct */ @@ -198,11 +197,13 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum) if (BufferGetBlockNumber(buf) != blkno) elog(ERROR, "unexpected hash relation size: %u, should be %u", BufferGetBlockNumber(buf), blkno); + LockBuffer(buf, HASH_WRITE); } else - buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO, NULL); - - LockBuffer(buf, HASH_WRITE); + { + buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK, + NULL); + } /* ref count and lock type are correct */ diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 43098f4442..1763b70631 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -7556,7 +7556,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record) { XLogReadBufferForRedoExtended(lsn, record, 0, target_node, MAIN_FORKNUM, blkno, - RBM_ZERO, false, &buffer); + RBM_ZERO_AND_LOCK, false, &buffer); page = BufferGetPage(buffer); PageInit(page, BufferGetPageSize(buffer), 0); action = BLK_NEEDS_REDO; @@ -7683,7 +7683,7 @@ heap_xlog_multi_insert(XLogRecPtr lsn, XLogRecord *record) { XLogReadBufferForRedoExtended(lsn, record, 0, rnode, MAIN_FORKNUM, blkno, - RBM_ZERO, false, &buffer); + RBM_ZERO_AND_LOCK, false, &buffer); page = BufferGetPage(buffer); PageInit(page, BufferGetPageSize(buffer), 0); action = BLK_NEEDS_REDO; @@ -7876,7 +7876,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) { XLogReadBufferForRedoExtended(lsn, record, 1, rnode, MAIN_FORKNUM, newblk, - RBM_ZERO, false, &nbuffer); + RBM_ZERO_AND_LOCK, false, &nbuffer); page = (Page) BufferGetPage(nbuffer); PageInit(page, BufferGetPageSize(nbuffer), 0); newaction = BLK_NEEDS_REDO; diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index da0e45bfcc..cf04081c19 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -287,9 +287,13 @@ XLogReadBufferForRedo(XLogRecPtr lsn, XLogRecord *record, int block_index, * XLogReadBufferForRedoExtended * Like XLogReadBufferForRedo, but with extra options. * - * If mode is RBM_ZERO or RBM_ZERO_ON_ERROR, if the page doesn't exist, the - * relation is extended with all-zeroes pages up to the referenced block - * number. In RBM_ZERO mode, the return value is always BLK_NEEDS_REDO. + * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended + * with all-zeroes pages up to the referenced block number. In + * RBM_ZERO_AND_LOCK and RBM_ZERO_AND_CLEANUP_LOCK modes, the return value + * is always BLK_NEEDS_REDO. + * + * (The RBM_ZERO_AND_CLEANUP_LOCK mode is redundant with the get_cleanup_lock + * parameter. Do not use an inconsistent combination!) * * If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer * using LockBufferForCleanup(), instead of a regular exclusive lock. @@ -312,10 +316,13 @@ XLogReadBufferForRedoExtended(XLogRecPtr lsn, XLogRecord *record, *buf = XLogReadBufferExtended(rnode, forkno, blkno, mode); if (BufferIsValid(*buf)) { - if (get_cleanup_lock) - LockBufferForCleanup(*buf); - else - LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); + if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK) + { + if (get_cleanup_lock) + LockBufferForCleanup(*buf); + else + LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); + } if (lsn <= PageGetLSN(BufferGetPage(*buf))) return BLK_DONE; else @@ -341,7 +348,8 @@ XLogReadBufferForRedoExtended(XLogRecPtr lsn, XLogRecord *record, * The returned buffer is exclusively-locked. * * For historical reasons, instead of a ReadBufferMode argument, this only - * supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes. + * supports RBM_ZERO_AND_LOCK (init == true) and RBM_NORMAL (init == false) + * modes. */ Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init) @@ -349,8 +357,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init) Buffer buf; buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, - init ? RBM_ZERO : RBM_NORMAL); - if (BufferIsValid(buf)) + init ? RBM_ZERO_AND_LOCK : RBM_NORMAL); + if (BufferIsValid(buf) && !init) LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); return buf; @@ -369,8 +377,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init) * dropped or truncated. If we don't see evidence of that later in the WAL * sequence, we'll complain at the end of WAL replay.) * - * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the - * relation is extended with all-zeroes pages up to the given block number. + * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended + * with all-zeroes pages up to the given block number. * * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't * exist, and we don't check for all-zeroes. Thus, no log entry is made @@ -424,7 +432,11 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, do { if (buffer != InvalidBuffer) + { + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); + } buffer = ReadBufferWithoutRelcache(rnode, forknum, P_NEW, mode, NULL); } @@ -432,6 +444,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, /* Handle the corner case that P_NEW returns non-consecutive pages */ if (BufferGetBlockNumber(buffer) != blkno) { + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, mode, NULL); @@ -537,12 +551,8 @@ RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk, Page page; buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block, - RBM_ZERO); + get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK); Assert(BufferIsValid(buffer)); - if (get_cleanup_lock) - LockBufferForCleanup(buffer); - else - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); page = (Page) BufferGetPage(buffer); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index cbfe05e2b5..9b62aecd91 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -499,14 +499,19 @@ ReadBuffer(Relation reln, BlockNumber blockNum) * valid, the page is zeroed instead of throwing an error. This is intended * for non-critical data, where the caller is prepared to repair errors. * - * In RBM_ZERO mode, if the page isn't in buffer cache already, it's filled - * with zeros instead of reading it from disk. Useful when the caller is - * going to fill the page from scratch, since this saves I/O and avoids + * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's + * filled with zeros instead of reading it from disk. Useful when the caller + * is going to fill the page from scratch, since this saves I/O and avoids * unnecessary failure if the page-on-disk has corrupt page headers. + * The page is returned locked to ensure that the caller has a chance to + * initialize the page before it's made visible to others. * Caution: do not use this mode to read a page that is beyond the relation's * current physical EOF; that is likely to cause problems in md.c when * the page is modified and written out. P_NEW is OK, though. * + * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires + * a cleanup-strength lock on the page. + * * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here. * * If strategy is not NULL, a nondefault buffer access strategy is used. @@ -648,6 +653,18 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, isExtend, found); + /* + * In RBM_ZERO_AND_LOCK mode the caller expects the page to + * be locked on return. + */ + if (!isLocalBuf) + { + if (mode == RBM_ZERO_AND_LOCK) + LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE); + else if (mode == RBM_ZERO_AND_CLEANUP_LOCK) + LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr)); + } + return BufferDescriptorGetBuffer(bufHdr); } @@ -729,7 +746,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * Read in the page, unless the caller intends to overwrite it and * just wants us to allocate a buffer. */ - if (mode == RBM_ZERO) + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) MemSet((char *) bufBlock, 0, BLCKSZ); else { @@ -771,6 +788,22 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } } + /* + * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking + * the page as valid, to make sure that no other backend sees the zeroed + * page before the caller has had a chance to initialize it. + * + * Since no-one else can be looking at the page contents yet, there is no + * difference between an exclusive lock and a cleanup-strength lock. + * (Note that we cannot use LockBuffer() of LockBufferForCleanup() here, + * because they assert that the buffer is already valid.) + */ + if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) && + !isLocalBuf) + { + LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE); + } + if (isLocalBuf) { /* Only need to adjust flags */ diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 42d9120ed3..b9af6ff6d1 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -36,8 +36,10 @@ typedef enum BufferAccessStrategyType typedef enum { RBM_NORMAL, /* Normal read */ - RBM_ZERO, /* Don't read from disk, caller will - * initialize */ + RBM_ZERO_AND_LOCK, /* Don't read from disk, caller will + * initialize. Also locks the page. */ + RBM_ZERO_AND_CLEANUP_LOCK, /* Like RBM_ZERO_AND_LOCK, but locks the page + * in "cleanup" mode */ RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */ RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL * replay; otherwise same as RBM_NORMAL */ -- 2.40.0