IndexBulkDeleteCallback callback;
void *callback_state;
BTCycleId cycleid;
- BlockNumber lastBlockVacuumed; /* last blkno reached by Vacuum scan */
- BlockNumber lastUsedPage; /* blkno of last non-recyclable page */
+ BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */
+ BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */
BlockNumber totFreePages; /* true total # of free pages */
MemoryContext pagedelcontext;
} BTVacState;
vstate.callback_state = callback_state;
vstate.cycleid = cycleid;
vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
- vstate.lastUsedPage = BTREE_METAPAGE;
+ vstate.lastBlockLocked = BTREE_METAPAGE;
vstate.totFreePages = 0;
/* Create a temporary memory context to run _bt_pagedel in */
}
/*
- * InHotStandby we need to scan right up to the end of the index for
- * correct locking, so we may need to write a WAL record for the final
- * block in the index if it was not vacuumed. It's possible that VACUUMing
- * has actually removed zeroed pages at the end of the index so we need to
- * take care to issue the record for last actual block and not for the
- * last block that was scanned. Ignore empty indexes.
+ * If the WAL is replayed in hot standby, the replay process needs to get
+ * cleanup locks on all index leaf pages, just as we've been doing here.
+ * However, we won't issue any WAL records about pages that have no items
+ * to be deleted. For pages between pages we've vacuumed, the replay code
+ * will take locks under the direction of the lastBlockVacuumed fields in
+ * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one
+ * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
+ * against the last leaf page in the index, if that one wasn't vacuumed.
*/
if (XLogStandbyInfoActive() &&
- num_pages > 1 && vstate.lastBlockVacuumed < (num_pages - 1))
+ vstate.lastBlockVacuumed < vstate.lastBlockLocked)
{
Buffer buf;
/*
- * We can't use _bt_getbuf() here because it always applies
- * _bt_checkpage(), which will barf on an all-zero page. We want to
- * recycle all-zero pages, not fail. Also, we want to use a
- * nondefault buffer access strategy.
+ * The page should be valid, but we can't use _bt_getbuf() because we
+ * want to use a nondefault buffer access strategy. Since we aren't
+ * going to delete any items, getting cleanup lock again is probably
+ * overkill, but for consistency do that anyway.
*/
- buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL,
- info->strategy);
+ buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
+ RBM_NORMAL, info->strategy);
LockBufferForCleanup(buf);
+ _bt_checkpage(rel, buf);
_bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
_bt_relbuf(rel, buf);
}
}
}
- /* If the page is in use, update lastUsedPage */
- if (!_bt_page_recyclable(page) && vstate->lastUsedPage < blkno)
- vstate->lastUsedPage = blkno;
-
/* Page is valid, see what to do with it */
if (_bt_page_recyclable(page))
{
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
LockBufferForCleanup(buf);
+ /*
+ * Remember highest leaf page number we've taken cleanup lock on; see
+ * notes in btvacuumscan
+ */
+ if (blkno > vstate->lastBlockLocked)
+ vstate->lastBlockLocked = blkno;
+
/*
* Check whether we need to recurse back to earlier pages. What we
* are concerned about is a page split that happened since we started
*/
if (ndeletable > 0)
{
- BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf);
-
+ /*
+ * Notice that the issued XLOG_BTREE_VACUUM WAL record includes an
+ * instruction to the replay code to get cleanup lock on all pages
+ * between the previous lastBlockVacuumed and this page. This
+ * ensures that WAL replay locks all leaf pages at some point.
+ *
+ * Since we can visit leaf pages out-of-order when recursing,
+ * replay might end up locking such pages an extra time, but it
+ * doesn't seem worth the amount of bookkeeping it'd take to avoid
+ * that.
+ */
_bt_delitems_vacuum(rel, buf, deletable, ndeletable,
vstate->lastBlockVacuumed);
/*
- * Keep track of the block number of the lastBlockVacuumed, so we
- * can scan those blocks as well during WAL replay. This then
- * provides concurrency protection and allows btrees to be used
- * while in recovery.
+ * Remember highest leaf page number we've issued a
+ * XLOG_BTREE_VACUUM WAL record for.
*/
- if (lastBlockVacuumed > vstate->lastBlockVacuumed)
- vstate->lastBlockVacuumed = lastBlockVacuumed;
+ if (blkno > vstate->lastBlockVacuumed)
+ vstate->lastBlockVacuumed = blkno;
stats->tuples_removed += ndeletable;
/* must recompute maxoff */
BTPageOpaque opaque;
/*
- * If queries might be active then we need to ensure every block is
+ * If queries might be active then we need to ensure every leaf page is
* unpinned between the lastBlockVacuumed and the current block, if there
- * are any. This ensures that every block in the index is touched during
- * VACUUM as required to ensure scans work correctly.
+ * are any. This prevents replay of the VACUUM from reaching the stage of
+ * removing heap tuples while there could still be indexscans "in flight"
+ * to those particular tuples (see nbtree/README).
+ *
+ * It might be worth checking if there are actually any backends running;
+ * if not, we could just skip this.
+ *
+ * Since VACUUM can visit leaf pages out-of-order, it might issue records
+ * with lastBlockVacuumed >= block; that's not an error, it just means
+ * nothing to do now.
+ *
+ * Note: since we touch all pages in the range, we will lock non-leaf
+ * pages, and also any empty (all-zero) pages that may be in the index. It
+ * doesn't seem worth the complexity to avoid that. But it's important
+ * that HotStandbyActiveInReplay() will not return true if the database
+ * isn't yet consistent; so we need not fear reading still-corrupt blocks
+ * here during crash recovery.
*/
- if (standbyState == STANDBY_SNAPSHOT_READY &&
- (xlrec->lastBlockVacuumed + 1) != xlrec->block)
+ if (HotStandbyActiveInReplay())
{
- BlockNumber blkno = xlrec->lastBlockVacuumed + 1;
+ BlockNumber blkno;
- for (; blkno < xlrec->block; blkno++)
+ for (blkno = xlrec->lastBlockVacuumed + 1; blkno < xlrec->block; blkno++)
{
/*
+ * We use RBM_NORMAL_NO_LOG mode because it's not an error
+ * condition to see all-zero pages. The original btvacuumpage
+ * scan would have skipped over all-zero pages, noting them in FSM
+ * but not bothering to initialize them just yet; so we mustn't
+ * throw an error here. (We could skip acquiring the cleanup lock
+ * if PageIsNew, but it's probably not worth the cycles to test.)
+ *
* XXX we don't actually need to read the block, we just need to
* confirm it is unpinned. If we had a special call into the
* buffer manager we could optimise this so that if the block is
* not in shared_buffers we confirm it as unpinned.
- *
- * Another simple optimization would be to check if there's any
- * backends running; if not, we could just skip this.
*/
- buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno, RBM_NORMAL);
+ buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno,
+ RBM_NORMAL_NO_LOG);
if (BufferIsValid(buffer))
{
LockBufferForCleanup(buffer);
* true. Postmaster knows this by way of signal, not via shared memory.
*
* Unlike testing standbyState, this works in any process that's connected to
- * shared memory.
+ * shared memory. (And note that standbyState alone doesn't tell the truth
+ * anyway.)
*/
bool
HotStandbyActive(void)
}
}
+/*
+ * Like HotStandbyActive(), but to be used only in WAL replay code,
+ * where we don't need to ask any other process what the state is.
+ */
+bool
+HotStandbyActiveInReplay(void)
+{
+ Assert(AmStartupProcess());
+ return LocalHotStandbyActive;
+}
+
/*
* Is this process allowed to insert new WAL records?
*
*
* In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
* relation is extended with all-zeroes pages up to the given block number.
+ *
+ * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
+ * exist, and we don't check for all-zeroes. Thus, no log entry is made
+ * to imply that the page should be dropped or truncated later.
*/
Buffer
XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
log_invalid_page(rnode, forknum, blkno, false);
return InvalidBuffer;
}
+ if (mode == RBM_NORMAL_NO_LOG)
+ return InvalidBuffer;
/* OK to extend the file */
/* we do this in recovery only - no rel-extension lock needed */
Assert(InRecovery);
* Assume when this function is called, that reln has been opened already.
*
* In RBM_NORMAL mode, the page is read from disk, and the page header is
- * validated. An error is thrown if the page header is not valid.
+ * validated. An error is thrown if the page header is not valid. (But
+ * note that an all-zero page is considered "valid"; see PageIsVerified().)
*
* RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
* valid, the page is zeroed instead of throwing an error. This is intended
* current physical EOF; that is likely to cause problems in md.c when
* the page is modified and written out. P_NEW is OK, though.
*
+ * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
+ *
* If strategy is not NULL, a nondefault buffer access strategy is used.
* See buffer/README for details.
*/
extern bool RecoveryInProgress(void);
extern bool HotStandbyActive(void);
+extern bool HotStandbyActiveInReplay(void);
extern bool XLogInsertAllowed(void);
extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream);
extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI);
RBM_NORMAL, /* Normal read */
RBM_ZERO, /* Don't read from disk, caller will
* initialize */
- RBM_ZERO_ON_ERROR /* Read, but return an all-zeros page on error */
+ RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */
+ RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL
+ * replay; otherwise same as RBM_NORMAL */
} ReadBufferMode;
/* in globals.c ... this duplicates miscadmin.h */