From 647fd9a108581d875543fd601b09ebd05e850993 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 13 Aug 2007 19:08:26 +0000 Subject: [PATCH] Fix two bugs induced in VACUUM FULL by async-commit patch. First, we cannot assume that XLogAsyncCommitFlush guarantees hint bits will be settable, because clog.c's inexact LSN bookkeeping results in windows where a previously flushed transaction is considered unhintable because it shares an LSN slot with a later unflushed transaction. But repair_frag requires XMIN_COMMITTED to be correct so that it can distinguish tuples moved by the current vacuum. Since not being able to set the bit is an uncommon corner case, the most practical way of dealing with it seems to be to abandon shrinking (ie, don't invoke repair_frag) when we find a non-dead tuple whose XMIN_COMMITTED bit couldn't be set. Second, it is possible for the same reason that a RECENTLY_DEAD tuple does not get its XMAX_COMMITTED bit set during scan_heap. But by the time repair_frag examines the tuple it might be possible to set the bit. We therefore must take buffer content lock when calling HeapTupleSatisfiesVacuum a second time, else we can get an Assert failure in SetBufferCommitInfoNeedsSave. This latter bug is latent in existing releases, but I think it cannot actually occur without async commit, since the first HeapTupleSatisfiesVacuum call should always have set the bit. So I'm not going to back-patch it. In passing, reduce the existing "cannot shrink relation" messages from NOTICE to LOG level. The new message must be no higher than LOG if we don't want unpredictable regression test failures, and consistency seems like a good idea. Also arrange that only one such message is reported per VACUUM FULL; in typical scenarios you could get spammed with many such messages, which seems a bit useless. --- src/backend/access/transam/xlog.c | 6 +- src/backend/commands/vacuum.c | 119 +++++++++++++++++++++--------- 2 files changed, 90 insertions(+), 35 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 626143ff19..353fb02c94 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.277 2007/08/04 01:26:53 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.278 2007/08/13 19:08:26 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1858,6 +1858,10 @@ XLogBackgroundFlush(void) /* * Flush any previous asynchronously-committed transactions' commit records. + * + * NOTE: it is unwise to assume that this provides any strong guarantees. + * In particular, because of the inexact LSN bookkeeping used by clog.c, + * we cannot assume that hint bits will be settable for these transactions. */ void XLogAsyncCommitFlush(void) diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 41c3b86791..358e9a5ad9 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.354 2007/08/01 22:45:08 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.355 2007/08/13 19:08:26 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1163,13 +1163,11 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt) vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared, &OldestXmin, &FreezeLimit); - /* - * VACUUM FULL assumes that all tuple states are well-known prior to - * moving tuples around --- see comment "known dead" in repair_frag(), - * as well as simplifications in tqual.c. So before we start we must - * ensure that any asynchronously-committed transactions with changes - * against this table have been flushed to disk. It's sufficient to do - * this once after we've acquired AccessExclusiveLock. + /* + * Flush any previous async-commit transactions. This does not guarantee + * that we will be able to set hint bits for tuples they inserted, but + * it improves the probability, especially in simple sequential-commands + * cases. See scan_heap() and repair_frag() for more about this. */ XLogAsyncCommitFlush(); @@ -1386,15 +1384,38 @@ scan_heap(VRelStats *vacrelstats, Relation onerel, switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf)) { - case HEAPTUPLE_DEAD: - tupgone = true; /* we can delete the tuple */ - break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); + /* + * The shrinkage phase of VACUUM FULL requires that all + * live tuples have XMIN_COMMITTED set --- see comments in + * repair_frag()'s walk-along-page loop. Use of async + * commit may prevent HeapTupleSatisfiesVacuum from + * setting the bit for a recently committed tuple. Rather + * than trying to handle this corner case, we just give up + * and don't shrink. + */ + if (do_shrinking && + !(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) + { + ereport(LOG, + (errmsg("relation \"%s\" TID %u/%u: XMIN_COMMITTED not set for transaction %u --- cannot shrink relation", + relname, blkno, offnum, + HeapTupleHeaderGetXmin(tuple.t_data)))); + do_shrinking = false; + } + break; + case HEAPTUPLE_DEAD: + tupgone = true; /* we can delete the tuple */ + /* + * We need not require XMIN_COMMITTED or XMAX_COMMITTED to + * be set, since we will remove the tuple without any + * further examination of its hint bits. + */ break; case HEAPTUPLE_RECENTLY_DEAD: @@ -1404,6 +1425,20 @@ scan_heap(VRelStats *vacrelstats, Relation onerel, */ nkeep += 1; + /* + * As with the LIVE case, shrinkage requires XMIN_COMMITTED + * to be set. + */ + if (do_shrinking && + !(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) + { + ereport(LOG, + (errmsg("relation \"%s\" TID %u/%u: XMIN_COMMITTED not set for transaction %u --- cannot shrink relation", + relname, blkno, offnum, + HeapTupleHeaderGetXmin(tuple.t_data)))); + do_shrinking = false; + } + /* * If we do shrinking and this tuple is updated one then * remember it to construct updated tuple dependencies. @@ -1429,26 +1464,34 @@ scan_heap(VRelStats *vacrelstats, Relation onerel, /* * This should not happen, since we hold exclusive lock on - * the relation; shouldn't we raise an error? (Actually, + * the relation; shouldn't we raise an error? (Actually, * it can happen in system catalogs, since we tend to - * release write lock before commit there.) + * release write lock before commit there.) As above, + * we can't apply repair_frag() if the tuple state is + * uncertain. */ - ereport(NOTICE, - (errmsg("relation \"%s\" TID %u/%u: InsertTransactionInProgress %u --- cannot shrink relation", - relname, blkno, offnum, HeapTupleHeaderGetXmin(tuple.t_data)))); + if (do_shrinking) + ereport(LOG, + (errmsg("relation \"%s\" TID %u/%u: InsertTransactionInProgress %u --- cannot shrink relation", + relname, blkno, offnum, + HeapTupleHeaderGetXmin(tuple.t_data)))); do_shrinking = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* * This should not happen, since we hold exclusive lock on - * the relation; shouldn't we raise an error? (Actually, + * the relation; shouldn't we raise an error? (Actually, * it can happen in system catalogs, since we tend to - * release write lock before commit there.) + * release write lock before commit there.) As above, + * we can't apply repair_frag() if the tuple state is + * uncertain. */ - ereport(NOTICE, - (errmsg("relation \"%s\" TID %u/%u: DeleteTransactionInProgress %u --- cannot shrink relation", - relname, blkno, offnum, HeapTupleHeaderGetXmax(tuple.t_data)))); + if (do_shrinking) + ereport(LOG, + (errmsg("relation \"%s\" TID %u/%u: DeleteTransactionInProgress %u --- cannot shrink relation", + relname, blkno, offnum, + HeapTupleHeaderGetXmax(tuple.t_data)))); do_shrinking = false; break; default: @@ -1810,19 +1853,21 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, * VACUUM FULL has an exclusive lock on the relation. So * normally no other transaction can have pending INSERTs or * DELETEs in this relation. A tuple is either: - * (a) a tuple in a system catalog, inserted or deleted - * by a not yet committed transaction + * (a) live (XMIN_COMMITTED) * (b) known dead (XMIN_INVALID, or XMAX_COMMITTED and xmax * is visible to all active transactions) - * (c) inserted by a committed xact (XMIN_COMMITTED) - * (d) moved by the currently running VACUUM. - * (e) deleted (XMAX_COMMITTED) but at least one active - * transaction does not see the deleting transaction - * In case (a) we wouldn't be in repair_frag() at all. - * In case (b) we cannot be here, because scan_heap() has - * already marked the item as unused, see continue above. Case - * (c) is what normally is to be expected. Case (d) is only - * possible, if a whole tuple chain has been moved while + * (c) inserted and deleted (XMIN_COMMITTED+XMAX_COMMITTED) + * but at least one active transaction does not see the + * deleting transaction (ie, it's RECENTLY_DEAD) + * (d) moved by the currently running VACUUM + * (e) inserted or deleted by a not yet committed transaction, + * or by a transaction we couldn't set XMIN_COMMITTED for. + * In case (e) we wouldn't be in repair_frag() at all, because + * scan_heap() detects those cases and shuts off shrinking. + * We can't see case (b) here either, because such tuples were + * already removed by vacuum_page(). Cases (a) and (c) are + * normal and will have XMIN_COMMITTED set. Case (d) is only + * possible if a whole tuple chain has been moved while * processing this or a higher numbered block. * --- */ @@ -1997,16 +2042,22 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, ReleaseBuffer(nextBuf); break; } - /* must check for DEAD or MOVED_IN tuple, too */ + /* + * Must check for DEAD or MOVED_IN tuple, too. This + * could potentially update hint bits, so we'd better + * hold the buffer content lock. + */ + LockBuffer(nextBuf, BUFFER_LOCK_SHARE); nextTstatus = HeapTupleSatisfiesVacuum(nextTdata, OldestXmin, nextBuf); if (nextTstatus == HEAPTUPLE_DEAD || nextTstatus == HEAPTUPLE_INSERT_IN_PROGRESS) { - ReleaseBuffer(nextBuf); + UnlockReleaseBuffer(nextBuf); break; } + LockBuffer(nextBuf, BUFFER_LOCK_UNLOCK); /* if it's MOVED_OFF we shoulda moved this one with it */ if (nextTstatus == HEAPTUPLE_DELETE_IN_PROGRESS) elog(ERROR, "updated tuple is already HEAP_MOVED_OFF"); -- 2.40.0