From e6284649b9e30372b3990107a082bc7520325676 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 25 Jul 2006 19:13:00 +0000 Subject: [PATCH] Modify btree to delete known-dead index entries without an actual VACUUM. When we are about to split an index page to do an insertion, first look to see if any entries marked LP_DELETE exist on the page, and if so remove them to try to make enough space for the desired insert. This should reduce index bloat in heavily-updated tables, although of course you still need VACUUM eventually to clean up the heap. Junji Teramoto --- src/backend/access/nbtree/README | 46 +++++++++++++-- src/backend/access/nbtree/nbtinsert.c | 81 ++++++++++++++++++++++++--- src/backend/access/nbtree/nbtpage.c | 11 +++- src/backend/access/nbtree/nbtutils.c | 8 ++- src/backend/access/nbtree/nbtxlog.c | 10 +++- src/include/access/nbtree.h | 4 +- 6 files changed, 144 insertions(+), 16 deletions(-) diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index a9466aa634..e153a47a2a 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.12 2006/05/08 00:00:09 tgl Exp $ +$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.13 2006/07/25 19:13:00 tgl Exp $ This directory contains a correct implementation of Lehman and Yao's high-concurrency B-tree management algorithm (P. Lehman and S. Yao, @@ -146,11 +146,12 @@ Because a page can be split even while someone holds a pin on it, it is possible that an indexscan will return items that are no longer stored on the page it has a pin on, but rather somewhere to the right of that page. To ensure that VACUUM can't prematurely remove such heap tuples, we require -btbulkdelete to obtain super-exclusive lock on every leaf page in the index -(even pages that don't contain any deletable tuples). This guarantees that +btbulkdelete to obtain super-exclusive lock on every leaf page in the index, +even pages that don't contain any deletable tuples. This guarantees that the btbulkdelete call cannot return while any indexscan is still holding a copy of a deleted index tuple. Note that this requirement does not say -that btbulkdelete must visit the pages in any particular order. +that btbulkdelete must visit the pages in any particular order. (See also +on-the-fly deletion, below.) There is no such interlocking for deletion of items in internal pages, since backends keep no lock nor pin on a page they have descended past. @@ -320,6 +321,43 @@ positives, so long as it never gives a false negative. This makes it possible to implement the test with a small counter value stored on each index page. +On-the-fly deletion of index tuples +----------------------------------- + +If a process visits a heap tuple and finds that it's dead and removable +(ie, dead to all open transactions, not only that process), then we can +return to the index and mark the corresponding index entry "known dead", +allowing subsequent index scans to skip visiting the heap tuple. The +"known dead" marking uses the LP_DELETE bit in ItemIds. This is currently +only done in plain indexscans, not bitmap scans, because only plain scans +visit the heap and index "in sync" and so there's not a convenient way +to do it for bitmap scans. + +Once an index tuple has been marked LP_DELETE it can actually be removed +from the index immediately; since index scans only stop "between" pages, +no scan can lose its place from such a deletion. We separate the steps +because we allow LP_DELETE to be set with only a share lock (it's exactly +like a hint bit for a heap tuple), but physically removing tuples requires +exclusive lock. In the current code we try to remove LP_DELETE tuples when +we are otherwise faced with having to split a page to do an insertion (and +hence have exclusive lock on it already). + +This leaves the index in a state where it has no entry for a dead tuple +that still exists in the heap. This is not a problem for the current +implementation of VACUUM, but it could be a problem for anything that +explicitly tries to find index entries for dead tuples. (However, the +same situation is created by REINDEX, since it doesn't enter dead +tuples into the index.) + +It's sufficient to have an exclusive lock on the index page, not a +super-exclusive lock, to do deletion of LP_DELETE items. It might seem +that this breaks the interlock between VACUUM and indexscans, but that is +not so: as long as an indexscanning process has a pin on the page where +the index item used to be, VACUUM cannot complete its btbulkdelete scan +and so cannot remove the heap tuple. This is another reason why +btbulkdelete has to get super-exclusive lock on every leaf page, not only +the ones where it actually sees items to delete. + WAL considerations ------------------ diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 974ef92d18..597949aa2d 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.141 2006/07/13 16:49:12 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.142 2006/07/25 19:13:00 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -65,6 +65,7 @@ static void _bt_pgaddtup(Relation rel, Page page, OffsetNumber itup_off, const char *where); static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey); +static void _bt_vacuum_one_page(Relation rel, Buffer buffer); /* @@ -262,6 +263,8 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, hbuffer) == HEAPTUPLE_DEAD) { curitemid->lp_flags |= LP_DELETE; + opaque->btpo_flags |= BTP_HAS_GARBAGE; + /* be sure to mark the proper buffer dirty... */ if (nbuf != InvalidBuffer) SetBufferCommitInfoNeedsSave(nbuf); else @@ -424,11 +427,29 @@ _bt_insertonpg(Relation rel, */ bool movedright = false; - while (PageGetFreeSpace(page) < itemsz && - !P_RIGHTMOST(lpageop) && - _bt_compare(rel, keysz, scankey, page, P_HIKEY) == 0 && - random() > (MAX_RANDOM_VALUE / 100)) + while (PageGetFreeSpace(page) < itemsz) { + Buffer rbuf; + + /* + * before considering moving right, see if we can obtain enough + * space by erasing LP_DELETE items + */ + if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop)) + { + _bt_vacuum_one_page(rel, buf); + if (PageGetFreeSpace(page) >= itemsz) + break; /* OK, now we have enough space */ + } + + /* + * nope, so check conditions (b) and (c) enumerated above + */ + if (P_RIGHTMOST(lpageop) || + _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 || + random() <= (MAX_RANDOM_VALUE / 100)) + break; + /* * step right to next non-dead page * @@ -438,7 +459,7 @@ _bt_insertonpg(Relation rel, * pages won't do because we don't know when they will get * de-linked from the tree. */ - Buffer rbuf = InvalidBuffer; + rbuf = InvalidBuffer; for (;;) { @@ -702,9 +723,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); /* if we're splitting this page, it won't be the root when we're done */ - /* also, clear the SPLIT_END flag in both pages */ + /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */ lopaque->btpo_flags = oopaque->btpo_flags; - lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END); + lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); ropaque->btpo_flags = lopaque->btpo_flags; lopaque->btpo_prev = oopaque->btpo_prev; lopaque->btpo_next = BufferGetBlockNumber(rbuf); @@ -1651,3 +1672,47 @@ _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, /* if we get here, the keys are equal */ return true; } + +/* + * _bt_vacuum_one_page - vacuum just one index page. + * + * Try to remove LP_DELETE items from the given page. The passed buffer + * must be exclusive-locked, but unlike a real VACUUM, we don't need a + * super-exclusive "cleanup" lock (see nbtree/README). + */ +static void +_bt_vacuum_one_page(Relation rel, Buffer buffer) +{ + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable = 0; + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buffer); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * Scan over all items to see which ones need deleted + * according to LP_DELETE flags. + */ + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdDeleted(itemId)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + _bt_delitems(rel, buffer, deletable, ndeletable); + /* + * Note: if we didn't find any LP_DELETE items, then the page's + * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother + * expending a separate write to clear it, however. We will clear + * it when we split the page. + */ +} diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index e069495f94..080e10c88c 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.98 2006/07/13 16:49:12 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.99 2006/07/25 19:13:00 tgl Exp $ * * NOTES * Postgres btree pages look like ordinary relation pages. The opaque @@ -668,6 +668,15 @@ _bt_delitems(Relation rel, Buffer buf, opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_cycleid = 0; + /* + * Mark the page as not containing any LP_DELETE items. This is not + * certainly true (there might be some that have recently been marked, + * but weren't included in our target-item list), but it will almost + * always be true and it doesn't seem worth an additional page scan + * to check it. Remember that BTP_HAS_GARBAGE is only a hint anyway. + */ + opaque->btpo_flags &= ~BTP_HAS_GARBAGE; + MarkBufferDirty(buf); /* XLOG stuff */ diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 840244e597..8b68054112 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.77 2006/07/13 16:49:12 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.78 2006/07/25 19:13:00 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -884,9 +884,15 @@ _bt_killitems(IndexScanDesc scan, bool haveLock) * Since this can be redone later if needed, it's treated the same * as a commit-hint-bit status update for heap tuples: we mark the * buffer dirty but don't make a WAL log entry. + * + * Whenever we mark anything LP_DELETEd, we also set the page's + * BTP_HAS_GARBAGE flag, which is likewise just a hint. */ if (killedsomething) + { + opaque->btpo_flags |= BTP_HAS_GARBAGE; SetBufferCommitInfoNeedsSave(so->currPos.buf); + } if (!haveLock) LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 63cb342e39..126f593cd9 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.35 2006/07/14 14:52:17 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.36 2006/07/25 19:13:00 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -346,6 +346,7 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record) Relation reln; Buffer buffer; Page page; + BTPageOpaque opaque; if (record->xl_info & XLR_BKP_BLOCK_1) return; @@ -374,6 +375,13 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record) PageIndexMultiDelete(page, unused, unend - unused); } + /* + * Mark the page as not containing any LP_DELETE items --- see comments + * in _bt_delitems(). + */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + opaque->btpo_flags &= ~BTP_HAS_GARBAGE; + PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index b22422ffd6..232cbb92bd 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.101 2006/07/11 21:05:57 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.102 2006/07/25 19:13:00 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -71,6 +71,7 @@ typedef BTPageOpaqueData *BTPageOpaque; #define BTP_META (1 << 3) /* meta-page */ #define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */ #define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ +#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DELETEd tuples */ /* @@ -163,6 +164,7 @@ typedef struct BTMetaPageData #define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT) #define P_ISDELETED(opaque) ((opaque)->btpo_flags & BTP_DELETED) #define P_IGNORE(opaque) ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) +#define P_HAS_GARBAGE(opaque) ((opaque)->btpo_flags & BTP_HAS_GARBAGE) /* * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost -- 2.40.0