From 1383e2a1a937116e1367c9584984f0730f9ef4d5 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 4 Apr 2018 14:26:04 -0400 Subject: [PATCH] Improve FSM management for BRIN indexes. BRIN indexes like to propagate additions of free space into the upper pages of their free space maps as soon as the new space is known, even when it's just on one individual index page. Previously this required calling FreeSpaceMapVacuum, which is quite an expensive thing if the map is large. Use the FreeSpaceMapVacuumRange function recently added by commit c79f6df75 to reduce the amount of work done for this purpose. Fix a couple of places that neglected to do the upper-page vacuuming at all after recording new free space. If the policy is to be that BRIN should do that, it should do it everywhere. Do RecordPageWithFreeSpace unconditionally in brin_page_cleanup, and do FreeSpaceMapVacuum unconditionally in brin_vacuum_scan. Because of the FSM's imprecise storage of free space, the old complications here seldom bought anything, they just slowed things down. This approach also provides a predictable path for FSM corruption to be repaired. Remove premature RecordPageWithFreeSpace call in brin_getinsertbuffer where it's about to return an extended page to the caller. The caller should do that, instead, after it's inserted its new tuple. Fix the one caller that forgot to do so. Simplify logic in brin_doupdate's same-page-update case by postponing brin_initialize_empty_new_buffer to after the critical section; I see little point in doing it before. Avoid repeat calls of RelationGetNumberOfBlocks in brin_vacuum_scan. Avoid duplicate BufferGetBlockNumber and BufferGetPage calls in a couple of places where we already had the right values. Move a BRIN_elog debug logging call out of a critical section; that's pretty unsafe and I don't think it buys us anything to not wait till after the critical section. Move the "*extended = false" step in brin_getinsertbuffer into the routine's main loop. There's no actual bug there, since the loop can't iterate with *extended still true, but it doesn't seem very future-proof as coded; and it's certainly not documented as a loop invariant. This is all from follow-on investigation inspired by commit c79f6df75. Discussion: https://postgr.es/m/5801.1522429460@sss.pgh.pa.us --- src/backend/access/brin/brin.c | 29 +++-- src/backend/access/brin/brin_pageops.c | 155 ++++++++++++++----------- src/include/access/brin_pageops.h | 2 +- 3 files changed, 103 insertions(+), 83 deletions(-) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 0e5849efdc..6ed115f81c 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1121,16 +1121,22 @@ initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap, static void terminate_brin_buildstate(BrinBuildState *state) { - /* release the last index buffer used */ + /* + * Release the last index buffer used. We might as well ensure that + * whatever free space remains in that page is available in FSM, too. + */ if (!BufferIsInvalid(state->bs_currentInsertBuf)) { Page page; + Size freespace; + BlockNumber blk; page = BufferGetPage(state->bs_currentInsertBuf); - RecordPageWithFreeSpace(state->bs_irel, - BufferGetBlockNumber(state->bs_currentInsertBuf), - PageGetFreeSpace(page)); + freespace = PageGetFreeSpace(page); + blk = BufferGetBlockNumber(state->bs_currentInsertBuf); ReleaseBuffer(state->bs_currentInsertBuf); + RecordPageWithFreeSpace(state->bs_irel, blk, freespace); + FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1); } brin_free_desc(state->bs_bdesc); @@ -1445,14 +1451,15 @@ union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b) static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy) { - bool vacuum_fsm = false; + BlockNumber nblocks; BlockNumber blkno; /* * Scan the index in physical order, and clean up any possible mess in * each page. */ - for (blkno = 0; blkno < RelationGetNumberOfBlocks(idxrel); blkno++) + nblocks = RelationGetNumberOfBlocks(idxrel); + for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; @@ -1461,15 +1468,15 @@ brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy) buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno, RBM_NORMAL, strategy); - vacuum_fsm |= brin_page_cleanup(idxrel, buf); + brin_page_cleanup(idxrel, buf); ReleaseBuffer(buf); } /* - * If we made any change to the FSM, make sure the new info is visible all - * the way to the top. + * Update all upper pages in the index's FSM, as well. This ensures not + * only that we propagate leaf-page FSM updates made by brin_page_cleanup, + * but also that any pre-existing damage or out-of-dateness is repaired. */ - if (vacuum_fsm) - FreeSpaceMapVacuum(idxrel); + FreeSpaceMapVacuum(idxrel); } diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c index 60a7025ec5..040cb62e55 100644 --- a/src/backend/access/brin/brin_pageops.c +++ b/src/backend/access/brin/brin_pageops.c @@ -64,6 +64,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, BrinTuple *oldtup; Size oldsz; Buffer newbuf; + BlockNumber newblk = InvalidBlockNumber; bool extended; Assert(newsz == MAXALIGN(newsz)); @@ -101,6 +102,8 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, Assert(!extended); newbuf = InvalidBuffer; } + else + newblk = BufferGetBlockNumber(newbuf); } else { @@ -136,7 +139,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); if (extended) - FreeSpaceMapVacuum(idxrel); + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); } return false; } @@ -152,11 +155,12 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); if (BufferIsValid(newbuf)) { + /* As above, initialize and record new page if we got one */ if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); if (extended) - FreeSpaceMapVacuum(idxrel); + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); } return false; } @@ -173,14 +177,6 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) && brin_can_do_samepage_update(oldbuf, origsz, newsz)) { - if (BufferIsValid(newbuf)) - { - /* as above */ - if (extended) - brin_initialize_empty_new_buffer(idxrel, newbuf); - UnlockReleaseBuffer(newbuf); - } - START_CRIT_SECTION(); if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) newtup, newsz)) elog(ERROR, "failed to replace BRIN tuple"); @@ -210,8 +206,15 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); - if (extended) - FreeSpaceMapVacuum(idxrel); + if (BufferIsValid(newbuf)) + { + /* As above, initialize and record new page if we got one */ + if (extended) + brin_initialize_empty_new_buffer(idxrel, newbuf); + UnlockReleaseBuffer(newbuf); + if (extended) + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); + } return true; } @@ -234,7 +237,6 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, Buffer revmapbuf; ItemPointerData newtid; OffsetNumber newoff; - BlockNumber newblk = InvalidBlockNumber; Size freespace = 0; revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); @@ -247,7 +249,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, * need to do that here. */ if (extended) - brin_page_init(BufferGetPage(newbuf), BRIN_PAGETYPE_REGULAR); + brin_page_init(newpage, BRIN_PAGETYPE_REGULAR); PageIndexTupleDeleteNoCompact(oldpage, oldoff); newoff = PageAddItem(newpage, (Item) newtup, newsz, @@ -259,12 +261,9 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, /* needed to update FSM below */ if (extended) - { - newblk = BufferGetBlockNumber(newbuf); freespace = br_page_get_freespace(newpage); - } - ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff); + ItemPointerSet(&newtid, newblk, newoff); brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid); MarkBufferDirty(revmapbuf); @@ -311,9 +310,8 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, if (extended) { - Assert(BlockNumberIsValid(newblk)); RecordPageWithFreeSpace(idxrel, newblk, freespace); - FreeSpaceMapVacuum(idxrel); + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); } return true; @@ -350,6 +348,7 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, Page page; BlockNumber blk; OffsetNumber off; + Size freespace = 0; Buffer revmapbuf; ItemPointerData tid; bool extended; @@ -410,15 +409,16 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, /* Execute the actual insertion */ START_CRIT_SECTION(); if (extended) - brin_page_init(BufferGetPage(*buffer), BRIN_PAGETYPE_REGULAR); + brin_page_init(page, BRIN_PAGETYPE_REGULAR); off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber, false, false); if (off == InvalidOffsetNumber) - elog(ERROR, "could not insert new index tuple to page"); + elog(ERROR, "failed to add BRIN tuple to new page"); MarkBufferDirty(*buffer); - BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u", - blk, off, heapBlk)); + /* needed to update FSM below */ + if (extended) + freespace = br_page_get_freespace(page); ItemPointerSet(&tid, blk, off); brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid); @@ -456,8 +456,14 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); + BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u", + blk, off, heapBlk)); + if (extended) - FreeSpaceMapVacuum(idxrel); + { + RecordPageWithFreeSpace(idxrel, blk, freespace); + FreeSpaceMapVacuumRange(idxrel, blk, blk + 1); + } return off; } @@ -599,17 +605,22 @@ brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, } /* - * Given a BRIN index page, initialize it if necessary, and record it into the - * FSM if necessary. Return value is true if the FSM itself needs "vacuuming". + * Given a BRIN index page, initialize it if necessary, and record its + * current free space in the FSM. + * * The main use for this is when, during vacuuming, an uninitialized page is * found, which could be the result of relation extension followed by a crash * before the page can be used. + * + * Here, we don't bother to update upper FSM pages, instead expecting that our + * caller (brin_vacuum_scan) will fix them at the end of the scan. Elsewhere + * in this file, it's generally a good idea to propagate additions of free + * space into the upper FSM pages immediately. */ -bool +void brin_page_cleanup(Relation idxrel, Buffer buf) { Page page = BufferGetPage(buf); - Size freespace; /* * If a page was left uninitialized, initialize it now; also record it in @@ -631,7 +642,7 @@ brin_page_cleanup(Relation idxrel, Buffer buf) { brin_initialize_empty_new_buffer(idxrel, buf); LockBuffer(buf, BUFFER_LOCK_UNLOCK); - return true; + return; } LockBuffer(buf, BUFFER_LOCK_UNLOCK); } @@ -639,24 +650,18 @@ brin_page_cleanup(Relation idxrel, Buffer buf) /* Nothing to be done for non-regular index pages */ if (BRIN_IS_META_PAGE(BufferGetPage(buf)) || BRIN_IS_REVMAP_PAGE(BufferGetPage(buf))) - return false; + return; /* Measure free space and record it */ - freespace = br_page_get_freespace(page); - if (freespace > GetRecordedFreeSpace(idxrel, BufferGetBlockNumber(buf))) - { - RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf), freespace); - return true; - } - - return false; + RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf), + br_page_get_freespace(page)); } /* * Return a pinned and exclusively locked buffer which can be used to insert an * index item of size itemsz (caller must ensure not to request sizes * impossible to fulfill). If oldbuf is a valid buffer, it is also locked (in - * an order determined to avoid deadlocks.) + * an order determined to avoid deadlocks). * * If we find that the old page is no longer a regular index page (because * of a revmap extension), the old buffer is unlocked and we return @@ -665,12 +670,18 @@ brin_page_cleanup(Relation idxrel, Buffer buf) * If there's no existing page with enough free space to accommodate the new * item, the relation is extended. If this happens, *extended is set to true, * and it is the caller's responsibility to initialize the page (and WAL-log - * that fact) prior to use. + * that fact) prior to use. The caller should also update the FSM with the + * page's remaining free space after the insertion. * - * Note that in some corner cases it is possible for this routine to extend the - * relation and then not return the buffer. It is this routine's + * Note that the caller is not expected to update FSM unless *extended is set + * true. This policy means that we'll update FSM when a page is created, and + * when it's found to have too little space for a desired tuple insertion, + * but not every single time we add a tuple to the page. + * + * Note that in some corner cases it is possible for this routine to extend + * the relation and then not return the new page. It is this routine's * responsibility to WAL-log the page initialization and to record the page in - * FSM if that happens. Such a buffer may later be reused by this routine. + * FSM if that happens, since the caller certainly can't do it. */ static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, @@ -684,22 +695,22 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, /* callers must have checked */ Assert(itemsz <= BrinMaxItemSize); - *extended = false; - if (BufferIsValid(oldbuf)) oldblk = BufferGetBlockNumber(oldbuf); else oldblk = InvalidBlockNumber; + /* Choose initial target page, re-using existing target if known */ + newblk = RelationGetTargetBlock(irel); + if (newblk == InvalidBlockNumber) + newblk = GetPageWithFreeSpace(irel, itemsz); + /* * Loop until we find a page with sufficient free space. By the time we * return to caller out of this loop, both buffers are valid and locked; - * if we have to restart here, neither buffer is locked and buf is not a - * pinned buffer. + * if we have to restart here, neither page is locked and newblk isn't + * pinned (if it's even valid). */ - newblk = RelationGetTargetBlock(irel); - if (newblk == InvalidBlockNumber) - newblk = GetPageWithFreeSpace(irel, itemsz); for (;;) { Buffer buf; @@ -707,6 +718,8 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, CHECK_FOR_INTERRUPTS(); + *extended = false; + if (newblk == InvalidBlockNumber) { /* @@ -741,9 +754,9 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, /* * We lock the old buffer first, if it's earlier than the new one; but - * before we do, we need to check that it hasn't been turned into a - * revmap page concurrently; if we detect that it happened, give up - * and tell caller to start over. + * then we need to check that it hasn't been turned into a revmap page + * concurrently. If we detect that that happened, give up and tell + * caller to start over. */ if (BufferIsValid(oldbuf) && oldblk < newblk) { @@ -761,16 +774,20 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, * it first. */ if (*extended) - { brin_initialize_empty_new_buffer(irel, buf); - /* shouldn't matter, but don't confuse caller */ - *extended = false; - } if (extensionLockHeld) UnlockRelationForExtension(irel, ExclusiveLock); ReleaseBuffer(buf); + + if (*extended) + { + FreeSpaceMapVacuumRange(irel, newblk, newblk + 1); + /* shouldn't matter, but don't confuse caller */ + *extended = false; + } + return InvalidBuffer; } } @@ -785,9 +802,6 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, /* * We have a new buffer to insert into. Check that the new page has * enough free space, and return it if it does; otherwise start over. - * Note that we allow for the FSM to be out of date here, and in that - * case we update it and move on. - * * (br_page_get_freespace also checks that the FSM didn't hand us a * page that has since been repurposed for the revmap.) */ @@ -795,16 +809,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, BrinMaxItemSize : br_page_get_freespace(page); if (freespace >= itemsz) { - RelationSetTargetBlock(irel, BufferGetBlockNumber(buf)); - - /* - * Since the target block specification can get lost on cache - * invalidations, make sure we update the more permanent FSM with - * data about it before going away. - */ - if (*extended) - RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf), - freespace); + RelationSetTargetBlock(irel, newblk); /* * Lock the old buffer if not locked already. Note that in this @@ -832,6 +837,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, if (*extended) { brin_initialize_empty_new_buffer(irel, buf); + /* since this should not happen, skip FreeSpaceMapVacuum */ ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), @@ -845,6 +851,10 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, if (BufferIsValid(oldbuf) && oldblk <= newblk) LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + /* + * Update the FSM with the new, presumably smaller, freespace value + * for this page, then search for a new target page. + */ newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz); } } @@ -859,6 +869,9 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, * there is no mechanism to get the space back and the index would bloat. * Also, because we would not WAL-log the action that would initialize the * page, the page would go uninitialized in a standby (or after recovery). + * + * While we record the page in FSM here, caller is responsible for doing FSM + * upper-page update if that seems appropriate. */ static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer) diff --git a/src/include/access/brin_pageops.h b/src/include/access/brin_pageops.h index 8b389de4a5..5189d5ddc2 100644 --- a/src/include/access/brin_pageops.h +++ b/src/include/access/brin_pageops.h @@ -33,6 +33,6 @@ extern bool brin_start_evacuating_page(Relation idxRel, Buffer buf); extern void brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, BrinRevmap *revmap, Buffer buf); -extern bool brin_page_cleanup(Relation idxrel, Buffer buf); +extern void brin_page_cleanup(Relation idxrel, Buffer buf); #endif /* BRIN_PAGEOPS_H */ -- 2.40.0