From d526575f893c1a4e05ebd307e80203536b213a6d Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 30 May 2007 20:12:03 +0000 Subject: [PATCH] Make large sequential scans and VACUUMs work in a limited-size "ring" of buffers, rather than blowing out the whole shared-buffer arena. Aside from avoiding cache spoliation, this fixes the problem that VACUUM formerly tended to cause a WAL flush for every page it modified, because we had it hacked to use only a single buffer. Those flushes will now occur only once per ring-ful. The exact ring size, and the threshold for seqscans to switch into the ring usage pattern, remain under debate; but the infrastructure seems done. The key bit of infrastructure is a new optional BufferAccessStrategy object that can be passed to ReadBuffer operations; this replaces the former StrategyHintVacuum API. This patch also changes the buffer usage-count methodology a bit: we now advance usage_count when first pinning a buffer, rather than when last unpinning it. To preserve the behavior that a buffer's lifetime starts to decrease when it's released, the clock sweep code is modified to not decrement usage_count of pinned buffers. Work not done in this commit: teach GiST and GIN indexes to use the vacuum BufferAccessStrategy for vacuum-driven fetches. Original patch by Simon, reworked by Heikki and again by Tom. --- src/backend/access/hash/hash.c | 10 +- src/backend/access/hash/hashovfl.c | 64 ++++-- src/backend/access/hash/hashpage.c | 32 ++- src/backend/access/heap/heapam.c | 38 +++- src/backend/access/nbtree/nbtree.c | 7 +- src/backend/access/transam/xlog.c | 32 ++- src/backend/catalog/index.c | 3 +- src/backend/commands/analyze.c | 12 +- src/backend/commands/vacuum.c | 89 ++++---- src/backend/commands/vacuumlazy.c | 18 +- src/backend/postmaster/autovacuum.c | 21 +- src/backend/storage/buffer/README | 75 ++++--- src/backend/storage/buffer/bufmgr.c | 174 ++++++++------- src/backend/storage/buffer/freelist.c | 298 ++++++++++++++++++++++---- src/backend/storage/buffer/localbuf.c | 39 ++-- src/backend/tcop/utility.c | 4 +- src/include/access/genam.h | 3 +- src/include/access/hash.h | 11 +- src/include/access/relscan.h | 3 +- src/include/access/xlog.h | 3 +- src/include/commands/vacuum.h | 13 +- src/include/storage/buf.h | 9 +- src/include/storage/buf_internals.h | 13 +- src/include/storage/bufmgr.h | 15 +- 24 files changed, 723 insertions(+), 263 deletions(-) diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 3d9b8064fc..57c5422471 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.94 2007/05/03 16:45:58 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.95 2007/05/30 20:11:51 tgl Exp $ * * NOTES * This file contains only the public interface routines. @@ -547,8 +547,9 @@ loop_top: vacuum_delay_point(); - buf = _hash_getbuf(rel, blkno, HASH_WRITE, - LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, + info->strategy); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == cur_bucket); @@ -596,7 +597,8 @@ loop_top: /* If we deleted anything, try to compact free space */ if (bucket_dirty) - _hash_squeezebucket(rel, cur_bucket, bucket_blkno); + _hash_squeezebucket(rel, cur_bucket, bucket_blkno, + info->strategy); /* Release bucket lock */ _hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE); diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 1f71f18b7c..889bbcdb1a 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.57 2007/05/03 16:45:58 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.58 2007/05/30 20:11:51 tgl Exp $ * * NOTES * Overflow pages look like ordinary relation pages. @@ -362,6 +362,9 @@ _hash_firstfreebit(uint32 map) * Remove this overflow page from its bucket's chain, and mark the page as * free. On entry, ovflbuf is write-locked; it is released before exiting. * + * Since this function is invoked in VACUUM, we provide an access strategy + * parameter that controls fetches of the bucket pages. + * * Returns the block number of the page that followed the given page * in the bucket, or InvalidBlockNumber if no following page. * @@ -370,7 +373,8 @@ _hash_firstfreebit(uint32 map) * on the bucket, too. */ BlockNumber -_hash_freeovflpage(Relation rel, Buffer ovflbuf) +_hash_freeovflpage(Relation rel, Buffer ovflbuf, + BufferAccessStrategy bstrategy) { HashMetaPage metap; Buffer metabuf; @@ -413,8 +417,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf) */ if (BlockNumberIsValid(prevblkno)) { - Buffer prevbuf = _hash_getbuf(rel, prevblkno, HASH_WRITE, - LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + Buffer prevbuf = _hash_getbuf_with_strategy(rel, + prevblkno, + HASH_WRITE, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, + bstrategy); Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); @@ -424,8 +431,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf) } if (BlockNumberIsValid(nextblkno)) { - Buffer nextbuf = _hash_getbuf(rel, nextblkno, HASH_WRITE, - LH_OVERFLOW_PAGE); + Buffer nextbuf = _hash_getbuf_with_strategy(rel, + nextblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); Page nextpage = BufferGetPage(nextbuf); HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); @@ -434,6 +444,8 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf) _hash_wrtbuf(rel, nextbuf); } + /* Note: bstrategy is intentionally not used for metapage and bitmap */ + /* Read the metapage so we can determine which bitmap page to use */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = (HashMetaPage) BufferGetPage(metabuf); @@ -558,11 +570,15 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno) * * Caller must hold exclusive lock on the target bucket. This allows * us to safely lock multiple pages in the bucket. + * + * Since this function is invoked in VACUUM, we provide an access strategy + * parameter that controls fetches of the bucket pages. */ void _hash_squeezebucket(Relation rel, Bucket bucket, - BlockNumber bucket_blkno) + BlockNumber bucket_blkno, + BufferAccessStrategy bstrategy) { Buffer wbuf; Buffer rbuf = 0; @@ -581,7 +597,11 @@ _hash_squeezebucket(Relation rel, * start squeezing into the base bucket page. */ wblkno = bucket_blkno; - wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE, LH_BUCKET_PAGE); + wbuf = _hash_getbuf_with_strategy(rel, + wblkno, + HASH_WRITE, + LH_BUCKET_PAGE, + bstrategy); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); @@ -595,8 +615,10 @@ _hash_squeezebucket(Relation rel, } /* - * find the last page in the bucket chain by starting at the base bucket - * page and working forward. + * Find the last page in the bucket chain by starting at the base bucket + * page and working forward. Note: we assume that a hash bucket chain is + * usually smaller than the buffer ring being used by VACUUM, else using + * the access strategy here would be counterproductive. */ ropaque = wopaque; do @@ -604,7 +626,11 @@ _hash_squeezebucket(Relation rel, rblkno = ropaque->hasho_nextblkno; if (ropaque != wopaque) _hash_relbuf(rel, rbuf); - rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE); + rbuf = _hash_getbuf_with_strategy(rel, + rblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); @@ -644,7 +670,11 @@ _hash_squeezebucket(Relation rel, return; } - wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE); + wbuf = _hash_getbuf_with_strategy(rel, + wblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); @@ -688,15 +718,19 @@ _hash_squeezebucket(Relation rel, /* yes, so release wbuf lock first */ _hash_wrtbuf(rel, wbuf); /* free this overflow page (releases rbuf) */ - _hash_freeovflpage(rel, rbuf); + _hash_freeovflpage(rel, rbuf, bstrategy); /* done */ return; } /* free this overflow page, then get the previous one */ - _hash_freeovflpage(rel, rbuf); + _hash_freeovflpage(rel, rbuf, bstrategy); - rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE); + rbuf = _hash_getbuf_with_strategy(rel, + rblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index a27d83d4ff..29d861efb8 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.67 2007/05/03 16:45:58 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.68 2007/05/30 20:11:51 tgl Exp $ * * NOTES * Postgres hash pages look like ordinary relation pages. The opaque @@ -214,6 +214,34 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno) return buf; } +/* + * _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy. + * + * This is identical to _hash_getbuf() but also allows a buffer access + * strategy to be specified. We use this for VACUUM operations. + */ +Buffer +_hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, + int access, int flags, + BufferAccessStrategy bstrategy) +{ + Buffer buf; + + if (blkno == P_NEW) + elog(ERROR, "hash AM does not use P_NEW"); + + buf = ReadBufferWithStrategy(rel, blkno, bstrategy); + + if (access != HASH_NOLOCK) + LockBuffer(buf, access); + + /* ref count and lock type are correct */ + + _hash_checkpage(rel, buf, flags); + + return buf; +} + /* * _hash_relbuf() -- release a locked buffer. * @@ -840,5 +868,5 @@ _hash_splitbucket(Relation rel, _hash_wrtbuf(rel, obuf); _hash_wrtbuf(rel, nbuf); - _hash_squeezebucket(rel, obucket, start_oblkno); + _hash_squeezebucket(rel, obucket, start_oblkno, NULL); } diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 9edeaff130..0b20e5e9a8 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.233 2007/05/27 03:50:38 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.234 2007/05/30 20:11:53 tgl Exp $ * * * INTERFACE ROUTINES @@ -83,6 +83,24 @@ initscan(HeapScanDesc scan, ScanKey key) */ scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd); + /* + * If the table is large relative to NBuffers, use a bulk-read access + * strategy, else use the default random-access strategy. During a + * rescan, don't make a new strategy object if we don't have to. + */ + if (scan->rs_nblocks > NBuffers / 4 && + !scan->rs_rd->rd_istemp) + { + if (scan->rs_strategy == NULL) + scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD); + } + else + { + if (scan->rs_strategy != NULL) + FreeAccessStrategy(scan->rs_strategy); + scan->rs_strategy = NULL; + } + scan->rs_inited = false; scan->rs_ctup.t_data = NULL; ItemPointerSetInvalid(&scan->rs_ctup.t_self); @@ -123,9 +141,17 @@ heapgetpage(HeapScanDesc scan, BlockNumber page) Assert(page < scan->rs_nblocks); - scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf, - scan->rs_rd, - page); + /* release previous scan buffer, if any */ + if (BufferIsValid(scan->rs_cbuf)) + { + ReleaseBuffer(scan->rs_cbuf); + scan->rs_cbuf = InvalidBuffer; + } + + /* read page using selected strategy */ + scan->rs_cbuf = ReadBufferWithStrategy(scan->rs_rd, + page, + scan->rs_strategy); scan->rs_cblock = page; if (!scan->rs_pageatatime) @@ -938,6 +964,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, scan->rs_rd = relation; scan->rs_snapshot = snapshot; scan->rs_nkeys = nkeys; + scan->rs_strategy = NULL; /* set in initscan */ /* * we can use page-at-a-time mode if it's an MVCC-safe snapshot @@ -1007,6 +1034,9 @@ heap_endscan(HeapScanDesc scan) if (scan->rs_key) pfree(scan->rs_key); + if (scan->rs_strategy != NULL) + FreeAccessStrategy(scan->rs_strategy); + pfree(scan); } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 87f8485505..a4ba3d3cdf 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -12,7 +12,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.154 2007/01/05 22:19:23 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.155 2007/05/30 20:11:53 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -786,9 +786,10 @@ restart: /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to - * recycle all-zero pages, not fail. + * recycle all-zero pages, not fail. Also, we want to use a nondefault + * buffer access strategy. */ - buf = ReadBuffer(rel, blkno); + buf = ReadBufferWithStrategy(rel, blkno, info->strategy); LockBuffer(buf, BT_READ); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 3dc00499bf..4ca4aa754c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.269 2007/05/20 21:08:19 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.270 2007/05/30 20:11:55 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1799,6 +1799,36 @@ XLogFlush(XLogRecPtr record) LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff); } +/* + * Test whether XLOG data has been flushed up to (at least) the given position. + * + * Returns true if a flush is still needed. (It may be that someone else + * is already in process of flushing that far, however.) + */ +bool +XLogNeedsFlush(XLogRecPtr record) +{ + /* Quick exit if already known flushed */ + if (XLByteLE(record, LogwrtResult.Flush)) + return false; + + /* read LogwrtResult and update local state */ + { + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + SpinLockAcquire(&xlogctl->info_lck); + LogwrtResult = xlogctl->LogwrtResult; + SpinLockRelease(&xlogctl->info_lck); + } + + /* check again */ + if (XLByteLE(record, LogwrtResult.Flush)) + return false; + + return true; +} + /* * Create a new XLOG file segment, or open a pre-existing one. * diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index bd3ae31b07..9aa58e35f9 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.283 2007/05/16 17:28:20 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.284 2007/05/30 20:11:55 tgl Exp $ * * * INTERFACE ROUTINES @@ -1658,6 +1658,7 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot) ivinfo.vacuum_full = false; ivinfo.message_level = DEBUG2; ivinfo.num_heap_tuples = -1; + ivinfo.strategy = NULL; state.tuplesort = tuplesort_begin_datum(TIDOID, TIDLessOperator, false, diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 2754a6db6a..d77aec2dd7 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/analyze.c,v 1.107 2007/04/30 03:23:48 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/analyze.c,v 1.108 2007/05/30 20:11:56 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -63,10 +63,13 @@ typedef struct AnlIndexData /* Default statistics target (GUC parameter) */ int default_statistics_target = 10; +/* A few variables that don't seem worth passing around as parameters */ static int elevel = -1; static MemoryContext anl_context = NULL; +static BufferAccessStrategy vac_strategy; + static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize); @@ -94,7 +97,8 @@ static bool std_typanalyze(VacAttrStats *stats); * analyze_rel() -- analyze one relation */ void -analyze_rel(Oid relid, VacuumStmt *vacstmt) +analyze_rel(Oid relid, VacuumStmt *vacstmt, + BufferAccessStrategy bstrategy) { Relation onerel; int attr_cnt, @@ -120,6 +124,8 @@ analyze_rel(Oid relid, VacuumStmt *vacstmt) else elevel = DEBUG2; + vac_strategy = bstrategy; + /* * Use the current context for storing analysis info. vacuum.c ensures * that this context will be cleared when I return, thus releasing the @@ -845,7 +851,7 @@ acquire_sample_rows(Relation onerel, HeapTuple *rows, int targrows, * looking at it. We don't maintain a lock on the page, so tuples * could get added to it, but we ignore such tuples. */ - targbuffer = ReadBuffer(onerel, targblock); + targbuffer = ReadBufferWithStrategy(onerel, targblock, vac_strategy); LockBuffer(targbuffer, BUFFER_LOCK_SHARE); targpage = BufferGetPage(targbuffer); maxoffset = PageGetMaxOffsetNumber(targpage); diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 93885579cb..cf4c341412 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.351 2007/05/17 15:28:29 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.352 2007/05/30 20:11:57 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -191,6 +191,7 @@ ExecContext_Finish(ExecContext ec) *---------------------------------------------------------------------- */ +/* A few variables that don't seem worth passing around as parameters */ static MemoryContext vac_context = NULL; static int elevel = -1; @@ -198,6 +199,8 @@ static int elevel = -1; static TransactionId OldestXmin; static TransactionId FreezeLimit; +static BufferAccessStrategy vac_strategy; + /* non-export function prototypes */ static List *get_rel_oids(List *relids, const RangeVar *vacrel, @@ -257,14 +260,18 @@ static Size PageGetFreeSpaceWithFillFactor(Relation relation, Page page); * relation OIDs to be processed, and vacstmt->relation is ignored. * (The non-NIL case is currently only used by autovacuum.) * + * bstrategy is normally given as NULL, but in autovacuum it can be passed + * in to use the same buffer strategy object across multiple vacuum() calls. + * * isTopLevel should be passed down from ProcessUtility. * - * It is the caller's responsibility that both vacstmt and relids + * It is the caller's responsibility that vacstmt, relids, and bstrategy * (if given) be allocated in a memory context that won't disappear * at transaction commit. */ void -vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel) +vacuum(VacuumStmt *vacstmt, List *relids, + BufferAccessStrategy bstrategy, bool isTopLevel) { const char *stmttype = vacstmt->vacuum ? "VACUUM" : "ANALYZE"; volatile MemoryContext anl_context = NULL; @@ -319,6 +326,19 @@ vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel) ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); + /* + * If caller didn't give us a buffer strategy object, make one in the + * cross-transaction memory context. + */ + if (bstrategy == NULL) + { + MemoryContext old_context = MemoryContextSwitchTo(vac_context); + + bstrategy = GetAccessStrategy(BAS_VACUUM); + MemoryContextSwitchTo(old_context); + } + vac_strategy = bstrategy; + /* Remember whether we are processing everything in the DB */ all_rels = (relids == NIL && vacstmt->relation == NULL); @@ -417,15 +437,7 @@ vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel) else old_context = MemoryContextSwitchTo(anl_context); - /* - * Tell the buffer replacement strategy that vacuum is causing - * the IO - */ - StrategyHintVacuum(true); - - analyze_rel(relid, vacstmt); - - StrategyHintVacuum(false); + analyze_rel(relid, vacstmt, vac_strategy); if (use_own_xacts) CommitTransactionCommand(); @@ -441,8 +453,6 @@ vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel) { /* Make sure cost accounting is turned off after error */ VacuumCostActive = false; - /* And reset buffer replacement strategy, too */ - StrategyHintVacuum(false); PG_RE_THROW(); } PG_END_TRY(); @@ -1084,21 +1094,13 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind) */ toast_relid = onerel->rd_rel->reltoastrelid; - /* - * Tell the cache replacement strategy that vacuum is causing all - * following IO - */ - StrategyHintVacuum(true); - /* * Do the actual work --- either FULL or "lazy" vacuum */ if (vacstmt->full) full_vacuum_rel(onerel, vacstmt); else - lazy_vacuum_rel(onerel, vacstmt); - - StrategyHintVacuum(false); + lazy_vacuum_rel(onerel, vacstmt, vac_strategy); /* all done with this class, but hold lock until commit */ relation_close(onerel, NoLock); @@ -1290,7 +1292,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel, vacuum_delay_point(); - buf = ReadBuffer(onerel, blkno); + buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy); page = BufferGetPage(buf); /* @@ -1730,7 +1732,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, /* * Process this page of relation. */ - buf = ReadBuffer(onerel, blkno); + buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy); page = BufferGetPage(buf); vacpage->offsets_free = 0; @@ -1954,8 +1956,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, nextTid = tp.t_data->t_ctid; priorXmax = HeapTupleHeaderGetXmax(tp.t_data); /* assume block# is OK (see heap_fetch comments) */ - nextBuf = ReadBuffer(onerel, - ItemPointerGetBlockNumber(&nextTid)); + nextBuf = ReadBufferWithStrategy(onerel, + ItemPointerGetBlockNumber(&nextTid), + vac_strategy); nextPage = BufferGetPage(nextBuf); /* If bogus or unused slot, assume tp is end of chain */ nextOffnum = ItemPointerGetOffsetNumber(&nextTid); @@ -2091,8 +2094,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, break; /* out of check-all-items loop */ } tp.t_self = vtlp->this_tid; - Pbuf = ReadBuffer(onerel, - ItemPointerGetBlockNumber(&(tp.t_self))); + Pbuf = ReadBufferWithStrategy(onerel, + ItemPointerGetBlockNumber(&(tp.t_self)), + vac_strategy); Ppage = BufferGetPage(Pbuf); Pitemid = PageGetItemId(Ppage, ItemPointerGetOffsetNumber(&(tp.t_self))); @@ -2174,11 +2178,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, /* Get page to move from */ tuple.t_self = vtmove[ti].tid; - Cbuf = ReadBuffer(onerel, - ItemPointerGetBlockNumber(&(tuple.t_self))); + Cbuf = ReadBufferWithStrategy(onerel, + ItemPointerGetBlockNumber(&(tuple.t_self)), + vac_strategy); /* Get page to move to */ - dst_buffer = ReadBuffer(onerel, destvacpage->blkno); + dst_buffer = ReadBufferWithStrategy(onerel, + destvacpage->blkno, + vac_strategy); LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE); if (dst_buffer != Cbuf) @@ -2239,7 +2246,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, if (i == num_fraged_pages) break; /* can't move item anywhere */ dst_vacpage = fraged_pages->pagedesc[i]; - dst_buffer = ReadBuffer(onerel, dst_vacpage->blkno); + dst_buffer = ReadBufferWithStrategy(onerel, + dst_vacpage->blkno, + vac_strategy); LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE); dst_page = BufferGetPage(dst_buffer); /* if this page was not used before - clean it */ @@ -2386,7 +2395,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, Page page; /* this page was not used as a move target, so must clean it */ - buf = ReadBuffer(onerel, (*curpage)->blkno); + buf = ReadBufferWithStrategy(onerel, + (*curpage)->blkno, + vac_strategy); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); if (!PageIsEmpty(page)) @@ -2470,7 +2481,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, int uncnt; int num_tuples = 0; - buf = ReadBuffer(onerel, vacpage->blkno); + buf = ReadBufferWithStrategy(onerel, vacpage->blkno, vac_strategy); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); maxoff = PageGetMaxOffsetNumber(page); @@ -2859,7 +2870,7 @@ update_hint_bits(Relation rel, VacPageList fraged_pages, int num_fraged_pages, break; /* no need to scan any further */ if ((*curpage)->offsets_used == 0) continue; /* this page was never used as a move dest */ - buf = ReadBuffer(rel, (*curpage)->blkno); + buf = ReadBufferWithStrategy(rel, (*curpage)->blkno, vac_strategy); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); max_offset = PageGetMaxOffsetNumber(page); @@ -2925,7 +2936,9 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages) if ((*vacpage)->offsets_free > 0) { - buf = ReadBuffer(onerel, (*vacpage)->blkno); + buf = ReadBufferWithStrategy(onerel, + (*vacpage)->blkno, + vac_strategy); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); vacuum_page(onerel, buf, *vacpage); UnlockReleaseBuffer(buf); @@ -3012,6 +3025,7 @@ scan_index(Relation indrel, double num_tuples) ivinfo.vacuum_full = true; ivinfo.message_level = elevel; ivinfo.num_heap_tuples = num_tuples; + ivinfo.strategy = vac_strategy; stats = index_vacuum_cleanup(&ivinfo, NULL); @@ -3077,6 +3091,7 @@ vacuum_index(VacPageList vacpagelist, Relation indrel, ivinfo.vacuum_full = true; ivinfo.message_level = elevel; ivinfo.num_heap_tuples = num_tuples + keep_tuples; + ivinfo.strategy = vac_strategy; /* Do bulk deletion */ stats = index_bulk_delete(&ivinfo, NULL, tid_reaped, (void *) vacpagelist); diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 2c9a80540c..3ac097388b 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -36,7 +36,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.89 2007/05/17 15:28:29 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.90 2007/05/30 20:11:57 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -96,11 +96,14 @@ typedef struct LVRelStats } LVRelStats; +/* A few variables that don't seem worth passing around as parameters */ static int elevel = -1; static TransactionId OldestXmin; static TransactionId FreezeLimit; +static BufferAccessStrategy vac_strategy; + /* non-export function prototypes */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, @@ -138,7 +141,8 @@ static int vac_cmp_page_spaces(const void *left, const void *right); * and locked the relation. */ void -lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt) +lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, + BufferAccessStrategy bstrategy) { LVRelStats *vacrelstats; Relation *Irel; @@ -158,6 +162,8 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt) else elevel = DEBUG2; + vac_strategy = bstrategy; + vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared, &OldestXmin, &FreezeLimit); @@ -318,7 +324,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, vacrelstats->num_index_scans++; } - buf = ReadBuffer(onerel, blkno); + buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy); /* Initially, we only need shared access to the buffer */ LockBuffer(buf, BUFFER_LOCK_SHARE); @@ -586,7 +592,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) vacuum_delay_point(); tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); - buf = ReadBuffer(onerel, tblk); + buf = ReadBufferWithStrategy(onerel, tblk, vac_strategy); LockBufferForCleanup(buf); tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats); /* Now that we've compacted the page, record its available space */ @@ -684,6 +690,7 @@ lazy_vacuum_index(Relation indrel, ivinfo.message_level = elevel; /* We don't yet know rel_tuples, so pass -1 */ ivinfo.num_heap_tuples = -1; + ivinfo.strategy = vac_strategy; /* Do bulk deletion */ *stats = index_bulk_delete(&ivinfo, *stats, @@ -713,6 +720,7 @@ lazy_cleanup_index(Relation indrel, ivinfo.vacuum_full = false; ivinfo.message_level = elevel; ivinfo.num_heap_tuples = vacrelstats->rel_tuples; + ivinfo.strategy = vac_strategy; stats = index_vacuum_cleanup(&ivinfo, stats); @@ -869,7 +877,7 @@ count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats) blkno--; - buf = ReadBuffer(onerel, blkno); + buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy); /* In this phase we only need shared access to the buffer */ LockBuffer(buf, BUFFER_LOCK_SHARE); diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 4d73c6c3f5..752af99831 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -10,7 +10,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/postmaster/autovacuum.c,v 1.46 2007/05/07 20:41:24 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/postmaster/autovacuum.c,v 1.47 2007/05/30 20:11:57 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -218,7 +218,8 @@ static void relation_needs_vacanalyze(Oid relid, Form_pg_autovacuum avForm, bool *doanalyze); static void autovacuum_do_vac_analyze(Oid relid, bool dovacuum, - bool doanalyze, int freeze_min_age); + bool doanalyze, int freeze_min_age, + BufferAccessStrategy bstrategy); static HeapTuple get_pg_autovacuum_tuple_relid(Relation avRel, Oid relid); static PgStat_StatTabEntry *get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared, @@ -1673,6 +1674,7 @@ do_autovacuum(void) ListCell *cell; PgStat_StatDBEntry *shared; PgStat_StatDBEntry *dbentry; + BufferAccessStrategy bstrategy; /* * may be NULL if we couldn't find an entry (only happens if we @@ -1812,6 +1814,13 @@ do_autovacuum(void) list_free(toast_oids); toast_oids = NIL; + /* + * Create a buffer access strategy object for VACUUM to use. We want + * to use the same one across all the vacuum operations we perform, + * since the point is for VACUUM not to blow out the shared cache. + */ + bstrategy = GetAccessStrategy(BAS_VACUUM); + /* * Perform operations on collected tables. */ @@ -1910,7 +1919,8 @@ next_worker: autovacuum_do_vac_analyze(tab->at_relid, tab->at_dovacuum, tab->at_doanalyze, - tab->at_freeze_min_age); + tab->at_freeze_min_age, + bstrategy); /* be tidy */ pfree(tab); } @@ -2328,7 +2338,8 @@ relation_needs_vacanalyze(Oid relid, */ static void autovacuum_do_vac_analyze(Oid relid, bool dovacuum, bool doanalyze, - int freeze_min_age) + int freeze_min_age, + BufferAccessStrategy bstrategy) { VacuumStmt vacstmt; MemoryContext old_cxt; @@ -2354,7 +2365,7 @@ autovacuum_do_vac_analyze(Oid relid, bool dovacuum, bool doanalyze, /* Let pgstat know what we're doing */ autovac_report_activity(&vacstmt, relid); - vacuum(&vacstmt, list_make1_oid(relid), true); + vacuum(&vacstmt, list_make1_oid(relid), bstrategy, true); MemoryContextSwitchTo(old_cxt); } diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index afdea2af74..f6327f875e 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.11 2006/07/23 03:07:58 tgl Exp $ +$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.12 2007/05/30 20:11:58 tgl Exp $ Notes about shared buffer access rules -------------------------------------- @@ -152,20 +152,21 @@ we could use per-backend LWLocks instead (a buffer header would then contain a field to show which backend is doing its I/O). -Buffer replacement strategy ---------------------------- +Normal buffer replacement strategy +---------------------------------- There is a "free list" of buffers that are prime candidates for replacement. In particular, buffers that are completely free (contain no valid page) are -always in this list. We may also throw buffers into this list if we -consider their pages unlikely to be needed soon. The list is singly-linked -using fields in the buffer headers; we maintain head and tail pointers in -global variables. (Note: although the list links are in the buffer headers, -they are considered to be protected by the BufFreelistLock, not the -buffer-header spinlocks.) To choose a victim buffer to recycle when there -are no free buffers available, we use a simple clock-sweep algorithm, which -avoids the need to take system-wide locks during common operations. It -works like this: +always in this list. We could also throw buffers into this list if we +consider their pages unlikely to be needed soon; however, the current +algorithm never does that. The list is singly-linked using fields in the +buffer headers; we maintain head and tail pointers in global variables. +(Note: although the list links are in the buffer headers, they are +considered to be protected by the BufFreelistLock, not the buffer-header +spinlocks.) To choose a victim buffer to recycle when there are no free +buffers available, we use a simple clock-sweep algorithm, which avoids the +need to take system-wide locks during common operations. It works like +this: Each buffer header contains a usage counter, which is incremented (up to a small limit value) whenever the buffer is unpinned. (This requires only the @@ -199,22 +200,40 @@ before we can recycle it; if someone else pins the buffer meanwhile we will have to give up and try another buffer. This however is not a concern of the basic select-a-victim-buffer algorithm.) -A special provision is that while running VACUUM, a backend does not -increment the usage count on buffers it accesses. In fact, if ReleaseBuffer -sees that it is dropping the pin count to zero and the usage count is zero, -then it appends the buffer to the tail of the free list. (This implies that -VACUUM, but only VACUUM, must take the BufFreelistLock during ReleaseBuffer; -this shouldn't create much of a contention problem.) This provision -encourages VACUUM to work in a relatively small number of buffers rather -than blowing out the entire buffer cache. It is reasonable since a page -that has been touched only by VACUUM is unlikely to be needed again soon. - -Since VACUUM usually requests many pages very fast, the effect of this is that -it will get back the very buffers it filled and possibly modified on the next -call and will therefore do its work in a few shared memory buffers, while -being able to use whatever it finds in the cache already. This also implies -that most of the write traffic caused by a VACUUM will be done by the VACUUM -itself and not pushed off onto other processes. + +Buffer ring replacement strategy +--------------------------------- + +When running a query that needs to access a large number of pages just once, +such as VACUUM or a large sequential scan, a different strategy is used. +A page that has been touched only by such a scan is unlikely to be needed +again soon, so instead of running the normal clock sweep algorithm and +blowing out the entire buffer cache, a small ring of buffers is allocated +using the normal clock sweep algorithm and those buffers are reused for the +whole scan. This also implies that much of the write traffic caused by such +a statement will be done by the backend itself and not pushed off onto other +processes. + +For sequential scans, a 256KB ring is used. That's small enough to fit in L2 +cache, which makes transferring pages from OS cache to shared buffer cache +efficient. Even less would often be enough, but the ring must be big enough +to accommodate all pages in the scan that are pinned concurrently. 256KB +should also be enough to leave a small cache trail for other backends to +join in a synchronized seq scan. If a ring buffer is dirtied and its LSN +updated, we would normally have to write and flush WAL before we could +re-use the buffer; in this case we instead discard the buffer from the ring +and (later) choose a replacement using the normal clock-sweep algorithm. +Hence this strategy works best for scans that are read-only (or at worst +update hint bits). In a scan that modifies every page in the scan, like a +bulk UPDATE or DELETE, the buffers in the ring will always be dirtied and +the ring strategy effectively degrades to the normal strategy. + +VACUUM uses a 256KB ring like sequential scans, but dirty pages are not +removed from the ring. Instead, WAL is flushed if needed to allow reuse of +the buffers. Before introducing the buffer ring strategy in 8.3, VACUUM's +buffers were sent to the freelist, which was effectively a buffer ring of 1 +buffer, resulting in excessive WAL flushing. Allowing VACUUM to update +256KB between WAL flushes should be more efficient. Background writer's processing diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index e2cfc870e2..bbb6e0bc04 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.219 2007/05/27 03:50:39 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.220 2007/05/30 20:11:58 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -90,11 +90,11 @@ static volatile BufferDesc *PinCountWaitBuf = NULL; static Buffer ReadBuffer_common(Relation reln, BlockNumber blockNum, - bool zeroPage); -static bool PinBuffer(volatile BufferDesc *buf); + bool zeroPage, + BufferAccessStrategy strategy); +static bool PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy); static void PinBuffer_Locked(volatile BufferDesc *buf); -static void UnpinBuffer(volatile BufferDesc *buf, - bool fixOwner, bool normalAccess); +static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner); static bool SyncOneBuffer(int buf_id, bool skip_pinned); static void WaitIO(volatile BufferDesc *buf); static bool StartBufferIO(volatile BufferDesc *buf, bool forInput); @@ -102,7 +102,8 @@ static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty, int set_flag_bits); static void buffer_write_error_callback(void *arg); static volatile BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, - bool *foundPtr); + BufferAccessStrategy strategy, + bool *foundPtr); static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln); static void AtProcExit_Buffers(int code, Datum arg); @@ -125,7 +126,18 @@ static void AtProcExit_Buffers(int code, Datum arg); Buffer ReadBuffer(Relation reln, BlockNumber blockNum) { - return ReadBuffer_common(reln, blockNum, false); + return ReadBuffer_common(reln, blockNum, false, NULL); +} + +/* + * ReadBufferWithStrategy -- same as ReadBuffer, except caller can specify + * a nondefault buffer access strategy. See buffer/README for details. + */ +Buffer +ReadBufferWithStrategy(Relation reln, BlockNumber blockNum, + BufferAccessStrategy strategy) +{ + return ReadBuffer_common(reln, blockNum, false, strategy); } /* @@ -140,14 +152,15 @@ ReadBuffer(Relation reln, BlockNumber blockNum) Buffer ReadOrZeroBuffer(Relation reln, BlockNumber blockNum) { - return ReadBuffer_common(reln, blockNum, true); + return ReadBuffer_common(reln, blockNum, true, NULL); } /* - * ReadBuffer_common -- common logic for ReadBuffer and ReadOrZeroBuffer + * ReadBuffer_common -- common logic for ReadBuffer variants */ static Buffer -ReadBuffer_common(Relation reln, BlockNumber blockNum, bool zeroPage) +ReadBuffer_common(Relation reln, BlockNumber blockNum, bool zeroPage, + BufferAccessStrategy strategy) { volatile BufferDesc *bufHdr; Block bufBlock; @@ -185,7 +198,7 @@ ReadBuffer_common(Relation reln, BlockNumber blockNum, bool zeroPage) * lookup the buffer. IO_IN_PROGRESS is set if the requested block is * not currently in memory. */ - bufHdr = BufferAlloc(reln, blockNum, &found); + bufHdr = BufferAlloc(reln, blockNum, strategy, &found); if (found) BufferHitCount++; } @@ -330,6 +343,10 @@ ReadBuffer_common(Relation reln, BlockNumber blockNum, bool zeroPage) * buffer. If no buffer exists already, selects a replacement * victim and evicts the old page, but does NOT read in new page. * + * "strategy" can be a buffer replacement strategy object, or NULL for + * the default strategy. The selected buffer's usage_count is advanced when + * using the default strategy, but otherwise possibly not (see PinBuffer). + * * The returned buffer is pinned and is already marked as holding the * desired page. If it already did have the desired page, *foundPtr is * set TRUE. Otherwise, *foundPtr is set FALSE and the buffer is marked @@ -343,6 +360,7 @@ ReadBuffer_common(Relation reln, BlockNumber blockNum, bool zeroPage) static volatile BufferDesc * BufferAlloc(Relation reln, BlockNumber blockNum, + BufferAccessStrategy strategy, bool *foundPtr) { BufferTag newTag; /* identity of requested block */ @@ -375,7 +393,7 @@ BufferAlloc(Relation reln, */ buf = &BufferDescriptors[buf_id]; - valid = PinBuffer(buf); + valid = PinBuffer(buf, strategy); /* Can release the mapping lock as soon as we've pinned it */ LWLockRelease(newPartitionLock); @@ -413,13 +431,15 @@ BufferAlloc(Relation reln, /* Loop here in case we have to try another victim buffer */ for (;;) { + bool lock_held; + /* * Select a victim buffer. The buffer is returned with its header - * spinlock still held! Also the BufFreelistLock is still held, since - * it would be bad to hold the spinlock while possibly waking up other - * processes. + * spinlock still held! Also (in most cases) the BufFreelistLock is + * still held, since it would be bad to hold the spinlock while + * possibly waking up other processes. */ - buf = StrategyGetBuffer(); + buf = StrategyGetBuffer(strategy, &lock_held); Assert(buf->refcount == 0); @@ -430,7 +450,8 @@ BufferAlloc(Relation reln, PinBuffer_Locked(buf); /* Now it's safe to release the freelist lock */ - LWLockRelease(BufFreelistLock); + if (lock_held) + LWLockRelease(BufFreelistLock); /* * If the buffer was dirty, try to write it out. There is a race @@ -458,16 +479,34 @@ BufferAlloc(Relation reln, */ if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED)) { + /* + * If using a nondefault strategy, and writing the buffer + * would require a WAL flush, let the strategy decide whether + * to go ahead and write/reuse the buffer or to choose another + * victim. We need lock to inspect the page LSN, so this + * can't be done inside StrategyGetBuffer. + */ + if (strategy != NULL && + XLogNeedsFlush(BufferGetLSN(buf)) && + StrategyRejectBuffer(strategy, buf)) + { + /* Drop lock/pin and loop around for another buffer */ + LWLockRelease(buf->content_lock); + UnpinBuffer(buf, true); + continue; + } + + /* OK, do the I/O */ FlushBuffer(buf, NULL); LWLockRelease(buf->content_lock); } else { /* - * Someone else has pinned the buffer, so give it up and loop + * Someone else has locked the buffer, so give it up and loop * back to get another one. */ - UnpinBuffer(buf, true, false /* evidently recently used */ ); + UnpinBuffer(buf, true); continue; } } @@ -531,10 +570,9 @@ BufferAlloc(Relation reln, * Got a collision. Someone has already done what we were about to * do. We'll just handle this as if it were found in the buffer * pool in the first place. First, give up the buffer we were - * planning to use. Don't allow it to be thrown in the free list - * (we don't want to hold freelist and mapping locks at once). + * planning to use. */ - UnpinBuffer(buf, true, false); + UnpinBuffer(buf, true); /* Can give up that buffer's mapping partition lock now */ if ((oldFlags & BM_TAG_VALID) && @@ -545,7 +583,7 @@ BufferAlloc(Relation reln, buf = &BufferDescriptors[buf_id]; - valid = PinBuffer(buf); + valid = PinBuffer(buf, strategy); /* Can release the mapping lock as soon as we've pinned it */ LWLockRelease(newPartitionLock); @@ -595,20 +633,21 @@ BufferAlloc(Relation reln, oldPartitionLock != newPartitionLock) LWLockRelease(oldPartitionLock); LWLockRelease(newPartitionLock); - UnpinBuffer(buf, true, false /* evidently recently used */ ); + UnpinBuffer(buf, true); } /* * Okay, it's finally safe to rename the buffer. * * Clearing BM_VALID here is necessary, clearing the dirtybits is just - * paranoia. We also clear the usage_count since any recency of use of - * the old content is no longer relevant. + * paranoia. We also reset the usage_count since any recency of use of + * the old content is no longer relevant. (The usage_count starts out + * at 1 so that the buffer can survive one clock-sweep pass.) */ buf->tag = newTag; buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR); buf->flags |= BM_TAG_VALID; - buf->usage_count = 0; + buf->usage_count = 1; UnlockBufHdr(buf); @@ -736,7 +775,7 @@ retry: /* * Insert the buffer at the head of the list of free buffers. */ - StrategyFreeBuffer(buf, true); + StrategyFreeBuffer(buf); } /* @@ -814,9 +853,6 @@ ReleaseAndReadBuffer(Buffer buffer, return buffer; ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer); LocalRefCount[-buffer - 1]--; - if (LocalRefCount[-buffer - 1] == 0 && - bufHdr->usage_count < BM_MAX_USAGE_COUNT) - bufHdr->usage_count++; } else { @@ -826,7 +862,7 @@ ReleaseAndReadBuffer(Buffer buffer, if (bufHdr->tag.blockNum == blockNum && RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node)) return buffer; - UnpinBuffer(bufHdr, true, true); + UnpinBuffer(bufHdr, true); } } @@ -836,6 +872,14 @@ ReleaseAndReadBuffer(Buffer buffer, /* * PinBuffer -- make buffer unavailable for replacement. * + * For the default access strategy, the buffer's usage_count is incremented + * when we first pin it; for other strategies we just make sure the usage_count + * isn't zero. (The idea of the latter is that we don't want synchronized + * heap scans to inflate the count, but we need it to not be zero to discourage + * other backends from stealing buffers from our ring. As long as we cycle + * through the ring faster than the global clock-sweep cycles, buffers in + * our ring won't be chosen as victims for replacement by other backends.) + * * This should be applied only to shared buffers, never local ones. * * Note that ResourceOwnerEnlargeBuffers must have been done already. @@ -844,7 +888,7 @@ ReleaseAndReadBuffer(Buffer buffer, * some callers to avoid an extra spinlock cycle. */ static bool -PinBuffer(volatile BufferDesc *buf) +PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy) { int b = buf->buf_id; bool result; @@ -853,6 +897,16 @@ PinBuffer(volatile BufferDesc *buf) { LockBufHdr(buf); buf->refcount++; + if (strategy == NULL) + { + if (buf->usage_count < BM_MAX_USAGE_COUNT) + buf->usage_count++; + } + else + { + if (buf->usage_count == 0) + buf->usage_count = 1; + } result = (buf->flags & BM_VALID) != 0; UnlockBufHdr(buf); } @@ -872,6 +926,11 @@ PinBuffer(volatile BufferDesc *buf) * PinBuffer_Locked -- as above, but caller already locked the buffer header. * The spinlock is released before return. * + * Currently, no callers of this function want to modify the buffer's + * usage_count at all, so there's no need for a strategy parameter. + * Also we don't bother with a BM_VALID test (the caller could check that for + * itself). + * * Note: use of this routine is frequently mandatory, not just an optimization * to save a spin lock/unlock cycle, because we need to pin a buffer before * its state can change under us. @@ -897,17 +956,9 @@ PinBuffer_Locked(volatile BufferDesc *buf) * * Most but not all callers want CurrentResourceOwner to be adjusted. * Those that don't should pass fixOwner = FALSE. - * - * normalAccess indicates that we are finishing a "normal" page access, - * that is, one requested by something outside the buffer subsystem. - * Passing FALSE means it's an internal access that should not update the - * buffer's usage count nor cause a change in the freelist. - * - * If we are releasing a buffer during VACUUM, and it's not been otherwise - * used recently, and normalAccess is true, we send the buffer to the freelist. */ static void -UnpinBuffer(volatile BufferDesc *buf, bool fixOwner, bool normalAccess) +UnpinBuffer(volatile BufferDesc *buf, bool fixOwner) { int b = buf->buf_id; @@ -919,8 +970,6 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner, bool normalAccess) PrivateRefCount[b]--; if (PrivateRefCount[b] == 0) { - bool immed_free_buffer = false; - /* I'd better not still hold any locks on the buffer */ Assert(!LWLockHeldByMe(buf->content_lock)); Assert(!LWLockHeldByMe(buf->io_in_progress_lock)); @@ -931,22 +980,7 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner, bool normalAccess) Assert(buf->refcount > 0); buf->refcount--; - /* Update buffer usage info, unless this is an internal access */ - if (normalAccess) - { - if (!strategy_hint_vacuum) - { - if (buf->usage_count < BM_MAX_USAGE_COUNT) - buf->usage_count++; - } - else - { - /* VACUUM accesses don't bump usage count, instead... */ - if (buf->refcount == 0 && buf->usage_count == 0) - immed_free_buffer = true; - } - } - + /* Support LockBufferForCleanup() */ if ((buf->flags & BM_PIN_COUNT_WAITER) && buf->refcount == 1) { @@ -959,14 +993,6 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner, bool normalAccess) } else UnlockBufHdr(buf); - - /* - * If VACUUM is releasing an otherwise-unused buffer, send it to the - * freelist for near-term reuse. We put it at the tail so that it - * won't be used before any invalid buffers that may exist. - */ - if (immed_free_buffer) - StrategyFreeBuffer(buf, false); } } @@ -1150,7 +1176,7 @@ SyncOneBuffer(int buf_id, bool skip_pinned) FlushBuffer(bufHdr, NULL); LWLockRelease(bufHdr->content_lock); - UnpinBuffer(bufHdr, true, false /* don't change freelist */ ); + UnpinBuffer(bufHdr, true); return true; } @@ -1266,7 +1292,7 @@ AtProcExit_Buffers(int code, Datum arg) * here, it suggests that ResourceOwners are messed up. */ PrivateRefCount[i] = 1; /* make sure we release shared pin */ - UnpinBuffer(buf, false, false /* don't change freelist */ ); + UnpinBuffer(buf, false); Assert(PrivateRefCount[i] == 0); } } @@ -1700,7 +1726,7 @@ FlushRelationBuffers(Relation rel) LWLockAcquire(bufHdr->content_lock, LW_SHARED); FlushBuffer(bufHdr, rel->rd_smgr); LWLockRelease(bufHdr->content_lock); - UnpinBuffer(bufHdr, true, false /* no freelist change */ ); + UnpinBuffer(bufHdr, true); } else UnlockBufHdr(bufHdr); @@ -1723,11 +1749,7 @@ ReleaseBuffer(Buffer buffer) if (BufferIsLocal(buffer)) { Assert(LocalRefCount[-buffer - 1] > 0); - bufHdr = &LocalBufferDescriptors[-buffer - 1]; LocalRefCount[-buffer - 1]--; - if (LocalRefCount[-buffer - 1] == 0 && - bufHdr->usage_count < BM_MAX_USAGE_COUNT) - bufHdr->usage_count++; return; } @@ -1738,7 +1760,7 @@ ReleaseBuffer(Buffer buffer) if (PrivateRefCount[buffer - 1] > 1) PrivateRefCount[buffer - 1]--; else - UnpinBuffer(bufHdr, false, true); + UnpinBuffer(bufHdr, false); } /* diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index a8c3aa2dcd..d8eec0f823 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.58 2007/01/05 22:19:37 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.59 2007/05/30 20:11:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -39,8 +39,42 @@ typedef struct /* Pointers to shared state */ static BufferStrategyControl *StrategyControl = NULL; -/* Backend-local state about whether currently vacuuming */ -bool strategy_hint_vacuum = false; +/* + * Private (non-shared) state for managing a ring of shared buffers to re-use. + * This is currently the only kind of BufferAccessStrategy object, but someday + * we might have more kinds. + */ +typedef struct BufferAccessStrategyData +{ + /* Overall strategy type */ + BufferAccessStrategyType btype; + /* Number of elements in buffers[] array */ + int ring_size; + /* + * Index of the "current" slot in the ring, ie, the one most recently + * returned by GetBufferFromRing. + */ + int current; + /* + * True if the buffer just returned by StrategyGetBuffer had been in + * the ring already. + */ + bool current_was_in_ring; + + /* + * Array of buffer numbers. InvalidBuffer (that is, zero) indicates + * we have not yet selected a buffer for this ring slot. For allocation + * simplicity this is palloc'd together with the fixed fields of the + * struct. + */ + Buffer buffers[1]; /* VARIABLE SIZE ARRAY */ +} BufferAccessStrategyData; + + +/* Prototypes for internal functions */ +static volatile BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy); +static void AddBufferToRing(BufferAccessStrategy strategy, + volatile BufferDesc *buf); /* @@ -50,17 +84,38 @@ bool strategy_hint_vacuum = false; * BufferAlloc(). The only hard requirement BufferAlloc() has is that * the selected buffer must not currently be pinned by anyone. * + * strategy is a BufferAccessStrategy object, or NULL for default strategy. + * * To ensure that no one else can pin the buffer before we do, we must - * return the buffer with the buffer header spinlock still held. That - * means that we return with the BufFreelistLock still held, as well; - * the caller must release that lock once the spinlock is dropped. + * return the buffer with the buffer header spinlock still held. If + * *lock_held is set on exit, we have returned with the BufFreelistLock + * still held, as well; the caller must release that lock once the spinlock + * is dropped. We do it that way because releasing the BufFreelistLock + * might awaken other processes, and it would be bad to do the associated + * kernel calls while holding the buffer header spinlock. */ volatile BufferDesc * -StrategyGetBuffer(void) +StrategyGetBuffer(BufferAccessStrategy strategy, bool *lock_held) { volatile BufferDesc *buf; int trycounter; + /* + * If given a strategy object, see whether it can select a buffer. + * We assume strategy objects don't need the BufFreelistLock. + */ + if (strategy != NULL) + { + buf = GetBufferFromRing(strategy); + if (buf != NULL) + { + *lock_held = false; + return buf; + } + } + + /* Nope, so lock the freelist */ + *lock_held = true; LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE); /* @@ -82,11 +137,16 @@ StrategyGetBuffer(void) * If the buffer is pinned or has a nonzero usage_count, we cannot use * it; discard it and retry. (This can only happen if VACUUM put a * valid buffer in the freelist and then someone else used it before - * we got to it.) + * we got to it. It's probably impossible altogether as of 8.3, + * but we'd better check anyway.) */ LockBufHdr(buf); if (buf->refcount == 0 && buf->usage_count == 0) + { + if (strategy != NULL) + AddBufferToRing(strategy, buf); return buf; + } UnlockBufHdr(buf); } @@ -101,15 +161,23 @@ StrategyGetBuffer(void) /* * If the buffer is pinned or has a nonzero usage_count, we cannot use - * it; decrement the usage_count and keep scanning. + * it; decrement the usage_count (unless pinned) and keep scanning. */ LockBufHdr(buf); - if (buf->refcount == 0 && buf->usage_count == 0) - return buf; - if (buf->usage_count > 0) + if (buf->refcount == 0) { - buf->usage_count--; - trycounter = NBuffers; + if (buf->usage_count > 0) + { + buf->usage_count--; + trycounter = NBuffers; + } + else + { + /* Found a usable buffer */ + if (strategy != NULL) + AddBufferToRing(strategy, buf); + return buf; + } } else if (--trycounter == 0) { @@ -132,13 +200,9 @@ StrategyGetBuffer(void) /* * StrategyFreeBuffer: put a buffer on the freelist - * - * The buffer is added either at the head or the tail, according to the - * at_head parameter. This allows a small amount of control over how - * quickly the buffer is reused. */ void -StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head) +StrategyFreeBuffer(volatile BufferDesc *buf) { LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE); @@ -148,22 +212,10 @@ StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head) */ if (buf->freeNext == FREENEXT_NOT_IN_LIST) { - if (at_head) - { - buf->freeNext = StrategyControl->firstFreeBuffer; - if (buf->freeNext < 0) - StrategyControl->lastFreeBuffer = buf->buf_id; - StrategyControl->firstFreeBuffer = buf->buf_id; - } - else - { - buf->freeNext = FREENEXT_END_OF_LIST; - if (StrategyControl->firstFreeBuffer < 0) - StrategyControl->firstFreeBuffer = buf->buf_id; - else - BufferDescriptors[StrategyControl->lastFreeBuffer].freeNext = buf->buf_id; + buf->freeNext = StrategyControl->firstFreeBuffer; + if (buf->freeNext < 0) StrategyControl->lastFreeBuffer = buf->buf_id; - } + StrategyControl->firstFreeBuffer = buf->buf_id; } LWLockRelease(BufFreelistLock); @@ -190,15 +242,6 @@ StrategySyncStart(void) return result; } -/* - * StrategyHintVacuum -- tell us whether VACUUM is active - */ -void -StrategyHintVacuum(bool vacuum_active) -{ - strategy_hint_vacuum = vacuum_active; -} - /* * StrategyShmemSize @@ -274,3 +317,172 @@ StrategyInitialize(bool init) else Assert(!init); } + + +/* ---------------------------------------------------------------- + * Backend-private buffer ring management + * ---------------------------------------------------------------- + */ + + +/* + * GetAccessStrategy -- create a BufferAccessStrategy object + * + * The object is allocated in the current memory context. + */ +BufferAccessStrategy +GetAccessStrategy(BufferAccessStrategyType btype) +{ + BufferAccessStrategy strategy; + int ring_size; + + /* + * Select ring size to use. See buffer/README for rationales. + * (Currently all cases are the same size, but keep this code + * structure for flexibility.) + */ + switch (btype) + { + case BAS_NORMAL: + /* if someone asks for NORMAL, just give 'em a "default" object */ + return NULL; + + case BAS_BULKREAD: + ring_size = 256 * 1024 / BLCKSZ; + break; + case BAS_VACUUM: + ring_size = 256 * 1024 / BLCKSZ; + break; + + default: + elog(ERROR, "unrecognized buffer access strategy: %d", + (int) btype); + return NULL; /* keep compiler quiet */ + } + + /* Make sure ring isn't an undue fraction of shared buffers */ + ring_size = Min(NBuffers / 8, ring_size); + + /* Allocate the object and initialize all elements to zeroes */ + strategy = (BufferAccessStrategy) + palloc0(offsetof(BufferAccessStrategyData, buffers) + + ring_size * sizeof(Buffer)); + + /* Set fields that don't start out zero */ + strategy->btype = btype; + strategy->ring_size = ring_size; + + return strategy; +} + +/* + * FreeAccessStrategy -- release a BufferAccessStrategy object + * + * A simple pfree would do at the moment, but we would prefer that callers + * don't assume that much about the representation of BufferAccessStrategy. + */ +void +FreeAccessStrategy(BufferAccessStrategy strategy) +{ + /* don't crash if called on a "default" strategy */ + if (strategy != NULL) + pfree(strategy); +} + +/* + * GetBufferFromRing -- returns a buffer from the ring, or NULL if the + * ring is empty. + * + * The bufhdr spin lock is held on the returned buffer. + */ +static volatile BufferDesc * +GetBufferFromRing(BufferAccessStrategy strategy) +{ + volatile BufferDesc *buf; + Buffer bufnum; + + /* Advance to next ring slot */ + if (++strategy->current >= strategy->ring_size) + strategy->current = 0; + + /* + * If the slot hasn't been filled yet, tell the caller to allocate + * a new buffer with the normal allocation strategy. He will then + * fill this slot by calling AddBufferToRing with the new buffer. + */ + bufnum = strategy->buffers[strategy->current]; + if (bufnum == InvalidBuffer) + { + strategy->current_was_in_ring = false; + return NULL; + } + + /* + * If the buffer is pinned we cannot use it under any circumstances. + * + * If usage_count is 0 or 1 then the buffer is fair game (we expect 1, + * since our own previous usage of the ring element would have left it + * there, but it might've been decremented by clock sweep since then). + * A higher usage_count indicates someone else has touched the buffer, + * so we shouldn't re-use it. + */ + buf = &BufferDescriptors[bufnum - 1]; + LockBufHdr(buf); + if (buf->refcount == 0 && buf->usage_count <= 1) + { + strategy->current_was_in_ring = true; + return buf; + } + UnlockBufHdr(buf); + + /* + * Tell caller to allocate a new buffer with the normal allocation + * strategy. He'll then replace this ring element via AddBufferToRing. + */ + strategy->current_was_in_ring = false; + return NULL; +} + +/* + * AddBufferToRing -- add a buffer to the buffer ring + * + * Caller must hold the buffer header spinlock on the buffer. Since this + * is called with the spinlock held, it had better be quite cheap. + */ +static void +AddBufferToRing(BufferAccessStrategy strategy, volatile BufferDesc *buf) +{ + strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf); +} + +/* + * StrategyRejectBuffer -- consider rejecting a dirty buffer + * + * When a nondefault strategy is used, the buffer manager calls this function + * when it turns out that the buffer selected by StrategyGetBuffer needs to + * be written out and doing so would require flushing WAL too. This gives us + * a chance to choose a different victim. + * + * Returns true if buffer manager should ask for a new victim, and false + * if this buffer should be written and re-used. + */ +bool +StrategyRejectBuffer(BufferAccessStrategy strategy, volatile BufferDesc *buf) +{ + /* We only do this in bulkread mode */ + if (strategy->btype != BAS_BULKREAD) + return false; + + /* Don't muck with behavior of normal buffer-replacement strategy */ + if (!strategy->current_was_in_ring || + strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf)) + return false; + + /* + * Remove the dirty buffer from the ring; necessary to prevent infinite + * loop if all ring members are dirty. + */ + strategy->buffers[strategy->current] = InvalidBuffer; + + return true; +} diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 306ffe4576..ad2bcf8dac 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.76 2007/01/05 22:19:37 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.77 2007/05/30 20:11:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -57,7 +57,8 @@ static Block GetLocalBufferStorage(void); * * API is similar to bufmgr.c's BufferAlloc, except that we do not need * to do any locking since this is all local. Also, IO_IN_PROGRESS - * does not get set. + * does not get set. Lastly, we support only default access strategy + * (hence, usage_count is always advanced). */ BufferDesc * LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) @@ -88,7 +89,12 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) fprintf(stderr, "LB ALLOC (%u,%d) %d\n", RelationGetRelid(reln), blockNum, -b - 1); #endif - + /* this part is equivalent to PinBuffer for a shared buffer */ + if (LocalRefCount[b] == 0) + { + if (bufHdr->usage_count < BM_MAX_USAGE_COUNT) + bufHdr->usage_count++; + } LocalRefCount[b]++; ResourceOwnerRememberBuffer(CurrentResourceOwner, BufferDescriptorGetBuffer(bufHdr)); @@ -121,18 +127,21 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) bufHdr = &LocalBufferDescriptors[b]; - if (LocalRefCount[b] == 0 && bufHdr->usage_count == 0) - { - LocalRefCount[b]++; - ResourceOwnerRememberBuffer(CurrentResourceOwner, - BufferDescriptorGetBuffer(bufHdr)); - break; - } - - if (bufHdr->usage_count > 0) + if (LocalRefCount[b] == 0) { - bufHdr->usage_count--; - trycounter = NLocBuffer; + if (bufHdr->usage_count > 0) + { + bufHdr->usage_count--; + trycounter = NLocBuffer; + } + else + { + /* Found a usable buffer */ + LocalRefCount[b]++; + ResourceOwnerRememberBuffer(CurrentResourceOwner, + BufferDescriptorGetBuffer(bufHdr)); + break; + } } else if (--trycounter == 0) ereport(ERROR, @@ -199,7 +208,7 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) bufHdr->tag = newTag; bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR); bufHdr->flags |= BM_TAG_VALID; - bufHdr->usage_count = 0; + bufHdr->usage_count = 1; *foundPtr = FALSE; return bufHdr; diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 028cb47c7a..baa203a2d1 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -10,7 +10,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tcop/utility.c,v 1.279 2007/04/27 22:05:49 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tcop/utility.c,v 1.280 2007/05/30 20:12:01 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -931,7 +931,7 @@ ProcessUtility(Node *parsetree, break; case T_VacuumStmt: - vacuum((VacuumStmt *) parsetree, NIL, isTopLevel); + vacuum((VacuumStmt *) parsetree, NIL, NULL, isTopLevel); break; case T_ExplainStmt: diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 1f31baf0e4..98296e62be 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/genam.h,v 1.66 2007/01/05 22:19:50 momjian Exp $ + * $PostgreSQL: pgsql/src/include/access/genam.h,v 1.67 2007/05/30 20:12:02 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -40,6 +40,7 @@ typedef struct IndexVacuumInfo bool vacuum_full; /* VACUUM FULL (we have exclusive lock) */ int message_level; /* ereport level for progress messages */ double num_heap_tuples; /* tuples remaining in heap */ + BufferAccessStrategy strategy; /* access strategy for reads */ } IndexVacuumInfo; /* diff --git a/src/include/access/hash.h b/src/include/access/hash.h index d382ee6ee9..2bd314a8aa 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.80 2007/05/03 16:45:58 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.81 2007/05/30 20:12:02 tgl Exp $ * * NOTES * modeled after Margo Seltzer's hash implementation for unix. @@ -273,11 +273,13 @@ extern void _hash_doinsert(Relation rel, IndexTuple itup); /* hashovfl.c */ extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf); -extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf); +extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, + BufferAccessStrategy bstrategy); extern void _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno); extern void _hash_squeezebucket(Relation rel, - Bucket bucket, BlockNumber bucket_blkno); + Bucket bucket, BlockNumber bucket_blkno, + BufferAccessStrategy bstrategy); /* hashpage.c */ extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access); @@ -287,6 +289,9 @@ extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags); extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno); extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno); +extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, + int access, int flags, + BufferAccessStrategy bstrategy); extern void _hash_relbuf(Relation rel, Buffer buf); extern void _hash_dropbuf(Relation rel, Buffer buf); extern void _hash_wrtbuf(Relation rel, Buffer buf); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 7a1ea39352..200b45713e 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.53 2007/05/27 03:50:39 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.54 2007/05/30 20:12:02 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -27,6 +27,7 @@ typedef struct HeapScanDescData int rs_nkeys; /* number of scan keys */ ScanKey rs_key; /* array of scan key descriptors */ BlockNumber rs_nblocks; /* number of blocks to scan */ + BufferAccessStrategy rs_strategy; /* access strategy for reads */ bool rs_pageatatime; /* verify visibility page-at-a-time? */ /* scan current state */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 87ff6aba50..1c741f38fd 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.77 2007/05/20 21:08:19 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.78 2007/05/30 20:12:02 tgl Exp $ */ #ifndef XLOG_H #define XLOG_H @@ -159,6 +159,7 @@ extern bool XLOG_DEBUG; extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata); extern void XLogFlush(XLogRecPtr RecPtr); +extern bool XLogNeedsFlush(XLogRecPtr RecPtr); extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record); extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec); diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index acb2f623e2..50a475bc5e 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/commands/vacuum.h,v 1.71 2007/05/17 15:28:29 alvherre Exp $ + * $PostgreSQL: pgsql/src/include/commands/vacuum.h,v 1.72 2007/05/30 20:12:03 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -18,9 +18,11 @@ #include "catalog/pg_statistic.h" #include "catalog/pg_type.h" #include "nodes/parsenodes.h" +#include "storage/buf.h" #include "storage/lock.h" #include "utils/rel.h" + /*---------- * ANALYZE builds one of these structs for each attribute (column) that is * to be analyzed. The struct and subsidiary data are in anl_context, @@ -110,7 +112,8 @@ extern int vacuum_freeze_min_age; /* in commands/vacuum.c */ -extern void vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel); +extern void vacuum(VacuumStmt *vacstmt, List *relids, + BufferAccessStrategy bstrategy, bool isTopLevel); extern void vac_open_indexes(Relation relation, LOCKMODE lockmode, int *nindexes, Relation **Irel); extern void vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode); @@ -127,9 +130,11 @@ extern bool vac_is_partial_index(Relation indrel); extern void vacuum_delay_point(void); /* in commands/vacuumlazy.c */ -extern void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt); +extern void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, + BufferAccessStrategy bstrategy); /* in commands/analyze.c */ -extern void analyze_rel(Oid relid, VacuumStmt *vacstmt); +extern void analyze_rel(Oid relid, VacuumStmt *vacstmt, + BufferAccessStrategy bstrategy); #endif /* VACUUM_H */ diff --git a/src/include/storage/buf.h b/src/include/storage/buf.h index 94da564d1e..a812a9e269 100644 --- a/src/include/storage/buf.h +++ b/src/include/storage/buf.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/buf.h,v 1.21 2007/01/05 22:19:57 momjian Exp $ + * $PostgreSQL: pgsql/src/include/storage/buf.h,v 1.22 2007/05/30 20:12:03 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -36,4 +36,11 @@ typedef int Buffer; */ #define BufferIsLocal(buffer) ((buffer) < 0) +/* + * Buffer access strategy objects. + * + * BufferAccessStrategyData is private to freelist.c + */ +typedef struct BufferAccessStrategyData *BufferAccessStrategy; + #endif /* BUF_H */ diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 561b7e40f6..d5eef8734f 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -8,7 +8,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.89 2007/01/05 22:19:57 momjian Exp $ + * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.90 2007/05/30 20:12:03 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -167,9 +167,6 @@ extern DLLIMPORT BufferDesc *BufferDescriptors; /* in localbuf.c */ extern BufferDesc *LocalBufferDescriptors; -/* in freelist.c */ -extern bool strategy_hint_vacuum; - /* event counters in buf_init.c */ extern long int ReadBufferCount; extern long int ReadLocalBufferCount; @@ -184,8 +181,12 @@ extern long int LocalBufferFlushCount; */ /* freelist.c */ -extern volatile BufferDesc *StrategyGetBuffer(void); -extern void StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head); +extern volatile BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy, + bool *lock_held); +extern void StrategyFreeBuffer(volatile BufferDesc *buf); +extern bool StrategyRejectBuffer(BufferAccessStrategy strategy, + volatile BufferDesc *buf); + extern int StrategySyncStart(void); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index ad20362179..9ae83b4253 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.103 2007/05/02 23:18:03 tgl Exp $ + * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.104 2007/05/30 20:12:03 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -19,6 +19,14 @@ typedef void *Block; +/* Possible arguments for GetAccessStrategy() */ +typedef enum BufferAccessStrategyType +{ + BAS_NORMAL, /* Normal random access */ + BAS_BULKREAD, /* Large read-only scan (hint bit updates are ok) */ + BAS_VACUUM /* VACUUM */ +} BufferAccessStrategyType; + /* in globals.c ... this duplicates miscadmin.h */ extern DLLIMPORT int NBuffers; @@ -111,6 +119,8 @@ extern DLLIMPORT int32 *LocalRefCount; * prototypes for functions in bufmgr.c */ extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum); +extern Buffer ReadBufferWithStrategy(Relation reln, BlockNumber blockNum, + BufferAccessStrategy strategy); extern Buffer ReadOrZeroBuffer(Relation reln, BlockNumber blockNum); extern void ReleaseBuffer(Buffer buffer); extern void UnlockReleaseBuffer(Buffer buffer); @@ -157,6 +167,7 @@ extern void BgBufferSync(void); extern void AtProcExit_LocalBuffers(void); /* in freelist.c */ -extern void StrategyHintVacuum(bool vacuum_active); +extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype); +extern void FreeAccessStrategy(BufferAccessStrategy strategy); #endif -- 2.40.0