From 09cb5c0e7d6fbc9dee26dc429e4fc0f2a88e5272 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sun, 7 May 2006 01:21:30 +0000 Subject: [PATCH] Rewrite btree index scans to work a page at a time in all cases (both btgettuple and btgetmulti). This eliminates the problem of "re-finding" the exact stopping point, since the stopping point is effectively always a page boundary, and index items are never moved across pre-existing page boundaries. A small penalty is that the keys_are_unique optimization is effectively disabled (and, therefore, is removed in this patch), causing us to apply _bt_checkkeys() to at least one more tuple than necessary when looking up a unique key. However, the advantages for non-unique cases seem great enough to accept this tradeoff. Aside from simplifying and (sometimes) speeding up the indexscan code, this will allow us to reimplement btbulkdelete as a largely sequential scan instead of index-order traversal, thereby significantly reducing the cost of VACUUM. Those changes will come in a separate patch. Original patch by Heikki Linnakangas, rework by Tom Lane. --- src/backend/access/index/genam.c | 7 +- src/backend/access/index/indexam.c | 89 +---- src/backend/access/nbtree/README | 62 ++- src/backend/access/nbtree/nbtree.c | 319 +++++---------- src/backend/access/nbtree/nbtsearch.c | 549 +++++++++++++++----------- src/backend/access/nbtree/nbtutils.c | 110 ++++-- src/include/access/itup.h | 13 +- src/include/access/nbtree.h | 86 +++- src/include/access/relscan.h | 18 +- 9 files changed, 624 insertions(+), 629 deletions(-) diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 50c940de37..56eabbaf42 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.54 2006/03/05 15:58:21 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.55 2006/05/07 01:21:30 tgl Exp $ * * NOTES * many of the old access method routines have been turned into @@ -90,8 +90,6 @@ RelationGetIndexScan(Relation indexRelation, scan->have_lock = false; /* ditto */ scan->kill_prior_tuple = false; scan->ignore_killed_tuples = true; /* default setting */ - scan->keys_are_unique = false; /* may be set by index AM */ - scan->got_tuple = false; scan->opaque = NULL; @@ -102,9 +100,6 @@ RelationGetIndexScan(Relation indexRelation, scan->xs_ctup.t_data = NULL; scan->xs_cbuf = InvalidBuffer; - scan->unique_tuple_pos = 0; - scan->unique_tuple_mark = 0; - pgstat_initstats(&scan->xs_pgstat_info, indexRelation); /* diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index b54364acb6..900b34263d 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.92 2006/05/02 22:25:10 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.93 2006/05/07 01:21:30 tgl Exp $ * * INTERFACE ROUTINES * index_open - open an index relation by relation OID @@ -362,10 +362,6 @@ index_rescan(IndexScanDesc scan, ScanKey key) } scan->kill_prior_tuple = false; /* for safety */ - scan->keys_are_unique = false; /* may be set by index AM */ - scan->got_tuple = false; - scan->unique_tuple_pos = 0; - scan->unique_tuple_mark = 0; FunctionCall2(procedure, PointerGetDatum(scan), @@ -417,8 +413,6 @@ index_markpos(IndexScanDesc scan) SCAN_CHECKS; GET_SCAN_PROCEDURE(ammarkpos); - scan->unique_tuple_mark = scan->unique_tuple_pos; - FunctionCall1(procedure, PointerGetDatum(scan)); } @@ -440,13 +434,6 @@ index_restrpos(IndexScanDesc scan) scan->kill_prior_tuple = false; /* for safety */ - /* - * We do not reset got_tuple; so if the scan is actually being - * short-circuited by index_getnext, the effective position restoration is - * done by restoring unique_tuple_pos. - */ - scan->unique_tuple_pos = scan->unique_tuple_mark; - FunctionCall1(procedure, PointerGetDatum(scan)); } @@ -456,8 +443,7 @@ index_restrpos(IndexScanDesc scan) * The result is the next heap tuple satisfying the scan keys and the * snapshot, or NULL if no more matching tuples exist. On success, * the buffer containing the heap tuple is pinned (the pin will be dropped - * at the next index_getnext or index_endscan). The index TID corresponding - * to the heap tuple can be obtained if needed from scan->currentItemData. + * at the next index_getnext or index_endscan). * ---------------- */ HeapTuple @@ -469,65 +455,6 @@ index_getnext(IndexScanDesc scan, ScanDirection direction) SCAN_CHECKS; GET_SCAN_PROCEDURE(amgettuple); - /* - * If we already got a tuple and it must be unique, there's no need to - * make the index AM look through any additional tuples. (This can save a - * useful amount of work in scenarios where there are many dead tuples due - * to heavy update activity.) - * - * To do this we must keep track of the logical scan position - * (before/on/after tuple). Also, we have to be sure to release scan - * resources before returning NULL; if we fail to do so then a multi-index - * scan can easily run the system out of free buffers. We can release - * index-level resources fairly cheaply by calling index_rescan. This - * means there are two persistent states as far as the index AM is - * concerned: on-tuple and rescanned. If we are actually asked to - * re-fetch the single tuple, we have to go through a fresh indexscan - * startup, which penalizes that (infrequent) case. - */ - if (scan->keys_are_unique && scan->got_tuple) - { - int new_tuple_pos = scan->unique_tuple_pos; - - if (ScanDirectionIsForward(direction)) - { - if (new_tuple_pos <= 0) - new_tuple_pos++; - } - else - { - if (new_tuple_pos >= 0) - new_tuple_pos--; - } - if (new_tuple_pos == 0) - { - /* - * We are moving onto the unique tuple from having been off it. We - * just fall through and let the index AM do the work. Note we - * should get the right answer regardless of scan direction. - */ - scan->unique_tuple_pos = 0; /* need to update position */ - } - else - { - /* - * Moving off the tuple; must do amrescan to release index-level - * pins before we return NULL. Since index_rescan will reset my - * state, must save and restore... - */ - int unique_tuple_mark = scan->unique_tuple_mark; - - index_rescan(scan, NULL /* no change to key */ ); - - scan->keys_are_unique = true; - scan->got_tuple = true; - scan->unique_tuple_pos = new_tuple_pos; - scan->unique_tuple_mark = unique_tuple_mark; - - return NULL; - } - } - /* just make sure this is false... */ scan->kill_prior_tuple = false; @@ -588,14 +515,6 @@ index_getnext(IndexScanDesc scan, ScanDirection direction) } /* Success exit */ - scan->got_tuple = true; - - /* - * If we just fetched a known-unique tuple, then subsequent calls will go - * through the short-circuit code above. unique_tuple_pos has been - * initialized to 0, which is the correct state ("on row"). - */ - return heapTuple; } @@ -608,8 +527,8 @@ index_getnext(IndexScanDesc scan, ScanDirection direction) * (which most callers of this routine will probably want to suppress by * setting scan->ignore_killed_tuples = false). * - * On success (TRUE return), the found index TID is in scan->currentItemData, - * and its heap TID is in scan->xs_ctup.t_self. scan->xs_cbuf is untouched. + * On success (TRUE return), the heap TID of the found index entry is in + * scan->xs_ctup.t_self. scan->xs_cbuf is untouched. * ---------------- */ bool diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 9fd6c5731c..5c50972883 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.10 2006/04/25 22:46:05 tgl Exp $ +$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.11 2006/05/07 01:21:30 tgl Exp $ This directory contains a correct implementation of Lehman and Yao's high-concurrency B-tree management algorithm (P. Lehman and S. Yao, @@ -67,13 +67,22 @@ move right until we find a page whose right-link matches the page we came from. (Actually, it's even harder than that; see deletion discussion below.) -Read locks on a page are held for as long as a scan is examining a page. -But nbtree.c arranges to drop the read lock, but not the buffer pin, -on the current page of a scan before control leaves nbtree. When we -come back to resume the scan, we have to re-grab the read lock and -then move right if the current item moved (see _bt_restscan()). Keeping -the pin ensures that the current item cannot move left or be deleted -(see btbulkdelete). +Page read locks are held only for as long as a scan is examining a page. +To minimize lock/unlock traffic, an index scan always searches a leaf page +to identify all the matching items at once, copying their heap tuple IDs +into backend-local storage. The heap tuple IDs are then processed while +not holding any page lock within the index. We do continue to hold a pin +on the leaf page, to protect against concurrent deletions (see below). +In this state the scan is effectively stopped "between" pages, either +before or after the page it has pinned. This is safe in the presence of +concurrent insertions and even page splits, because items are never moved +across pre-existing page boundaries --- so the scan cannot miss any items +it should have seen, nor accidentally return the same item twice. The scan +must remember the page's right-link at the time it was scanned, since that +is the page to move right to; if we move right to the current right-link +then we'd re-scan any items moved by a page split. We don't similarly +remember the left-link, since it's best to use the most up-to-date +left-link when trying to move left (see detailed move-left algorithm below). In most cases we release our lock and pin on a page before attempting to acquire pin and lock on the page we are moving to. In a few places @@ -119,14 +128,33 @@ item doesn't fit on the split page where it needs to go! The deletion algorithm ---------------------- -Deletions of leaf items are handled by getting a super-exclusive lock on -the target page, so that no other backend has a pin on the page when the -deletion starts. This means no scan is pointing at the page, so no other -backend can lose its place due to the item deletion. - -The above does not work for deletion of items in internal pages, since -other backends keep no lock nor pin on a page they have descended past. -Instead, when a backend is ascending the tree using its stack, it must +Before deleting a leaf item, we get a super-exclusive lock on the target +page, so that no other backend has a pin on the page when the deletion +starts. This is not necessary for correctness in terms of the btree index +operations themselves; as explained above, index scans logically stop +"between" pages and so can't lose their place. The reason we do it is to +provide an interlock between non-full VACUUM and indexscans. Since VACUUM +deletes index entries before deleting tuples, the super-exclusive lock +guarantees that VACUUM can't delete any heap tuple that an indexscanning +process might be about to visit. (This guarantee works only for simple +indexscans that visit the heap in sync with the index scan, not for bitmap +scans. We only need the guarantee when using non-MVCC snapshot rules such +as SnapshotNow, so in practice this is only important for system catalog +accesses.) + +Because a page can be split even while someone holds a pin on it, it is +possible that an indexscan will return items that are no longer stored on +the page it has a pin on, but rather somewhere to the right of that page. +To ensure that VACUUM can't prematurely remove such heap tuples, we require +btbulkdelete to obtain super-exclusive lock on every leaf page in the index +(even pages that don't contain any deletable tuples). This guarantees that +the btbulkdelete call cannot return while any indexscan is still holding +a copy of a deleted index tuple. Note that this requirement does not say +that btbulkdelete must visit the pages in any particular order. + +There is no such interlocking for deletion of items in internal pages, +since backends keep no lock nor pin on a page they have descended past. +Hence, when a backend is ascending the tree using its stack, it must be prepared for the possibility that the item it wants is to the left of the recorded position (but it can't have moved left out of the recorded page). Since we hold a lock on the lower page (per L&Y) until we have @@ -201,7 +229,7 @@ accordingly. Searches and forward scans simply follow the right-link until they find a non-dead page --- this will be where the deleted page's key-space moved to. -Stepping left in a backward scan is complicated because we must consider +Moving left in a backward scan is complicated because we must consider the possibility that the left sibling was just split (meaning we must find the rightmost page derived from the left sibling), plus the possibility that the page we were just on has now been deleted and hence isn't in the diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 1eece10398..c7aee07b3a 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -12,7 +12,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.146 2006/05/02 22:25:10 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.147 2006/05/07 01:21:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -48,7 +48,6 @@ typedef struct } BTBuildState; -static void _bt_restscan(IndexScanDesc scan); static void btbuildCallback(Relation index, HeapTuple htup, Datum *values, @@ -218,8 +217,6 @@ btgettuple(PG_FUNCTION_ARGS) IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); BTScanOpaque so = (BTScanOpaque) scan->opaque; - Page page; - OffsetNumber offnum; bool res; /* @@ -227,33 +224,26 @@ btgettuple(PG_FUNCTION_ARGS) * appropriate direction. If we haven't done so yet, we call a routine to * get the first item in the scan. */ - if (ItemPointerIsValid(&(scan->currentItemData))) + if (BTScanPosIsValid(so->currPos)) { - /* - * Restore scan position using heap TID returned by previous call to - * btgettuple(). _bt_restscan() re-grabs the read lock on the buffer, - * too. - */ - _bt_restscan(scan); - /* * Check to see if we should kill the previously-fetched tuple. */ if (scan->kill_prior_tuple) { /* - * Yes, so mark it by setting the LP_DELETE bit in the item flags. - */ - offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData)); - page = BufferGetPage(so->btso_curbuf); - PageGetItemId(page, offnum)->lp_flags |= LP_DELETE; - - /* - * Since this can be redone later if needed, it's treated the same - * as a commit-hint-bit status update for heap tuples: we mark the - * buffer dirty but don't make a WAL log entry. + * Yes, remember it for later. (We'll deal with all such tuples + * at once right before leaving the index page.) The test for + * numKilled overrun is not just paranoia: if the caller reverses + * direction in the indexscan then the same item might get entered + * multiple times. It's not worth trying to optimize that, so we + * don't detect it, but instead just forget any excess entries. */ - SetBufferCommitInfoNeedsSave(so->btso_curbuf); + if (so->killedItems == NULL) + so->killedItems = (int *) + palloc(MaxIndexTuplesPerPage * sizeof(int)); + if (so->numKilled < MaxIndexTuplesPerPage) + so->killedItems[so->numKilled++] = so->currPos.itemIndex; } /* @@ -264,29 +254,15 @@ btgettuple(PG_FUNCTION_ARGS) else res = _bt_first(scan, dir); - /* - * Save heap TID to use it in _bt_restscan. Then release the read lock on - * the buffer so that we aren't blocking other backends. - * - * NOTE: we do keep the pin on the buffer! This is essential to ensure - * that someone else doesn't delete the index entry we are stopped on. - */ - if (res) - { - ((BTScanOpaque) scan->opaque)->curHeapIptr = scan->xs_ctup.t_self; - LockBuffer(((BTScanOpaque) scan->opaque)->btso_curbuf, - BUFFER_LOCK_UNLOCK); - } - PG_RETURN_BOOL(res); } /* * btgetmulti() -- get multiple tuples at once * - * This is a somewhat generic implementation: it avoids the _bt_restscan - * overhead, but there's no smarts about picking especially good stopping - * points such as index page boundaries. + * In the current implementation there seems no strong reason to stop at + * index page boundaries; we just press on until we fill the caller's buffer + * or run out of matches. */ Datum btgetmulti(PG_FUNCTION_ARGS) @@ -299,36 +275,41 @@ btgetmulti(PG_FUNCTION_ARGS) bool res = true; int32 ntids = 0; - /* - * Restore prior state if we were already called at least once. - */ - if (ItemPointerIsValid(&(scan->currentItemData))) - _bt_restscan(scan); + if (max_tids <= 0) /* behave correctly in boundary case */ + PG_RETURN_BOOL(true); - while (ntids < max_tids) + /* If we haven't started the scan yet, fetch the first page & tuple. */ + if (!BTScanPosIsValid(so->currPos)) { - /* - * Start scan, or advance to next tuple. - */ - if (ItemPointerIsValid(&(scan->currentItemData))) - res = _bt_next(scan, ForwardScanDirection); - else - res = _bt_first(scan, ForwardScanDirection); + res = _bt_first(scan, ForwardScanDirection); if (!res) - break; + { + /* empty scan */ + *returned_tids = ntids; + PG_RETURN_BOOL(res); + } /* Save tuple ID, and continue scanning */ tids[ntids] = scan->xs_ctup.t_self; ntids++; } - /* - * Save heap TID to use it in _bt_restscan. Then release the read lock on - * the buffer so that we aren't blocking other backends. - */ - if (res) + while (ntids < max_tids) { - so->curHeapIptr = scan->xs_ctup.t_self; - LockBuffer(so->btso_curbuf, BUFFER_LOCK_UNLOCK); + /* + * Advance to next tuple within page. This is the same as the + * easy case in _bt_next(). + */ + if (++so->currPos.itemIndex > so->currPos.lastItem) + { + /* let _bt_next do the heavy lifting */ + res = _bt_next(scan, ForwardScanDirection); + if (!res) + break; + } + + /* Save tuple ID, and continue scanning */ + tids[ntids] = so->currPos.items[so->currPos.itemIndex].heapTid; + ntids++; } *returned_tids = ntids; @@ -360,7 +341,6 @@ btrescan(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1); - ItemPointer iptr; BTScanOpaque so; so = (BTScanOpaque) scan->opaque; @@ -368,31 +348,30 @@ btrescan(PG_FUNCTION_ARGS) if (so == NULL) /* if called from btbeginscan */ { so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); - so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer; - ItemPointerSetInvalid(&(so->curHeapIptr)); - ItemPointerSetInvalid(&(so->mrkHeapIptr)); + so->currPos.buf = so->markPos.buf = InvalidBuffer; if (scan->numberOfKeys > 0) so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); else so->keyData = NULL; + so->killedItems = NULL; /* until needed */ + so->numKilled = 0; scan->opaque = so; } /* we aren't holding any read locks, but gotta drop the pins */ - if (ItemPointerIsValid(iptr = &(scan->currentItemData))) + if (BTScanPosIsValid(so->currPos)) { - ReleaseBuffer(so->btso_curbuf); - so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(&(so->curHeapIptr)); - ItemPointerSetInvalid(iptr); + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan, false); + ReleaseBuffer(so->currPos.buf); + so->currPos.buf = InvalidBuffer; } - if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) + if (BTScanPosIsValid(so->markPos)) { - ReleaseBuffer(so->btso_mrkbuf); - so->btso_mrkbuf = InvalidBuffer; - ItemPointerSetInvalid(&(so->mrkHeapIptr)); - ItemPointerSetInvalid(iptr); + ReleaseBuffer(so->markPos.buf); + so->markPos.buf = InvalidBuffer; } /* @@ -415,28 +394,26 @@ Datum btendscan(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); - ItemPointer iptr; - BTScanOpaque so; - - so = (BTScanOpaque) scan->opaque; + BTScanOpaque so = (BTScanOpaque) scan->opaque; /* we aren't holding any read locks, but gotta drop the pins */ - if (ItemPointerIsValid(iptr = &(scan->currentItemData))) + if (BTScanPosIsValid(so->currPos)) { - if (BufferIsValid(so->btso_curbuf)) - ReleaseBuffer(so->btso_curbuf); - so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(iptr); + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan, false); + ReleaseBuffer(so->currPos.buf); + so->currPos.buf = InvalidBuffer; } - if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) + if (BTScanPosIsValid(so->markPos)) { - if (BufferIsValid(so->btso_mrkbuf)) - ReleaseBuffer(so->btso_mrkbuf); - so->btso_mrkbuf = InvalidBuffer; - ItemPointerSetInvalid(iptr); + ReleaseBuffer(so->markPos.buf); + so->markPos.buf = InvalidBuffer; } + if (so->killedItems != NULL) + pfree(so->killedItems); if (so->keyData != NULL) pfree(so->keyData); pfree(so); @@ -451,26 +428,22 @@ Datum btmarkpos(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); - ItemPointer iptr; - BTScanOpaque so; - - so = (BTScanOpaque) scan->opaque; + BTScanOpaque so = (BTScanOpaque) scan->opaque; /* we aren't holding any read locks, but gotta drop the pin */ - if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) + if (BTScanPosIsValid(so->markPos)) { - ReleaseBuffer(so->btso_mrkbuf); - so->btso_mrkbuf = InvalidBuffer; - ItemPointerSetInvalid(iptr); + ReleaseBuffer(so->markPos.buf); + so->markPos.buf = InvalidBuffer; } /* bump pin on current buffer for assignment to mark buffer */ - if (ItemPointerIsValid(&(scan->currentItemData))) + if (BTScanPosIsValid(so->currPos)) { - IncrBufferRefCount(so->btso_curbuf); - so->btso_mrkbuf = so->btso_curbuf; - scan->currentMarkData = scan->currentItemData; - so->mrkHeapIptr = so->curHeapIptr; + IncrBufferRefCount(so->currPos.buf); + memcpy(&so->markPos, &so->currPos, + offsetof(BTScanPosData, items[1]) + + so->currPos.lastItem * sizeof(BTScanPosItem)); } PG_RETURN_VOID(); @@ -483,26 +456,26 @@ Datum btrestrpos(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); - ItemPointer iptr; - BTScanOpaque so; - - so = (BTScanOpaque) scan->opaque; + BTScanOpaque so = (BTScanOpaque) scan->opaque; /* we aren't holding any read locks, but gotta drop the pin */ - if (ItemPointerIsValid(iptr = &(scan->currentItemData))) + if (BTScanPosIsValid(so->currPos)) { - ReleaseBuffer(so->btso_curbuf); - so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(iptr); + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0 && + so->currPos.buf != so->markPos.buf) + _bt_killitems(scan, false); + ReleaseBuffer(so->currPos.buf); + so->currPos.buf = InvalidBuffer; } /* bump pin on marked buffer */ - if (ItemPointerIsValid(&(scan->currentMarkData))) + if (BTScanPosIsValid(so->markPos)) { - IncrBufferRefCount(so->btso_mrkbuf); - so->btso_curbuf = so->btso_mrkbuf; - scan->currentItemData = scan->currentMarkData; - so->curHeapIptr = so->mrkHeapIptr; + IncrBufferRefCount(so->markPos.buf); + memcpy(&so->currPos, &so->markPos, + offsetof(BTScanPosData, items[1]) + + so->markPos.lastItem * sizeof(BTScanPosItem)); } PG_RETURN_VOID(); @@ -537,12 +510,11 @@ btbulkdelete(PG_FUNCTION_ARGS) * sequence left to right. It sounds attractive to only exclusive-lock * those containing items we need to delete, but unfortunately that is not * safe: we could then pass a stopped indexscan, which could in rare cases - * lead to deleting the item it needs to find when it resumes. (See - * _bt_restscan --- this could only happen if an indexscan stops on a - * deletable item and then a page split moves that item into a page - * further to its right, which the indexscan will have no pin on.) We can - * skip obtaining exclusive lock on empty pages though, since no indexscan - * could be stopped on those. + * lead to deleting items that the indexscan will still return later. + * (See discussion in nbtree/README.) We can skip obtaining exclusive + * lock on empty pages though, since no indexscan could be stopped on + * those. (Note: this presumes that a split couldn't have left either + * page totally empty.) */ buf = _bt_get_endpoint(rel, 0, false); @@ -823,108 +795,3 @@ btvacuumcleanup(PG_FUNCTION_ARGS) PG_RETURN_POINTER(stats); } - -/* - * Restore scan position when btgettuple is called to continue a scan. - * - * This is nontrivial because concurrent insertions might have moved the - * index tuple we stopped on. We assume the tuple can only have moved to - * the right from our stop point, because we kept a pin on the buffer, - * and so no deletion can have occurred on that page. - * - * On entry, we have a pin but no read lock on the buffer that contained - * the index tuple we stopped the scan on. On exit, we have pin and read - * lock on the buffer that now contains that index tuple, and the scandesc's - * current position is updated to point at it. - */ -static void -_bt_restscan(IndexScanDesc scan) -{ - Relation rel = scan->indexRelation; - BTScanOpaque so = (BTScanOpaque) scan->opaque; - Buffer buf = so->btso_curbuf; - Page page; - ItemPointer current = &(scan->currentItemData); - OffsetNumber offnum = ItemPointerGetOffsetNumber(current), - maxoff; - BTPageOpaque opaque; - Buffer nextbuf; - ItemPointer target = &(so->curHeapIptr); - IndexTuple itup; - BlockNumber blkno; - - /* - * Reacquire read lock on the buffer. (We should still have a - * reference-count pin on it, so need not get that.) - */ - LockBuffer(buf, BT_READ); - - page = BufferGetPage(buf); - maxoff = PageGetMaxOffsetNumber(page); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - - /* - * We use this as flag when first index tuple on page is deleted but we do - * not move left (this would slowdown vacuum) - so we set - * current->ip_posid before first index tuple on the current page - * (_bt_step will move it right)... XXX still needed? - */ - if (!ItemPointerIsValid(target)) - { - ItemPointerSetOffsetNumber(current, - OffsetNumberPrev(P_FIRSTDATAKEY(opaque))); - return; - } - - /* - * The item we were on may have moved right due to insertions. Find it - * again. We use the heap TID to identify the item uniquely. - */ - for (;;) - { - /* Check for item on this page */ - for (; - offnum <= maxoff; - offnum = OffsetNumberNext(offnum)) - { - itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); - if (BTTidSame(itup->t_tid, *target)) - { - /* Found it */ - current->ip_posid = offnum; - return; - } - } - - /* - * The item we're looking for moved right at least one page, so move - * right. We are careful here to pin and read-lock the next non-dead - * page before releasing the current one. This ensures that a - * concurrent btbulkdelete scan cannot pass our position --- if it - * did, it might be able to reach and delete our target item before we - * can find it again. - */ - if (P_RIGHTMOST(opaque)) - elog(ERROR, "failed to re-find previous key in \"%s\"", - RelationGetRelationName(rel)); - /* Advance to next non-dead page --- there must be one */ - nextbuf = InvalidBuffer; - for (;;) - { - blkno = opaque->btpo_next; - nextbuf = _bt_relandgetbuf(rel, nextbuf, blkno, BT_READ); - page = BufferGetPage(nextbuf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (!P_IGNORE(opaque)) - break; - if (P_RIGHTMOST(opaque)) - elog(ERROR, "fell off the end of \"%s\"", - RelationGetRelationName(rel)); - } - _bt_relbuf(rel, buf); - so->btso_curbuf = buf = nextbuf; - maxoff = PageGetMaxOffsetNumber(page); - offnum = P_FIRSTDATAKEY(opaque); - ItemPointerSet(current, blkno, offnum); - } -} diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 4afe9aa4f1..2c1dfc3eb4 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.104 2006/03/05 15:58:21 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.105 2006/05/07 01:21:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -21,6 +21,9 @@ #include "utils/lsyscache.h" +static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, + OffsetNumber offnum); +static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); static Buffer _bt_walk_left(Relation rel, Buffer buf); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); @@ -410,70 +413,19 @@ _bt_compare(Relation rel, return 0; } -/* - * _bt_next() -- Get the next item in a scan. - * - * On entry, we have a valid currentItemData in the scan, and a - * read lock and pin count on the page that contains that item. - * We return the next item in the scan, or false if no more. - * On successful exit, the page containing the new item is locked - * and pinned; on failure exit, no lock or pin is held. - */ -bool -_bt_next(IndexScanDesc scan, ScanDirection dir) -{ - Relation rel; - Buffer buf; - Page page; - OffsetNumber offnum; - ItemPointer current; - BTScanOpaque so; - bool continuescan; - - rel = scan->indexRelation; - so = (BTScanOpaque) scan->opaque; - current = &(scan->currentItemData); - - /* we still have the buffer pinned and locked */ - buf = so->btso_curbuf; - Assert(BufferIsValid(buf)); - - do - { - /* step one tuple in the appropriate direction */ - if (!_bt_step(scan, &buf, dir)) - return false; - - /* current is the next candidate tuple to return */ - offnum = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - - if (_bt_checkkeys(scan, page, offnum, dir, &continuescan)) - { - /* tuple passes all scan key conditions, so return it */ - return true; - } - - /* This tuple doesn't pass, but there might be more that do */ - } while (continuescan); - - /* No more items, so close down the current-item info */ - ItemPointerSetInvalid(current); - so->btso_curbuf = InvalidBuffer; - _bt_relbuf(rel, buf); - - return false; -} - /* * _bt_first() -- Find the first item in a scan. * * We need to be clever about the direction of scan, the search * conditions, and the tree ordering. We find the first item (or, * if backwards scan, the last item) in the tree that satisfies the - * qualifications in the scan key. On exit, the page containing - * the current index tuple is read locked and pinned, and the scan's - * opaque data entry is updated to include the buffer. + * qualifications in the scan key. On success exit, the page containing + * the current index tuple is pinned but not locked, and data about + * the matching tuple(s) on the page has been loaded into so->currPos, + * and scan->xs_ctup.t_self is set to the heap TID of the current tuple. + * + * If there are no matching items in the index, we return FALSE, with no + * pins or locks held. * * Note that scan->keyData[], and the so->keyData[] scankey built from it, * are both search-type scankeys (see nbtree/README for more about this). @@ -486,16 +438,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Buffer buf; - Page page; BTStack stack; OffsetNumber offnum; - ItemPointer current; - BlockNumber blkno; StrategyNumber strat; - bool res; bool nextkey; bool goback; - bool continuescan; ScanKey startKeys[INDEX_MAX_KEYS]; ScanKeyData scankeys[INDEX_MAX_KEYS]; int keysCount = 0; @@ -847,26 +794,31 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) /* don't need to keep the stack around... */ _bt_freestack(stack); - current = &(scan->currentItemData); + /* remember which buffer we have pinned, if any */ + so->currPos.buf = buf; if (!BufferIsValid(buf)) { /* Only get here if index is completely empty */ - ItemPointerSetInvalid(current); - so->btso_curbuf = InvalidBuffer; return false; } - /* remember which buffer we have pinned */ - so->btso_curbuf = buf; + /* initialize moreLeft/moreRight appropriately for scan direction */ + if (ScanDirectionIsForward(dir)) + { + so->currPos.moreLeft = false; + so->currPos.moreRight = true; + } + else + { + so->currPos.moreLeft = true; + so->currPos.moreRight = false; + } + so->numKilled = 0; /* just paranoia */ /* position to the precise item on the page */ offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey); - page = BufferGetPage(buf); - blkno = BufferGetBlockNumber(buf); - ItemPointerSet(current, blkno, offnum); - /* * If nextkey = false, we are positioned at the first item >= scan key, or * possibly at the end of a page on which all the existing items are less @@ -880,174 +832,310 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * * The actually desired starting point is either this item or the prior * one, or in the end-of-page case it's the first item on the next page or - * the last item on this page. We apply _bt_step if needed to get to the - * right place. - * - * If _bt_step fails (meaning we fell off the end of the index in one - * direction or the other), then there are no matches so we just return - * false. + * the last item on this page. Adjust the starting offset if needed. + * (If this results in an offset before the first item or after the last + * one, _bt_readpage will report no items found, and then we'll step to + * the next page as needed.) */ if (goback) + offnum = OffsetNumberPrev(offnum); + + /* + * Now load data from the first page of the scan. + */ + if (!_bt_readpage(scan, dir, offnum)) { - /* _bt_step will do the right thing if we are at end-of-page */ - if (!_bt_step(scan, &buf, BackwardScanDirection)) + /* + * There's no actually-matching data on this page. Try to advance to + * the next page. Return false if there's no matching data at all. + */ + if (!_bt_steppage(scan, dir)) return false; } + + /* Drop the lock, but not pin, on the current page */ + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + + /* OK, itemIndex says what to return */ + scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid; + + return true; +} + +/* + * _bt_next() -- Get the next item in a scan. + * + * On entry, so->currPos describes the current page, which is pinned + * but not locked, and so->currPos.itemIndex identifies which item was + * previously returned. + * + * On successful exit, scan->xs_ctup.t_self is set to the TID of the + * next heap tuple, and so->currPos is updated as needed. + * + * On failure exit (no more tuples), we release pin and set + * so->currPos.buf to InvalidBuffer. + */ +bool +_bt_next(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + /* + * Advance to next tuple on current page; or if there's no more, + * try to step to the next page with data. + */ + if (ScanDirectionIsForward(dir)) + { + if (++so->currPos.itemIndex > so->currPos.lastItem) + { + /* We must acquire lock before applying _bt_steppage */ + Assert(BufferIsValid(so->currPos.buf)); + LockBuffer(so->currPos.buf, BT_READ); + if (!_bt_steppage(scan, dir)) + return false; + /* Drop the lock, but not pin, on the new page */ + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + } + } else { - /* If we're at end-of-page, must step forward to next page */ - if (offnum > PageGetMaxOffsetNumber(page)) + if (--so->currPos.itemIndex < so->currPos.firstItem) { - if (!_bt_step(scan, &buf, ForwardScanDirection)) + /* We must acquire lock before applying _bt_steppage */ + Assert(BufferIsValid(so->currPos.buf)); + LockBuffer(so->currPos.buf, BT_READ); + if (!_bt_steppage(scan, dir)) return false; + /* Drop the lock, but not pin, on the new page */ + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); } } - /* okay, current item pointer for the scan is right */ - offnum = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); + /* OK, itemIndex says what to return */ + scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid; - /* is the first item actually acceptable? */ - if (_bt_checkkeys(scan, page, offnum, dir, &continuescan)) - { - /* yes, return it */ - res = true; - } - else if (continuescan) + return true; +} + +/* + * _bt_readpage() -- Load data from current index page into so->currPos + * + * Caller must have pinned and read-locked so->currPos.buf; the buffer's state + * is not changed here. Also, currPos.moreLeft and moreRight must be valid; + * they are updated as appropriate. All other fields of so->currPos are + * initialized from scratch here. + * + * We scan the current page starting at offnum and moving in the indicated + * direction. All items matching the scan keys are loaded into currPos.items. + * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports + * that there can be no more matching tuples in the current scan direction. + * + * Returns true if any matching items found on the page, false if none. + */ +static bool +_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + int itemIndex; + bool continuescan; + + /* we must have the buffer pinned and locked */ + Assert(BufferIsValid(so->currPos.buf)); + + page = BufferGetPage(so->currPos.buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * we must save the page's right-link while scanning it; this tells us + * where to step right to after we're done with these items. There is + * no corresponding need for the left-link, since splits always go right. + */ + so->currPos.nextPage = opaque->btpo_next; + + if (ScanDirectionIsForward(dir)) { - /* no, but there might be another one that is */ - res = _bt_next(scan, dir); + /* load items[] in ascending order */ + itemIndex = 0; + + offnum = Max(offnum, minoff); + + while (offnum <= maxoff) + { + if (_bt_checkkeys(scan, page, offnum, dir, &continuescan)) + { + /* tuple passes all scan key conditions, so remember it */ + /* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */ + so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self; + so->currPos.items[itemIndex].indexOffset = offnum; + itemIndex++; + } + if (!continuescan) + { + /* there can't be any more matches, so stop */ + so->currPos.moreRight = false; + break; + } + + offnum = OffsetNumberNext(offnum); + } + + Assert(itemIndex <= MaxIndexTuplesPerPage); + so->currPos.firstItem = 0; + so->currPos.lastItem = itemIndex - 1; + so->currPos.itemIndex = 0; } else { - /* no tuples in the index match this scan key */ - ItemPointerSetInvalid(current); - so->btso_curbuf = InvalidBuffer; - _bt_relbuf(rel, buf); - res = false; + /* load items[] in descending order */ + itemIndex = MaxIndexTuplesPerPage; + + offnum = Min(offnum, maxoff); + + while (offnum >= minoff) + { + if (_bt_checkkeys(scan, page, offnum, dir, &continuescan)) + { + /* tuple passes all scan key conditions, so remember it */ + /* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */ + itemIndex--; + so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self; + so->currPos.items[itemIndex].indexOffset = offnum; + } + if (!continuescan) + { + /* there can't be any more matches, so stop */ + so->currPos.moreLeft = false; + break; + } + + offnum = OffsetNumberPrev(offnum); + } + + Assert(itemIndex >= 0); + so->currPos.firstItem = itemIndex; + so->currPos.lastItem = MaxIndexTuplesPerPage - 1; + so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; } - return res; + return (so->currPos.firstItem <= so->currPos.lastItem); } /* - * _bt_step() -- Step one item in the requested direction in a scan on - * the tree. + * _bt_steppage() -- Step to next page containing valid data for scan + * + * On entry, so->currPos.buf must be pinned and read-locked. We'll drop + * the lock and pin before moving to next page. * - * *bufP is the current buffer (read-locked and pinned). If we change - * pages, it's updated appropriately. + * On success exit, we hold pin and read-lock on the next interesting page, + * and so->currPos is updated to contain data from that page. * - * If successful, update scan's currentItemData and return true. - * If no adjacent record exists in the requested direction, - * release buffer pin/locks and return false. + * If there are no more matching records in the given direction, we drop all + * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE. */ -bool -_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) +static bool +_bt_steppage(IndexScanDesc scan, ScanDirection dir) { - ItemPointer current = &(scan->currentItemData); BTScanOpaque so = (BTScanOpaque) scan->opaque; Relation rel; Page page; BTPageOpaque opaque; - OffsetNumber offnum, - maxoff; - BlockNumber blkno; - /* - * Don't use ItemPointerGetOffsetNumber or you risk to get assertion due - * to ability of ip_posid to be equal 0. - */ - offnum = current->ip_posid; + /* we must have the buffer pinned and locked */ + Assert(BufferIsValid(so->currPos.buf)); - page = BufferGetPage(*bufP); - maxoff = PageGetMaxOffsetNumber(page); + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan, true); + + rel = scan->indexRelation; if (ScanDirectionIsForward(dir)) { - if (offnum < maxoff) - offnum = OffsetNumberNext(offnum); - else + /* Walk right to the next page with data */ + /* We must rely on the previously saved nextPage link! */ + BlockNumber blkno = so->currPos.nextPage; + + /* Remember we left a page with data */ + so->currPos.moreLeft = true; + + for (;;) { - /* Walk right to the next page with data */ - rel = scan->indexRelation; + /* if we're at end of scan, release the buffer and return */ + if (blkno == P_NONE || !so->currPos.moreRight) + { + _bt_relbuf(rel, so->currPos.buf); + so->currPos.buf = InvalidBuffer; + return false; + } + /* step right one page */ + so->currPos.buf = _bt_relandgetbuf(rel, so->currPos.buf, + blkno, BT_READ); + /* check for deleted page */ + page = BufferGetPage(so->currPos.buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - for (;;) + if (!P_IGNORE(opaque)) { - /* if we're at end of scan, release the buffer and return */ - if (P_RIGHTMOST(opaque)) - { - _bt_relbuf(rel, *bufP); - ItemPointerSetInvalid(current); - *bufP = so->btso_curbuf = InvalidBuffer; - return false; - } - /* step right one page */ - blkno = opaque->btpo_next; - *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ); - page = BufferGetPage(*bufP); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (!P_IGNORE(opaque)) - { - /* done if it's not empty */ - maxoff = PageGetMaxOffsetNumber(page); - offnum = P_FIRSTDATAKEY(opaque); - if (offnum <= maxoff) - break; - } + /* see if there are any matches on this page */ + /* note that this will clear moreRight if we can stop */ + if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque))) + break; } + /* nope, keep going */ + blkno = opaque->btpo_next; } } else { - /* backwards scan */ - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (offnum > P_FIRSTDATAKEY(opaque)) - offnum = OffsetNumberPrev(offnum); - else + /* Remember we left a page with data */ + so->currPos.moreRight = true; + + /* + * Walk left to the next page with data. This is much more + * complex than the walk-right case because of the possibility + * that the page to our left splits while we are in flight to it, + * plus the possibility that the page we were on gets deleted + * after we leave it. See nbtree/README for details. + */ + for (;;) { - /* - * Walk left to the next page with data. This is much more - * complex than the walk-right case because of the possibility - * that the page to our left splits while we are in flight to it, - * plus the possibility that the page we were on gets deleted - * after we leave it. See nbtree/README for details. - */ - rel = scan->indexRelation; - for (;;) + /* Done if we know there are no matching keys to the left */ + if (!so->currPos.moreLeft) { - *bufP = _bt_walk_left(rel, *bufP); + _bt_relbuf(rel, so->currPos.buf); + so->currPos.buf = InvalidBuffer; + return false; + } - /* if we're at end of scan, return failure */ - if (*bufP == InvalidBuffer) - { - ItemPointerSetInvalid(current); - so->btso_curbuf = InvalidBuffer; - return false; - } - page = BufferGetPage(*bufP); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* Step to next physical page */ + so->currPos.buf = _bt_walk_left(rel, so->currPos.buf); - /* - * Okay, we managed to move left to a non-deleted page. Done - * if it's not half-dead and not empty. Else loop back and do - * it all again. - */ - if (!P_IGNORE(opaque)) - { - maxoff = PageGetMaxOffsetNumber(page); - offnum = maxoff; - if (maxoff >= P_FIRSTDATAKEY(opaque)) - break; - } + /* if we're physically at end of index, return failure */ + if (so->currPos.buf == InvalidBuffer) + return false; + + /* + * Okay, we managed to move left to a non-deleted page. + * Done if it's not half-dead and contains matching tuples. + * Else loop back and do it all again. + */ + page = BufferGetPage(so->currPos.buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_IGNORE(opaque)) + { + /* see if there are any matches on this page */ + /* note that this will clear moreLeft if we can stop */ + if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page))) + break; } } } - /* Update scan state */ - so->btso_curbuf = *bufP; - blkno = BufferGetBlockNumber(*bufP); - ItemPointerSet(current, blkno, offnum); - return true; } @@ -1250,31 +1338,23 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) } /* - * _bt_endpoint() -- Find the first or last key in the index, and scan + * _bt_endpoint() -- Find the first or last page in the index, and scan * from there to the first key satisfying all the quals. * * This is used by _bt_first() to set up a scan when we've determined * that the scan must start at the beginning or end of the index (for - * a forward or backward scan respectively). + * a forward or backward scan respectively). Exit conditions are the + * same as for _bt_first(). */ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir) { - Relation rel; + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; Buffer buf; Page page; BTPageOpaque opaque; - ItemPointer current; - OffsetNumber maxoff; OffsetNumber start; - BlockNumber blkno; - BTScanOpaque so; - bool res; - bool continuescan; - - rel = scan->indexRelation; - current = &(scan->currentItemData); - so = (BTScanOpaque) scan->opaque; /* * Scan down to the leftmost or rightmost leaf page. This is a simplified @@ -1286,18 +1366,14 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) if (!BufferIsValid(buf)) { /* empty index... */ - ItemPointerSetInvalid(current); - so->btso_curbuf = InvalidBuffer; + so->currPos.buf = InvalidBuffer; return false; } - blkno = BufferGetBlockNumber(buf); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISLEAF(opaque)); - maxoff = PageGetMaxOffsetNumber(page); - if (ScanDirectionIsForward(dir)) { /* There could be dead pages to the left, so not this: */ @@ -1310,8 +1386,6 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) Assert(P_RIGHTMOST(opaque)); start = PageGetMaxOffsetNumber(page); - if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty page */ - start = P_FIRSTDATAKEY(opaque); } else { @@ -1319,43 +1393,40 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) start = 0; /* keep compiler quiet */ } - ItemPointerSet(current, blkno, start); /* remember which buffer we have pinned */ - so->btso_curbuf = buf; + so->currPos.buf = buf; - /* - * Left/rightmost page could be empty due to deletions, if so step till we - * find a nonempty page. - */ - if (start > maxoff) + /* initialize moreLeft/moreRight appropriately for scan direction */ + if (ScanDirectionIsForward(dir)) { - if (!_bt_step(scan, &buf, dir)) - return false; - start = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); + so->currPos.moreLeft = false; + so->currPos.moreRight = true; + } + else + { + so->currPos.moreLeft = true; + so->currPos.moreRight = false; } + so->numKilled = 0; /* just paranoia */ /* - * Okay, we are on the first or last tuple. Does it pass all the quals? + * Now load data from the first page of the scan. */ - if (_bt_checkkeys(scan, page, start, dir, &continuescan)) - { - /* yes, return it */ - res = true; - } - else if (continuescan) - { - /* no, but there might be another one that does */ - res = _bt_next(scan, dir); - } - else + if (!_bt_readpage(scan, dir, start)) { - /* no tuples in the index match this scan key */ - ItemPointerSetInvalid(current); - so->btso_curbuf = InvalidBuffer; - _bt_relbuf(rel, buf); - res = false; + /* + * There's no actually-matching data on this page. Try to advance to + * the next page. Return false if there's no matching data at all. + */ + if (!_bt_steppage(scan, dir)) + return false; } - return res; + /* Drop the lock, but not pin, on the current page */ + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + + /* OK, itemIndex says what to return */ + scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid; + + return true; } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index b8b9ff780e..74d0605258 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.72 2006/03/05 15:58:21 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.73 2006/05/07 01:21:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -196,11 +196,6 @@ _bt_freestack(BTStack stack) * Again though, only keys with RHS datatype equal to the index datatype * can be checked for contradictions. * - * Furthermore, we detect the case where the index is unique and we have - * equality quals for all columns. In this case there can be at most one - * (visible) matching tuple. index_getnext uses this to avoid uselessly - * continuing the scan after finding one match. - * * Row comparison keys are treated the same as comparisons to nondefault * datatypes: we just transfer them into the preprocessed array without any * editorialization. We can treat them the same as an ordinary inequality @@ -216,7 +211,6 @@ _bt_freestack(BTStack stack) void _bt_preprocess_keys(IndexScanDesc scan) { - Relation relation = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; int numberOfKeys = scan->numberOfKeys; int new_numberOfKeys; @@ -234,7 +228,6 @@ _bt_preprocess_keys(IndexScanDesc scan) /* initialize result variables */ so->qual_ok = true; so->numberOfKeys = 0; - scan->keys_are_unique = false; if (numberOfKeys < 1) return; /* done if qual-less scan */ @@ -256,13 +249,6 @@ _bt_preprocess_keys(IndexScanDesc scan) */ if (cur->sk_flags & SK_ISNULL) so->qual_ok = false; - else if (relation->rd_index->indisunique && - relation->rd_rel->relnatts == 1) - { - /* it's a unique index, do we have an equality qual? */ - if (cur->sk_strategy == BTEqualStrategyNumber) - scan->keys_are_unique = true; - } memcpy(outkeys, inkeys, sizeof(ScanKeyData)); so->numberOfKeys = 1; /* We can mark the qual as required if it's for first index col */ @@ -464,14 +450,6 @@ _bt_preprocess_keys(IndexScanDesc scan) } so->numberOfKeys = new_numberOfKeys; - - /* - * If unique index and we have equality keys for all columns, set - * keys_are_unique flag for higher levels. - */ - if (relation->rd_index->indisunique && - relation->rd_rel->relnatts == numberOfEqualCols) - scan->keys_are_unique = true; } /* @@ -826,3 +804,89 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc, return result; } + +/* + * _bt_killitems - set LP_DELETE bit for items an indexscan caller has + * told us were killed + * + * scan->so contains information about the current page and killed tuples + * thereon (generally, this should only be called if so->numKilled > 0). + * + * The caller must have pin on so->currPos.buf, but may or may not have + * read-lock, as indicated by haveLock. Note that we assume read-lock + * is sufficient for setting LP_DELETE hint bits. + * + * We match items by heap TID before assuming they are the right ones to + * delete. We cope with cases where items have moved right due to insertions. + * If an item has moved off the current page due to a split, we'll fail to + * find it and do nothing (this is not an error case --- we assume the item + * will eventually get marked in a future indexscan). Likewise, if the item + * has moved left due to deletions or disappeared itself, we'll not find it, + * but these cases are not worth optimizing. (Since deletions are only done + * by VACUUM, any deletion makes it highly likely that the dead item has been + * removed itself, and therefore searching left is not worthwhile.) + */ +void +_bt_killitems(IndexScanDesc scan, bool haveLock) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + int i; + bool killedsomething = false; + + Assert(BufferIsValid(so->currPos.buf)); + + if (!haveLock) + LockBuffer(so->currPos.buf, BT_READ); + + page = BufferGetPage(so->currPos.buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + for (i = 0; i < so->numKilled; i++) + { + int itemIndex = so->killedItems[i]; + BTScanPosItem *kitem = &so->currPos.items[itemIndex]; + OffsetNumber offnum = kitem->indexOffset; + + Assert(itemIndex >= so->currPos.firstItem && + itemIndex <= so->currPos.lastItem); + if (offnum < minoff) + continue; /* pure paranoia */ + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); + + if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + { + /* found the item */ + iid->lp_flags |= LP_DELETE; + killedsomething = true; + break; /* out of inner search loop */ + } + offnum = OffsetNumberNext(offnum); + } + } + + /* + * Since this can be redone later if needed, it's treated the same + * as a commit-hint-bit status update for heap tuples: we mark the + * buffer dirty but don't make a WAL log entry. + */ + if (killedsomething) + SetBufferCommitInfoNeedsSave(so->currPos.buf); + + if (!haveLock) + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + + /* + * Always reset the scan state, so we don't look for same items + * on other pages. + */ + so->numKilled = 0; +} diff --git a/src/include/access/itup.h b/src/include/access/itup.h index 26e489af6b..d93451a28f 100644 --- a/src/include/access/itup.h +++ b/src/include/access/itup.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/itup.h,v 1.45 2006/03/05 15:58:53 momjian Exp $ + * $PostgreSQL: pgsql/src/include/access/itup.h,v 1.46 2006/05/07 01:21:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -126,6 +126,17 @@ typedef IndexAttributeBitMapData *IndexAttributeBitMap; ) \ ) +/* + * MaxIndexTuplesPerPage is an upper bound on the number of tuples that can + * fit on one index page. An index tuple must have either data or a null + * bitmap, so we can safely assume it's at least 1 byte bigger than a bare + * IndexTupleData struct. We arrive at the divisor because each tuple + * must be maxaligned, and it must have an associated item pointer. + */ +#define MaxIndexTuplesPerPage \ + ((int) ((BLCKSZ - offsetof(PageHeaderData, pd_linp)) / \ + (MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData)))) + /* routines in indextuple.c */ extern IndexTuple index_form_tuple(TupleDesc tupleDescriptor, diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 4b254f1fd1..1831c66216 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.96 2006/04/13 03:53:05 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.97 2006/05/07 01:21:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -336,30 +336,82 @@ typedef struct BTStackData typedef BTStackData *BTStack; /* - * BTScanOpaqueData is used to remember which buffers we're currently - * examining in an indexscan. Between calls to btgettuple or btgetmulti, - * we keep these buffers pinned (but not locked, see nbtree.c) to avoid - * doing a ReadBuffer() for every tuple in the index. + * BTScanOpaqueData is the btree-private state needed for an indexscan. + * This consists of preprocessed scan keys (see _bt_preprocess_keys() for + * details of the preprocessing), information about the current location + * of the scan, and information about the marked location, if any. (We use + * BTScanPosData to represent the data needed for each of current and marked + * locations.) In addition we can remember some known-killed index entries + * that must be marked before we can move off the current page. * - * We also store preprocessed versions of the scan keys in this structure. - * See _bt_preprocess_keys() for details of the preprocessing. + * Index scans work a page at a time: we pin and read-lock the page, identify + * all the matching items on the page and save them in BTScanPosData, then + * release the read-lock while returning the items to the caller for + * processing. This approach minimizes lock/unlock traffic. Note that we + * keep the pin on the index page until the caller is done with all the items + * (this is needed for VACUUM synchronization, see nbtree/README). When we + * are ready to step to the next page, if the caller has told us any of the + * items were killed, we re-lock the page to mark them killed, then unlock. + * Finally we drop the pin and step to the next page in the appropriate + * direction. * - * curHeapIptr & mrkHeapIptr are heap iptr-s from current/marked - * index tuples: we don't adjust scans on insertions - instead we - * use these pointers to restore index scan positions... - * - vadim 07/29/98 + * NOTE: in this implementation, btree does not use or set the + * currentItemData and currentMarkData fields of IndexScanDesc at all. */ +typedef struct BTScanPosItem /* what we remember about each match */ +{ + ItemPointerData heapTid; /* TID of referenced heap item */ + OffsetNumber indexOffset; /* index item's location within page */ +} BTScanPosItem; + +typedef struct BTScanPosData +{ + Buffer buf; /* if valid, the buffer is pinned */ + + BlockNumber nextPage; /* page's right link when we scanned it */ + + /* + * moreLeft and moreRight track whether we think there may be matching + * index entries to the left and right of the current page, respectively. + * We can clear the appropriate one of these flags when _bt_checkkeys() + * returns continuescan = false. + */ + bool moreLeft; + bool moreRight; + + /* + * The items array is always ordered in index order (ie, increasing + * indexoffset). When scanning backwards it is convenient to fill the + * array back-to-front, so we start at the last slot and fill downwards. + * Hence we need both a first-valid-entry and a last-valid-entry counter. + * itemIndex is a cursor showing which entry was last returned to caller. + */ + int firstItem; /* first valid index in items[] */ + int lastItem; /* last valid index in items[] */ + int itemIndex; /* current index in items[] */ + + BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ +} BTScanPosData; + +typedef BTScanPosData *BTScanPos; + +#define BTScanPosIsValid(scanpos) BufferIsValid((scanpos).buf) + typedef struct BTScanOpaqueData { - Buffer btso_curbuf; - Buffer btso_mrkbuf; - ItemPointerData curHeapIptr; - ItemPointerData mrkHeapIptr; /* these fields are set by _bt_preprocess_keys(): */ bool qual_ok; /* false if qual can never be satisfied */ int numberOfKeys; /* number of preprocessed scan keys */ ScanKey keyData; /* array of preprocessed scan keys */ + + /* info about killed items if any (killedItems is NULL if never used) */ + int *killedItems; /* currPos.items indexes of killed items */ + int numKilled; /* number of currently stored items */ + + /* keep these last in struct for efficiency */ + BTScanPosData currPos; /* current position data */ + BTScanPosData markPos; /* marked position, if any */ } BTScanOpaqueData; typedef BTScanOpaqueData *BTScanOpaque; @@ -424,9 +476,8 @@ extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey); extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum); -extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); -extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir); +extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost); /* @@ -440,6 +491,7 @@ extern void _bt_preprocess_keys(IndexScanDesc scan); extern bool _bt_checkkeys(IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir, bool *continuescan); +extern void _bt_killitems(IndexScanDesc scan, bool haveLock); /* * prototypes for functions in nbtsort.c diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 09e977c065..e7c409ed0e 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.44 2006/03/05 15:58:53 momjian Exp $ + * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.45 2006/05/07 01:21:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -67,12 +67,9 @@ typedef struct IndexScanDescData bool kill_prior_tuple; /* last-returned tuple is dead */ bool ignore_killed_tuples; /* do not return killed entries */ - /* set by index AM if scan keys satisfy index's uniqueness constraint */ - bool keys_are_unique; - - /* scan current state */ - bool got_tuple; /* true after successful index_getnext */ + /* index access method's private state */ void *opaque; /* access-method-specific info */ + /* these fields are used by some but not all AMs: */ ItemPointerData currentItemData; /* current index pointer */ ItemPointerData currentMarkData; /* marked position, if any */ @@ -85,15 +82,6 @@ typedef struct IndexScanDescData Buffer xs_cbuf; /* current heap buffer in scan, if any */ /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ - /* - * If keys_are_unique and got_tuple are both true, we stop calling the - * index AM; it is then necessary for index_getnext to keep track of the - * logical scan position for itself. It does that using unique_tuple_pos: - * -1 = before row, 0 = on row, +1 = after row. - */ - int unique_tuple_pos; /* logical position */ - int unique_tuple_mark; /* logical marked position */ - PgStat_Info xs_pgstat_info; /* statistics collector hook */ } IndexScanDescData; -- 2.40.0