/*-------------------------------------------------------------------------
*
- * btsearch.c
- * search code for postgres btrees.
+ * nbtsearch.c
+ * Search code for postgres btrees.
*
- * Copyright (c) 1994, Regents of the University of California
*
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.54 1999/09/27 18:20:21 momjian Exp $
+ * src/backend/access/nbtree/nbtsearch.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include "access/genam.h"
#include "access/nbtree.h"
+#include "access/relscan.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/predicate.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/tqual.h"
+static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
+ OffsetNumber offnum);
+static void _bt_saveitem(BTScanOpaque so, int itemIndex,
+ OffsetNumber offnum, IndexTuple itup);
+static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
+static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
+static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
+static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
-static BTStack _bt_searchr(Relation rel, int keysz, ScanKey scankey,
- Buffer *bufP, BTStack stack_in);
-static int _bt_compare(Relation rel, TupleDesc itupdesc, Page page,
- int keysz, ScanKey scankey, OffsetNumber offnum);
-static bool
- _bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
-static RetrieveIndexResult
- _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
/*
- * _bt_search() -- Search for a scan key in the index.
+ * _bt_drop_lock_and_maybe_pin()
+ *
+ * Unlock the buffer; and if it is safe to release the pin, do that, too. It
+ * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
+ * This will prevent vacuum from stalling in a blocked state trying to read a
+ * page when a cursor is sitting on it -- at least in many important cases.
*
- * This routine is actually just a helper that sets things up and
- * calls a recursive-descent search routine on the tree.
+ * Set the buffer to invalid if the pin is released, since the buffer may be
+ * re-used. If we need to go back to this block (for example, to apply
+ * LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it
+ * will remain in shared memory for as long as it takes to scan the index
+ * buffer page.
*/
-BTStack
-_bt_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP)
+static void
+_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
{
- *bufP = _bt_getroot(rel, BT_READ);
- return _bt_searchr(rel, keysz, scankey, bufP, (BTStack) NULL);
+ LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
+
+ if (IsMVCCSnapshot(scan->xs_snapshot) &&
+ RelationNeedsWAL(scan->indexRelation) &&
+ !scan->xs_want_itup)
+ {
+ ReleaseBuffer(sp->buf);
+ sp->buf = InvalidBuffer;
+ }
}
+
/*
- * _bt_searchr() -- Search the tree recursively for a particular scankey.
+ * _bt_search() -- Search the tree for a particular scankey,
+ * or more precisely for the first leaf page it could be on.
+ *
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
+ * When nextkey is false (the usual case), we are looking for the first
+ * item >= scankey. When nextkey is true, we are looking for the first
+ * item strictly greater than scankey.
+ *
+ * Return value is a stack of parent-page pointers. *bufP is set to the
+ * address of the leaf-page buffer, which is read-locked and pinned.
+ * No locks are held on the parent pages, however!
+ *
+ * If the snapshot parameter is not NULL, "old snapshot" checking will take
+ * place during the descent through the tree. This is not needed when
+ * positioning for an insert or delete, so NULL is used for those cases.
+ *
+ * NOTE that the returned buffer is read-locked regardless of the access
+ * parameter. However, access = BT_WRITE will allow an empty root page
+ * to be created and returned. When access = BT_READ, an empty index
+ * will result in *bufP being set to InvalidBuffer. Also, in BT_WRITE mode,
+ * any incomplete splits encountered during the search will be finished.
*/
-static BTStack
-_bt_searchr(Relation rel,
- int keysz,
- ScanKey scankey,
- Buffer *bufP,
- BTStack stack_in)
+BTStack
+_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
+ Buffer *bufP, int access, Snapshot snapshot)
{
- BTStack stack;
- OffsetNumber offnum;
- Page page;
- BTPageOpaque opaque;
- BlockNumber par_blkno;
- BlockNumber blkno;
- ItemId itemid;
- BTItem btitem;
- BTItem item_save;
- int item_nbytes;
- IndexTuple itup;
+ BTStack stack_in = NULL;
- /* if this is a leaf page, we're done */
- page = BufferGetPage(*bufP);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- if (opaque->btpo_flags & BTP_LEAF)
- return stack_in;
+ /* Get the root page to start with */
+ *bufP = _bt_getroot(rel, access);
- /*
- * Find the appropriate item on the internal page, and get the child
- * page that it points to.
- */
+ /* If index is empty and access = BT_READ, no root page is created. */
+ if (!BufferIsValid(*bufP))
+ return (BTStack) NULL;
- par_blkno = BufferGetBlockNumber(*bufP);
- offnum = _bt_binsrch(rel, *bufP, keysz, scankey, BT_DESCENT);
- itemid = PageGetItemId(page, offnum);
- btitem = (BTItem) PageGetItem(page, itemid);
- itup = &(btitem->bti_itup);
- blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+ /* Loop iterates once per level descended in the tree */
+ for (;;)
+ {
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber offnum;
+ ItemId itemid;
+ IndexTuple itup;
+ BlockNumber blkno;
+ BlockNumber par_blkno;
+ BTStack new_stack;
- /*
- * We need to save the bit image of the index entry we chose in the
- * parent page on a stack. In case we split the tree, we'll use this
- * bit image to figure out what our real parent page is, in case the
- * parent splits while we're working lower in the tree. See the paper
- * by Lehman and Yao for how this is detected and handled. (We use
- * unique OIDs to disambiguate duplicate keys in the index -- Lehman
- * and Yao disallow duplicate keys).
- */
+ /*
+ * Race -- the page we just grabbed may have split since we read its
+ * pointer in the parent (or metapage). If it has, we may need to
+ * move right to its new sibling. Do that.
+ *
+ * In write-mode, allow _bt_moveright to finish any incomplete splits
+ * along the way. Strictly speaking, we'd only need to finish an
+ * incomplete split on the leaf page we're about to insert to, not on
+ * any of the upper levels (they are taken care of in _bt_getstackbuf,
+ * if the leaf page is split and we insert to the parent page). But
+ * this is a good opportunity to finish splits of internal pages too.
+ */
+ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
+ (access == BT_WRITE), stack_in,
+ BT_READ, snapshot);
+
+ /* if this is a leaf page, we're done */
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (P_ISLEAF(opaque))
+ break;
- item_nbytes = ItemIdGetLength(itemid);
- item_save = (BTItem) palloc(item_nbytes);
- memmove((char *) item_save, (char *) btitem, item_nbytes);
- stack = (BTStack) palloc(sizeof(BTStackData));
- stack->bts_blkno = par_blkno;
- stack->bts_offset = offnum;
- stack->bts_btitem = item_save;
- stack->bts_parent = stack_in;
+ /*
+ * Find the appropriate item on the internal page, and get the child
+ * page that it points to.
+ */
+ offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
+ itemid = PageGetItemId(page, offnum);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+ blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+ par_blkno = BufferGetBlockNumber(*bufP);
- /* drop the read lock on the parent page and acquire one on the child */
- _bt_relbuf(rel, *bufP, BT_READ);
- *bufP = _bt_getbuf(rel, blkno, BT_READ);
+ /*
+ * We need to save the location of the index entry we chose in the
+ * parent page on a stack. In case we split the tree, we'll use the
+ * stack to work back up to the parent page. We also save the actual
+ * downlink (TID) to uniquely identify the index entry, in case it
+ * moves right while we're working lower in the tree. See the paper
+ * by Lehman and Yao for how this is detected and handled. (We use the
+ * child link to disambiguate duplicate keys in the index -- Lehman
+ * and Yao disallow duplicate keys.)
+ */
+ new_stack = (BTStack) palloc(sizeof(BTStackData));
+ new_stack->bts_blkno = par_blkno;
+ new_stack->bts_offset = offnum;
+ memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
+ new_stack->bts_parent = stack_in;
- /*
- * Race -- the page we just grabbed may have split since we read its
- * pointer in the parent. If it has, we may need to move right to its
- * new sibling. Do that.
- */
+ /* drop the read lock on the parent page, acquire one on the child */
+ *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
- *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);
+ /* okay, all set to move down a level */
+ stack_in = new_stack;
+ }
- /* okay, all set to move down a level */
- return _bt_searchr(rel, keysz, scankey, bufP, stack);
+ return stack_in;
}
/*
* _bt_moveright() -- move right in the btree if necessary.
*
- * When we drop and reacquire a pointer to a page, it is possible that
- * the page has changed in the meanwhile. If this happens, we're
- * guaranteed that the page has "split right" -- that is, that any
- * data that appeared on the page originally is either on the page
- * or strictly to the right of it.
+ * When we follow a pointer to reach a page, it is possible that
+ * the page has changed in the meanwhile. If this happens, we're
+ * guaranteed that the page has "split right" -- that is, that any
+ * data that appeared on the page originally is either on the page
+ * or strictly to the right of it.
+ *
+ * This routine decides whether or not we need to move right in the
+ * tree by examining the high key entry on the page. If that entry
+ * is strictly less than the scankey, or <= the scankey in the nextkey=true
+ * case, then we followed the wrong link and we need to move right.
*
- * This routine decides whether or not we need to move right in the
- * tree by examining the high key entry on the page. If that entry
- * is strictly less than one we expect to be on the page, then our
- * picture of the page is incorrect and we need to move right.
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
*
- * On entry, we have the buffer pinned and a lock of the proper type.
- * If we move right, we release the buffer and lock and acquire the
- * same on the right sibling.
+ * When nextkey is false (the usual case), we are looking for the first
+ * item >= scankey. When nextkey is true, we are looking for the first
+ * item strictly greater than scankey.
+ *
+ * If forupdate is true, we will attempt to finish any incomplete splits
+ * that we encounter. This is required when locking a target page for an
+ * insertion, because we don't allow inserting on a page before the split
+ * is completed. 'stack' is only used if forupdate is true.
+ *
+ * On entry, we have the buffer pinned and a lock of the type specified by
+ * 'access'. If we move right, we release the buffer and lock and acquire
+ * the same on the right sibling. Return value is the buffer we stop at.
+ *
+ * If the snapshot parameter is not NULL, "old snapshot" checking will take
+ * place during the descent through the tree. This is not needed when
+ * positioning for an insert or delete, so NULL is used for those cases.
*/
Buffer
_bt_moveright(Relation rel,
Buffer buf,
int keysz,
ScanKey scankey,
- int access)
+ bool nextkey,
+ bool forupdate,
+ BTStack stack,
+ int access,
+ Snapshot snapshot)
{
Page page;
BTPageOpaque opaque;
- ItemId hikey;
- BlockNumber rblkno;
- int natts = rel->rd_rel->relnatts;
-
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
- /* if we're on a rightmost page, we don't need to move right */
- if (P_RIGHTMOST(opaque))
- return buf;
-
- /* by convention, item 0 on non-rightmost pages is the high key */
- hikey = PageGetItemId(page, P_HIKEY);
+ int32 cmpval;
/*
- * If the scan key that brought us to this page is >= the high key
- * stored on the page, then the page has split and we need to move
- * right.
+ * When nextkey = false (normal case): if the scan key that brought us to
+ * this page is > the high key stored on the page, then the page has split
+ * and we need to move right. (If the scan key is equal to the high key,
+ * we might or might not need to move right; have to scan the page first
+ * anyway.)
+ *
+ * When nextkey = true: move right if the scan key is >= page's high key.
+ *
+ * The page could even have split more than once, so scan as far as
+ * needed.
+ *
+ * We also have to move right if we followed a link that brought us to a
+ * dead page.
*/
+ cmpval = nextkey ? 0 : 1;
- if (_bt_skeycmp(rel, keysz, scankey, page, hikey,
- BTGreaterEqualStrategyNumber))
+ for (;;)
{
- /* move right as long as we need to */
- do
- {
- OffsetNumber offmax = PageGetMaxOffsetNumber(page);
+ page = BufferGetPage(buf);
+ TestForOldSnapshot(snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- /*
- * If this page consists of all duplicate keys (hikey and
- * first key on the page have the same value), then we don't
- * need to step right.
- *
- * NOTE for multi-column indices: we may do scan using keys not
- * for all attrs. But we handle duplicates using all attrs in
- * _bt_insert/_bt_spool code. And so we've to compare scankey
- * with _last_ item on this page to do not lose "good" tuples
- * if number of attrs > keysize. Example: (2,0) - last items
- * on this page, (2,1) - first item on next page (hikey), our
- * scankey is x = 2. Scankey == (2,1) because of we compare
- * first attrs only, but we shouldn't to move right of here. -
- * vadim 04/15/97
- *
- * Also, if this page is not LEAF one (and # of attrs > keysize)
- * then we can't move too. - vadim 10/22/97
- */
+ if (P_RIGHTMOST(opaque))
+ break;
- if (_bt_skeycmp(rel, keysz, scankey, page, hikey,
- BTEqualStrategyNumber))
+ /*
+ * Finish any incomplete splits we encounter along the way.
+ */
+ if (forupdate && P_INCOMPLETE_SPLIT(opaque))
+ {
+ BlockNumber blkno = BufferGetBlockNumber(buf);
+
+ /* upgrade our lock if necessary */
+ if (access == BT_READ)
{
- if (opaque->btpo_flags & BTP_CHAIN)
- {
- Assert((opaque->btpo_flags & BTP_LEAF) || offmax > P_HIKEY);
- break;
- }
- if (offmax > P_HIKEY)
- {
- if (natts == keysz) /* sanity checks */
- {
- if (_bt_skeycmp(rel, keysz, scankey, page,
- PageGetItemId(page, P_FIRSTKEY),
- BTEqualStrategyNumber))
- elog(FATAL, "btree: BTP_CHAIN flag was expected in %s (access = %s)",
- rel->rd_rel->relname.data, access ? "bt_write" : "bt_read");
- if (_bt_skeycmp(rel, keysz, scankey, page,
- PageGetItemId(page, offmax),
- BTEqualStrategyNumber))
- elog(FATAL, "btree: unexpected equal last item");
- if (_bt_skeycmp(rel, keysz, scankey, page,
- PageGetItemId(page, offmax),
- BTLessStrategyNumber))
- elog(FATAL, "btree: unexpected greater last item");
- /* move right */
- }
- else if (!(opaque->btpo_flags & BTP_LEAF))
- break;
- else if (_bt_skeycmp(rel, keysz, scankey, page,
- PageGetItemId(page, offmax),
- BTLessEqualStrategyNumber))
- break;
- }
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ LockBuffer(buf, BT_WRITE);
}
- /* step right one page */
- rblkno = opaque->btpo_next;
- _bt_relbuf(rel, buf, access);
- buf = _bt_getbuf(rel, rblkno, access);
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- hikey = PageGetItemId(page, P_HIKEY);
-
- } while (!P_RIGHTMOST(opaque)
- && _bt_skeycmp(rel, keysz, scankey, page, hikey,
- BTGreaterEqualStrategyNumber));
- }
- return buf;
-}
-
-/*
- * _bt_skeycmp() -- compare a scan key to a particular item on a page using
- * a requested strategy (<, <=, =, >=, >).
- *
- * We ignore the unique OIDs stored in the btree item here. Those
- * numbers are intended for use internally only, in repositioning a
- * scan after a page split. They do not impose any meaningful ordering.
- *
- * The comparison is A <op> B, where A is the scan key and B is the
- * tuple pointed at by itemid on page.
- */
-bool
-_bt_skeycmp(Relation rel,
- Size keysz,
- ScanKey scankey,
- Page page,
- ItemId itemid,
- StrategyNumber strat)
-{
- BTItem item;
- IndexTuple indexTuple;
- TupleDesc tupDes;
- ScanKey entry;
- int i;
- Datum attrDatum;
- Datum keyDatum;
- bool compare;
- bool isNull;
- bool useEqual = false;
- bool keyNull;
-
- if (strat == BTLessEqualStrategyNumber)
- {
- useEqual = true;
- strat = BTLessStrategyNumber;
- }
- else if (strat == BTGreaterEqualStrategyNumber)
- {
- useEqual = true;
- strat = BTGreaterStrategyNumber;
- }
-
- item = (BTItem) PageGetItem(page, itemid);
- indexTuple = &(item->bti_itup);
-
- tupDes = RelationGetDescr(rel);
-
- /* see if the comparison is true for all of the key attributes */
- for (i = 1; i <= keysz; i++)
- {
-
- entry = &scankey[i - 1];
- Assert(entry->sk_attno == i);
- attrDatum = index_getattr(indexTuple,
- entry->sk_attno,
- tupDes,
- &isNull);
- keyDatum = entry->sk_argument;
-
- /* see comments about NULLs handling in btbuild */
- if (entry->sk_flags & SK_ISNULL) /* key is NULL */
- {
- Assert(entry->sk_procedure == F_NULLVALUE);
- keyNull = true;
- if (isNull)
- compare = (strat == BTEqualStrategyNumber) ? true : false;
+ if (P_INCOMPLETE_SPLIT(opaque))
+ _bt_finish_split(rel, buf, stack);
else
- compare = (strat == BTGreaterStrategyNumber) ? true : false;
- }
- else if (isNull) /* key is NOT_NULL and item is NULL */
- {
- keyNull = false;
- compare = (strat == BTLessStrategyNumber) ? true : false;
- }
- else
- {
- keyNull = false;
- compare = _bt_invokestrat(rel, i, strat, keyDatum, attrDatum);
+ _bt_relbuf(rel, buf);
+
+ /* re-acquire the lock in the right mode, and re-check */
+ buf = _bt_getbuf(rel, blkno, access);
+ continue;
}
- if (compare) /* true for one of ">, <, =" */
+ if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
{
- if (strat != BTEqualStrategyNumber)
- return true;
+ /* step right one page */
+ buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
+ continue;
}
else
-/* false for one of ">, <, =" */
- {
- if (strat == BTEqualStrategyNumber)
- return false;
-
- /*
- * if original strat was "<=, >=" OR "<, >" but some
- * attribute(s) left - need to test for Equality
- */
- if (useEqual || i < keysz)
- {
- if (keyNull || isNull)
- compare = (keyNull && isNull) ? true : false;
- else
- compare = _bt_invokestrat(rel, i, BTEqualStrategyNumber,
- keyDatum, attrDatum);
- if (compare) /* key' and item' attributes are equal */
- continue; /* - try to compare next attributes */
- }
- return false;
- }
+ break;
}
- return true;
+ if (P_IGNORE(opaque))
+ elog(ERROR, "fell off the end of index \"%s\"",
+ RelationGetRelationName(rel));
+
+ return buf;
}
/*
* _bt_binsrch() -- Do a binary search for a key on a particular page.
*
- * The scankey we get has the compare function stored in the procedure
- * entry of each data struct. We invoke this regproc to do the
- * comparison for every key in the scankey. _bt_binsrch() returns
- * the OffsetNumber of the first matching key on the page, or the
- * OffsetNumber at which the matching key would appear if it were
- * on this page. (NOTE: in particular, this means it is possible to
- * return a value 1 greater than the number of keys on the page, if
- * the scankey is > all keys on the page.)
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
+ * When nextkey is false (the usual case), we are looking for the first
+ * item >= scankey. When nextkey is true, we are looking for the first
+ * item strictly greater than scankey.
+ *
+ * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
+ * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
+ * particular, this means it is possible to return a value 1 greater than the
+ * number of keys on the page, if the scankey is > all keys on the page.)
+ *
+ * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
+ * of the last key < given scankey, or last key <= given scankey if nextkey
+ * is true. (Since _bt_compare treats the first data key of such a page as
+ * minus infinity, there will be at least one key < scankey, so the result
+ * always points at one of the keys on the page.) This key indicates the
+ * right place to descend to be sure we find all leaf keys >= given scankey
+ * (or leaf keys > given scankey when nextkey is true).
*
- * By the time this procedure is called, we're sure we're looking
- * at the right page -- don't need to walk right. _bt_binsrch() has
- * no lock or refcount side effects on the buffer.
+ * This procedure is not responsible for walking right, it just examines
+ * the given page. _bt_binsrch() has no lock or refcount side effects
+ * on the buffer.
*/
OffsetNumber
_bt_binsrch(Relation rel,
Buffer buf,
int keysz,
ScanKey scankey,
- int srchtype)
+ bool nextkey)
{
- TupleDesc itupdesc;
Page page;
BTPageOpaque opaque;
OffsetNumber low,
high;
- bool haveEq;
- int natts = rel->rd_rel->relnatts;
- int result;
+ int32 result,
+ cmpval;
- itupdesc = RelationGetDescr(rel);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- /* by convention, item 1 on any non-rightmost page is the high key */
- low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
+ low = P_FIRSTDATAKEY(opaque);
high = PageGetMaxOffsetNumber(page);
/*
- * If there are no keys on the page, return the first available slot.
- * Note this covers two cases: the page is really empty (no keys),
- * or it contains only a high key. The latter case is possible after
- * vacuuming.
+ * If there are no keys on the page, return the first available slot. Note
+ * this covers two cases: the page is really empty (no keys), or it
+ * contains only a high key. The latter case is possible after vacuuming.
+ * This can never happen on an internal page, however, since they are
+ * never empty (an internal page must have children).
*/
if (high < low)
return low;
/*
- * Binary search to find the first key on the page >= scan key.
- * Loop invariant: all slots before 'low' are < scan key, all slots
- * at or after 'high' are >= scan key. Also, haveEq is true if the
- * tuple at 'high' is == scan key.
+ * Binary search to find the first key on the page >= scan key, or first
+ * key > scankey when nextkey is true.
+ *
+ * For nextkey=false (cmpval=1), the loop invariant is: all slots before
+ * 'low' are < scan key, all slots at or after 'high' are >= scan key.
+ *
+ * For nextkey=true (cmpval=0), the loop invariant is: all slots before
+ * 'low' are <= scan key, all slots at or after 'high' are > scan key.
+ *
* We can fall out when high == low.
*/
high++; /* establish the loop invariant for high */
- haveEq = false;
+
+ cmpval = nextkey ? 0 : 1; /* select comparison value */
while (high > low)
{
- OffsetNumber mid = low + ((high - low) / 2);
+ OffsetNumber mid = low + ((high - low) / 2);
+
/* We have low <= mid < high, so mid points at a real slot */
- result = _bt_compare(rel, itupdesc, page, keysz, scankey, mid);
+ result = _bt_compare(rel, keysz, scankey, page, mid);
- if (result > 0)
+ if (result >= cmpval)
low = mid + 1;
else
- {
high = mid;
- haveEq = (result == 0);
- }
}
- /*--------------------
+ /*
* At this point we have high == low, but be careful: they could point
- * past the last slot on the page. We also know that haveEq is true
- * if and only if there is an equal key (in which case high&low point
- * at the first equal key).
+ * past the last slot on the page.
*
- * On a leaf page, we always return the first key >= scan key
- * (which could be the last slot + 1).
- *--------------------
+ * On a leaf page, we always return the first key >= scan key (resp. >
+ * scan key), which could be the last slot + 1.
*/
-
- if (opaque->btpo_flags & BTP_LEAF)
+ if (P_ISLEAF(opaque))
return low;
- /*--------------------
- * On a non-leaf page, there are special cases:
- *
- * For an insertion (srchtype != BT_DESCENT and natts == keysz)
- * always return first key >= scan key (which could be off the end).
- *
- * For a standard search (srchtype == BT_DESCENT and natts == keysz)
- * return the first equal key if one exists, else the last lesser key
- * if one exists, else the first slot on the page.
- *
- * For a partial-match search (srchtype == BT_DESCENT and natts > keysz)
- * return the last lesser key if one exists, else the first slot.
- *
- * Old comments:
- * For multi-column indices, we may scan using keys
- * not for all attrs. But we handle duplicates using all attrs
- * in _bt_insert/_bt_spool code. And so while searching on
- * internal pages having number of attrs > keysize we want to
- * point at the last item < the scankey, not at the first item
- * = the scankey (!!!), and let _bt_moveright decide later
- * whether to move right or not (see comments and example
- * there). Note also that INSERTions are not affected by this
- * code (since natts == keysz for inserts). - vadim 04/15/97
- *--------------------
+ /*
+ * On a non-leaf page, return the last key < scan key (resp. <= scan key).
+ * There must be one if _bt_compare() is playing by the rules.
*/
-
- if (haveEq)
- {
- /*
- * There is an equal key. We return either the first equal key
- * (which we just found), or the last lesser key.
- *
- * We need not check srchtype != BT_DESCENT here, since if that
- * is true then natts == keysz by assumption.
- */
- if (natts == keysz)
- return low; /* return first equal key */
- }
- else
- {
- /*
- * There is no equal key. We return either the first greater key
- * (which we just found), or the last lesser key.
- */
- if (srchtype != BT_DESCENT)
- return low; /* return first greater key */
- }
-
-
- if (low == (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY))
- return low; /* there is no prior item */
+ Assert(low > P_FIRSTDATAKEY(opaque));
return OffsetNumberPrev(low);
}
-/*
+/*----------
* _bt_compare() -- Compare scankey to a particular tuple on the page.
*
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
+ * keysz: number of key conditions to be checked (might be less than the
+ * number of index columns!)
+ * page/offnum: location of btree item to be compared to.
+ *
* This routine returns:
- * -1 if scankey < tuple at offnum;
+ * <0 if scankey < tuple at offnum;
* 0 if scankey == tuple at offnum;
- * +1 if scankey > tuple at offnum.
- *
- * -- Old comments:
- * In order to avoid having to propagate changes up the tree any time
- * a new minimal key is inserted, the leftmost entry on the leftmost
- * page is less than all possible keys, by definition.
+ * >0 if scankey > tuple at offnum.
+ * NULLs in the keys are treated as sortable values. Therefore
+ * "equality" does not necessarily mean that the item should be
+ * returned to the caller as a matching key!
*
- * -- New ones:
- * New insertion code (fix against updating _in_place_ if new minimal
- * key has bigger size than old one) may delete P_HIKEY entry on the
- * root page in order to insert new minimal key - and so this definition
- * does not work properly in this case and breaks key' order on root
- * page. BTW, this propagation occures only while page' splitting,
- * but not "any time a new min key is inserted" (see _bt_insertonpg).
- * - vadim 12/05/96
+ * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
+ * "minus infinity": this routine will always claim it is less than the
+ * scankey. The actual key value stored (if any, which there probably isn't)
+ * does not matter. This convention allows us to implement the Lehman and
+ * Yao convention that the first down-link pointer is before the first key.
+ * See backend/access/nbtree/README for details.
+ *----------
*/
-static int
+int32
_bt_compare(Relation rel,
- TupleDesc itupdesc,
- Page page,
int keysz,
ScanKey scankey,
+ Page page,
OffsetNumber offnum)
{
- Datum datum;
- BTItem btitem;
+ TupleDesc itupdesc = RelationGetDescr(rel);
+ BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
IndexTuple itup;
- BTPageOpaque opaque;
- ScanKey entry;
- AttrNumber attno;
- int result;
int i;
- bool null;
/*
- * If this is a leftmost internal page, and if our comparison is with
- * the first key on the page, then the item at that position is by
- * definition less than the scan key.
- *
- * - see new comments above...
+ * Force result ">" if target item is first data item on an internal page
+ * --- see NOTE above.
*/
-
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
- if (!(opaque->btpo_flags & BTP_LEAF)
- && P_LEFTMOST(opaque)
- && offnum == P_HIKEY)
- {
- /*
- * we just have to believe that this will only be called with
- * offnum == P_HIKEY when P_HIKEY is the OffsetNumber of the first
- * actual data key (i.e., this is also a rightmost page). there
- * doesn't seem to be any code that implies that the leftmost page
- * is normally missing a high key as well as the rightmost page.
- * but that implies that this code path only applies to the root
- * -- which seems unlikely..
- *
- * - see new comments above...
- */
- if (!P_RIGHTMOST(opaque))
- elog(ERROR, "_bt_compare: invalid comparison to high key");
-
-#ifdef NOT_USED
-
- /*
- * We just have to belive that right answer will not break
- * anything. I've checked code and all seems to be ok. See new
- * comments above...
- *
- * -- Old comments If the item on the page is equal to the scankey,
- * that's okay to admit. We just can't claim that the first key
- * on the page is greater than anything.
- */
-
- if (_bt_skeycmp(rel, keysz, scankey, page, PageGetItemId(page, offnum),
- BTEqualStrategyNumber))
- return 0;
+ if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
return 1;
-#endif
- }
- btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
- itup = &(btitem->bti_itup);
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
/*
- * The scan key is set up with the attribute number associated with
- * each term in the key. It is important that, if the index is
- * multi-key, the scan contain the first k key attributes, and that
- * they be in order. If you think about how multi-key ordering works,
- * you'll understand why this is.
+ * The scan key is set up with the attribute number associated with each
+ * term in the key. It is important that, if the index is multi-key, the
+ * scan contain the first k key attributes, and that they be in order. If
+ * you think about how multi-key ordering works, you'll understand why
+ * this is.
*
- * We don't test for violation of this condition here.
+ * We don't test for violation of this condition here, however. The
+ * initial setup for the index scan had better have gotten it right (see
+ * _bt_first).
*/
for (i = 1; i <= keysz; i++)
{
- long tmpres;
+ Datum datum;
+ bool isNull;
+ int32 result;
- entry = &scankey[i - 1];
- attno = entry->sk_attno;
- datum = index_getattr(itup, attno, itupdesc, &null);
+ datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
/* see comments about NULLs handling in btbuild */
- if (entry->sk_flags & SK_ISNULL) /* key is NULL */
+ if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
{
- Assert(entry->sk_procedure == F_NULLVALUE);
- if (null)
- tmpres = (long) 0; /* NULL "=" NULL */
+ if (isNull)
+ result = 0; /* NULL "=" NULL */
+ else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
+ result = -1; /* NULL "<" NOT_NULL */
else
- tmpres = (long) 1; /* NULL ">" NOT_NULL */
+ result = 1; /* NULL ">" NOT_NULL */
}
- else if (null) /* key is NOT_NULL and item is NULL */
+ else if (isNull) /* key is NOT_NULL and item is NULL */
{
- tmpres = (long) -1; /* NOT_NULL "<" NULL */
+ if (scankey->sk_flags & SK_BT_NULLS_FIRST)
+ result = 1; /* NOT_NULL ">" NULL */
+ else
+ result = -1; /* NOT_NULL "<" NULL */
}
else
- tmpres = (long) FMGR_PTR2(&entry->sk_func, entry->sk_argument, datum);
- result = tmpres;
+ {
+ /*
+ * The sk_func needs to be passed the index value as left arg and
+ * the sk_argument as right arg (they might be of different
+ * types). Since it is convenient for callers to think of
+ * _bt_compare as comparing the scankey to the index item, we have
+ * to flip the sign of the comparison result. (Unless it's a DESC
+ * column, in which case we *don't* flip the sign.)
+ */
+ result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
+ scankey->sk_collation,
+ datum,
+ scankey->sk_argument));
+
+ if (!(scankey->sk_flags & SK_BT_DESC))
+ result = -result;
+ }
/* if the keys are unequal, return the difference */
if (result != 0)
return result;
+
+ scankey++;
}
- /* by here, the keys are equal */
+ /* if we get here, the keys are equal */
return 0;
}
/*
- * _bt_next() -- Get the next item in a scan.
+ * _bt_first() -- Find the first item in a scan.
+ *
+ * We need to be clever about the direction of scan, the search
+ * conditions, and the tree ordering. We find the first item (or,
+ * if backwards scan, the last item) in the tree that satisfies the
+ * qualifications in the scan key. On success exit, the page containing
+ * the current index tuple is pinned but not locked, and data about
+ * the matching tuple(s) on the page has been loaded into so->currPos.
+ * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
+ * and if requested, scan->xs_itup points to a copy of the index tuple.
*
- * On entry, we have a valid currentItemData in the scan, and a
- * read lock on the page that contains that item. We do not have
- * the page pinned. We return the next item in the scan. On
- * exit, we have the page containing the next item locked but not
- * pinned.
+ * If there are no matching items in the index, we return FALSE, with no
+ * pins or locks held.
+ *
+ * Note that scan->keyData[], and the so->keyData[] scankey built from it,
+ * are both search-type scankeys (see nbtree/README for more about this).
+ * Within this routine, we build a temporary insertion-type scankey to use
+ * in locating the scan start position.
*/
-RetrieveIndexResult
-_bt_next(IndexScanDesc scan, ScanDirection dir)
+bool
+_bt_first(IndexScanDesc scan, ScanDirection dir)
{
- Relation rel;
+ Relation rel = scan->indexRelation;
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
Buffer buf;
- Page page;
+ BTStack stack;
OffsetNumber offnum;
- RetrieveIndexResult res;
- ItemPointer current;
- BTItem btitem;
- IndexTuple itup;
- BTScanOpaque so;
- Size keysok;
+ StrategyNumber strat;
+ bool nextkey;
+ bool goback;
+ ScanKey startKeys[INDEX_MAX_KEYS];
+ ScanKeyData scankeys[INDEX_MAX_KEYS];
+ ScanKeyData notnullkeys[INDEX_MAX_KEYS];
+ int keysCount = 0;
+ int i;
+ StrategyNumber strat_total;
+ BTScanPosItem *currItem;
- rel = scan->relation;
- so = (BTScanOpaque) scan->opaque;
- current = &(scan->currentItemData);
+ Assert(!BTScanPosIsValid(so->currPos));
- Assert(BufferIsValid(so->btso_curbuf));
+ pgstat_count_index_scan(rel);
- /* we still have the buffer pinned and locked */
- buf = so->btso_curbuf;
+ /*
+ * Examine the scan keys and eliminate any redundant keys; also mark the
+ * keys that must be matched to continue the scan.
+ */
+ _bt_preprocess_keys(scan);
+
+ /*
+ * Quit now if _bt_preprocess_keys() discovered that the scan keys can
+ * never be satisfied (eg, x == 1 AND x > 2).
+ */
+ if (!so->qual_ok)
+ return false;
- do
+ /*----------
+ * Examine the scan keys to discover where we need to start the scan.
+ *
+ * We want to identify the keys that can be used as starting boundaries;
+ * these are =, >, or >= keys for a forward scan or =, <, <= keys for
+ * a backwards scan. We can use keys for multiple attributes so long as
+ * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
+ * a > or < boundary or find an attribute with no boundary (which can be
+ * thought of as the same as "> -infinity"), we can't use keys for any
+ * attributes to its right, because it would break our simplistic notion
+ * of what initial positioning strategy to use.
+ *
+ * When the scan keys include cross-type operators, _bt_preprocess_keys
+ * may not be able to eliminate redundant keys; in such cases we will
+ * arbitrarily pick a usable one for each attribute. This is correct
+ * but possibly not optimal behavior. (For example, with keys like
+ * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
+ * x=5 would be more efficient.) Since the situation only arises given
+ * a poorly-worded query plus an incomplete opfamily, live with it.
+ *
+ * When both equality and inequality keys appear for a single attribute
+ * (again, only possible when cross-type operators appear), we *must*
+ * select one of the equality keys for the starting point, because
+ * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
+ * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
+ * start at x=4, we will fail and stop before reaching x=10. If multiple
+ * equality quals survive preprocessing, however, it doesn't matter which
+ * one we use --- by definition, they are either redundant or
+ * contradictory.
+ *
+ * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
+ * If the index stores nulls at the end of the index we'll be starting
+ * from, and we have no boundary key for the column (which means the key
+ * we deduced NOT NULL from is an inequality key that constrains the other
+ * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
+ * use as a boundary key. If we didn't do this, we might find ourselves
+ * traversing a lot of null entries at the start of the scan.
+ *
+ * In this loop, row-comparison keys are treated the same as keys on their
+ * first (leftmost) columns. We'll add on lower-order columns of the row
+ * comparison below, if possible.
+ *
+ * The selected scan keys (at most one per index column) are remembered by
+ * storing their addresses into the local startKeys[] array.
+ *----------
+ */
+ strat_total = BTEqualStrategyNumber;
+ if (so->numberOfKeys > 0)
{
- /* step one tuple in the appropriate direction */
- if (!_bt_step(scan, &buf, dir))
- return (RetrieveIndexResult) NULL;
+ AttrNumber curattr;
+ ScanKey chosen;
+ ScanKey impliesNN;
+ ScanKey cur;
- /* by here, current is the tuple we want to return */
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
- itup = &btitem->bti_itup;
+ /*
+ * chosen is the so-far-chosen key for the current attribute, if any.
+ * We don't cast the decision in stone until we reach keys for the
+ * next attribute.
+ */
+ curattr = 1;
+ chosen = NULL;
+ /* Also remember any scankey that implies a NOT NULL constraint */
+ impliesNN = NULL;
- if (_bt_checkkeys(scan, itup, &keysok))
+ /*
+ * Loop iterates from 0 to numberOfKeys inclusive; we use the last
+ * pass to handle after-last-key processing. Actual exit from the
+ * loop is at one of the "break" statements below.
+ */
+ for (cur = so->keyData, i = 0;; cur++, i++)
{
- Assert(keysok == so->numberOfKeys);
- res = FormRetrieveIndexResult(current, &(itup->t_tid));
+ if (i >= so->numberOfKeys || cur->sk_attno != curattr)
+ {
+ /*
+ * Done looking at keys for curattr. If we didn't find a
+ * usable boundary key, see if we can deduce a NOT NULL key.
+ */
+ if (chosen == NULL && impliesNN != NULL &&
+ ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
+ ScanDirectionIsForward(dir) :
+ ScanDirectionIsBackward(dir)))
+ {
+ /* Yes, so build the key in notnullkeys[keysCount] */
+ chosen = ¬nullkeys[keysCount];
+ ScanKeyEntryInitialize(chosen,
+ (SK_SEARCHNOTNULL | SK_ISNULL |
+ (impliesNN->sk_flags &
+ (SK_BT_DESC | SK_BT_NULLS_FIRST))),
+ curattr,
+ ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
+ BTGreaterStrategyNumber :
+ BTLessStrategyNumber),
+ InvalidOid,
+ InvalidOid,
+ InvalidOid,
+ (Datum) 0);
+ }
+
+ /*
+ * If we still didn't find a usable boundary key, quit; else
+ * save the boundary key pointer in startKeys.
+ */
+ if (chosen == NULL)
+ break;
+ startKeys[keysCount++] = chosen;
+
+ /*
+ * Adjust strat_total, and quit if we have stored a > or <
+ * key.
+ */
+ strat = chosen->sk_strategy;
+ if (strat != BTEqualStrategyNumber)
+ {
+ strat_total = strat;
+ if (strat == BTGreaterStrategyNumber ||
+ strat == BTLessStrategyNumber)
+ break;
+ }
- /* remember which buffer we have pinned and locked */
- so->btso_curbuf = buf;
- return res;
+ /*
+ * Done if that was the last attribute, or if next key is not
+ * in sequence (implying no boundary key is available for the
+ * next attribute).
+ */
+ if (i >= so->numberOfKeys ||
+ cur->sk_attno != curattr + 1)
+ break;
+
+ /*
+ * Reset for next attr.
+ */
+ curattr = cur->sk_attno;
+ chosen = NULL;
+ impliesNN = NULL;
+ }
+
+ /*
+ * Can we use this key as a starting boundary for this attr?
+ *
+ * If not, does it imply a NOT NULL constraint? (Because
+ * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
+ * *any* inequality key works for that; we need not test.)
+ */
+ switch (cur->sk_strategy)
+ {
+ case BTLessStrategyNumber:
+ case BTLessEqualStrategyNumber:
+ if (chosen == NULL)
+ {
+ if (ScanDirectionIsBackward(dir))
+ chosen = cur;
+ else
+ impliesNN = cur;
+ }
+ break;
+ case BTEqualStrategyNumber:
+ /* override any non-equality choice */
+ chosen = cur;
+ break;
+ case BTGreaterEqualStrategyNumber:
+ case BTGreaterStrategyNumber:
+ if (chosen == NULL)
+ {
+ if (ScanDirectionIsForward(dir))
+ chosen = cur;
+ else
+ impliesNN = cur;
+ }
+ break;
+ }
}
+ }
- } while (keysok >= so->numberOfFirstKeys ||
- (keysok == -1 && ScanDirectionIsBackward(dir)));
+ /*
+ * If we found no usable boundary keys, we have to start from one end of
+ * the tree. Walk down that edge to the first or last key, and scan from
+ * there.
+ */
+ if (keysCount == 0)
+ return _bt_endpoint(scan, dir);
- ItemPointerSetInvalid(current);
- so->btso_curbuf = InvalidBuffer;
- _bt_relbuf(rel, buf, BT_READ);
+ /*
+ * We want to start the scan somewhere within the index. Set up an
+ * insertion scankey we can use to search for the boundary point we
+ * identified above. The insertion scankey is built in the local
+ * scankeys[] array, using the keys identified by startKeys[].
+ */
+ Assert(keysCount <= INDEX_MAX_KEYS);
+ for (i = 0; i < keysCount; i++)
+ {
+ ScanKey cur = startKeys[i];
- return (RetrieveIndexResult) NULL;
-}
+ Assert(cur->sk_attno == i + 1);
-/*
- * _bt_first() -- Find the first item in a scan.
- *
- * We need to be clever about the type of scan, the operation it's
- * performing, and the tree ordering. We return the RetrieveIndexResult
- * of the first item in the tree that satisfies the qualification
- * associated with the scan descriptor. On exit, the page containing
- * the current index tuple is read locked and pinned, and the scan's
- * opaque data entry is updated to include the buffer.
- */
-RetrieveIndexResult
-_bt_first(IndexScanDesc scan, ScanDirection dir)
-{
- Relation rel;
- TupleDesc itupdesc;
- Buffer buf;
- Page page;
- BTPageOpaque pop;
- BTStack stack;
- OffsetNumber offnum,
- maxoff;
- bool offGmax = false;
- BTItem btitem;
- IndexTuple itup;
- ItemPointer current;
- BlockNumber blkno;
- StrategyNumber strat;
- RetrieveIndexResult res;
- RegProcedure proc;
- int result;
- BTScanOpaque so;
- Size keysok;
-
- bool strategyCheck;
- ScanKey scankeys = 0;
- int keysCount = 0;
- int *nKeyIs = 0;
- int i, j;
- StrategyNumber strat_total;
-
- rel = scan->relation;
- so = (BTScanOpaque) scan->opaque;
-
- /*
- * Order the keys in the qualification and be sure that the scan
- * exploits the tree order.
- */
- so->numberOfFirstKeys = 0; /* may be changed by _bt_orderkeys */
- so->qual_ok = 1; /* may be changed by _bt_orderkeys */
- scan->scanFromEnd = false;
- strategyCheck = false;
- if (so->numberOfKeys > 0)
- {
- _bt_orderkeys(rel, so);
+ if (cur->sk_flags & SK_ROW_HEADER)
+ {
+ /*
+ * Row comparison header: look to the first row member instead.
+ *
+ * The member scankeys are already in insertion format (ie, they
+ * have sk_func = 3-way-comparison function), but we have to watch
+ * out for nulls, which _bt_preprocess_keys didn't check. A null
+ * in the first row member makes the condition unmatchable, just
+ * like qual_ok = false.
+ */
+ ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
- if (so->qual_ok)
- strategyCheck = true;
- }
- strat_total = BTEqualStrategyNumber;
- if (strategyCheck)
- {
- AttrNumber attno;
+ Assert(subkey->sk_flags & SK_ROW_MEMBER);
+ if (subkey->sk_flags & SK_ISNULL)
+ return false;
+ memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
- nKeyIs = (int *)palloc(so->numberOfKeys*sizeof(int));
- for (i=0; i < so->numberOfKeys; i++)
- {
- attno = so->keyData[i].sk_attno;
- if (attno == keysCount)
- continue;
- if (attno > keysCount + 1)
- break;
- strat = _bt_getstrat(rel, attno,
- so->keyData[i].sk_procedure);
- if (strat == strat_total ||
- strat == BTEqualStrategyNumber)
+ /*
+ * If the row comparison is the last positioning key we accepted,
+ * try to add additional keys from the lower-order row members.
+ * (If we accepted independent conditions on additional index
+ * columns, we use those instead --- doesn't seem worth trying to
+ * determine which is more restrictive.) Note that this is OK
+ * even if the row comparison is of ">" or "<" type, because the
+ * condition applied to all but the last row member is effectively
+ * ">=" or "<=", and so the extra keys don't break the positioning
+ * scheme. But, by the same token, if we aren't able to use all
+ * the row members, then the part of the row comparison that we
+ * did use has to be treated as just a ">=" or "<=" condition, and
+ * so we'd better adjust strat_total accordingly.
+ */
+ if (i == keysCount - 1)
{
- nKeyIs[keysCount++] = i;
- continue;
+ bool used_all_subkeys = false;
+
+ Assert(!(subkey->sk_flags & SK_ROW_END));
+ for (;;)
+ {
+ subkey++;
+ Assert(subkey->sk_flags & SK_ROW_MEMBER);
+ if (subkey->sk_attno != keysCount + 1)
+ break; /* out-of-sequence, can't use it */
+ if (subkey->sk_strategy != cur->sk_strategy)
+ break; /* wrong direction, can't use it */
+ if (subkey->sk_flags & SK_ISNULL)
+ break; /* can't use null keys */
+ Assert(keysCount < INDEX_MAX_KEYS);
+ memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
+ keysCount++;
+ if (subkey->sk_flags & SK_ROW_END)
+ {
+ used_all_subkeys = true;
+ break;
+ }
+ }
+ if (!used_all_subkeys)
+ {
+ switch (strat_total)
+ {
+ case BTLessStrategyNumber:
+ strat_total = BTLessEqualStrategyNumber;
+ break;
+ case BTGreaterStrategyNumber:
+ strat_total = BTGreaterEqualStrategyNumber;
+ break;
+ }
+ }
+ break; /* done with outer loop */
}
- if (ScanDirectionIsBackward(dir) &&
- (strat == BTLessStrategyNumber ||
- strat == BTLessEqualStrategyNumber) )
+ }
+ else
+ {
+ /*
+ * Ordinary comparison key. Transform the search-style scan key
+ * to an insertion scan key by replacing the sk_func with the
+ * appropriate btree comparison function.
+ *
+ * If scankey operator is not a cross-type comparison, we can use
+ * the cached comparison function; otherwise gotta look it up in
+ * the catalogs. (That can't lead to infinite recursion, since no
+ * indexscan initiated by syscache lookup will use cross-data-type
+ * operators.)
+ *
+ * We support the convention that sk_subtype == InvalidOid means
+ * the opclass input type; this is a hack to simplify life for
+ * ScanKeyInit().
+ */
+ if (cur->sk_subtype == rel->rd_opcintype[i] ||
+ cur->sk_subtype == InvalidOid)
{
- nKeyIs[keysCount++] = i;
- strat_total = strat;
- if (strat == BTLessStrategyNumber)
- break;
- continue;
+ FmgrInfo *procinfo;
+
+ procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
+ ScanKeyEntryInitializeWithInfo(scankeys + i,
+ cur->sk_flags,
+ cur->sk_attno,
+ InvalidStrategy,
+ cur->sk_subtype,
+ cur->sk_collation,
+ procinfo,
+ cur->sk_argument);
}
- if (ScanDirectionIsForward(dir) &&
- (strat == BTGreaterStrategyNumber ||
- strat == BTGreaterEqualStrategyNumber) )
+ else
{
- nKeyIs[keysCount++] = i;
- strat_total = strat;
- if (strat == BTGreaterStrategyNumber)
- break;
- continue;
+ RegProcedure cmp_proc;
+
+ cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
+ rel->rd_opcintype[i],
+ cur->sk_subtype,
+ BTORDER_PROC);
+ if (!RegProcedureIsValid(cmp_proc))
+ elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
+ BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
+ cur->sk_attno, RelationGetRelationName(rel));
+ ScanKeyEntryInitialize(scankeys + i,
+ cur->sk_flags,
+ cur->sk_attno,
+ InvalidStrategy,
+ cur->sk_subtype,
+ cur->sk_collation,
+ cmp_proc,
+ cur->sk_argument);
}
}
- if (!keysCount)
- scan->scanFromEnd = true;
}
- else
- scan->scanFromEnd = true;
-
- if (so->qual_ok == 0)
- return (RetrieveIndexResult) NULL;
- /* if we just need to walk down one edge of the tree, do that */
- if (scan->scanFromEnd)
+ /*----------
+ * Examine the selected initial-positioning strategy to determine exactly
+ * where we need to start the scan, and set flag variables to control the
+ * code below.
+ *
+ * If nextkey = false, _bt_search and _bt_binsrch will locate the first
+ * item >= scan key. If nextkey = true, they will locate the first
+ * item > scan key.
+ *
+ * If goback = true, we will then step back one item, while if
+ * goback = false, we will start the scan on the located item.
+ *----------
+ */
+ switch (strat_total)
{
- if (nKeyIs)
- pfree(nKeyIs);
- return _bt_endpoint(scan, dir);
- }
+ case BTLessStrategyNumber:
+
+ /*
+ * Find first item >= scankey, then back up one to arrive at last
+ * item < scankey. (Note: this positioning strategy is only used
+ * for a backward scan, so that is always the correct starting
+ * position.)
+ */
+ nextkey = false;
+ goback = true;
+ break;
+
+ case BTLessEqualStrategyNumber:
+
+ /*
+ * Find first item > scankey, then back up one to arrive at last
+ * item <= scankey. (Note: this positioning strategy is only used
+ * for a backward scan, so that is always the correct starting
+ * position.)
+ */
+ nextkey = true;
+ goback = true;
+ break;
+
+ case BTEqualStrategyNumber:
+
+ /*
+ * If a backward scan was specified, need to start with last equal
+ * item not first one.
+ */
+ if (ScanDirectionIsBackward(dir))
+ {
+ /*
+ * This is the same as the <= strategy. We will check at the
+ * end whether the found item is actually =.
+ */
+ nextkey = true;
+ goback = true;
+ }
+ else
+ {
+ /*
+ * This is the same as the >= strategy. We will check at the
+ * end whether the found item is actually =.
+ */
+ nextkey = false;
+ goback = false;
+ }
+ break;
+
+ case BTGreaterEqualStrategyNumber:
+
+ /*
+ * Find first item >= scankey. (This is only used for forward
+ * scans.)
+ */
+ nextkey = false;
+ goback = false;
+ break;
- itupdesc = RelationGetDescr(rel);
- current = &(scan->currentItemData);
+ case BTGreaterStrategyNumber:
+
+ /*
+ * Find first item > scankey. (This is only used for forward
+ * scans.)
+ */
+ nextkey = true;
+ goback = false;
+ break;
+
+ default:
+ /* can't get here, but keep compiler quiet */
+ elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
+ return false;
+ }
/*
- * Okay, we want something more complicated. What we'll do is use the
- * first item in the scan key passed in (which has been correctly
- * ordered to take advantage of index ordering) to position ourselves
- * at the right place in the scan.
+ * Use the manufactured insertion scan key to descend the tree and
+ * position ourselves on the target leaf page.
*/
- /* _bt_orderkeys disallows it, but it's place to add some code latter */
- scankeys = (ScanKey)palloc(keysCount*sizeof(ScanKeyData));
- for (i=0; i < keysCount; i++)
+ stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ,
+ scan->xs_snapshot);
+
+ /* don't need to keep the stack around... */
+ _bt_freestack(stack);
+
+ if (!BufferIsValid(buf))
{
- j = nKeyIs[i];
- if (so->keyData[j].sk_flags & SK_ISNULL)
- {
- pfree(nKeyIs);
- pfree(scankeys);
- elog(ERROR, "_bt_first: btree doesn't support is(not)null, yet");
- return ((RetrieveIndexResult) NULL);
- }
- proc = index_getprocid(rel, i+1, BTORDER_PROC);
- ScanKeyEntryInitialize(scankeys+i, so->keyData[j].sk_flags,
- i+1, proc, so->keyData[j].sk_argument);
+ /*
+ * We only get here if the index is completely empty. Lock relation
+ * because nothing finer to lock exists.
+ */
+ PredicateLockRelation(rel, scan->xs_snapshot);
+ return false;
}
- if (nKeyIs) pfree(nKeyIs);
+ else
+ PredicateLockPage(rel, BufferGetBlockNumber(buf),
+ scan->xs_snapshot);
- stack = _bt_search(rel, keysCount, scankeys, &buf);
- _bt_freestack(stack);
+ /* initialize moreLeft/moreRight appropriately for scan direction */
+ if (ScanDirectionIsForward(dir))
+ {
+ so->currPos.moreLeft = false;
+ so->currPos.moreRight = true;
+ }
+ else
+ {
+ so->currPos.moreLeft = true;
+ so->currPos.moreRight = false;
+ }
+ so->numKilled = 0; /* just paranoia */
+ Assert(so->markItemIndex == -1);
- blkno = BufferGetBlockNumber(buf);
- page = BufferGetPage(buf);
+ /* position to the precise item on the page */
+ offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
/*
- * This will happen if the tree we're searching is entirely empty, or
- * if we're doing a search for a key that would appear on an entirely
- * empty internal page. In either case, there are no matching tuples
- * in the index.
+ * If nextkey = false, we are positioned at the first item >= scan key, or
+ * possibly at the end of a page on which all the existing items are less
+ * than the scan key and we know that everything on later pages is greater
+ * than or equal to scan key.
+ *
+ * If nextkey = true, we are positioned at the first item > scan key, or
+ * possibly at the end of a page on which all the existing items are less
+ * than or equal to the scan key and we know that everything on later
+ * pages is greater than scan key.
+ *
+ * The actually desired starting point is either this item or the prior
+ * one, or in the end-of-page case it's the first item on the next page or
+ * the last item on this page. Adjust the starting offset if needed. (If
+ * this results in an offset before the first item or after the last one,
+ * _bt_readpage will report no items found, and then we'll step to the
+ * next page as needed.)
*/
+ if (goback)
+ offnum = OffsetNumberPrev(offnum);
+
+ /* remember which buffer we have pinned, if any */
+ Assert(!BTScanPosIsValid(so->currPos));
+ so->currPos.buf = buf;
- if (PageIsEmpty(page))
+ /*
+ * Now load data from the first page of the scan.
+ */
+ if (!_bt_readpage(scan, dir, offnum))
+ {
+ /*
+ * There's no actually-matching data on this page. Try to advance to
+ * the next page. Return false if there's no matching data at all.
+ */
+ LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+ if (!_bt_steppage(scan, dir))
+ return false;
+ }
+ else
{
- ItemPointerSetInvalid(current);
- so->btso_curbuf = InvalidBuffer;
- _bt_relbuf(rel, buf, BT_READ);
- pfree(scankeys);
- return (RetrieveIndexResult) NULL;
+ /* Drop the lock, and maybe the pin, on the current page */
+ _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
}
- maxoff = PageGetMaxOffsetNumber(page);
- pop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* OK, itemIndex says what to return */
+ currItem = &so->currPos.items[so->currPos.itemIndex];
+ scan->xs_ctup.t_self = currItem->heapTid;
+ if (scan->xs_want_itup)
+ scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+ return true;
+}
+
+/*
+ * _bt_next() -- Get the next item in a scan.
+ *
+ * On entry, so->currPos describes the current page, which may be pinned
+ * but is not locked, and so->currPos.itemIndex identifies which item was
+ * previously returned.
+ *
+ * On successful exit, scan->xs_ctup.t_self is set to the TID of the
+ * next heap tuple, and if requested, scan->xs_itup points to a copy of
+ * the index tuple. so->currPos is updated as needed.
+ *
+ * On failure exit (no more tuples), we release pin and set
+ * so->currPos.buf to InvalidBuffer.
+ */
+bool
+_bt_next(IndexScanDesc scan, ScanDirection dir)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ BTScanPosItem *currItem;
/*
- * Now _bt_moveright doesn't move from non-rightmost leaf page if
- * scankey == hikey and there is only hikey there. It's good for
- * insertion, but we need to do work for scan here. - vadim 05/27/97
+ * Advance to next tuple on current page; or if there's no more, try to
+ * step to the next page with data.
*/
-
- while (maxoff == P_HIKEY && !P_RIGHTMOST(pop) &&
- _bt_skeycmp(rel, keysCount, scankeys, page,
- PageGetItemId(page, P_HIKEY),
- BTGreaterEqualStrategyNumber))
+ if (ScanDirectionIsForward(dir))
{
- /* step right one page */
- blkno = pop->btpo_next;
- _bt_relbuf(rel, buf, BT_READ);
- buf = _bt_getbuf(rel, blkno, BT_READ);
- page = BufferGetPage(buf);
- if (PageIsEmpty(page))
+ if (++so->currPos.itemIndex > so->currPos.lastItem)
{
- ItemPointerSetInvalid(current);
- so->btso_curbuf = InvalidBuffer;
- _bt_relbuf(rel, buf, BT_READ);
- pfree(scankeys);
- return (RetrieveIndexResult) NULL;
+ if (!_bt_steppage(scan, dir))
+ return false;
}
- maxoff = PageGetMaxOffsetNumber(page);
- pop = (BTPageOpaque) PageGetSpecialPointer(page);
}
-
-
- /* find the nearest match to the manufactured scan key on the page */
- offnum = _bt_binsrch(rel, buf, keysCount, scankeys, BT_DESCENT);
-
- if (offnum > maxoff)
+ else
{
- offnum = maxoff;
- offGmax = true;
+ if (--so->currPos.itemIndex < so->currPos.firstItem)
+ {
+ if (!_bt_steppage(scan, dir))
+ return false;
+ }
}
- ItemPointerSet(current, blkno, offnum);
+ /* OK, itemIndex says what to return */
+ currItem = &so->currPos.items[so->currPos.itemIndex];
+ scan->xs_ctup.t_self = currItem->heapTid;
+ if (scan->xs_want_itup)
+ scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+ return true;
+}
+
+/*
+ * _bt_readpage() -- Load data from current index page into so->currPos
+ *
+ * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
+ * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
+ * they are updated as appropriate. All other fields of so->currPos are
+ * initialized from scratch here.
+ *
+ * We scan the current page starting at offnum and moving in the indicated
+ * direction. All items matching the scan keys are loaded into currPos.items.
+ * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
+ * that there can be no more matching tuples in the current scan direction.
+ *
+ * Returns true if any matching items found on the page, false if none.
+ */
+static bool
+_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber minoff;
+ OffsetNumber maxoff;
+ int itemIndex;
+ IndexTuple itup;
+ bool continuescan;
/*
- * Now find the right place to start the scan. Result is the value
- * we're looking for minus the value we're looking at in the index.
+ * We must have the buffer pinned and locked, but the usual macro can't be
+ * used here; this function is what makes it good for currPos.
*/
+ Assert(BufferIsValid(so->currPos.buf));
- result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
+ page = BufferGetPage(so->currPos.buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
- /* it's yet other place to add some code latter for is(not)null */
+ /*
+ * We note the buffer's block number so that we can release the pin later.
+ * This allows us to re-read the buffer if it is needed again for hinting.
+ */
+ so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
- strat = strat_total;
- switch (strat)
- {
- case BTLessStrategyNumber:
- if (result <= 0)
- {
- do
- {
- if (!_bt_twostep(scan, &buf, BackwardScanDirection))
- break;
+ /*
+ * We save the LSN of the page as we read it, so that we know whether it
+ * safe to apply LP_DEAD hints to the page later. This allows us to drop
+ * the pin for MVCC scans, which allows vacuum to avoid blocking.
+ */
+ so->currPos.lsn = PageGetLSN(page);
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
- } while (result <= 0);
+ /*
+ * we must save the page's right-link while scanning it; this tells us
+ * where to step right to after we're done with these items. There is no
+ * corresponding need for the left-link, since splits always go right.
+ */
+ so->currPos.nextPage = opaque->btpo_next;
- }
- break;
+ /* initialize tuple workspace to empty */
+ so->currPos.nextTupleOffset = 0;
- case BTLessEqualStrategyNumber:
- if (result >= 0)
- {
- do
- {
- if (!_bt_twostep(scan, &buf, ForwardScanDirection))
- break;
+ /*
+ * Now that the current page has been made consistent, the macro should be
+ * good.
+ */
+ Assert(BTScanPosIsPinned(so->currPos));
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
- } while (result >= 0);
- }
- if (result < 0)
- _bt_twostep(scan, &buf, BackwardScanDirection);
- break;
+ if (ScanDirectionIsForward(dir))
+ {
+ /* load items[] in ascending order */
+ itemIndex = 0;
- case BTEqualStrategyNumber:
- if (result != 0)
+ offnum = Max(offnum, minoff);
+
+ while (offnum <= maxoff)
+ {
+ itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
+ if (itup != NULL)
{
- _bt_relbuf(scan->relation, buf, BT_READ);
- so->btso_curbuf = InvalidBuffer;
- ItemPointerSetInvalid(&(scan->currentItemData));
- pfree(scankeys);
- return (RetrieveIndexResult) NULL;
+ /* tuple passes all scan key conditions, so remember it */
+ _bt_saveitem(so, itemIndex, offnum, itup);
+ itemIndex++;
}
- else if (ScanDirectionIsBackward(dir))
+ if (!continuescan)
{
- do
- {
- if (!_bt_twostep(scan, &buf, ForwardScanDirection))
- break;
+ /* there can't be any more matches, so stop */
+ so->currPos.moreRight = false;
+ break;
+ }
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
- } while (result == 0);
+ offnum = OffsetNumberNext(offnum);
+ }
- if (result < 0)
- _bt_twostep(scan, &buf, BackwardScanDirection);
- }
- break;
+ Assert(itemIndex <= MaxIndexTuplesPerPage);
+ so->currPos.firstItem = 0;
+ so->currPos.lastItem = itemIndex - 1;
+ so->currPos.itemIndex = 0;
+ }
+ else
+ {
+ /* load items[] in descending order */
+ itemIndex = MaxIndexTuplesPerPage;
- case BTGreaterEqualStrategyNumber:
- if (offGmax)
+ offnum = Min(offnum, maxoff);
+
+ while (offnum >= minoff)
+ {
+ itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
+ if (itup != NULL)
{
- if (result < 0)
- {
- Assert(!P_RIGHTMOST(pop) && maxoff == P_HIKEY);
- if (!_bt_step(scan, &buf, ForwardScanDirection))
- {
- _bt_relbuf(scan->relation, buf, BT_READ);
- so->btso_curbuf = InvalidBuffer;
- ItemPointerSetInvalid(&(scan->currentItemData));
- pfree(scankeys);
- return (RetrieveIndexResult) NULL;
- }
- }
- else if (result > 0)
- { /* Just remember: _bt_binsrch() returns
- * the OffsetNumber of the first matching
- * key on the page, or the OffsetNumber at
- * which the matching key WOULD APPEAR IF
- * IT WERE on this page. No key on this
- * page, but offnum from _bt_binsrch()
- * greater maxoff - have to move right. -
- * vadim 12/06/96 */
- _bt_twostep(scan, &buf, ForwardScanDirection);
- }
+ /* tuple passes all scan key conditions, so remember it */
+ itemIndex--;
+ _bt_saveitem(so, itemIndex, offnum, itup);
}
- else if (result < 0)
+ if (!continuescan)
{
- do
- {
- if (!_bt_twostep(scan, &buf, BackwardScanDirection))
- break;
-
- page = BufferGetPage(buf);
- offnum = ItemPointerGetOffsetNumber(current);
- result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
- } while (result < 0);
-
- if (result > 0)
- _bt_twostep(scan, &buf, ForwardScanDirection);
+ /* there can't be any more matches, so stop */
+ so->currPos.moreLeft = false;
+ break;
}
- break;
- case BTGreaterStrategyNumber:
- /* offGmax helps as above */
- if (result >= 0 || offGmax)
- {
- do
- {
- if (!_bt_twostep(scan, &buf, ForwardScanDirection))
- break;
+ offnum = OffsetNumberPrev(offnum);
+ }
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
- } while (result >= 0);
- }
- break;
+ Assert(itemIndex >= 0);
+ so->currPos.firstItem = itemIndex;
+ so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
+ so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
}
- pfree(scankeys);
- /* okay, current item pointer for the scan is right */
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
- itup = &btitem->bti_itup;
+ return (so->currPos.firstItem <= so->currPos.lastItem);
+}
- if (_bt_checkkeys(scan, itup, &keysok))
- {
- res = FormRetrieveIndexResult(current, &(itup->t_tid));
+/* Save an index item into so->currPos.items[itemIndex] */
+static void
+_bt_saveitem(BTScanOpaque so, int itemIndex,
+ OffsetNumber offnum, IndexTuple itup)
+{
+ BTScanPosItem *currItem = &so->currPos.items[itemIndex];
- /* remember which buffer we have pinned */
- so->btso_curbuf = buf;
- }
- else if (keysok >= so->numberOfFirstKeys)
- {
- so->btso_curbuf = buf;
- return _bt_next(scan, dir);
- }
- else if (keysok == -1 && ScanDirectionIsBackward(dir))
- {
- so->btso_curbuf = buf;
- return _bt_next(scan, dir);
- }
- else
+ currItem->heapTid = itup->t_tid;
+ currItem->indexOffset = offnum;
+ if (so->currTuples)
{
- ItemPointerSetInvalid(current);
- so->btso_curbuf = InvalidBuffer;
- _bt_relbuf(rel, buf, BT_READ);
- res = (RetrieveIndexResult) NULL;
- }
+ Size itupsz = IndexTupleSize(itup);
- return res;
+ currItem->tupleOffset = so->currPos.nextTupleOffset;
+ memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
+ so->currPos.nextTupleOffset += MAXALIGN(itupsz);
+ }
}
/*
- * _bt_step() -- Step one item in the requested direction in a scan on
- * the tree.
+ * _bt_steppage() -- Step to next page containing valid data for scan
*
- * If no adjacent record exists in the requested direction, return
- * false. Else, return true and set the currentItemData for the
- * scan to the right thing.
+ * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
+ * if pinned, we'll drop the pin before moving to next page. The buffer is
+ * not locked on entry.
+ *
+ * On success exit, so->currPos is updated to contain data from the next
+ * interesting page. For success on a scan using a non-MVCC snapshot we hold
+ * a pin, but not a read lock, on that page. If we do not hold the pin, we
+ * set so->currPos.buf to InvalidBuffer. We return TRUE to indicate success.
+ *
+ * If there are no more matching records in the given direction, we drop all
+ * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
*/
-bool
-_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
+static bool
+_bt_steppage(IndexScanDesc scan, ScanDirection dir)
{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Relation rel;
Page page;
BTPageOpaque opaque;
- OffsetNumber offnum,
- maxoff;
- OffsetNumber start;
- BlockNumber blkno;
- BlockNumber obknum;
- BTScanOpaque so;
- ItemPointer current;
- Relation rel;
- rel = scan->relation;
- current = &(scan->currentItemData);
+ Assert(BTScanPosIsValid(so->currPos));
+
+ /* Before leaving current page, deal with any killed items */
+ if (so->numKilled > 0)
+ _bt_killitems(scan);
/*
- * Don't use ItemPointerGetOffsetNumber or you risk to get assertion
- * due to ability of ip_posid to be equal 0.
+ * Before we modify currPos, make a copy of the page data if there was a
+ * mark position that needs it.
*/
- offnum = current->ip_posid;
- page = BufferGetPage(*bufP);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- so = (BTScanOpaque) scan->opaque;
- maxoff = PageGetMaxOffsetNumber(page);
+ if (so->markItemIndex >= 0)
+ {
+ /* bump pin on current buffer for assignment to mark buffer */
+ if (BTScanPosIsPinned(so->currPos))
+ IncrBufferRefCount(so->currPos.buf);
+ memcpy(&so->markPos, &so->currPos,
+ offsetof(BTScanPosData, items[1]) +
+ so->currPos.lastItem * sizeof(BTScanPosItem));
+ if (so->markTuples)
+ memcpy(so->markTuples, so->currTuples,
+ so->currPos.nextTupleOffset);
+ so->markPos.itemIndex = so->markItemIndex;
+ so->markItemIndex = -1;
+ }
+
+ rel = scan->indexRelation;
- /* get the next tuple */
if (ScanDirectionIsForward(dir))
{
- if (!PageIsEmpty(page) && offnum < maxoff)
- offnum = OffsetNumberNext(offnum);
- else
- {
+ /* Walk right to the next page with data */
+ /* We must rely on the previously saved nextPage link! */
+ BlockNumber blkno = so->currPos.nextPage;
- /* if we're at end of scan, release the buffer and return */
- blkno = opaque->btpo_next;
- if (P_RIGHTMOST(opaque))
+ /* Remember we left a page with data */
+ so->currPos.moreLeft = true;
+
+ /* release the previous buffer, if pinned */
+ BTScanPosUnpinIfPinned(so->currPos);
+
+ for (;;)
+ {
+ /* if we're at end of scan, give up */
+ if (blkno == P_NONE || !so->currPos.moreRight)
{
- _bt_relbuf(rel, *bufP, BT_READ);
- ItemPointerSetInvalid(current);
- *bufP = so->btso_curbuf = InvalidBuffer;
+ BTScanPosInvalidate(so->currPos);
return false;
}
- else
+ /* check for interrupts while we're not holding any buffer lock */
+ CHECK_FOR_INTERRUPTS();
+ /* step right one page */
+ so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+ /* check for deleted page */
+ page = BufferGetPage(so->currPos.buf);
+ TestForOldSnapshot(scan->xs_snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!P_IGNORE(opaque))
{
-
- /* walk right to the next page with data */
- _bt_relbuf(rel, *bufP, BT_READ);
- for (;;)
- {
- *bufP = _bt_getbuf(rel, blkno, BT_READ);
- page = BufferGetPage(*bufP);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- maxoff = PageGetMaxOffsetNumber(page);
- start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
- if (!PageIsEmpty(page) && start <= maxoff)
- break;
- else
- {
- blkno = opaque->btpo_next;
- _bt_relbuf(rel, *bufP, BT_READ);
- if (blkno == P_NONE)
- {
- *bufP = so->btso_curbuf = InvalidBuffer;
- ItemPointerSetInvalid(current);
- return false;
- }
- }
- }
- offnum = start;
+ PredicateLockPage(rel, blkno, scan->xs_snapshot);
+ /* see if there are any matches on this page */
+ /* note that this will clear moreRight if we can stop */
+ if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
+ break;
}
+
+ /* nope, keep going */
+ blkno = opaque->btpo_next;
+ _bt_relbuf(rel, so->currPos.buf);
}
}
- else if (ScanDirectionIsBackward(dir))
+ else
{
+ /* Remember we left a page with data */
+ so->currPos.moreRight = true;
- /* remember that high key is item zero on non-rightmost pages */
- start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
- if (offnum > start)
- offnum = OffsetNumberPrev(offnum);
+ /*
+ * Walk left to the next page with data. This is much more complex
+ * than the walk-right case because of the possibility that the page
+ * to our left splits while we are in flight to it, plus the
+ * possibility that the page we were on gets deleted after we leave
+ * it. See nbtree/README for details.
+ *
+ * It might be possible to rearrange this code to have less overhead
+ * in pinning and locking, but that would require capturing the left
+ * pointer when the page is initially read, and using it here, along
+ * with big changes to _bt_walk_left() and the code below. It is not
+ * clear whether this would be a win, since if the page immediately to
+ * the left splits after we read this page and before we step left, we
+ * would need to visit more pages than with the current code.
+ *
+ * Note that if we change the code so that we drop the pin for a scan
+ * which uses a non-MVCC snapshot, we will need to modify the code for
+ * walking left, to allow for the possibility that a referenced page
+ * has been deleted. As long as the buffer is pinned or the snapshot
+ * is MVCC the page cannot move past the half-dead state to fully
+ * deleted.
+ */
+ if (BTScanPosIsPinned(so->currPos))
+ LockBuffer(so->currPos.buf, BT_READ);
else
- {
+ so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
- /* if we're at end of scan, release the buffer and return */
- blkno = opaque->btpo_prev;
- if (P_LEFTMOST(opaque))
+ for (;;)
+ {
+ /* Done if we know there are no matching keys to the left */
+ if (!so->currPos.moreLeft)
{
- _bt_relbuf(rel, *bufP, BT_READ);
- *bufP = so->btso_curbuf = InvalidBuffer;
- ItemPointerSetInvalid(current);
+ _bt_relbuf(rel, so->currPos.buf);
+ BTScanPosInvalidate(so->currPos);
return false;
}
- else
- {
- obknum = BufferGetBlockNumber(*bufP);
+ /* Step to next physical page */
+ so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
+ scan->xs_snapshot);
- /* walk right to the next page with data */
- _bt_relbuf(rel, *bufP, BT_READ);
- for (;;)
- {
- *bufP = _bt_getbuf(rel, blkno, BT_READ);
- page = BufferGetPage(*bufP);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- maxoff = PageGetMaxOffsetNumber(page);
-
- /*
- * If the adjacent page just split, then we may have
- * the wrong block. Handle this case. Because pages
- * only split right, we don't have to worry about this
- * failing to terminate.
- */
-
- while (opaque->btpo_next != obknum)
- {
- blkno = opaque->btpo_next;
- _bt_relbuf(rel, *bufP, BT_READ);
- *bufP = _bt_getbuf(rel, blkno, BT_READ);
- page = BufferGetPage(*bufP);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- maxoff = PageGetMaxOffsetNumber(page);
- }
-
- /* don't consider the high key */
- start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+ /* if we're physically at end of index, return failure */
+ if (so->currPos.buf == InvalidBuffer)
+ {
+ BTScanPosInvalidate(so->currPos);
+ return false;
+ }
- /* anything to look at here? */
- if (!PageIsEmpty(page) && maxoff >= start)
- break;
- else
- {
- blkno = opaque->btpo_prev;
- obknum = BufferGetBlockNumber(*bufP);
- _bt_relbuf(rel, *bufP, BT_READ);
- if (blkno == P_NONE)
- {
- *bufP = so->btso_curbuf = InvalidBuffer;
- ItemPointerSetInvalid(current);
- return false;
- }
- }
- }
- offnum = maxoff;/* XXX PageIsEmpty? */
+ /*
+ * Okay, we managed to move left to a non-deleted page. Done if
+ * it's not half-dead and contains matching tuples. Else loop back
+ * and do it all again.
+ */
+ page = BufferGetPage(so->currPos.buf);
+ TestForOldSnapshot(scan->xs_snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!P_IGNORE(opaque))
+ {
+ PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
+ /* see if there are any matches on this page */
+ /* note that this will clear moreLeft if we can stop */
+ if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
+ break;
}
}
}
- blkno = BufferGetBlockNumber(*bufP);
- so->btso_curbuf = *bufP;
- ItemPointerSet(current, blkno, offnum);
+
+ /* Drop the lock, and maybe the pin, on the current page */
+ _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
return true;
}
/*
- * _bt_twostep() -- Move to an adjacent record in a scan on the tree,
- * if an adjacent record exists.
+ * _bt_walk_left() -- step left one page, if possible
*
- * This is like _bt_step, except that if no adjacent record exists
- * it restores us to where we were before trying the step. This is
- * only hairy when you cross page boundaries, since the page you cross
- * from could have records inserted or deleted, or could even split.
- * This is unlikely, but we try to handle it correctly here anyway.
+ * The given buffer must be pinned and read-locked. This will be dropped
+ * before stepping left. On return, we have pin and read lock on the
+ * returned page, instead.
*
- * This routine contains the only case in which our changes to Lehman
- * and Yao's algorithm.
+ * Returns InvalidBuffer if there is no page to the left (no lock is held
+ * in that case).
*
- * Like step, this routine leaves the scan's currentItemData in the
- * proper state and acquires a lock and pin on *bufP. If the twostep
- * succeeded, we return true; otherwise, we return false.
+ * When working on a non-leaf level, it is possible for the returned page
+ * to be half-dead; the caller should check that condition and step left
+ * again if it's important.
*/
-static bool
-_bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
+static Buffer
+_bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
{
Page page;
BTPageOpaque opaque;
- OffsetNumber offnum,
- maxoff;
- OffsetNumber start;
- ItemPointer current;
- ItemId itemid;
- int itemsz;
- BTItem btitem;
- BTItem svitem;
- BlockNumber blkno;
- blkno = BufferGetBlockNumber(*bufP);
- page = BufferGetPage(*bufP);
+ page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- maxoff = PageGetMaxOffsetNumber(page);
- current = &(scan->currentItemData);
- offnum = ItemPointerGetOffsetNumber(current);
- start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
- /* if we're safe, just do it */
- if (ScanDirectionIsForward(dir) && offnum < maxoff)
- { /* XXX PageIsEmpty? */
- ItemPointerSet(current, blkno, OffsetNumberNext(offnum));
- return true;
- }
- else if (ScanDirectionIsBackward(dir) && offnum > start)
+ for (;;)
{
- ItemPointerSet(current, blkno, OffsetNumberPrev(offnum));
- return true;
- }
-
- /* if we've hit end of scan we don't have to do any work */
- if (ScanDirectionIsForward(dir) && P_RIGHTMOST(opaque))
- return false;
- else if (ScanDirectionIsBackward(dir) && P_LEFTMOST(opaque))
- return false;
+ BlockNumber obknum;
+ BlockNumber lblkno;
+ BlockNumber blkno;
+ int tries;
- /*
- * Okay, it's off the page; let _bt_step() do the hard work, and we'll
- * try to remember where we were. This is not guaranteed to work;
- * this is the only place in the code where concurrency can screw us
- * up, and it's because we want to be able to move in two directions
- * in the scan.
- */
-
- itemid = PageGetItemId(page, offnum);
- itemsz = ItemIdGetLength(itemid);
- btitem = (BTItem) PageGetItem(page, itemid);
- svitem = (BTItem) palloc(itemsz);
- memmove((char *) svitem, (char *) btitem, itemsz);
+ /* if we're at end of tree, release buf and return failure */
+ if (P_LEFTMOST(opaque))
+ {
+ _bt_relbuf(rel, buf);
+ break;
+ }
+ /* remember original page we are stepping left from */
+ obknum = BufferGetBlockNumber(buf);
+ /* step left */
+ blkno = lblkno = opaque->btpo_prev;
+ _bt_relbuf(rel, buf);
+ /* check for interrupts while we're not holding any buffer lock */
+ CHECK_FOR_INTERRUPTS();
+ buf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ TestForOldSnapshot(snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- if (_bt_step(scan, bufP, dir))
- {
- pfree(svitem);
- return true;
- }
+ /*
+ * If this isn't the page we want, walk right till we find what we
+ * want --- but go no more than four hops (an arbitrary limit). If we
+ * don't find the correct page by then, the most likely bet is that
+ * the original page got deleted and isn't in the sibling chain at all
+ * anymore, not that its left sibling got split more than four times.
+ *
+ * Note that it is correct to test P_ISDELETED not P_IGNORE here,
+ * because half-dead pages are still in the sibling chain. Caller
+ * must reject half-dead pages if wanted.
+ */
+ tries = 0;
+ for (;;)
+ {
+ if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
+ {
+ /* Found desired page, return it */
+ return buf;
+ }
+ if (P_RIGHTMOST(opaque) || ++tries > 4)
+ break;
+ blkno = opaque->btpo_next;
+ buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ TestForOldSnapshot(snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ }
- /* try to find our place again */
- *bufP = _bt_getbuf(scan->relation, blkno, BT_READ);
- page = BufferGetPage(*bufP);
- maxoff = PageGetMaxOffsetNumber(page);
+ /* Return to the original page to see what's up */
+ buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
+ page = BufferGetPage(buf);
+ TestForOldSnapshot(snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (P_ISDELETED(opaque))
+ {
+ /*
+ * It was deleted. Move right to first nondeleted page (there
+ * must be one); that is the page that has acquired the deleted
+ * one's keyspace, so stepping left from it will take us where we
+ * want to be.
+ */
+ for (;;)
+ {
+ if (P_RIGHTMOST(opaque))
+ elog(ERROR, "fell off the end of index \"%s\"",
+ RelationGetRelationName(rel));
+ blkno = opaque->btpo_next;
+ buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ TestForOldSnapshot(snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!P_ISDELETED(opaque))
+ break;
+ }
- while (offnum <= maxoff)
- {
- itemid = PageGetItemId(page, offnum);
- btitem = (BTItem) PageGetItem(page, itemid);
- if (BTItemSame(btitem, svitem))
+ /*
+ * Now return to top of loop, resetting obknum to point to this
+ * nondeleted page, and try again.
+ */
+ }
+ else
{
- pfree(svitem);
- ItemPointerSet(current, blkno, offnum);
- return false;
+ /*
+ * It wasn't deleted; the explanation had better be that the page
+ * to the left got split or deleted. Without this check, we'd go
+ * into an infinite loop if there's anything wrong.
+ */
+ if (opaque->btpo_prev == lblkno)
+ elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
+ obknum, RelationGetRelationName(rel));
+ /* Okay to try again with new lblkno value */
}
}
- /*
- * XXX crash and burn -- can't find our place. We can be a little
- * smarter -- walk to the next page to the right, for example, since
- * that's the only direction that splits happen in. Deletions screw
- * us up less often since they're only done by the vacuum daemon.
- */
-
- elog(ERROR, "btree synchronization error: concurrent update botched scan");
-
- return false;
+ return InvalidBuffer;
}
/*
- * _bt_endpoint() -- Find the first or last key in the index.
+ * _bt_get_endpoint() -- Find the first or last page on a given tree level
+ *
+ * If the index is empty, we will return InvalidBuffer; any other failure
+ * condition causes ereport(). We will not return a dead page.
+ *
+ * The returned buffer is pinned and read-locked.
*/
-static RetrieveIndexResult
-_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
+Buffer
+_bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
+ Snapshot snapshot)
{
- Relation rel;
Buffer buf;
Page page;
BTPageOpaque opaque;
- ItemPointer current;
- OffsetNumber offnum,
- maxoff;
- OffsetNumber start = 0;
+ OffsetNumber offnum;
BlockNumber blkno;
- BTItem btitem;
IndexTuple itup;
- BTScanOpaque so;
- RetrieveIndexResult res;
- Size keysok;
- rel = scan->relation;
- current = &(scan->currentItemData);
- so = (BTScanOpaque) scan->opaque;
+ /*
+ * If we are looking for a leaf page, okay to descend from fast root;
+ * otherwise better descend from true root. (There is no point in being
+ * smarter about intermediate levels.)
+ */
+ if (level == 0)
+ buf = _bt_getroot(rel, BT_READ);
+ else
+ buf = _bt_gettrueroot(rel);
+
+ if (!BufferIsValid(buf))
+ return InvalidBuffer;
- buf = _bt_getroot(rel, BT_READ);
- blkno = BufferGetBlockNumber(buf);
page = BufferGetPage(buf);
+ TestForOldSnapshot(snapshot, rel, page);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
for (;;)
{
- if (opaque->btpo_flags & BTP_LEAF)
+ /*
+ * If we landed on a deleted page, step right to find a live page
+ * (there must be one). Also, if we want the rightmost page, step
+ * right if needed to get to it (this could happen if the page split
+ * since we obtained a pointer to it).
+ */
+ while (P_IGNORE(opaque) ||
+ (rightmost && !P_RIGHTMOST(opaque)))
+ {
+ blkno = opaque->btpo_next;
+ if (blkno == P_NONE)
+ elog(ERROR, "fell off the end of index \"%s\"",
+ RelationGetRelationName(rel));
+ buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ TestForOldSnapshot(snapshot, rel, page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ }
+
+ /* Done? */
+ if (opaque->btpo.level == level)
break;
+ if (opaque->btpo.level < level)
+ elog(ERROR, "btree level %u not found in index \"%s\"",
+ level, RelationGetRelationName(rel));
- if (ScanDirectionIsForward(dir))
- offnum = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
- else
+ /* Descend to leftmost or rightmost child page */
+ if (rightmost)
offnum = PageGetMaxOffsetNumber(page);
+ else
+ offnum = P_FIRSTDATAKEY(opaque);
- btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
- itup = &(btitem->bti_itup);
-
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
- _bt_relbuf(rel, buf, BT_READ);
- buf = _bt_getbuf(rel, blkno, BT_READ);
+ buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ }
- /*
- * Race condition: If the child page we just stepped onto is in
- * the process of being split, we need to make sure we're all the
- * way at the right edge of the tree. See the paper by Lehman and
- * Yao.
- */
+ return buf;
+}
- if (ScanDirectionIsBackward(dir) && !P_RIGHTMOST(opaque))
- {
- do
- {
- blkno = opaque->btpo_next;
- _bt_relbuf(rel, buf, BT_READ);
- buf = _bt_getbuf(rel, blkno, BT_READ);
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- } while (!P_RIGHTMOST(opaque));
- }
- }
+/*
+ * _bt_endpoint() -- Find the first or last page in the index, and scan
+ * from there to the first key satisfying all the quals.
+ *
+ * This is used by _bt_first() to set up a scan when we've determined
+ * that the scan must start at the beginning or end of the index (for
+ * a forward or backward scan respectively). Exit conditions are the
+ * same as for _bt_first().
+ */
+static bool
+_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
+{
+ Relation rel = scan->indexRelation;
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Buffer buf;
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber start;
+ BTScanPosItem *currItem;
- /* okay, we've got the {left,right}-most page in the tree */
- maxoff = PageGetMaxOffsetNumber(page);
+ /*
+ * Scan down to the leftmost or rightmost leaf page. This is a simplified
+ * version of _bt_search(). We don't maintain a stack since we know we
+ * won't need it.
+ */
+ buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
- if (ScanDirectionIsForward(dir))
+ if (!BufferIsValid(buf))
{
- if (!P_LEFTMOST(opaque))/* non-leftmost page ? */
- elog(ERROR, "_bt_endpoint: leftmost page (%u) has not leftmost flag", blkno);
- start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
/*
- * I don't understand this stuff! It doesn't work for
- * non-rightmost pages with only one element (P_HIKEY) which we
- * have after deletion itups by vacuum (it's case of start >
- * maxoff). Scanning in BackwardScanDirection is not
- * understandable at all. Well - new stuff. - vadim 12/06/96
+ * Empty index. Lock the whole relation, as nothing finer to lock
+ * exists.
*/
-#ifdef NOT_USED
- if (PageIsEmpty(page) || start > maxoff)
- {
- ItemPointerSet(current, blkno, maxoff);
- if (!_bt_step(scan, &buf, BackwardScanDirection))
- return (RetrieveIndexResult) NULL;
+ PredicateLockRelation(rel, scan->xs_snapshot);
+ BTScanPosInvalidate(so->currPos);
+ return false;
+ }
- start = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- }
-#endif
- if (PageIsEmpty(page))
- {
- if (start != P_HIKEY) /* non-rightmost page */
- elog(ERROR, "_bt_endpoint: non-rightmost page (%u) is empty", blkno);
+ PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ Assert(P_ISLEAF(opaque));
- /*
- * It's left- & right- most page - root page, - and it's
- * empty...
- */
- _bt_relbuf(rel, buf, BT_READ);
- ItemPointerSetInvalid(current);
- so->btso_curbuf = InvalidBuffer;
- return (RetrieveIndexResult) NULL;
- }
- if (start > maxoff) /* start == 2 && maxoff == 1 */
- {
- ItemPointerSet(current, blkno, maxoff);
- if (!_bt_step(scan, &buf, ForwardScanDirection))
- return (RetrieveIndexResult) NULL;
+ if (ScanDirectionIsForward(dir))
+ {
+ /* There could be dead pages to the left, so not this: */
+ /* Assert(P_LEFTMOST(opaque)); */
- start = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- }
- /* new stuff ends here */
- else
- ItemPointerSet(current, blkno, start);
+ start = P_FIRSTDATAKEY(opaque);
}
else if (ScanDirectionIsBackward(dir))
{
+ Assert(P_RIGHTMOST(opaque));
- /*
- * I don't understand this stuff too! If RIGHT-most leaf page is
- * empty why do scanning in ForwardScanDirection ??? Well - new
- * stuff. - vadim 12/06/96
- */
-#ifdef NOT_USED
- if (PageIsEmpty(page))
- {
- ItemPointerSet(current, blkno, FirstOffsetNumber);
- if (!_bt_step(scan, &buf, ForwardScanDirection))
- return (RetrieveIndexResult) NULL;
-
- start = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- }
-#endif
- if (PageIsEmpty(page))
- {
- /* If it's leftmost page too - it's empty root page... */
- if (P_LEFTMOST(opaque))
- {
- _bt_relbuf(rel, buf, BT_READ);
- ItemPointerSetInvalid(current);
- so->btso_curbuf = InvalidBuffer;
- return (RetrieveIndexResult) NULL;
- }
- /* Go back ! */
- ItemPointerSet(current, blkno, FirstOffsetNumber);
- if (!_bt_step(scan, &buf, BackwardScanDirection))
- return (RetrieveIndexResult) NULL;
-
- start = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- }
- /* new stuff ends here */
- else
- {
- start = PageGetMaxOffsetNumber(page);
- ItemPointerSet(current, blkno, start);
- }
+ start = PageGetMaxOffsetNumber(page);
}
else
- elog(ERROR, "Illegal scan direction %d", dir);
+ {
+ elog(ERROR, "invalid scan direction: %d", (int) dir);
+ start = 0; /* keep compiler quiet */
+ }
- btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
- itup = &(btitem->bti_itup);
+ /* remember which buffer we have pinned */
+ so->currPos.buf = buf;
- /* see if we picked a winner */
- if (_bt_checkkeys(scan, itup, &keysok))
+ /* initialize moreLeft/moreRight appropriately for scan direction */
+ if (ScanDirectionIsForward(dir))
{
- res = FormRetrieveIndexResult(current, &(itup->t_tid));
-
- /* remember which buffer we have pinned */
- so->btso_curbuf = buf;
+ so->currPos.moreLeft = false;
+ so->currPos.moreRight = true;
}
- else if (keysok >= so->numberOfFirstKeys)
+ else
{
- so->btso_curbuf = buf;
- return _bt_next(scan, dir);
+ so->currPos.moreLeft = true;
+ so->currPos.moreRight = false;
}
- else if (keysok == -1 && ScanDirectionIsBackward(dir))
+ so->numKilled = 0; /* just paranoia */
+ so->markItemIndex = -1; /* ditto */
+
+ /*
+ * Now load data from the first page of the scan.
+ */
+ if (!_bt_readpage(scan, dir, start))
{
- so->btso_curbuf = buf;
- return _bt_next(scan, dir);
+ /*
+ * There's no actually-matching data on this page. Try to advance to
+ * the next page. Return false if there's no matching data at all.
+ */
+ LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+ if (!_bt_steppage(scan, dir))
+ return false;
}
else
{
- ItemPointerSetInvalid(current);
- so->btso_curbuf = InvalidBuffer;
- _bt_relbuf(rel, buf, BT_READ);
- res = (RetrieveIndexResult) NULL;
+ /* Drop the lock, and maybe the pin, on the current page */
+ _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
}
- return res;
+ /* OK, itemIndex says what to return */
+ currItem = &so->currPos.items[so->currPos.itemIndex];
+ scan->xs_ctup.t_self = currItem->heapTid;
+ if (scan->xs_want_itup)
+ scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+ return true;
}