]> granicus.if.org Git - postgresql/blobdiff - src/backend/access/nbtree/nbtsearch.c
Update copyright via script for 2017
[postgresql] / src / backend / access / nbtree / nbtsearch.c
index 0ad8b4ab36f996f5efccfa17e1d0f990e8feb82c..4fba75aaf66a613ed198894384c2708f5d47ea29 100644 (file)
 /*-------------------------------------------------------------------------
  *
- * btsearch.c
- *       search code for postgres btrees.
+ * nbtsearch.c
+ *       Search code for postgres btrees.
  *
- * Copyright (c) 1994, Regents of the University of California
  *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.54 1999/09/27 18:20:21 momjian Exp $
+ *       src/backend/access/nbtree/nbtsearch.c
  *
  *-------------------------------------------------------------------------
  */
 
 #include "postgres.h"
 
-#include "access/genam.h"
 #include "access/nbtree.h"
+#include "access/relscan.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/predicate.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/tqual.h"
 
 
+static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
+                        OffsetNumber offnum);
+static void _bt_saveitem(BTScanOpaque so, int itemIndex,
+                        OffsetNumber offnum, IndexTuple itup);
+static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
+static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
+static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
+static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
 
-static BTStack _bt_searchr(Relation rel, int keysz, ScanKey scankey,
-                       Buffer *bufP, BTStack stack_in);
-static int _bt_compare(Relation rel, TupleDesc itupdesc, Page page,
-                       int keysz, ScanKey scankey, OffsetNumber offnum);
-static bool
-                       _bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
-static RetrieveIndexResult
-                       _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
 
 /*
- *     _bt_search() -- Search for a scan key in the index.
+ *     _bt_drop_lock_and_maybe_pin()
+ *
+ * Unlock the buffer; and if it is safe to release the pin, do that, too.  It
+ * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
+ * This will prevent vacuum from stalling in a blocked state trying to read a
+ * page when a cursor is sitting on it -- at least in many important cases.
  *
- *             This routine is actually just a helper that sets things up and
- *             calls a recursive-descent search routine on the tree.
+ * Set the buffer to invalid if the pin is released, since the buffer may be
+ * re-used.  If we need to go back to this block (for example, to apply
+ * LP_DEAD hints) we must get a fresh reference to the buffer.  Hopefully it
+ * will remain in shared memory for as long as it takes to scan the index
+ * buffer page.
  */
-BTStack
-_bt_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP)
+static void
+_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
 {
-       *bufP = _bt_getroot(rel, BT_READ);
-       return _bt_searchr(rel, keysz, scankey, bufP, (BTStack) NULL);
+       LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
+
+       if (IsMVCCSnapshot(scan->xs_snapshot) &&
+               RelationNeedsWAL(scan->indexRelation) &&
+               !scan->xs_want_itup)
+       {
+               ReleaseBuffer(sp->buf);
+               sp->buf = InvalidBuffer;
+       }
 }
 
+
 /*
- *     _bt_searchr() -- Search the tree recursively for a particular scankey.
+ *     _bt_search() -- Search the tree for a particular scankey,
+ *             or more precisely for the first leaf page it could be on.
+ *
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
+ * When nextkey is false (the usual case), we are looking for the first
+ * item >= scankey.  When nextkey is true, we are looking for the first
+ * item strictly greater than scankey.
+ *
+ * Return value is a stack of parent-page pointers.  *bufP is set to the
+ * address of the leaf-page buffer, which is read-locked and pinned.
+ * No locks are held on the parent pages, however!
+ *
+ * If the snapshot parameter is not NULL, "old snapshot" checking will take
+ * place during the descent through the tree.  This is not needed when
+ * positioning for an insert or delete, so NULL is used for those cases.
+ *
+ * NOTE that the returned buffer is read-locked regardless of the access
+ * parameter.  However, access = BT_WRITE will allow an empty root page
+ * to be created and returned.  When access = BT_READ, an empty index
+ * will result in *bufP being set to InvalidBuffer.  Also, in BT_WRITE mode,
+ * any incomplete splits encountered during the search will be finished.
  */
-static BTStack
-_bt_searchr(Relation rel,
-                       int keysz,
-                       ScanKey scankey,
-                       Buffer *bufP,
-                       BTStack stack_in)
+BTStack
+_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
+                  Buffer *bufP, int access, Snapshot snapshot)
 {
-       BTStack         stack;
-       OffsetNumber offnum;
-       Page            page;
-       BTPageOpaque opaque;
-       BlockNumber par_blkno;
-       BlockNumber blkno;
-       ItemId          itemid;
-       BTItem          btitem;
-       BTItem          item_save;
-       int                     item_nbytes;
-       IndexTuple      itup;
+       BTStack         stack_in = NULL;
 
-       /* if this is a leaf page, we're done */
-       page = BufferGetPage(*bufP);
-       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-       if (opaque->btpo_flags & BTP_LEAF)
-               return stack_in;
+       /* Get the root page to start with */
+       *bufP = _bt_getroot(rel, access);
 
-       /*
-        * Find the appropriate item on the internal page, and get the child
-        * page that it points to.
-        */
+       /* If index is empty and access = BT_READ, no root page is created. */
+       if (!BufferIsValid(*bufP))
+               return (BTStack) NULL;
 
-       par_blkno = BufferGetBlockNumber(*bufP);
-       offnum = _bt_binsrch(rel, *bufP, keysz, scankey, BT_DESCENT);
-       itemid = PageGetItemId(page, offnum);
-       btitem = (BTItem) PageGetItem(page, itemid);
-       itup = &(btitem->bti_itup);
-       blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+       /* Loop iterates once per level descended in the tree */
+       for (;;)
+       {
+               Page            page;
+               BTPageOpaque opaque;
+               OffsetNumber offnum;
+               ItemId          itemid;
+               IndexTuple      itup;
+               BlockNumber blkno;
+               BlockNumber par_blkno;
+               BTStack         new_stack;
 
-       /*
-        * We need to save the bit image of the index entry we chose in the
-        * parent page on a stack.      In case we split the tree, we'll use this
-        * bit image to figure out what our real parent page is, in case the
-        * parent splits while we're working lower in the tree.  See the paper
-        * by Lehman and Yao for how this is detected and handled.      (We use
-        * unique OIDs to disambiguate duplicate keys in the index -- Lehman
-        * and Yao disallow duplicate keys).
-        */
+               /*
+                * Race -- the page we just grabbed may have split since we read its
+                * pointer in the parent (or metapage).  If it has, we may need to
+                * move right to its new sibling.  Do that.
+                *
+                * In write-mode, allow _bt_moveright to finish any incomplete splits
+                * along the way.  Strictly speaking, we'd only need to finish an
+                * incomplete split on the leaf page we're about to insert to, not on
+                * any of the upper levels (they are taken care of in _bt_getstackbuf,
+                * if the leaf page is split and we insert to the parent page).  But
+                * this is a good opportunity to finish splits of internal pages too.
+                */
+               *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
+                                                         (access == BT_WRITE), stack_in,
+                                                         BT_READ, snapshot);
+
+               /* if this is a leaf page, we're done */
+               page = BufferGetPage(*bufP);
+               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+               if (P_ISLEAF(opaque))
+                       break;
 
-       item_nbytes = ItemIdGetLength(itemid);
-       item_save = (BTItem) palloc(item_nbytes);
-       memmove((char *) item_save, (char *) btitem, item_nbytes);
-       stack = (BTStack) palloc(sizeof(BTStackData));
-       stack->bts_blkno = par_blkno;
-       stack->bts_offset = offnum;
-       stack->bts_btitem = item_save;
-       stack->bts_parent = stack_in;
+               /*
+                * Find the appropriate item on the internal page, and get the child
+                * page that it points to.
+                */
+               offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
+               itemid = PageGetItemId(page, offnum);
+               itup = (IndexTuple) PageGetItem(page, itemid);
+               blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+               par_blkno = BufferGetBlockNumber(*bufP);
 
-       /* drop the read lock on the parent page and acquire one on the child */
-       _bt_relbuf(rel, *bufP, BT_READ);
-       *bufP = _bt_getbuf(rel, blkno, BT_READ);
+               /*
+                * We need to save the location of the index entry we chose in the
+                * parent page on a stack. In case we split the tree, we'll use the
+                * stack to work back up to the parent page.  We also save the actual
+                * downlink (TID) to uniquely identify the index entry, in case it
+                * moves right while we're working lower in the tree.  See the paper
+                * by Lehman and Yao for how this is detected and handled. (We use the
+                * child link to disambiguate duplicate keys in the index -- Lehman
+                * and Yao disallow duplicate keys.)
+                */
+               new_stack = (BTStack) palloc(sizeof(BTStackData));
+               new_stack->bts_blkno = par_blkno;
+               new_stack->bts_offset = offnum;
+               memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
+               new_stack->bts_parent = stack_in;
 
-       /*
-        * Race -- the page we just grabbed may have split since we read its
-        * pointer in the parent.  If it has, we may need to move right to its
-        * new sibling.  Do that.
-        */
+               /* drop the read lock on the parent page, acquire one on the child */
+               *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
 
-       *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);
+               /* okay, all set to move down a level */
+               stack_in = new_stack;
+       }
 
-       /* okay, all set to move down a level */
-       return _bt_searchr(rel, keysz, scankey, bufP, stack);
+       return stack_in;
 }
 
 /*
  *     _bt_moveright() -- move right in the btree if necessary.
  *
- *             When we drop and reacquire a pointer to a page, it is possible that
- *             the page has changed in the meanwhile.  If this happens, we're
- *             guaranteed that the page has "split right" -- that is, that any
- *             data that appeared on the page originally is either on the page
- *             or strictly to the right of it.
+ * When we follow a pointer to reach a page, it is possible that
+ * the page has changed in the meanwhile.  If this happens, we're
+ * guaranteed that the page has "split right" -- that is, that any
+ * data that appeared on the page originally is either on the page
+ * or strictly to the right of it.
+ *
+ * This routine decides whether or not we need to move right in the
+ * tree by examining the high key entry on the page.  If that entry
+ * is strictly less than the scankey, or <= the scankey in the nextkey=true
+ * case, then we followed the wrong link and we need to move right.
  *
- *             This routine decides whether or not we need to move right in the
- *             tree by examining the high key entry on the page.  If that entry
- *             is strictly less than one we expect to be on the page, then our
- *             picture of the page is incorrect and we need to move right.
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
  *
- *             On entry, we have the buffer pinned and a lock of the proper type.
- *             If we move right, we release the buffer and lock and acquire the
- *             same on the right sibling.
+ * When nextkey is false (the usual case), we are looking for the first
+ * item >= scankey.  When nextkey is true, we are looking for the first
+ * item strictly greater than scankey.
+ *
+ * If forupdate is true, we will attempt to finish any incomplete splits
+ * that we encounter.  This is required when locking a target page for an
+ * insertion, because we don't allow inserting on a page before the split
+ * is completed.  'stack' is only used if forupdate is true.
+ *
+ * On entry, we have the buffer pinned and a lock of the type specified by
+ * 'access'.  If we move right, we release the buffer and lock and acquire
+ * the same on the right sibling.  Return value is the buffer we stop at.
+ *
+ * If the snapshot parameter is not NULL, "old snapshot" checking will take
+ * place during the descent through the tree.  This is not needed when
+ * positioning for an insert or delete, so NULL is used for those cases.
  */
 Buffer
 _bt_moveright(Relation rel,
                          Buffer buf,
                          int keysz,
                          ScanKey scankey,
-                         int access)
+                         bool nextkey,
+                         bool forupdate,
+                         BTStack stack,
+                         int access,
+                         Snapshot snapshot)
 {
        Page            page;
        BTPageOpaque opaque;
-       ItemId          hikey;
-       BlockNumber rblkno;
-       int                     natts = rel->rd_rel->relnatts;
-
-       page = BufferGetPage(buf);
-       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
-       /* if we're on a rightmost page, we don't need to move right */
-       if (P_RIGHTMOST(opaque))
-               return buf;
-
-       /* by convention, item 0 on non-rightmost pages is the high key */
-       hikey = PageGetItemId(page, P_HIKEY);
+       int32           cmpval;
 
        /*
-        * If the scan key that brought us to this page is >= the high key
-        * stored on the page, then the page has split and we need to move
-        * right.
+        * When nextkey = false (normal case): if the scan key that brought us to
+        * this page is > the high key stored on the page, then the page has split
+        * and we need to move right.  (If the scan key is equal to the high key,
+        * we might or might not need to move right; have to scan the page first
+        * anyway.)
+        *
+        * When nextkey = true: move right if the scan key is >= page's high key.
+        *
+        * The page could even have split more than once, so scan as far as
+        * needed.
+        *
+        * We also have to move right if we followed a link that brought us to a
+        * dead page.
         */
+       cmpval = nextkey ? 0 : 1;
 
-       if (_bt_skeycmp(rel, keysz, scankey, page, hikey,
-                                       BTGreaterEqualStrategyNumber))
+       for (;;)
        {
-               /* move right as long as we need to */
-               do
-               {
-                       OffsetNumber offmax = PageGetMaxOffsetNumber(page);
+               page = BufferGetPage(buf);
+               TestForOldSnapshot(snapshot, rel, page);
+               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 
-                       /*
-                        * If this page consists of all duplicate keys (hikey and
-                        * first key on the page have the same value), then we don't
-                        * need to step right.
-                        *
-                        * NOTE for multi-column indices: we may do scan using keys not
-                        * for all attrs. But we handle duplicates using all attrs in
-                        * _bt_insert/_bt_spool code. And so we've to compare scankey
-                        * with _last_ item on this page to do not lose "good" tuples
-                        * if number of attrs > keysize. Example: (2,0) - last items
-                        * on this page, (2,1) - first item on next page (hikey), our
-                        * scankey is x = 2. Scankey == (2,1) because of we compare
-                        * first attrs only, but we shouldn't to move right of here. -
-                        * vadim 04/15/97
-                        *
-                        * Also, if this page is not LEAF one (and # of attrs > keysize)
-                        * then we can't move too. - vadim 10/22/97
-                        */
+               if (P_RIGHTMOST(opaque))
+                       break;
 
-                       if (_bt_skeycmp(rel, keysz, scankey, page, hikey,
-                                                       BTEqualStrategyNumber))
+               /*
+                * Finish any incomplete splits we encounter along the way.
+                */
+               if (forupdate && P_INCOMPLETE_SPLIT(opaque))
+               {
+                       BlockNumber blkno = BufferGetBlockNumber(buf);
+
+                       /* upgrade our lock if necessary */
+                       if (access == BT_READ)
                        {
-                               if (opaque->btpo_flags & BTP_CHAIN)
-                               {
-                                       Assert((opaque->btpo_flags & BTP_LEAF) || offmax > P_HIKEY);
-                                       break;
-                               }
-                               if (offmax > P_HIKEY)
-                               {
-                                       if (natts == keysz) /* sanity checks */
-                                       {
-                                               if (_bt_skeycmp(rel, keysz, scankey, page,
-                                                                               PageGetItemId(page, P_FIRSTKEY),
-                                                                               BTEqualStrategyNumber))
-                                                       elog(FATAL, "btree: BTP_CHAIN flag was expected in %s (access = %s)",
-                                                                rel->rd_rel->relname.data, access ? "bt_write" : "bt_read");
-                                               if (_bt_skeycmp(rel, keysz, scankey, page,
-                                                                               PageGetItemId(page, offmax),
-                                                                               BTEqualStrategyNumber))
-                                                       elog(FATAL, "btree: unexpected equal last item");
-                                               if (_bt_skeycmp(rel, keysz, scankey, page,
-                                                                               PageGetItemId(page, offmax),
-                                                                               BTLessStrategyNumber))
-                                                       elog(FATAL, "btree: unexpected greater last item");
-                                               /* move right */
-                                       }
-                                       else if (!(opaque->btpo_flags & BTP_LEAF))
-                                               break;
-                                       else if (_bt_skeycmp(rel, keysz, scankey, page,
-                                                                                PageGetItemId(page, offmax),
-                                                                                BTLessEqualStrategyNumber))
-                                               break;
-                               }
+                               LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+                               LockBuffer(buf, BT_WRITE);
                        }
 
-                       /* step right one page */
-                       rblkno = opaque->btpo_next;
-                       _bt_relbuf(rel, buf, access);
-                       buf = _bt_getbuf(rel, rblkno, access);
-                       page = BufferGetPage(buf);
-                       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-                       hikey = PageGetItemId(page, P_HIKEY);
-
-               } while (!P_RIGHTMOST(opaque)
-                                && _bt_skeycmp(rel, keysz, scankey, page, hikey,
-                                                               BTGreaterEqualStrategyNumber));
-       }
-       return buf;
-}
-
-/*
- *     _bt_skeycmp() -- compare a scan key to a particular item on a page using
- *                                      a requested strategy (<, <=, =, >=, >).
- *
- *             We ignore the unique OIDs stored in the btree item here.  Those
- *             numbers are intended for use internally only, in repositioning a
- *             scan after a page split.  They do not impose any meaningful ordering.
- *
- *             The comparison is A <op> B, where A is the scan key and B is the
- *             tuple pointed at by itemid on page.
- */
-bool
-_bt_skeycmp(Relation rel,
-                       Size keysz,
-                       ScanKey scankey,
-                       Page page,
-                       ItemId itemid,
-                       StrategyNumber strat)
-{
-       BTItem          item;
-       IndexTuple      indexTuple;
-       TupleDesc       tupDes;
-       ScanKey         entry;
-       int                     i;
-       Datum           attrDatum;
-       Datum           keyDatum;
-       bool            compare;
-       bool            isNull;
-       bool            useEqual = false;
-       bool            keyNull;
-
-       if (strat == BTLessEqualStrategyNumber)
-       {
-               useEqual = true;
-               strat = BTLessStrategyNumber;
-       }
-       else if (strat == BTGreaterEqualStrategyNumber)
-       {
-               useEqual = true;
-               strat = BTGreaterStrategyNumber;
-       }
-
-       item = (BTItem) PageGetItem(page, itemid);
-       indexTuple = &(item->bti_itup);
-
-       tupDes = RelationGetDescr(rel);
-
-       /* see if the comparison is true for all of the key attributes */
-       for (i = 1; i <= keysz; i++)
-       {
-
-               entry = &scankey[i - 1];
-               Assert(entry->sk_attno == i);
-               attrDatum = index_getattr(indexTuple,
-                                                                 entry->sk_attno,
-                                                                 tupDes,
-                                                                 &isNull);
-               keyDatum = entry->sk_argument;
-
-               /* see comments about NULLs handling in btbuild */
-               if (entry->sk_flags & SK_ISNULL)                /* key is NULL */
-               {
-                       Assert(entry->sk_procedure == F_NULLVALUE);
-                       keyNull = true;
-                       if (isNull)
-                               compare = (strat == BTEqualStrategyNumber) ? true : false;
+                       if (P_INCOMPLETE_SPLIT(opaque))
+                               _bt_finish_split(rel, buf, stack);
                        else
-                               compare = (strat == BTGreaterStrategyNumber) ? true : false;
-               }
-               else if (isNull)                /* key is NOT_NULL and item is NULL */
-               {
-                       keyNull = false;
-                       compare = (strat == BTLessStrategyNumber) ? true : false;
-               }
-               else
-               {
-                       keyNull = false;
-                       compare = _bt_invokestrat(rel, i, strat, keyDatum, attrDatum);
+                               _bt_relbuf(rel, buf);
+
+                       /* re-acquire the lock in the right mode, and re-check */
+                       buf = _bt_getbuf(rel, blkno, access);
+                       continue;
                }
 
-               if (compare)                    /* true for one of ">, <, =" */
+               if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
                {
-                       if (strat != BTEqualStrategyNumber)
-                               return true;
+                       /* step right one page */
+                       buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
+                       continue;
                }
                else
-/* false for one of ">, <, =" */
-               {
-                       if (strat == BTEqualStrategyNumber)
-                               return false;
-
-                       /*
-                        * if original strat was "<=, >=" OR "<, >" but some
-                        * attribute(s) left - need to test for Equality
-                        */
-                       if (useEqual || i < keysz)
-                       {
-                               if (keyNull || isNull)
-                                       compare = (keyNull && isNull) ? true : false;
-                               else
-                                       compare = _bt_invokestrat(rel, i, BTEqualStrategyNumber,
-                                                                                         keyDatum, attrDatum);
-                               if (compare)    /* key' and item' attributes are equal */
-                                       continue;       /* - try to compare next attributes */
-                       }
-                       return false;
-               }
+                       break;
        }
 
-       return true;
+       if (P_IGNORE(opaque))
+               elog(ERROR, "fell off the end of index \"%s\"",
+                        RelationGetRelationName(rel));
+
+       return buf;
 }
 
 /*
  *     _bt_binsrch() -- Do a binary search for a key on a particular page.
  *
- *             The scankey we get has the compare function stored in the procedure
- *             entry of each data struct.      We invoke this regproc to do the
- *             comparison for every key in the scankey.  _bt_binsrch() returns
- *             the OffsetNumber of the first matching key on the page, or the
- *             OffsetNumber at which the matching key would appear if it were
- *             on this page.  (NOTE: in particular, this means it is possible to
- *             return a value 1 greater than the number of keys on the page, if
- *             the scankey is > all keys on the page.)
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
+ * When nextkey is false (the usual case), we are looking for the first
+ * item >= scankey.  When nextkey is true, we are looking for the first
+ * item strictly greater than scankey.
+ *
+ * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
+ * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
+ * particular, this means it is possible to return a value 1 greater than the
+ * number of keys on the page, if the scankey is > all keys on the page.)
+ *
+ * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
+ * of the last key < given scankey, or last key <= given scankey if nextkey
+ * is true.  (Since _bt_compare treats the first data key of such a page as
+ * minus infinity, there will be at least one key < scankey, so the result
+ * always points at one of the keys on the page.)  This key indicates the
+ * right place to descend to be sure we find all leaf keys >= given scankey
+ * (or leaf keys > given scankey when nextkey is true).
  *
- *             By the time this procedure is called, we're sure we're looking
- *             at the right page -- don't need to walk right.  _bt_binsrch() has
- *             no lock or refcount side effects on the buffer.
+ * This procedure is not responsible for walking right, it just examines
+ * the given page.  _bt_binsrch() has no lock or refcount side effects
+ * on the buffer.
  */
 OffsetNumber
 _bt_binsrch(Relation rel,
                        Buffer buf,
                        int keysz,
                        ScanKey scankey,
-                       int srchtype)
+                       bool nextkey)
 {
-       TupleDesc       itupdesc;
        Page            page;
        BTPageOpaque opaque;
        OffsetNumber low,
                                high;
-       bool            haveEq;
-       int                     natts = rel->rd_rel->relnatts;
-       int                     result;
+       int32           result,
+                               cmpval;
 
-       itupdesc = RelationGetDescr(rel);
        page = BufferGetPage(buf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 
-       /* by convention, item 1 on any non-rightmost page is the high key */
-       low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
+       low = P_FIRSTDATAKEY(opaque);
        high = PageGetMaxOffsetNumber(page);
 
        /*
-        * If there are no keys on the page, return the first available slot.
-        * Note this covers two cases: the page is really empty (no keys),
-        * or it contains only a high key.  The latter case is possible after
-        * vacuuming.
+        * If there are no keys on the page, return the first available slot. Note
+        * this covers two cases: the page is really empty (no keys), or it
+        * contains only a high key.  The latter case is possible after vacuuming.
+        * This can never happen on an internal page, however, since they are
+        * never empty (an internal page must have children).
         */
        if (high < low)
                return low;
 
        /*
-        * Binary search to find the first key on the page >= scan key.
-        * Loop invariant: all slots before 'low' are < scan key, all slots
-        * at or after 'high' are >= scan key.  Also, haveEq is true if the
-        * tuple at 'high' is == scan key.
+        * Binary search to find the first key on the page >= scan key, or first
+        * key > scankey when nextkey is true.
+        *
+        * For nextkey=false (cmpval=1), the loop invariant is: all slots before
+        * 'low' are < scan key, all slots at or after 'high' are >= scan key.
+        *
+        * For nextkey=true (cmpval=0), the loop invariant is: all slots before
+        * 'low' are <= scan key, all slots at or after 'high' are > scan key.
+        *
         * We can fall out when high == low.
         */
        high++;                                         /* establish the loop invariant for high */
-       haveEq = false;
+
+       cmpval = nextkey ? 0 : 1;       /* select comparison value */
 
        while (high > low)
        {
-               OffsetNumber    mid = low + ((high - low) / 2);
+               OffsetNumber mid = low + ((high - low) / 2);
+
                /* We have low <= mid < high, so mid points at a real slot */
 
-               result = _bt_compare(rel, itupdesc, page, keysz, scankey, mid);
+               result = _bt_compare(rel, keysz, scankey, page, mid);
 
-               if (result > 0)
+               if (result >= cmpval)
                        low = mid + 1;
                else
-               {
                        high = mid;
-                       haveEq = (result == 0);
-               }
        }
 
-       /*--------------------
+       /*
         * At this point we have high == low, but be careful: they could point
-        * past the last slot on the page.  We also know that haveEq is true
-        * if and only if there is an equal key (in which case high&low point
-        * at the first equal key).
+        * past the last slot on the page.
         *
-        * On a leaf page, we always return the first key >= scan key
-        * (which could be the last slot + 1).
-        *--------------------
+        * On a leaf page, we always return the first key >= scan key (resp. >
+        * scan key), which could be the last slot + 1.
         */
-
-       if (opaque->btpo_flags & BTP_LEAF)
+       if (P_ISLEAF(opaque))
                return low;
 
-       /*--------------------
-        * On a non-leaf page, there are special cases:
-        *
-        * For an insertion (srchtype != BT_DESCENT and natts == keysz)
-        * always return first key >= scan key (which could be off the end).
-        *
-        * For a standard search (srchtype == BT_DESCENT and natts == keysz)
-        * return the first equal key if one exists, else the last lesser key
-        * if one exists, else the first slot on the page.
-        *
-        * For a partial-match search (srchtype == BT_DESCENT and natts > keysz)
-        * return the last lesser key if one exists, else the first slot.
-        *
-        * Old comments:
-        *      For multi-column indices, we may scan using keys
-        *      not for all attrs. But we handle duplicates using all attrs
-        *      in _bt_insert/_bt_spool code. And so while searching on
-        *      internal pages having number of attrs > keysize we want to
-        *      point at the last item < the scankey, not at the first item
-        *      = the scankey (!!!), and let _bt_moveright decide later
-        *      whether to move right or not (see comments and example
-        *      there). Note also that INSERTions are not affected by this
-        *      code (since natts == keysz for inserts).                   - vadim 04/15/97
-        *--------------------
+       /*
+        * On a non-leaf page, return the last key < scan key (resp. <= scan key).
+        * There must be one if _bt_compare() is playing by the rules.
         */
-
-       if (haveEq)
-       {
-               /*
-                * There is an equal key.  We return either the first equal key
-                * (which we just found), or the last lesser key.
-                *
-                * We need not check srchtype != BT_DESCENT here, since if that
-                * is true then natts == keysz by assumption.
-                */
-               if (natts == keysz)
-                       return low;                     /* return first equal key */
-       }
-       else
-       {
-               /*
-                * There is no equal key.  We return either the first greater key
-                * (which we just found), or the last lesser key.
-                */
-               if (srchtype != BT_DESCENT)
-                       return low;                     /* return first greater key */
-       }
-
-
-       if (low == (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY))
-               return low;                             /* there is no prior item */
+       Assert(low > P_FIRSTDATAKEY(opaque));
 
        return OffsetNumberPrev(low);
 }
 
-/*
+/*----------
  *     _bt_compare() -- Compare scankey to a particular tuple on the page.
  *
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
+ *     keysz: number of key conditions to be checked (might be less than the
+ *             number of index columns!)
+ *     page/offnum: location of btree item to be compared to.
+ *
  *             This routine returns:
- *                     -1 if scankey < tuple at offnum;
+ *                     <0 if scankey < tuple at offnum;
  *                      0 if scankey == tuple at offnum;
- *                     +1 if scankey > tuple at offnum.
- *
- *             -- Old comments:
- *             In order to avoid having to propagate changes up the tree any time
- *             a new minimal key is inserted, the leftmost entry on the leftmost
- *             page is less than all possible keys, by definition.
+ *                     >0 if scankey > tuple at offnum.
+ *             NULLs in the keys are treated as sortable values.  Therefore
+ *             "equality" does not necessarily mean that the item should be
+ *             returned to the caller as a matching key!
  *
- *             -- New ones:
- *             New insertion code (fix against updating _in_place_ if new minimal
- *             key has bigger size than old one) may delete P_HIKEY entry on the
- *             root page in order to insert new minimal key - and so this definition
- *             does not work properly in this case and breaks key' order on root
- *             page. BTW, this propagation occures only while page' splitting,
- *             but not "any time a new min key is inserted" (see _bt_insertonpg).
- *                             - vadim 12/05/96
+ * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
+ * "minus infinity": this routine will always claim it is less than the
+ * scankey.  The actual key value stored (if any, which there probably isn't)
+ * does not matter.  This convention allows us to implement the Lehman and
+ * Yao convention that the first down-link pointer is before the first key.
+ * See backend/access/nbtree/README for details.
+ *----------
  */
-static int
+int32
 _bt_compare(Relation rel,
-                       TupleDesc itupdesc,
-                       Page page,
                        int keysz,
                        ScanKey scankey,
+                       Page page,
                        OffsetNumber offnum)
 {
-       Datum           datum;
-       BTItem          btitem;
+       TupleDesc       itupdesc = RelationGetDescr(rel);
+       BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        IndexTuple      itup;
-       BTPageOpaque opaque;
-       ScanKey         entry;
-       AttrNumber      attno;
-       int                     result;
        int                     i;
-       bool            null;
 
        /*
-        * If this is a leftmost internal page, and if our comparison is with
-        * the first key on the page, then the item at that position is by
-        * definition less than the scan key.
-        *
-        * - see new comments above...
+        * Force result ">" if target item is first data item on an internal page
+        * --- see NOTE above.
         */
-
-       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
-       if (!(opaque->btpo_flags & BTP_LEAF)
-               && P_LEFTMOST(opaque)
-               && offnum == P_HIKEY)
-       {
-               /*
-                * we just have to believe that this will only be called with
-                * offnum == P_HIKEY when P_HIKEY is the OffsetNumber of the first
-                * actual data key (i.e., this is also a rightmost page).  there
-                * doesn't seem to be any code that implies that the leftmost page
-                * is normally missing a high key as well as the rightmost page.
-                * but that implies that this code path only applies to the root
-                * -- which seems unlikely..
-                *
-                * - see new comments above...
-                */
-               if (!P_RIGHTMOST(opaque))
-                       elog(ERROR, "_bt_compare: invalid comparison to high key");
-
-#ifdef NOT_USED
-
-               /*
-                * We just have to belive that right answer will not break
-                * anything. I've checked code and all seems to be ok. See new
-                * comments above...
-                *
-                * -- Old comments If the item on the page is equal to the scankey,
-                * that's okay to admit.  We just can't claim that the first key
-                * on the page is greater than anything.
-                */
-
-               if (_bt_skeycmp(rel, keysz, scankey, page, PageGetItemId(page, offnum),
-                                               BTEqualStrategyNumber))
-                       return 0;
+       if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
                return 1;
-#endif
-       }
 
-       btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
-       itup = &(btitem->bti_itup);
+       itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
 
        /*
-        * The scan key is set up with the attribute number associated with
-        * each term in the key.  It is important that, if the index is
-        * multi-key, the scan contain the first k key attributes, and that
-        * they be in order.  If you think about how multi-key ordering works,
-        * you'll understand why this is.
+        * The scan key is set up with the attribute number associated with each
+        * term in the key.  It is important that, if the index is multi-key, the
+        * scan contain the first k key attributes, and that they be in order.  If
+        * you think about how multi-key ordering works, you'll understand why
+        * this is.
         *
-        * We don't test for violation of this condition here.
+        * We don't test for violation of this condition here, however.  The
+        * initial setup for the index scan had better have gotten it right (see
+        * _bt_first).
         */
 
        for (i = 1; i <= keysz; i++)
        {
-               long            tmpres;
+               Datum           datum;
+               bool            isNull;
+               int32           result;
 
-               entry = &scankey[i - 1];
-               attno = entry->sk_attno;
-               datum = index_getattr(itup, attno, itupdesc, &null);
+               datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
 
                /* see comments about NULLs handling in btbuild */
-               if (entry->sk_flags & SK_ISNULL)                /* key is NULL */
+               if (scankey->sk_flags & SK_ISNULL)              /* key is NULL */
                {
-                       Assert(entry->sk_procedure == F_NULLVALUE);
-                       if (null)
-                               tmpres = (long) 0;              /* NULL "=" NULL */
+                       if (isNull)
+                               result = 0;             /* NULL "=" NULL */
+                       else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
+                               result = -1;    /* NULL "<" NOT_NULL */
                        else
-                               tmpres = (long) 1;              /* NULL ">" NOT_NULL */
+                               result = 1;             /* NULL ">" NOT_NULL */
                }
-               else if (null)                  /* key is NOT_NULL and item is NULL */
+               else if (isNull)                /* key is NOT_NULL and item is NULL */
                {
-                       tmpres = (long) -1; /* NOT_NULL "<" NULL */
+                       if (scankey->sk_flags & SK_BT_NULLS_FIRST)
+                               result = 1;             /* NOT_NULL ">" NULL */
+                       else
+                               result = -1;    /* NOT_NULL "<" NULL */
                }
                else
-                       tmpres = (long) FMGR_PTR2(&entry->sk_func, entry->sk_argument, datum);
-               result = tmpres;
+               {
+                       /*
+                        * The sk_func needs to be passed the index value as left arg and
+                        * the sk_argument as right arg (they might be of different
+                        * types).  Since it is convenient for callers to think of
+                        * _bt_compare as comparing the scankey to the index item, we have
+                        * to flip the sign of the comparison result.  (Unless it's a DESC
+                        * column, in which case we *don't* flip the sign.)
+                        */
+                       result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
+                                                                                                        scankey->sk_collation,
+                                                                                                        datum,
+                                                                                                        scankey->sk_argument));
+
+                       if (!(scankey->sk_flags & SK_BT_DESC))
+                               result = -result;
+               }
 
                /* if the keys are unequal, return the difference */
                if (result != 0)
                        return result;
+
+               scankey++;
        }
 
-       /* by here, the keys are equal */
+       /* if we get here, the keys are equal */
        return 0;
 }
 
 /*
- *     _bt_next() -- Get the next item in a scan.
+ *     _bt_first() -- Find the first item in a scan.
+ *
+ *             We need to be clever about the direction of scan, the search
+ *             conditions, and the tree ordering.  We find the first item (or,
+ *             if backwards scan, the last item) in the tree that satisfies the
+ *             qualifications in the scan key.  On success exit, the page containing
+ *             the current index tuple is pinned but not locked, and data about
+ *             the matching tuple(s) on the page has been loaded into so->currPos.
+ *             scan->xs_ctup.t_self is set to the heap TID of the current tuple,
+ *             and if requested, scan->xs_itup points to a copy of the index tuple.
  *
- *             On entry, we have a valid currentItemData in the scan, and a
- *             read lock on the page that contains that item.  We do not have
- *             the page pinned.  We return the next item in the scan.  On
- *             exit, we have the page containing the next item locked but not
- *             pinned.
+ * If there are no matching items in the index, we return FALSE, with no
+ * pins or locks held.
+ *
+ * Note that scan->keyData[], and the so->keyData[] scankey built from it,
+ * are both search-type scankeys (see nbtree/README for more about this).
+ * Within this routine, we build a temporary insertion-type scankey to use
+ * in locating the scan start position.
  */
-RetrieveIndexResult
-_bt_next(IndexScanDesc scan, ScanDirection dir)
+bool
+_bt_first(IndexScanDesc scan, ScanDirection dir)
 {
-       Relation        rel;
+       Relation        rel = scan->indexRelation;
+       BTScanOpaque so = (BTScanOpaque) scan->opaque;
        Buffer          buf;
-       Page            page;
+       BTStack         stack;
        OffsetNumber offnum;
-       RetrieveIndexResult res;
-       ItemPointer current;
-       BTItem          btitem;
-       IndexTuple      itup;
-       BTScanOpaque so;
-       Size            keysok;
+       StrategyNumber strat;
+       bool            nextkey;
+       bool            goback;
+       ScanKey         startKeys[INDEX_MAX_KEYS];
+       ScanKeyData scankeys[INDEX_MAX_KEYS];
+       ScanKeyData notnullkeys[INDEX_MAX_KEYS];
+       int                     keysCount = 0;
+       int                     i;
+       StrategyNumber strat_total;
+       BTScanPosItem *currItem;
 
-       rel = scan->relation;
-       so = (BTScanOpaque) scan->opaque;
-       current = &(scan->currentItemData);
+       Assert(!BTScanPosIsValid(so->currPos));
 
-       Assert(BufferIsValid(so->btso_curbuf));
+       pgstat_count_index_scan(rel);
 
-       /* we still have the buffer pinned and locked */
-       buf = so->btso_curbuf;
+       /*
+        * Examine the scan keys and eliminate any redundant keys; also mark the
+        * keys that must be matched to continue the scan.
+        */
+       _bt_preprocess_keys(scan);
+
+       /*
+        * Quit now if _bt_preprocess_keys() discovered that the scan keys can
+        * never be satisfied (eg, x == 1 AND x > 2).
+        */
+       if (!so->qual_ok)
+               return false;
 
-       do
+       /*----------
+        * Examine the scan keys to discover where we need to start the scan.
+        *
+        * We want to identify the keys that can be used as starting boundaries;
+        * these are =, >, or >= keys for a forward scan or =, <, <= keys for
+        * a backwards scan.  We can use keys for multiple attributes so long as
+        * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
+        * a > or < boundary or find an attribute with no boundary (which can be
+        * thought of as the same as "> -infinity"), we can't use keys for any
+        * attributes to its right, because it would break our simplistic notion
+        * of what initial positioning strategy to use.
+        *
+        * When the scan keys include cross-type operators, _bt_preprocess_keys
+        * may not be able to eliminate redundant keys; in such cases we will
+        * arbitrarily pick a usable one for each attribute.  This is correct
+        * but possibly not optimal behavior.  (For example, with keys like
+        * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
+        * x=5 would be more efficient.)  Since the situation only arises given
+        * a poorly-worded query plus an incomplete opfamily, live with it.
+        *
+        * When both equality and inequality keys appear for a single attribute
+        * (again, only possible when cross-type operators appear), we *must*
+        * select one of the equality keys for the starting point, because
+        * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
+        * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
+        * start at x=4, we will fail and stop before reaching x=10.  If multiple
+        * equality quals survive preprocessing, however, it doesn't matter which
+        * one we use --- by definition, they are either redundant or
+        * contradictory.
+        *
+        * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
+        * If the index stores nulls at the end of the index we'll be starting
+        * from, and we have no boundary key for the column (which means the key
+        * we deduced NOT NULL from is an inequality key that constrains the other
+        * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
+        * use as a boundary key.  If we didn't do this, we might find ourselves
+        * traversing a lot of null entries at the start of the scan.
+        *
+        * In this loop, row-comparison keys are treated the same as keys on their
+        * first (leftmost) columns.  We'll add on lower-order columns of the row
+        * comparison below, if possible.
+        *
+        * The selected scan keys (at most one per index column) are remembered by
+        * storing their addresses into the local startKeys[] array.
+        *----------
+        */
+       strat_total = BTEqualStrategyNumber;
+       if (so->numberOfKeys > 0)
        {
-               /* step one tuple in the appropriate direction */
-               if (!_bt_step(scan, &buf, dir))
-                       return (RetrieveIndexResult) NULL;
+               AttrNumber      curattr;
+               ScanKey         chosen;
+               ScanKey         impliesNN;
+               ScanKey         cur;
 
-               /* by here, current is the tuple we want to return */
-               offnum = ItemPointerGetOffsetNumber(current);
-               page = BufferGetPage(buf);
-               btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
-               itup = &btitem->bti_itup;
+               /*
+                * chosen is the so-far-chosen key for the current attribute, if any.
+                * We don't cast the decision in stone until we reach keys for the
+                * next attribute.
+                */
+               curattr = 1;
+               chosen = NULL;
+               /* Also remember any scankey that implies a NOT NULL constraint */
+               impliesNN = NULL;
 
-               if (_bt_checkkeys(scan, itup, &keysok))
+               /*
+                * Loop iterates from 0 to numberOfKeys inclusive; we use the last
+                * pass to handle after-last-key processing.  Actual exit from the
+                * loop is at one of the "break" statements below.
+                */
+               for (cur = so->keyData, i = 0;; cur++, i++)
                {
-                       Assert(keysok == so->numberOfKeys);
-                       res = FormRetrieveIndexResult(current, &(itup->t_tid));
+                       if (i >= so->numberOfKeys || cur->sk_attno != curattr)
+                       {
+                               /*
+                                * Done looking at keys for curattr.  If we didn't find a
+                                * usable boundary key, see if we can deduce a NOT NULL key.
+                                */
+                               if (chosen == NULL && impliesNN != NULL &&
+                                       ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
+                                        ScanDirectionIsForward(dir) :
+                                        ScanDirectionIsBackward(dir)))
+                               {
+                                       /* Yes, so build the key in notnullkeys[keysCount] */
+                                       chosen = &notnullkeys[keysCount];
+                                       ScanKeyEntryInitialize(chosen,
+                                                                                  (SK_SEARCHNOTNULL | SK_ISNULL |
+                                                                                       (impliesNN->sk_flags &
+                                                                                 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
+                                                                                  curattr,
+                                                                ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
+                                                                 BTGreaterStrategyNumber :
+                                                                 BTLessStrategyNumber),
+                                                                                  InvalidOid,
+                                                                                  InvalidOid,
+                                                                                  InvalidOid,
+                                                                                  (Datum) 0);
+                               }
+
+                               /*
+                                * If we still didn't find a usable boundary key, quit; else
+                                * save the boundary key pointer in startKeys.
+                                */
+                               if (chosen == NULL)
+                                       break;
+                               startKeys[keysCount++] = chosen;
+
+                               /*
+                                * Adjust strat_total, and quit if we have stored a > or <
+                                * key.
+                                */
+                               strat = chosen->sk_strategy;
+                               if (strat != BTEqualStrategyNumber)
+                               {
+                                       strat_total = strat;
+                                       if (strat == BTGreaterStrategyNumber ||
+                                               strat == BTLessStrategyNumber)
+                                               break;
+                               }
 
-                       /* remember which buffer we have pinned and locked */
-                       so->btso_curbuf = buf;
-                       return res;
+                               /*
+                                * Done if that was the last attribute, or if next key is not
+                                * in sequence (implying no boundary key is available for the
+                                * next attribute).
+                                */
+                               if (i >= so->numberOfKeys ||
+                                       cur->sk_attno != curattr + 1)
+                                       break;
+
+                               /*
+                                * Reset for next attr.
+                                */
+                               curattr = cur->sk_attno;
+                               chosen = NULL;
+                               impliesNN = NULL;
+                       }
+
+                       /*
+                        * Can we use this key as a starting boundary for this attr?
+                        *
+                        * If not, does it imply a NOT NULL constraint?  (Because
+                        * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
+                        * *any* inequality key works for that; we need not test.)
+                        */
+                       switch (cur->sk_strategy)
+                       {
+                               case BTLessStrategyNumber:
+                               case BTLessEqualStrategyNumber:
+                                       if (chosen == NULL)
+                                       {
+                                               if (ScanDirectionIsBackward(dir))
+                                                       chosen = cur;
+                                               else
+                                                       impliesNN = cur;
+                                       }
+                                       break;
+                               case BTEqualStrategyNumber:
+                                       /* override any non-equality choice */
+                                       chosen = cur;
+                                       break;
+                               case BTGreaterEqualStrategyNumber:
+                               case BTGreaterStrategyNumber:
+                                       if (chosen == NULL)
+                                       {
+                                               if (ScanDirectionIsForward(dir))
+                                                       chosen = cur;
+                                               else
+                                                       impliesNN = cur;
+                                       }
+                                       break;
+                       }
                }
+       }
 
-       } while (keysok >= so->numberOfFirstKeys ||
-                        (keysok == -1 && ScanDirectionIsBackward(dir)));
+       /*
+        * If we found no usable boundary keys, we have to start from one end of
+        * the tree.  Walk down that edge to the first or last key, and scan from
+        * there.
+        */
+       if (keysCount == 0)
+               return _bt_endpoint(scan, dir);
 
-       ItemPointerSetInvalid(current);
-       so->btso_curbuf = InvalidBuffer;
-       _bt_relbuf(rel, buf, BT_READ);
+       /*
+        * We want to start the scan somewhere within the index.  Set up an
+        * insertion scankey we can use to search for the boundary point we
+        * identified above.  The insertion scankey is built in the local
+        * scankeys[] array, using the keys identified by startKeys[].
+        */
+       Assert(keysCount <= INDEX_MAX_KEYS);
+       for (i = 0; i < keysCount; i++)
+       {
+               ScanKey         cur = startKeys[i];
 
-       return (RetrieveIndexResult) NULL;
-}
+               Assert(cur->sk_attno == i + 1);
 
-/*
- *     _bt_first() -- Find the first item in a scan.
- *
- *             We need to be clever about the type of scan, the operation it's
- *             performing, and the tree ordering.      We return the RetrieveIndexResult
- *             of the first item in the tree that satisfies the qualification
- *             associated with the scan descriptor.  On exit, the page containing
- *             the current index tuple is read locked and pinned, and the scan's
- *             opaque data entry is updated to include the buffer.
- */
-RetrieveIndexResult
-_bt_first(IndexScanDesc scan, ScanDirection dir)
-{
-       Relation        rel;
-       TupleDesc       itupdesc;
-       Buffer          buf;
-       Page            page;
-       BTPageOpaque pop;
-       BTStack         stack;
-       OffsetNumber offnum,
-                               maxoff;
-       bool            offGmax = false;
-       BTItem          btitem;
-       IndexTuple      itup;
-       ItemPointer current;
-       BlockNumber blkno;
-       StrategyNumber strat;
-       RetrieveIndexResult res;
-       RegProcedure proc;
-       int                     result;
-       BTScanOpaque so;
-       Size            keysok;
-
-       bool            strategyCheck;
-       ScanKey         scankeys = 0;
-       int             keysCount = 0;
-       int             *nKeyIs = 0;
-       int             i, j;
-       StrategyNumber  strat_total;
-       
-       rel = scan->relation;
-       so = (BTScanOpaque) scan->opaque;
-
-       /*
-        * Order the keys in the qualification and be sure that the scan
-        * exploits the tree order.
-        */
-       so->numberOfFirstKeys = 0;      /* may be changed by _bt_orderkeys */
-       so->qual_ok = 1;                        /* may be changed by _bt_orderkeys */
-       scan->scanFromEnd = false;
-       strategyCheck = false;
-       if (so->numberOfKeys > 0)
-       {
-               _bt_orderkeys(rel, so);
+               if (cur->sk_flags & SK_ROW_HEADER)
+               {
+                       /*
+                        * Row comparison header: look to the first row member instead.
+                        *
+                        * The member scankeys are already in insertion format (ie, they
+                        * have sk_func = 3-way-comparison function), but we have to watch
+                        * out for nulls, which _bt_preprocess_keys didn't check. A null
+                        * in the first row member makes the condition unmatchable, just
+                        * like qual_ok = false.
+                        */
+                       ScanKey         subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
 
-               if (so->qual_ok)
-                    strategyCheck = true;
-       }
-       strat_total = BTEqualStrategyNumber;
-       if (strategyCheck)
-       {
-               AttrNumber      attno;
+                       Assert(subkey->sk_flags & SK_ROW_MEMBER);
+                       if (subkey->sk_flags & SK_ISNULL)
+                               return false;
+                       memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
 
-               nKeyIs = (int *)palloc(so->numberOfKeys*sizeof(int));
-               for (i=0; i < so->numberOfKeys; i++)
-               {
-                       attno = so->keyData[i].sk_attno;
-                       if (attno == keysCount)
-                               continue;
-                       if (attno > keysCount + 1)
-                               break;
-                       strat = _bt_getstrat(rel, attno,
-                                       so->keyData[i].sk_procedure);
-                       if (strat == strat_total ||
-                           strat == BTEqualStrategyNumber)
+                       /*
+                        * If the row comparison is the last positioning key we accepted,
+                        * try to add additional keys from the lower-order row members.
+                        * (If we accepted independent conditions on additional index
+                        * columns, we use those instead --- doesn't seem worth trying to
+                        * determine which is more restrictive.)  Note that this is OK
+                        * even if the row comparison is of ">" or "<" type, because the
+                        * condition applied to all but the last row member is effectively
+                        * ">=" or "<=", and so the extra keys don't break the positioning
+                        * scheme.  But, by the same token, if we aren't able to use all
+                        * the row members, then the part of the row comparison that we
+                        * did use has to be treated as just a ">=" or "<=" condition, and
+                        * so we'd better adjust strat_total accordingly.
+                        */
+                       if (i == keysCount - 1)
                        {
-                               nKeyIs[keysCount++] = i;
-                               continue;
+                               bool            used_all_subkeys = false;
+
+                               Assert(!(subkey->sk_flags & SK_ROW_END));
+                               for (;;)
+                               {
+                                       subkey++;
+                                       Assert(subkey->sk_flags & SK_ROW_MEMBER);
+                                       if (subkey->sk_attno != keysCount + 1)
+                                               break;  /* out-of-sequence, can't use it */
+                                       if (subkey->sk_strategy != cur->sk_strategy)
+                                               break;  /* wrong direction, can't use it */
+                                       if (subkey->sk_flags & SK_ISNULL)
+                                               break;  /* can't use null keys */
+                                       Assert(keysCount < INDEX_MAX_KEYS);
+                                       memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
+                                       keysCount++;
+                                       if (subkey->sk_flags & SK_ROW_END)
+                                       {
+                                               used_all_subkeys = true;
+                                               break;
+                                       }
+                               }
+                               if (!used_all_subkeys)
+                               {
+                                       switch (strat_total)
+                                       {
+                                               case BTLessStrategyNumber:
+                                                       strat_total = BTLessEqualStrategyNumber;
+                                                       break;
+                                               case BTGreaterStrategyNumber:
+                                                       strat_total = BTGreaterEqualStrategyNumber;
+                                                       break;
+                                       }
+                               }
+                               break;                  /* done with outer loop */
                        }
-                       if (ScanDirectionIsBackward(dir) &&
-                           (strat == BTLessStrategyNumber ||
-                            strat == BTLessEqualStrategyNumber) )
+               }
+               else
+               {
+                       /*
+                        * Ordinary comparison key.  Transform the search-style scan key
+                        * to an insertion scan key by replacing the sk_func with the
+                        * appropriate btree comparison function.
+                        *
+                        * If scankey operator is not a cross-type comparison, we can use
+                        * the cached comparison function; otherwise gotta look it up in
+                        * the catalogs.  (That can't lead to infinite recursion, since no
+                        * indexscan initiated by syscache lookup will use cross-data-type
+                        * operators.)
+                        *
+                        * We support the convention that sk_subtype == InvalidOid means
+                        * the opclass input type; this is a hack to simplify life for
+                        * ScanKeyInit().
+                        */
+                       if (cur->sk_subtype == rel->rd_opcintype[i] ||
+                               cur->sk_subtype == InvalidOid)
                        {
-                               nKeyIs[keysCount++] = i;
-                               strat_total = strat;
-                               if (strat == BTLessStrategyNumber)
-                                       break;
-                               continue;
+                               FmgrInfo   *procinfo;
+
+                               procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
+                               ScanKeyEntryInitializeWithInfo(scankeys + i,
+                                                                                          cur->sk_flags,
+                                                                                          cur->sk_attno,
+                                                                                          InvalidStrategy,
+                                                                                          cur->sk_subtype,
+                                                                                          cur->sk_collation,
+                                                                                          procinfo,
+                                                                                          cur->sk_argument);
                        }
-                       if (ScanDirectionIsForward(dir) &&
-                           (strat == BTGreaterStrategyNumber ||
-                            strat == BTGreaterEqualStrategyNumber) )
+                       else
                        {
-                               nKeyIs[keysCount++] = i;
-                               strat_total = strat;
-                               if (strat == BTGreaterStrategyNumber)
-                                       break;
-                               continue;
+                               RegProcedure cmp_proc;
+
+                               cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
+                                                                                        rel->rd_opcintype[i],
+                                                                                        cur->sk_subtype,
+                                                                                        BTORDER_PROC);
+                               if (!RegProcedureIsValid(cmp_proc))
+                                       elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
+                                                BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
+                                                cur->sk_attno, RelationGetRelationName(rel));
+                               ScanKeyEntryInitialize(scankeys + i,
+                                                                          cur->sk_flags,
+                                                                          cur->sk_attno,
+                                                                          InvalidStrategy,
+                                                                          cur->sk_subtype,
+                                                                          cur->sk_collation,
+                                                                          cmp_proc,
+                                                                          cur->sk_argument);
                        }
                }
-               if (!keysCount)
-                       scan->scanFromEnd = true;
        }
-       else
-               scan->scanFromEnd = true;
-
-       if (so->qual_ok == 0)
-               return (RetrieveIndexResult) NULL;
 
-       /* if we just need to walk down one edge of the tree, do that */
-       if (scan->scanFromEnd)
+       /*----------
+        * Examine the selected initial-positioning strategy to determine exactly
+        * where we need to start the scan, and set flag variables to control the
+        * code below.
+        *
+        * If nextkey = false, _bt_search and _bt_binsrch will locate the first
+        * item >= scan key.  If nextkey = true, they will locate the first
+        * item > scan key.
+        *
+        * If goback = true, we will then step back one item, while if
+        * goback = false, we will start the scan on the located item.
+        *----------
+        */
+       switch (strat_total)
        {
-               if (nKeyIs)
-                       pfree(nKeyIs);
-               return _bt_endpoint(scan, dir);
-       }
+               case BTLessStrategyNumber:
+
+                       /*
+                        * Find first item >= scankey, then back up one to arrive at last
+                        * item < scankey.  (Note: this positioning strategy is only used
+                        * for a backward scan, so that is always the correct starting
+                        * position.)
+                        */
+                       nextkey = false;
+                       goback = true;
+                       break;
+
+               case BTLessEqualStrategyNumber:
+
+                       /*
+                        * Find first item > scankey, then back up one to arrive at last
+                        * item <= scankey.  (Note: this positioning strategy is only used
+                        * for a backward scan, so that is always the correct starting
+                        * position.)
+                        */
+                       nextkey = true;
+                       goback = true;
+                       break;
+
+               case BTEqualStrategyNumber:
+
+                       /*
+                        * If a backward scan was specified, need to start with last equal
+                        * item not first one.
+                        */
+                       if (ScanDirectionIsBackward(dir))
+                       {
+                               /*
+                                * This is the same as the <= strategy.  We will check at the
+                                * end whether the found item is actually =.
+                                */
+                               nextkey = true;
+                               goback = true;
+                       }
+                       else
+                       {
+                               /*
+                                * This is the same as the >= strategy.  We will check at the
+                                * end whether the found item is actually =.
+                                */
+                               nextkey = false;
+                               goback = false;
+                       }
+                       break;
+
+               case BTGreaterEqualStrategyNumber:
+
+                       /*
+                        * Find first item >= scankey.  (This is only used for forward
+                        * scans.)
+                        */
+                       nextkey = false;
+                       goback = false;
+                       break;
 
-       itupdesc = RelationGetDescr(rel);
-       current = &(scan->currentItemData);
+               case BTGreaterStrategyNumber:
+
+                       /*
+                        * Find first item > scankey.  (This is only used for forward
+                        * scans.)
+                        */
+                       nextkey = true;
+                       goback = false;
+                       break;
+
+               default:
+                       /* can't get here, but keep compiler quiet */
+                       elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
+                       return false;
+       }
 
        /*
-        * Okay, we want something more complicated.  What we'll do is use the
-        * first item in the scan key passed in (which has been correctly
-        * ordered to take advantage of index ordering) to position ourselves
-        * at the right place in the scan.
+        * Use the manufactured insertion scan key to descend the tree and
+        * position ourselves on the target leaf page.
         */
-       /* _bt_orderkeys disallows it, but it's place to add some code latter */
-       scankeys = (ScanKey)palloc(keysCount*sizeof(ScanKeyData));
-       for (i=0; i < keysCount; i++)
+       stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ,
+                                          scan->xs_snapshot);
+
+       /* don't need to keep the stack around... */
+       _bt_freestack(stack);
+
+       if (!BufferIsValid(buf))
        {
-               j = nKeyIs[i];
-               if (so->keyData[j].sk_flags & SK_ISNULL)
-               {
-                       pfree(nKeyIs);
-                       pfree(scankeys);
-                       elog(ERROR, "_bt_first: btree doesn't support is(not)null, yet");
-                       return ((RetrieveIndexResult) NULL);
-               } 
-               proc = index_getprocid(rel, i+1, BTORDER_PROC);
-               ScanKeyEntryInitialize(scankeys+i, so->keyData[j].sk_flags,
-                               i+1, proc, so->keyData[j].sk_argument);
+               /*
+                * We only get here if the index is completely empty. Lock relation
+                * because nothing finer to lock exists.
+                */
+               PredicateLockRelation(rel, scan->xs_snapshot);
+               return false;
        }
-       if   (nKeyIs)   pfree(nKeyIs);
+       else
+               PredicateLockPage(rel, BufferGetBlockNumber(buf),
+                                                 scan->xs_snapshot);
 
-       stack = _bt_search(rel, keysCount, scankeys, &buf);
-       _bt_freestack(stack);
+       /* initialize moreLeft/moreRight appropriately for scan direction */
+       if (ScanDirectionIsForward(dir))
+       {
+               so->currPos.moreLeft = false;
+               so->currPos.moreRight = true;
+       }
+       else
+       {
+               so->currPos.moreLeft = true;
+               so->currPos.moreRight = false;
+       }
+       so->numKilled = 0;                      /* just paranoia */
+       Assert(so->markItemIndex == -1);
 
-       blkno = BufferGetBlockNumber(buf);
-       page = BufferGetPage(buf);
+       /* position to the precise item on the page */
+       offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
 
        /*
-        * This will happen if the tree we're searching is entirely empty, or
-        * if we're doing a search for a key that would appear on an entirely
-        * empty internal page.  In either case, there are no matching tuples
-        * in the index.
+        * If nextkey = false, we are positioned at the first item >= scan key, or
+        * possibly at the end of a page on which all the existing items are less
+        * than the scan key and we know that everything on later pages is greater
+        * than or equal to scan key.
+        *
+        * If nextkey = true, we are positioned at the first item > scan key, or
+        * possibly at the end of a page on which all the existing items are less
+        * than or equal to the scan key and we know that everything on later
+        * pages is greater than scan key.
+        *
+        * The actually desired starting point is either this item or the prior
+        * one, or in the end-of-page case it's the first item on the next page or
+        * the last item on this page.  Adjust the starting offset if needed. (If
+        * this results in an offset before the first item or after the last one,
+        * _bt_readpage will report no items found, and then we'll step to the
+        * next page as needed.)
         */
+       if (goback)
+               offnum = OffsetNumberPrev(offnum);
+
+       /* remember which buffer we have pinned, if any */
+       Assert(!BTScanPosIsValid(so->currPos));
+       so->currPos.buf = buf;
 
-       if (PageIsEmpty(page))
+       /*
+        * Now load data from the first page of the scan.
+        */
+       if (!_bt_readpage(scan, dir, offnum))
+       {
+               /*
+                * There's no actually-matching data on this page.  Try to advance to
+                * the next page.  Return false if there's no matching data at all.
+                */
+               LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+               if (!_bt_steppage(scan, dir))
+                       return false;
+       }
+       else
        {
-               ItemPointerSetInvalid(current);
-               so->btso_curbuf = InvalidBuffer;
-               _bt_relbuf(rel, buf, BT_READ);
-               pfree(scankeys);
-               return (RetrieveIndexResult) NULL;
+               /* Drop the lock, and maybe the pin, on the current page */
+               _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
        }
-       maxoff = PageGetMaxOffsetNumber(page);
-       pop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+       /* OK, itemIndex says what to return */
+       currItem = &so->currPos.items[so->currPos.itemIndex];
+       scan->xs_ctup.t_self = currItem->heapTid;
+       if (scan->xs_want_itup)
+               scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+       return true;
+}
+
+/*
+ *     _bt_next() -- Get the next item in a scan.
+ *
+ *             On entry, so->currPos describes the current page, which may be pinned
+ *             but is not locked, and so->currPos.itemIndex identifies which item was
+ *             previously returned.
+ *
+ *             On successful exit, scan->xs_ctup.t_self is set to the TID of the
+ *             next heap tuple, and if requested, scan->xs_itup points to a copy of
+ *             the index tuple.  so->currPos is updated as needed.
+ *
+ *             On failure exit (no more tuples), we release pin and set
+ *             so->currPos.buf to InvalidBuffer.
+ */
+bool
+_bt_next(IndexScanDesc scan, ScanDirection dir)
+{
+       BTScanOpaque so = (BTScanOpaque) scan->opaque;
+       BTScanPosItem *currItem;
 
        /*
-        * Now _bt_moveright doesn't move from non-rightmost leaf page if
-        * scankey == hikey and there is only hikey there. It's good for
-        * insertion, but we need to do work for scan here. - vadim 05/27/97
+        * Advance to next tuple on current page; or if there's no more, try to
+        * step to the next page with data.
         */
-
-       while (maxoff == P_HIKEY && !P_RIGHTMOST(pop) &&
-                  _bt_skeycmp(rel, keysCount, scankeys, page,
-                                          PageGetItemId(page, P_HIKEY),
-                                          BTGreaterEqualStrategyNumber))
+       if (ScanDirectionIsForward(dir))
        {
-               /* step right one page */
-               blkno = pop->btpo_next;
-               _bt_relbuf(rel, buf, BT_READ);
-               buf = _bt_getbuf(rel, blkno, BT_READ);
-               page = BufferGetPage(buf);
-               if (PageIsEmpty(page))
+               if (++so->currPos.itemIndex > so->currPos.lastItem)
                {
-                       ItemPointerSetInvalid(current);
-                       so->btso_curbuf = InvalidBuffer;
-                       _bt_relbuf(rel, buf, BT_READ);
-                       pfree(scankeys);
-                       return (RetrieveIndexResult) NULL;
+                       if (!_bt_steppage(scan, dir))
+                               return false;
                }
-               maxoff = PageGetMaxOffsetNumber(page);
-               pop = (BTPageOpaque) PageGetSpecialPointer(page);
        }
-
-
-       /* find the nearest match to the manufactured scan key on the page */
-       offnum = _bt_binsrch(rel, buf, keysCount, scankeys, BT_DESCENT);
-
-       if (offnum > maxoff)
+       else
        {
-               offnum = maxoff;
-               offGmax = true;
+               if (--so->currPos.itemIndex < so->currPos.firstItem)
+               {
+                       if (!_bt_steppage(scan, dir))
+                               return false;
+               }
        }
 
-       ItemPointerSet(current, blkno, offnum);
+       /* OK, itemIndex says what to return */
+       currItem = &so->currPos.items[so->currPos.itemIndex];
+       scan->xs_ctup.t_self = currItem->heapTid;
+       if (scan->xs_want_itup)
+               scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+       return true;
+}
+
+/*
+ *     _bt_readpage() -- Load data from current index page into so->currPos
+ *
+ * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
+ * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
+ * they are updated as appropriate.  All other fields of so->currPos are
+ * initialized from scratch here.
+ *
+ * We scan the current page starting at offnum and moving in the indicated
+ * direction.  All items matching the scan keys are loaded into currPos.items.
+ * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
+ * that there can be no more matching tuples in the current scan direction.
+ *
+ * Returns true if any matching items found on the page, false if none.
+ */
+static bool
+_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
+{
+       BTScanOpaque so = (BTScanOpaque) scan->opaque;
+       Page            page;
+       BTPageOpaque opaque;
+       OffsetNumber minoff;
+       OffsetNumber maxoff;
+       int                     itemIndex;
+       IndexTuple      itup;
+       bool            continuescan;
 
        /*
-        * Now find the right place to start the scan.  Result is the value
-        * we're looking for minus the value we're looking at in the index.
+        * We must have the buffer pinned and locked, but the usual macro can't be
+        * used here; this function is what makes it good for currPos.
         */
+       Assert(BufferIsValid(so->currPos.buf));
 
-       result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
+       page = BufferGetPage(so->currPos.buf);
+       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+       minoff = P_FIRSTDATAKEY(opaque);
+       maxoff = PageGetMaxOffsetNumber(page);
 
-       /* it's yet other place to add some code latter for is(not)null */
+       /*
+        * We note the buffer's block number so that we can release the pin later.
+        * This allows us to re-read the buffer if it is needed again for hinting.
+        */
+       so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
 
-       strat = strat_total;
-       switch (strat)
-       {
-               case BTLessStrategyNumber:
-                       if (result <= 0)
-                       {
-                               do
-                               {
-                                       if (!_bt_twostep(scan, &buf, BackwardScanDirection))
-                                               break;
+       /*
+        * We save the LSN of the page as we read it, so that we know whether it
+        * safe to apply LP_DEAD hints to the page later.  This allows us to drop
+        * the pin for MVCC scans, which allows vacuum to avoid blocking.
+        */
+       so->currPos.lsn = PageGetLSN(page);
 
-                                       offnum = ItemPointerGetOffsetNumber(current);
-                                       page = BufferGetPage(buf);
-                                       result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
-                               } while (result <= 0);
+       /*
+        * we must save the page's right-link while scanning it; this tells us
+        * where to step right to after we're done with these items.  There is no
+        * corresponding need for the left-link, since splits always go right.
+        */
+       so->currPos.nextPage = opaque->btpo_next;
 
-                       }
-                       break;
+       /* initialize tuple workspace to empty */
+       so->currPos.nextTupleOffset = 0;
 
-               case BTLessEqualStrategyNumber:
-                       if (result >= 0)
-                       {
-                               do
-                               {
-                                       if (!_bt_twostep(scan, &buf, ForwardScanDirection))
-                                               break;
+       /*
+        * Now that the current page has been made consistent, the macro should be
+        * good.
+        */
+       Assert(BTScanPosIsPinned(so->currPos));
 
-                                       offnum = ItemPointerGetOffsetNumber(current);
-                                       page = BufferGetPage(buf);
-                                       result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
-                               } while (result >= 0);
-                       }
-                       if (result < 0)
-                               _bt_twostep(scan, &buf, BackwardScanDirection);
-                       break;
+       if (ScanDirectionIsForward(dir))
+       {
+               /* load items[] in ascending order */
+               itemIndex = 0;
 
-               case BTEqualStrategyNumber:
-                       if (result != 0)
+               offnum = Max(offnum, minoff);
+
+               while (offnum <= maxoff)
+               {
+                       itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
+                       if (itup != NULL)
                        {
-                               _bt_relbuf(scan->relation, buf, BT_READ);
-                               so->btso_curbuf = InvalidBuffer;
-                               ItemPointerSetInvalid(&(scan->currentItemData));
-                               pfree(scankeys);
-                               return (RetrieveIndexResult) NULL;
+                               /* tuple passes all scan key conditions, so remember it */
+                               _bt_saveitem(so, itemIndex, offnum, itup);
+                               itemIndex++;
                        }
-                       else if (ScanDirectionIsBackward(dir))
+                       if (!continuescan)
                        {
-                               do
-                               {
-                                       if (!_bt_twostep(scan, &buf, ForwardScanDirection))
-                                               break;
+                               /* there can't be any more matches, so stop */
+                               so->currPos.moreRight = false;
+                               break;
+                       }
 
-                                       offnum = ItemPointerGetOffsetNumber(current);
-                                       page = BufferGetPage(buf);
-                                       result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
-                               } while (result == 0);
+                       offnum = OffsetNumberNext(offnum);
+               }
 
-                               if (result < 0)
-                                       _bt_twostep(scan, &buf, BackwardScanDirection);
-                       }
-                       break;
+               Assert(itemIndex <= MaxIndexTuplesPerPage);
+               so->currPos.firstItem = 0;
+               so->currPos.lastItem = itemIndex - 1;
+               so->currPos.itemIndex = 0;
+       }
+       else
+       {
+               /* load items[] in descending order */
+               itemIndex = MaxIndexTuplesPerPage;
 
-               case BTGreaterEqualStrategyNumber:
-                       if (offGmax)
+               offnum = Min(offnum, maxoff);
+
+               while (offnum >= minoff)
+               {
+                       itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
+                       if (itup != NULL)
                        {
-                               if (result < 0)
-                               {
-                                       Assert(!P_RIGHTMOST(pop) && maxoff == P_HIKEY);
-                                       if (!_bt_step(scan, &buf, ForwardScanDirection))
-                                       {
-                                               _bt_relbuf(scan->relation, buf, BT_READ);
-                                               so->btso_curbuf = InvalidBuffer;
-                                               ItemPointerSetInvalid(&(scan->currentItemData));
-                                               pfree(scankeys);
-                                               return (RetrieveIndexResult) NULL;
-                                       }
-                               }
-                               else if (result > 0)
-                               {                               /* Just remember:  _bt_binsrch() returns
-                                                                * the OffsetNumber of the first matching
-                                                                * key on the page, or the OffsetNumber at
-                                                                * which the matching key WOULD APPEAR IF
-                                                                * IT WERE on this page. No key on this
-                                                                * page, but offnum from _bt_binsrch()
-                                                                * greater maxoff - have to move right. -
-                                                                * vadim 12/06/96 */
-                                       _bt_twostep(scan, &buf, ForwardScanDirection);
-                               }
+                               /* tuple passes all scan key conditions, so remember it */
+                               itemIndex--;
+                               _bt_saveitem(so, itemIndex, offnum, itup);
                        }
-                       else if (result < 0)
+                       if (!continuescan)
                        {
-                               do
-                               {
-                                       if (!_bt_twostep(scan, &buf, BackwardScanDirection))
-                                               break;
-
-                                       page = BufferGetPage(buf);
-                                       offnum = ItemPointerGetOffsetNumber(current);
-                                       result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
-                               } while (result < 0);
-
-                               if (result > 0)
-                                       _bt_twostep(scan, &buf, ForwardScanDirection);
+                               /* there can't be any more matches, so stop */
+                               so->currPos.moreLeft = false;
+                               break;
                        }
-                       break;
 
-               case BTGreaterStrategyNumber:
-                       /* offGmax helps as above */
-                       if (result >= 0 || offGmax)
-                       {
-                               do
-                               {
-                                       if (!_bt_twostep(scan, &buf, ForwardScanDirection))
-                                               break;
+                       offnum = OffsetNumberPrev(offnum);
+               }
 
-                                       offnum = ItemPointerGetOffsetNumber(current);
-                                       page = BufferGetPage(buf);
-                                       result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
-                               } while (result >= 0);
-                       }
-                       break;
+               Assert(itemIndex >= 0);
+               so->currPos.firstItem = itemIndex;
+               so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
+               so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
        }
 
-       pfree(scankeys);
-       /* okay, current item pointer for the scan is right */
-       offnum = ItemPointerGetOffsetNumber(current);
-       page = BufferGetPage(buf);
-       btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
-       itup = &btitem->bti_itup;
+       return (so->currPos.firstItem <= so->currPos.lastItem);
+}
 
-       if (_bt_checkkeys(scan, itup, &keysok))
-       {
-               res = FormRetrieveIndexResult(current, &(itup->t_tid));
+/* Save an index item into so->currPos.items[itemIndex] */
+static void
+_bt_saveitem(BTScanOpaque so, int itemIndex,
+                        OffsetNumber offnum, IndexTuple itup)
+{
+       BTScanPosItem *currItem = &so->currPos.items[itemIndex];
 
-               /* remember which buffer we have pinned */
-               so->btso_curbuf = buf;
-       }
-       else if (keysok >= so->numberOfFirstKeys)
-       {
-               so->btso_curbuf = buf;
-               return _bt_next(scan, dir);
-       }
-       else if (keysok == -1 && ScanDirectionIsBackward(dir))
-       {
-               so->btso_curbuf = buf;
-               return _bt_next(scan, dir);
-       }
-       else
+       currItem->heapTid = itup->t_tid;
+       currItem->indexOffset = offnum;
+       if (so->currTuples)
        {
-               ItemPointerSetInvalid(current);
-               so->btso_curbuf = InvalidBuffer;
-               _bt_relbuf(rel, buf, BT_READ);
-               res = (RetrieveIndexResult) NULL;
-       }
+               Size            itupsz = IndexTupleSize(itup);
 
-       return res;
+               currItem->tupleOffset = so->currPos.nextTupleOffset;
+               memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
+               so->currPos.nextTupleOffset += MAXALIGN(itupsz);
+       }
 }
 
 /*
- *     _bt_step() -- Step one item in the requested direction in a scan on
- *                               the tree.
+ *     _bt_steppage() -- Step to next page containing valid data for scan
  *
- *             If no adjacent record exists in the requested direction, return
- *             false.  Else, return true and set the currentItemData for the
- *             scan to the right thing.
+ * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
+ * if pinned, we'll drop the pin before moving to next page.  The buffer is
+ * not locked on entry.
+ *
+ * On success exit, so->currPos is updated to contain data from the next
+ * interesting page.  For success on a scan using a non-MVCC snapshot we hold
+ * a pin, but not a read lock, on that page.  If we do not hold the pin, we
+ * set so->currPos.buf to InvalidBuffer.  We return TRUE to indicate success.
+ *
+ * If there are no more matching records in the given direction, we drop all
+ * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
  */
-bool
-_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
+static bool
+_bt_steppage(IndexScanDesc scan, ScanDirection dir)
 {
+       BTScanOpaque so = (BTScanOpaque) scan->opaque;
+       Relation        rel;
        Page            page;
        BTPageOpaque opaque;
-       OffsetNumber offnum,
-                               maxoff;
-       OffsetNumber start;
-       BlockNumber blkno;
-       BlockNumber obknum;
-       BTScanOpaque so;
-       ItemPointer current;
-       Relation        rel;
 
-       rel = scan->relation;
-       current = &(scan->currentItemData);
+       Assert(BTScanPosIsValid(so->currPos));
+
+       /* Before leaving current page, deal with any killed items */
+       if (so->numKilled > 0)
+               _bt_killitems(scan);
 
        /*
-        * Don't use ItemPointerGetOffsetNumber or you risk to get assertion
-        * due to ability of ip_posid to be equal 0.
+        * Before we modify currPos, make a copy of the page data if there was a
+        * mark position that needs it.
         */
-       offnum = current->ip_posid;
-       page = BufferGetPage(*bufP);
-       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-       so = (BTScanOpaque) scan->opaque;
-       maxoff = PageGetMaxOffsetNumber(page);
+       if (so->markItemIndex >= 0)
+       {
+               /* bump pin on current buffer for assignment to mark buffer */
+               if (BTScanPosIsPinned(so->currPos))
+                       IncrBufferRefCount(so->currPos.buf);
+               memcpy(&so->markPos, &so->currPos,
+                          offsetof(BTScanPosData, items[1]) +
+                          so->currPos.lastItem * sizeof(BTScanPosItem));
+               if (so->markTuples)
+                       memcpy(so->markTuples, so->currTuples,
+                                  so->currPos.nextTupleOffset);
+               so->markPos.itemIndex = so->markItemIndex;
+               so->markItemIndex = -1;
+       }
+
+       rel = scan->indexRelation;
 
-       /* get the next tuple */
        if (ScanDirectionIsForward(dir))
        {
-               if (!PageIsEmpty(page) && offnum < maxoff)
-                       offnum = OffsetNumberNext(offnum);
-               else
-               {
+               /* Walk right to the next page with data */
+               /* We must rely on the previously saved nextPage link! */
+               BlockNumber blkno = so->currPos.nextPage;
 
-                       /* if we're at end of scan, release the buffer and return */
-                       blkno = opaque->btpo_next;
-                       if (P_RIGHTMOST(opaque))
+               /* Remember we left a page with data */
+               so->currPos.moreLeft = true;
+
+               /* release the previous buffer, if pinned */
+               BTScanPosUnpinIfPinned(so->currPos);
+
+               for (;;)
+               {
+                       /* if we're at end of scan, give up */
+                       if (blkno == P_NONE || !so->currPos.moreRight)
                        {
-                               _bt_relbuf(rel, *bufP, BT_READ);
-                               ItemPointerSetInvalid(current);
-                               *bufP = so->btso_curbuf = InvalidBuffer;
+                               BTScanPosInvalidate(so->currPos);
                                return false;
                        }
-                       else
+                       /* check for interrupts while we're not holding any buffer lock */
+                       CHECK_FOR_INTERRUPTS();
+                       /* step right one page */
+                       so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+                       /* check for deleted page */
+                       page = BufferGetPage(so->currPos.buf);
+                       TestForOldSnapshot(scan->xs_snapshot, rel, page);
+                       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+                       if (!P_IGNORE(opaque))
                        {
-
-                               /* walk right to the next page with data */
-                               _bt_relbuf(rel, *bufP, BT_READ);
-                               for (;;)
-                               {
-                                       *bufP = _bt_getbuf(rel, blkno, BT_READ);
-                                       page = BufferGetPage(*bufP);
-                                       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-                                       maxoff = PageGetMaxOffsetNumber(page);
-                                       start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
-                                       if (!PageIsEmpty(page) && start <= maxoff)
-                                               break;
-                                       else
-                                       {
-                                               blkno = opaque->btpo_next;
-                                               _bt_relbuf(rel, *bufP, BT_READ);
-                                               if (blkno == P_NONE)
-                                               {
-                                                       *bufP = so->btso_curbuf = InvalidBuffer;
-                                                       ItemPointerSetInvalid(current);
-                                                       return false;
-                                               }
-                                       }
-                               }
-                               offnum = start;
+                               PredicateLockPage(rel, blkno, scan->xs_snapshot);
+                               /* see if there are any matches on this page */
+                               /* note that this will clear moreRight if we can stop */
+                               if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
+                                       break;
                        }
+
+                       /* nope, keep going */
+                       blkno = opaque->btpo_next;
+                       _bt_relbuf(rel, so->currPos.buf);
                }
        }
-       else if (ScanDirectionIsBackward(dir))
+       else
        {
+               /* Remember we left a page with data */
+               so->currPos.moreRight = true;
 
-               /* remember that high key is item zero on non-rightmost pages */
-               start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
-               if (offnum > start)
-                       offnum = OffsetNumberPrev(offnum);
+               /*
+                * Walk left to the next page with data.  This is much more complex
+                * than the walk-right case because of the possibility that the page
+                * to our left splits while we are in flight to it, plus the
+                * possibility that the page we were on gets deleted after we leave
+                * it.  See nbtree/README for details.
+                *
+                * It might be possible to rearrange this code to have less overhead
+                * in pinning and locking, but that would require capturing the left
+                * pointer when the page is initially read, and using it here, along
+                * with big changes to _bt_walk_left() and the code below.  It is not
+                * clear whether this would be a win, since if the page immediately to
+                * the left splits after we read this page and before we step left, we
+                * would need to visit more pages than with the current code.
+                *
+                * Note that if we change the code so that we drop the pin for a scan
+                * which uses a non-MVCC snapshot, we will need to modify the code for
+                * walking left, to allow for the possibility that a referenced page
+                * has been deleted.  As long as the buffer is pinned or the snapshot
+                * is MVCC the page cannot move past the half-dead state to fully
+                * deleted.
+                */
+               if (BTScanPosIsPinned(so->currPos))
+                       LockBuffer(so->currPos.buf, BT_READ);
                else
-               {
+                       so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
 
-                       /* if we're at end of scan, release the buffer and return */
-                       blkno = opaque->btpo_prev;
-                       if (P_LEFTMOST(opaque))
+               for (;;)
+               {
+                       /* Done if we know there are no matching keys to the left */
+                       if (!so->currPos.moreLeft)
                        {
-                               _bt_relbuf(rel, *bufP, BT_READ);
-                               *bufP = so->btso_curbuf = InvalidBuffer;
-                               ItemPointerSetInvalid(current);
+                               _bt_relbuf(rel, so->currPos.buf);
+                               BTScanPosInvalidate(so->currPos);
                                return false;
                        }
-                       else
-                       {
 
-                               obknum = BufferGetBlockNumber(*bufP);
+                       /* Step to next physical page */
+                       so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
+                                                                                       scan->xs_snapshot);
 
-                               /* walk right to the next page with data */
-                               _bt_relbuf(rel, *bufP, BT_READ);
-                               for (;;)
-                               {
-                                       *bufP = _bt_getbuf(rel, blkno, BT_READ);
-                                       page = BufferGetPage(*bufP);
-                                       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-                                       maxoff = PageGetMaxOffsetNumber(page);
-
-                                       /*
-                                        * If the adjacent page just split, then we may have
-                                        * the wrong block.  Handle this case.  Because pages
-                                        * only split right, we don't have to worry about this
-                                        * failing to terminate.
-                                        */
-
-                                       while (opaque->btpo_next != obknum)
-                                       {
-                                               blkno = opaque->btpo_next;
-                                               _bt_relbuf(rel, *bufP, BT_READ);
-                                               *bufP = _bt_getbuf(rel, blkno, BT_READ);
-                                               page = BufferGetPage(*bufP);
-                                               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-                                               maxoff = PageGetMaxOffsetNumber(page);
-                                       }
-
-                                       /* don't consider the high key */
-                                       start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+                       /* if we're physically at end of index, return failure */
+                       if (so->currPos.buf == InvalidBuffer)
+                       {
+                               BTScanPosInvalidate(so->currPos);
+                               return false;
+                       }
 
-                                       /* anything to look at here? */
-                                       if (!PageIsEmpty(page) && maxoff >= start)
-                                               break;
-                                       else
-                                       {
-                                               blkno = opaque->btpo_prev;
-                                               obknum = BufferGetBlockNumber(*bufP);
-                                               _bt_relbuf(rel, *bufP, BT_READ);
-                                               if (blkno == P_NONE)
-                                               {
-                                                       *bufP = so->btso_curbuf = InvalidBuffer;
-                                                       ItemPointerSetInvalid(current);
-                                                       return false;
-                                               }
-                                       }
-                               }
-                               offnum = maxoff;/* XXX PageIsEmpty? */
+                       /*
+                        * Okay, we managed to move left to a non-deleted page. Done if
+                        * it's not half-dead and contains matching tuples. Else loop back
+                        * and do it all again.
+                        */
+                       page = BufferGetPage(so->currPos.buf);
+                       TestForOldSnapshot(scan->xs_snapshot, rel, page);
+                       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+                       if (!P_IGNORE(opaque))
+                       {
+                               PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
+                               /* see if there are any matches on this page */
+                               /* note that this will clear moreLeft if we can stop */
+                               if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
+                                       break;
                        }
                }
        }
-       blkno = BufferGetBlockNumber(*bufP);
-       so->btso_curbuf = *bufP;
-       ItemPointerSet(current, blkno, offnum);
+
+       /* Drop the lock, and maybe the pin, on the current page */
+       _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
 
        return true;
 }
 
 /*
- *     _bt_twostep() -- Move to an adjacent record in a scan on the tree,
- *                                      if an adjacent record exists.
+ * _bt_walk_left() -- step left one page, if possible
  *
- *             This is like _bt_step, except that if no adjacent record exists
- *             it restores us to where we were before trying the step.  This is
- *             only hairy when you cross page boundaries, since the page you cross
- *             from could have records inserted or deleted, or could even split.
- *             This is unlikely, but we try to handle it correctly here anyway.
+ * The given buffer must be pinned and read-locked.  This will be dropped
+ * before stepping left.  On return, we have pin and read lock on the
+ * returned page, instead.
  *
- *             This routine contains the only case in which our changes to Lehman
- *             and Yao's algorithm.
+ * Returns InvalidBuffer if there is no page to the left (no lock is held
+ * in that case).
  *
- *             Like step, this routine leaves the scan's currentItemData in the
- *             proper state and acquires a lock and pin on *bufP.      If the twostep
- *             succeeded, we return true; otherwise, we return false.
+ * When working on a non-leaf level, it is possible for the returned page
+ * to be half-dead; the caller should check that condition and step left
+ * again if it's important.
  */
-static bool
-_bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
+static Buffer
+_bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
 {
        Page            page;
        BTPageOpaque opaque;
-       OffsetNumber offnum,
-                               maxoff;
-       OffsetNumber start;
-       ItemPointer current;
-       ItemId          itemid;
-       int                     itemsz;
-       BTItem          btitem;
-       BTItem          svitem;
-       BlockNumber blkno;
 
-       blkno = BufferGetBlockNumber(*bufP);
-       page = BufferGetPage(*bufP);
+       page = BufferGetPage(buf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-       maxoff = PageGetMaxOffsetNumber(page);
-       current = &(scan->currentItemData);
-       offnum = ItemPointerGetOffsetNumber(current);
 
-       start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
-       /* if we're safe, just do it */
-       if (ScanDirectionIsForward(dir) && offnum < maxoff)
-       {                                                       /* XXX PageIsEmpty? */
-               ItemPointerSet(current, blkno, OffsetNumberNext(offnum));
-               return true;
-       }
-       else if (ScanDirectionIsBackward(dir) && offnum > start)
+       for (;;)
        {
-               ItemPointerSet(current, blkno, OffsetNumberPrev(offnum));
-               return true;
-       }
-
-       /* if we've hit end of scan we don't have to do any work */
-       if (ScanDirectionIsForward(dir) && P_RIGHTMOST(opaque))
-               return false;
-       else if (ScanDirectionIsBackward(dir) && P_LEFTMOST(opaque))
-               return false;
+               BlockNumber obknum;
+               BlockNumber lblkno;
+               BlockNumber blkno;
+               int                     tries;
 
-       /*
-        * Okay, it's off the page; let _bt_step() do the hard work, and we'll
-        * try to remember where we were.  This is not guaranteed to work;
-        * this is the only place in the code where concurrency can screw us
-        * up, and it's because we want to be able to move in two directions
-        * in the scan.
-        */
-
-       itemid = PageGetItemId(page, offnum);
-       itemsz = ItemIdGetLength(itemid);
-       btitem = (BTItem) PageGetItem(page, itemid);
-       svitem = (BTItem) palloc(itemsz);
-       memmove((char *) svitem, (char *) btitem, itemsz);
+               /* if we're at end of tree, release buf and return failure */
+               if (P_LEFTMOST(opaque))
+               {
+                       _bt_relbuf(rel, buf);
+                       break;
+               }
+               /* remember original page we are stepping left from */
+               obknum = BufferGetBlockNumber(buf);
+               /* step left */
+               blkno = lblkno = opaque->btpo_prev;
+               _bt_relbuf(rel, buf);
+               /* check for interrupts while we're not holding any buffer lock */
+               CHECK_FOR_INTERRUPTS();
+               buf = _bt_getbuf(rel, blkno, BT_READ);
+               page = BufferGetPage(buf);
+               TestForOldSnapshot(snapshot, rel, page);
+               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 
-       if (_bt_step(scan, bufP, dir))
-       {
-               pfree(svitem);
-               return true;
-       }
+               /*
+                * If this isn't the page we want, walk right till we find what we
+                * want --- but go no more than four hops (an arbitrary limit). If we
+                * don't find the correct page by then, the most likely bet is that
+                * the original page got deleted and isn't in the sibling chain at all
+                * anymore, not that its left sibling got split more than four times.
+                *
+                * Note that it is correct to test P_ISDELETED not P_IGNORE here,
+                * because half-dead pages are still in the sibling chain.  Caller
+                * must reject half-dead pages if wanted.
+                */
+               tries = 0;
+               for (;;)
+               {
+                       if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
+                       {
+                               /* Found desired page, return it */
+                               return buf;
+                       }
+                       if (P_RIGHTMOST(opaque) || ++tries > 4)
+                               break;
+                       blkno = opaque->btpo_next;
+                       buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+                       page = BufferGetPage(buf);
+                       TestForOldSnapshot(snapshot, rel, page);
+                       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+               }
 
-       /* try to find our place again */
-       *bufP = _bt_getbuf(scan->relation, blkno, BT_READ);
-       page = BufferGetPage(*bufP);
-       maxoff = PageGetMaxOffsetNumber(page);
+               /* Return to the original page to see what's up */
+               buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
+               page = BufferGetPage(buf);
+               TestForOldSnapshot(snapshot, rel, page);
+               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+               if (P_ISDELETED(opaque))
+               {
+                       /*
+                        * It was deleted.  Move right to first nondeleted page (there
+                        * must be one); that is the page that has acquired the deleted
+                        * one's keyspace, so stepping left from it will take us where we
+                        * want to be.
+                        */
+                       for (;;)
+                       {
+                               if (P_RIGHTMOST(opaque))
+                                       elog(ERROR, "fell off the end of index \"%s\"",
+                                                RelationGetRelationName(rel));
+                               blkno = opaque->btpo_next;
+                               buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+                               page = BufferGetPage(buf);
+                               TestForOldSnapshot(snapshot, rel, page);
+                               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+                               if (!P_ISDELETED(opaque))
+                                       break;
+                       }
 
-       while (offnum <= maxoff)
-       {
-               itemid = PageGetItemId(page, offnum);
-               btitem = (BTItem) PageGetItem(page, itemid);
-               if (BTItemSame(btitem, svitem))
+                       /*
+                        * Now return to top of loop, resetting obknum to point to this
+                        * nondeleted page, and try again.
+                        */
+               }
+               else
                {
-                       pfree(svitem);
-                       ItemPointerSet(current, blkno, offnum);
-                       return false;
+                       /*
+                        * It wasn't deleted; the explanation had better be that the page
+                        * to the left got split or deleted. Without this check, we'd go
+                        * into an infinite loop if there's anything wrong.
+                        */
+                       if (opaque->btpo_prev == lblkno)
+                               elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
+                                        obknum, RelationGetRelationName(rel));
+                       /* Okay to try again with new lblkno value */
                }
        }
 
-       /*
-        * XXX crash and burn -- can't find our place.  We can be a little
-        * smarter -- walk to the next page to the right, for example, since
-        * that's the only direction that splits happen in.  Deletions screw
-        * us up less often since they're only done by the vacuum daemon.
-        */
-
-       elog(ERROR, "btree synchronization error:  concurrent update botched scan");
-
-       return false;
+       return InvalidBuffer;
 }
 
 /*
- *     _bt_endpoint() -- Find the first or last key in the index.
+ * _bt_get_endpoint() -- Find the first or last page on a given tree level
+ *
+ * If the index is empty, we will return InvalidBuffer; any other failure
+ * condition causes ereport().  We will not return a dead page.
+ *
+ * The returned buffer is pinned and read-locked.
  */
-static RetrieveIndexResult
-_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
+Buffer
+_bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
+                                Snapshot snapshot)
 {
-       Relation        rel;
        Buffer          buf;
        Page            page;
        BTPageOpaque opaque;
-       ItemPointer current;
-       OffsetNumber offnum,
-                               maxoff;
-       OffsetNumber start = 0;
+       OffsetNumber offnum;
        BlockNumber blkno;
-       BTItem          btitem;
        IndexTuple      itup;
-       BTScanOpaque so;
-       RetrieveIndexResult res;
-       Size            keysok;
 
-       rel = scan->relation;
-       current = &(scan->currentItemData);
-       so = (BTScanOpaque) scan->opaque;
+       /*
+        * If we are looking for a leaf page, okay to descend from fast root;
+        * otherwise better descend from true root.  (There is no point in being
+        * smarter about intermediate levels.)
+        */
+       if (level == 0)
+               buf = _bt_getroot(rel, BT_READ);
+       else
+               buf = _bt_gettrueroot(rel);
+
+       if (!BufferIsValid(buf))
+               return InvalidBuffer;
 
-       buf = _bt_getroot(rel, BT_READ);
-       blkno = BufferGetBlockNumber(buf);
        page = BufferGetPage(buf);
+       TestForOldSnapshot(snapshot, rel, page);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 
        for (;;)
        {
-               if (opaque->btpo_flags & BTP_LEAF)
+               /*
+                * If we landed on a deleted page, step right to find a live page
+                * (there must be one).  Also, if we want the rightmost page, step
+                * right if needed to get to it (this could happen if the page split
+                * since we obtained a pointer to it).
+                */
+               while (P_IGNORE(opaque) ||
+                          (rightmost && !P_RIGHTMOST(opaque)))
+               {
+                       blkno = opaque->btpo_next;
+                       if (blkno == P_NONE)
+                               elog(ERROR, "fell off the end of index \"%s\"",
+                                        RelationGetRelationName(rel));
+                       buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+                       page = BufferGetPage(buf);
+                       TestForOldSnapshot(snapshot, rel, page);
+                       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+               }
+
+               /* Done? */
+               if (opaque->btpo.level == level)
                        break;
+               if (opaque->btpo.level < level)
+                       elog(ERROR, "btree level %u not found in index \"%s\"",
+                                level, RelationGetRelationName(rel));
 
-               if (ScanDirectionIsForward(dir))
-                       offnum = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-               else
+               /* Descend to leftmost or rightmost child page */
+               if (rightmost)
                        offnum = PageGetMaxOffsetNumber(page);
+               else
+                       offnum = P_FIRSTDATAKEY(opaque);
 
-               btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
-               itup = &(btitem->bti_itup);
-
+               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
                blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
 
-               _bt_relbuf(rel, buf, BT_READ);
-               buf = _bt_getbuf(rel, blkno, BT_READ);
+               buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
                page = BufferGetPage(buf);
                opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+       }
 
-               /*
-                * Race condition: If the child page we just stepped onto is in
-                * the process of being split, we need to make sure we're all the
-                * way at the right edge of the tree.  See the paper by Lehman and
-                * Yao.
-                */
+       return buf;
+}
 
-               if (ScanDirectionIsBackward(dir) && !P_RIGHTMOST(opaque))
-               {
-                       do
-                       {
-                               blkno = opaque->btpo_next;
-                               _bt_relbuf(rel, buf, BT_READ);
-                               buf = _bt_getbuf(rel, blkno, BT_READ);
-                               page = BufferGetPage(buf);
-                               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-                       } while (!P_RIGHTMOST(opaque));
-               }
-       }
+/*
+ *     _bt_endpoint() -- Find the first or last page in the index, and scan
+ * from there to the first key satisfying all the quals.
+ *
+ * This is used by _bt_first() to set up a scan when we've determined
+ * that the scan must start at the beginning or end of the index (for
+ * a forward or backward scan respectively).  Exit conditions are the
+ * same as for _bt_first().
+ */
+static bool
+_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
+{
+       Relation        rel = scan->indexRelation;
+       BTScanOpaque so = (BTScanOpaque) scan->opaque;
+       Buffer          buf;
+       Page            page;
+       BTPageOpaque opaque;
+       OffsetNumber start;
+       BTScanPosItem *currItem;
 
-       /* okay, we've got the {left,right}-most page in the tree */
-       maxoff = PageGetMaxOffsetNumber(page);
+       /*
+        * Scan down to the leftmost or rightmost leaf page.  This is a simplified
+        * version of _bt_search().  We don't maintain a stack since we know we
+        * won't need it.
+        */
+       buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
 
-       if (ScanDirectionIsForward(dir))
+       if (!BufferIsValid(buf))
        {
-               if (!P_LEFTMOST(opaque))/* non-leftmost page ? */
-                       elog(ERROR, "_bt_endpoint: leftmost page (%u) has not leftmost flag", blkno);
-               start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
                /*
-                * I don't understand this stuff! It doesn't work for
-                * non-rightmost pages with only one element (P_HIKEY) which we
-                * have after deletion itups by vacuum (it's case of start >
-                * maxoff). Scanning in BackwardScanDirection is not
-                * understandable at all. Well - new stuff. - vadim 12/06/96
+                * Empty index. Lock the whole relation, as nothing finer to lock
+                * exists.
                 */
-#ifdef NOT_USED
-               if (PageIsEmpty(page) || start > maxoff)
-               {
-                       ItemPointerSet(current, blkno, maxoff);
-                       if (!_bt_step(scan, &buf, BackwardScanDirection))
-                               return (RetrieveIndexResult) NULL;
+               PredicateLockRelation(rel, scan->xs_snapshot);
+               BTScanPosInvalidate(so->currPos);
+               return false;
+       }
 
-                       start = ItemPointerGetOffsetNumber(current);
-                       page = BufferGetPage(buf);
-               }
-#endif
-               if (PageIsEmpty(page))
-               {
-                       if (start != P_HIKEY)           /* non-rightmost page */
-                               elog(ERROR, "_bt_endpoint: non-rightmost page (%u) is empty", blkno);
+       PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
+       page = BufferGetPage(buf);
+       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+       Assert(P_ISLEAF(opaque));
 
-                       /*
-                        * It's left- & right- most page - root page, - and it's
-                        * empty...
-                        */
-                       _bt_relbuf(rel, buf, BT_READ);
-                       ItemPointerSetInvalid(current);
-                       so->btso_curbuf = InvalidBuffer;
-                       return (RetrieveIndexResult) NULL;
-               }
-               if (start > maxoff)             /* start == 2 && maxoff == 1 */
-               {
-                       ItemPointerSet(current, blkno, maxoff);
-                       if (!_bt_step(scan, &buf, ForwardScanDirection))
-                               return (RetrieveIndexResult) NULL;
+       if (ScanDirectionIsForward(dir))
+       {
+               /* There could be dead pages to the left, so not this: */
+               /* Assert(P_LEFTMOST(opaque)); */
 
-                       start = ItemPointerGetOffsetNumber(current);
-                       page = BufferGetPage(buf);
-               }
-               /* new stuff ends here */
-               else
-                       ItemPointerSet(current, blkno, start);
+               start = P_FIRSTDATAKEY(opaque);
        }
        else if (ScanDirectionIsBackward(dir))
        {
+               Assert(P_RIGHTMOST(opaque));
 
-               /*
-                * I don't understand this stuff too! If RIGHT-most leaf page is
-                * empty why do scanning in ForwardScanDirection ??? Well - new
-                * stuff. - vadim 12/06/96
-                */
-#ifdef NOT_USED
-               if (PageIsEmpty(page))
-               {
-                       ItemPointerSet(current, blkno, FirstOffsetNumber);
-                       if (!_bt_step(scan, &buf, ForwardScanDirection))
-                               return (RetrieveIndexResult) NULL;
-
-                       start = ItemPointerGetOffsetNumber(current);
-                       page = BufferGetPage(buf);
-               }
-#endif
-               if (PageIsEmpty(page))
-               {
-                       /* If it's leftmost page too - it's empty root page... */
-                       if (P_LEFTMOST(opaque))
-                       {
-                               _bt_relbuf(rel, buf, BT_READ);
-                               ItemPointerSetInvalid(current);
-                               so->btso_curbuf = InvalidBuffer;
-                               return (RetrieveIndexResult) NULL;
-                       }
-                       /* Go back ! */
-                       ItemPointerSet(current, blkno, FirstOffsetNumber);
-                       if (!_bt_step(scan, &buf, BackwardScanDirection))
-                               return (RetrieveIndexResult) NULL;
-
-                       start = ItemPointerGetOffsetNumber(current);
-                       page = BufferGetPage(buf);
-               }
-               /* new stuff ends here */
-               else
-               {
-                       start = PageGetMaxOffsetNumber(page);
-                       ItemPointerSet(current, blkno, start);
-               }
+               start = PageGetMaxOffsetNumber(page);
        }
        else
-               elog(ERROR, "Illegal scan direction %d", dir);
+       {
+               elog(ERROR, "invalid scan direction: %d", (int) dir);
+               start = 0;                              /* keep compiler quiet */
+       }
 
-       btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
-       itup = &(btitem->bti_itup);
+       /* remember which buffer we have pinned */
+       so->currPos.buf = buf;
 
-       /* see if we picked a winner */
-       if (_bt_checkkeys(scan, itup, &keysok))
+       /* initialize moreLeft/moreRight appropriately for scan direction */
+       if (ScanDirectionIsForward(dir))
        {
-               res = FormRetrieveIndexResult(current, &(itup->t_tid));
-
-               /* remember which buffer we have pinned */
-               so->btso_curbuf = buf;
+               so->currPos.moreLeft = false;
+               so->currPos.moreRight = true;
        }
-       else if (keysok >= so->numberOfFirstKeys)
+       else
        {
-               so->btso_curbuf = buf;
-               return _bt_next(scan, dir);
+               so->currPos.moreLeft = true;
+               so->currPos.moreRight = false;
        }
-       else if (keysok == -1 && ScanDirectionIsBackward(dir))
+       so->numKilled = 0;                      /* just paranoia */
+       so->markItemIndex = -1;         /* ditto */
+
+       /*
+        * Now load data from the first page of the scan.
+        */
+       if (!_bt_readpage(scan, dir, start))
        {
-               so->btso_curbuf = buf;
-               return _bt_next(scan, dir);
+               /*
+                * There's no actually-matching data on this page.  Try to advance to
+                * the next page.  Return false if there's no matching data at all.
+                */
+               LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+               if (!_bt_steppage(scan, dir))
+                       return false;
        }
        else
        {
-               ItemPointerSetInvalid(current);
-               so->btso_curbuf = InvalidBuffer;
-               _bt_relbuf(rel, buf, BT_READ);
-               res = (RetrieveIndexResult) NULL;
+               /* Drop the lock, and maybe the pin, on the current page */
+               _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
        }
 
-       return res;
+       /* OK, itemIndex says what to return */
+       currItem = &so->currPos.items[so->currPos.itemIndex];
+       scan->xs_ctup.t_self = currItem->heapTid;
+       if (scan->xs_want_itup)
+               scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+       return true;
 }