Update copyright via script for 2017

[postgresql] / src / backend / access / nbtree / nbtsearch.c
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c

index cc852984ce28f69ce83e4af3c7333bc8fa6c0154..4fba75aaf66a613ed198894384c2708f5d47ea29 100644 (file)
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -4,30 +4,73 @@
   *       Search code for postgres btrees.
   *
   *
- * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.85 2003/12/21 03:00:04 tgl Exp $
+ *       src/backend/access/nbtree/nbtsearch.c
   *
   *-------------------------------------------------------------------------
   */
  
  #include "postgres.h"
  
-#include "access/genam.h"
  #include "access/nbtree.h"
+#include "access/relscan.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/predicate.h"
  #include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/tqual.h"
  
  
-static Buffer _bt_walk_left(Relation rel, Buffer buf);
+static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
+                        OffsetNumber offnum);
+static void _bt_saveitem(BTScanOpaque so, int itemIndex,
+                        OffsetNumber offnum, IndexTuple itup);
+static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
+static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
  static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
+static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
+
+
+/*
+ *     _bt_drop_lock_and_maybe_pin()
+ *
+ * Unlock the buffer; and if it is safe to release the pin, do that, too.  It
+ * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
+ * This will prevent vacuum from stalling in a blocked state trying to read a
+ * page when a cursor is sitting on it -- at least in many important cases.
+ *
+ * Set the buffer to invalid if the pin is released, since the buffer may be
+ * re-used.  If we need to go back to this block (for example, to apply
+ * LP_DEAD hints) we must get a fresh reference to the buffer.  Hopefully it
+ * will remain in shared memory for as long as it takes to scan the index
+ * buffer page.
+ */
+static void
+_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
+{
+       LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
+
+       if (IsMVCCSnapshot(scan->xs_snapshot) &&
+               RelationNeedsWAL(scan->indexRelation) &&
+               !scan->xs_want_itup)
+       {
+               ReleaseBuffer(sp->buf);
+               sp->buf = InvalidBuffer;
+       }
+}
  
  
  /*
   *     _bt_search() -- Search the tree for a particular scankey,
   *             or more precisely for the first leaf page it could be on.
   *
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
   * When nextkey is false (the usual case), we are looking for the first
   * item >= scankey.  When nextkey is true, we are looking for the first
   * item strictly greater than scankey.
@@ -36,14 +79,19 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
   * address of the leaf-page buffer, which is read-locked and pinned.
   * No locks are held on the parent pages, however!
   *
+ * If the snapshot parameter is not NULL, "old snapshot" checking will take
+ * place during the descent through the tree.  This is not needed when
+ * positioning for an insert or delete, so NULL is used for those cases.
+ *
   * NOTE that the returned buffer is read-locked regardless of the access
   * parameter.  However, access = BT_WRITE will allow an empty root page
- * to be created and returned. When access = BT_READ, an empty index
- * will result in *bufP being set to InvalidBuffer.
+ * to be created and returned.  When access = BT_READ, an empty index
+ * will result in *bufP being set to InvalidBuffer.  Also, in BT_WRITE mode,
+ * any incomplete splits encountered during the search will be finished.
   */
  BTStack
  _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
-                  Buffer *bufP, int access)
+                  Buffer *bufP, int access, Snapshot snapshot)
  {
         BTStack         stack_in = NULL;
  
@@ -61,18 +109,26 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
                 BTPageOpaque opaque;
                 OffsetNumber offnum;
                 ItemId          itemid;
-               BTItem          btitem;
                 IndexTuple      itup;
                 BlockNumber blkno;
                 BlockNumber par_blkno;
                 BTStack         new_stack;
  
                 /*
-                * Race -- the page we just grabbed may have split since we read
-                * its pointer in the parent (or metapage).  If it has, we may
-                * need to move right to its new sibling.  Do that.
+                * Race -- the page we just grabbed may have split since we read its
+                * pointer in the parent (or metapage).  If it has, we may need to
+                * move right to its new sibling.  Do that.
+                *
+                * In write-mode, allow _bt_moveright to finish any incomplete splits
+                * along the way.  Strictly speaking, we'd only need to finish an
+                * incomplete split on the leaf page we're about to insert to, not on
+                * any of the upper levels (they are taken care of in _bt_getstackbuf,
+                * if the leaf page is split and we insert to the parent page).  But
+                * this is a good opportunity to finish splits of internal pages too.
                  */
-               *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
+               *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
+                                                         (access == BT_WRITE), stack_in,
+                                                         BT_READ, snapshot);
  
                 /* if this is a leaf page, we're done */
                 page = BufferGetPage(*bufP);
@@ -81,35 +137,33 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
                         break;
  
                 /*
-                * Find the appropriate item on the internal page, and get the
-                * child page that it points to.
+                * Find the appropriate item on the internal page, and get the child
+                * page that it points to.
                  */
                 offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
                 itemid = PageGetItemId(page, offnum);
-               btitem = (BTItem) PageGetItem(page, itemid);
-               itup = &(btitem->bti_itup);
+               itup = (IndexTuple) PageGetItem(page, itemid);
                 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
                 par_blkno = BufferGetBlockNumber(*bufP);
  
                 /*
                  * We need to save the location of the index entry we chose in the
-                * parent page on a stack. In case we split the tree, we'll use
-                * the stack to work back up to the parent page.  We also save the
-                * actual downlink (TID) to uniquely identify the index entry, in
-                * case it moves right while we're working lower in the tree.  See
-                * the paper by Lehman and Yao for how this is detected and
-                * handled. (We use the child link to disambiguate duplicate keys
-                * in the index -- Lehman and Yao disallow duplicate keys.)
+                * parent page on a stack. In case we split the tree, we'll use the
+                * stack to work back up to the parent page.  We also save the actual
+                * downlink (TID) to uniquely identify the index entry, in case it
+                * moves right while we're working lower in the tree.  See the paper
+                * by Lehman and Yao for how this is detected and handled. (We use the
+                * child link to disambiguate duplicate keys in the index -- Lehman
+                * and Yao disallow duplicate keys.)
                  */
                 new_stack = (BTStack) palloc(sizeof(BTStackData));
                 new_stack->bts_blkno = par_blkno;
                 new_stack->bts_offset = offnum;
-               memcpy(&new_stack->bts_btitem, btitem, sizeof(BTItemData));
+               memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
                 new_stack->bts_parent = stack_in;
  
                 /* drop the read lock on the parent page, acquire one on the child */
-               _bt_relbuf(rel, *bufP);
-               *bufP = _bt_getbuf(rel, blkno, BT_READ);
+               *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
  
                 /* okay, all set to move down a level */
                 stack_in = new_stack;
@@ -127,18 +181,30 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
   * data that appeared on the page originally is either on the page
   * or strictly to the right of it.
   *
- * When nextkey is false (the usual case), we are looking for the first
- * item >= scankey.  When nextkey is true, we are looking for the first
- * item strictly greater than scankey.
- *
   * This routine decides whether or not we need to move right in the
   * tree by examining the high key entry on the page.  If that entry
   * is strictly less than the scankey, or <= the scankey in the nextkey=true
   * case, then we followed the wrong link and we need to move right.
   *
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
+ * When nextkey is false (the usual case), we are looking for the first
+ * item >= scankey.  When nextkey is true, we are looking for the first
+ * item strictly greater than scankey.
+ *
+ * If forupdate is true, we will attempt to finish any incomplete splits
+ * that we encounter.  This is required when locking a target page for an
+ * insertion, because we don't allow inserting on a page before the split
+ * is completed.  'stack' is only used if forupdate is true.
+ *
   * On entry, we have the buffer pinned and a lock of the type specified by
   * 'access'.  If we move right, we release the buffer and lock and acquire
   * the same on the right sibling.  Return value is the buffer we stop at.
+ *
+ * If the snapshot parameter is not NULL, "old snapshot" checking will take
+ * place during the descent through the tree.  This is not needed when
+ * positioning for an insert or delete, so NULL is used for those cases.
   */
  Buffer
  _bt_moveright(Relation rel,
@@ -146,15 +212,15 @@ _bt_moveright(Relation rel,
                           int keysz,
                           ScanKey scankey,
                           bool nextkey,
-                         int access)
+                         bool forupdate,
+                         BTStack stack,
+                         int access,
+                         Snapshot snapshot)
  {
         Page            page;
         BTPageOpaque opaque;
         int32           cmpval;
  
-       page = BufferGetPage(buf);
-       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
         /*
          * When nextkey = false (normal case): if the scan key that brought us to
          * this page is > the high key stored on the page, then the page has split
@@ -164,28 +230,59 @@ _bt_moveright(Relation rel,
          *
          * When nextkey = true: move right if the scan key is >= page's high key.
          *
-        * The page could even have split more than once, so scan as far as needed.
+        * The page could even have split more than once, so scan as far as
+        * needed.
          *
          * We also have to move right if we followed a link that brought us to a
          * dead page.
          */
         cmpval = nextkey ? 0 : 1;
  
-       while (!P_RIGHTMOST(opaque) &&
-                  (P_IGNORE(opaque) ||
-                       _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
+       for (;;)
         {
-               /* step right one page */
-               BlockNumber rblkno = opaque->btpo_next;
-
-               _bt_relbuf(rel, buf);
-               buf = _bt_getbuf(rel, rblkno, access);
                 page = BufferGetPage(buf);
+               TestForOldSnapshot(snapshot, rel, page);
                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+               if (P_RIGHTMOST(opaque))
+                       break;
+
+               /*
+                * Finish any incomplete splits we encounter along the way.
+                */
+               if (forupdate && P_INCOMPLETE_SPLIT(opaque))
+               {
+                       BlockNumber blkno = BufferGetBlockNumber(buf);
+
+                       /* upgrade our lock if necessary */
+                       if (access == BT_READ)
+                       {
+                               LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+                               LockBuffer(buf, BT_WRITE);
+                       }
+
+                       if (P_INCOMPLETE_SPLIT(opaque))
+                               _bt_finish_split(rel, buf, stack);
+                       else
+                               _bt_relbuf(rel, buf);
+
+                       /* re-acquire the lock in the right mode, and re-check */
+                       buf = _bt_getbuf(rel, blkno, access);
+                       continue;
+               }
+
+               if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
+               {
+                       /* step right one page */
+                       buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
+                       continue;
+               }
+               else
+                       break;
         }
  
         if (P_IGNORE(opaque))
-               elog(ERROR, "fell off the end of \"%s\"",
+               elog(ERROR, "fell off the end of index \"%s\"",
                          RelationGetRelationName(rel));
  
         return buf;
@@ -194,14 +291,13 @@ _bt_moveright(Relation rel,
  /*
   *     _bt_binsrch() -- Do a binary search for a key on a particular page.
   *
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
   * When nextkey is false (the usual case), we are looking for the first
   * item >= scankey.  When nextkey is true, we are looking for the first
   * item strictly greater than scankey.
   *
- * The scankey we get has the compare function stored in the procedure
- * entry of each data struct.  We invoke this regproc to do the
- * comparison for every key in the scankey.
- *
   * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
   * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
   * particular, this means it is possible to return a value 1 greater than the
@@ -216,7 +312,7 @@ _bt_moveright(Relation rel,
   * (or leaf keys > given scankey when nextkey is true).
   *
   * This procedure is not responsible for walking right, it just examines
- * the given page.     _bt_binsrch() has no lock or refcount side effects
+ * the given page.  _bt_binsrch() has no lock or refcount side effects
   * on the buffer.
   */
  OffsetNumber
@@ -226,7 +322,6 @@ _bt_binsrch(Relation rel,
                         ScanKey scankey,
                         bool nextkey)
  {
-       TupleDesc       itupdesc;
         Page            page;
         BTPageOpaque opaque;
         OffsetNumber low,
@@ -234,7 +329,6 @@ _bt_binsrch(Relation rel,
         int32           result,
                                 cmpval;
  
-       itupdesc = RelationGetDescr(rel);
         page = BufferGetPage(buf);
         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  
@@ -242,26 +336,24 @@ _bt_binsrch(Relation rel,
         high = PageGetMaxOffsetNumber(page);
  
         /*
-        * If there are no keys on the page, return the first available slot.
-        * Note this covers two cases: the page is really empty (no keys), or
-        * it contains only a high key.  The latter case is possible after
-        * vacuuming.  This can never happen on an internal page, however,
-        * since they are never empty (an internal page must have children).
+        * If there are no keys on the page, return the first available slot. Note
+        * this covers two cases: the page is really empty (no keys), or it
+        * contains only a high key.  The latter case is possible after vacuuming.
+        * This can never happen on an internal page, however, since they are
+        * never empty (an internal page must have children).
          */
         if (high < low)
                 return low;
  
         /*
-        * Binary search to find the first key on the page >= scan key, or
-        * first key > scankey when nextkey is true.
+        * Binary search to find the first key on the page >= scan key, or first
+        * key > scankey when nextkey is true.
          *
-        * For nextkey=false (cmpval=1), the loop invariant is: all slots
-        * before 'low' are < scan key, all slots at or after 'high'
-        * are >= scan key.
+        * For nextkey=false (cmpval=1), the loop invariant is: all slots before
+        * 'low' are < scan key, all slots at or after 'high' are >= scan key.
          *
-        * For nextkey=true (cmpval=0), the loop invariant is: all slots
-        * before 'low' are <= scan key, all slots at or after 'high'
-        * are > scan key.
+        * For nextkey=true (cmpval=0), the loop invariant is: all slots before
+        * 'low' are <= scan key, all slots at or after 'high' are > scan key.
          *
          * We can fall out when high == low.
          */
@@ -287,8 +379,8 @@ _bt_binsrch(Relation rel,
          * At this point we have high == low, but be careful: they could point
          * past the last slot on the page.
          *
-        * On a leaf page, we always return the first key >= scan key (resp.
-        * > scan key), which could be the last slot + 1.
+        * On a leaf page, we always return the first key >= scan key (resp. >
+        * scan key), which could be the last slot + 1.
          */
         if (P_ISLEAF(opaque))
                 return low;
@@ -305,8 +397,11 @@ _bt_binsrch(Relation rel,
  /*----------
   *     _bt_compare() -- Compare scankey to a particular tuple on the page.
   *
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
   *     keysz: number of key conditions to be checked (might be less than the
- *     total length of the scan key!)
+ *             number of index columns!)
   *     page/offnum: location of btree item to be compared to.
   *
   *             This routine returns:
@@ -334,30 +429,28 @@ _bt_compare(Relation rel,
  {
         TupleDesc       itupdesc = RelationGetDescr(rel);
         BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-       BTItem          btitem;
         IndexTuple      itup;
         int                     i;
  
         /*
-        * Force result ">" if target item is first data item on an internal
-        * page --- see NOTE above.
+        * Force result ">" if target item is first data item on an internal page
+        * --- see NOTE above.
          */
         if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
                 return 1;
  
-       btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
-       itup = &(btitem->bti_itup);
+       itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
  
         /*
-        * The scan key is set up with the attribute number associated with
-        * each term in the key.  It is important that, if the index is
-        * multi-key, the scan contain the first k key attributes, and that
-        * they be in order.  If you think about how multi-key ordering works,
-        * you'll understand why this is.
+        * The scan key is set up with the attribute number associated with each
+        * term in the key.  It is important that, if the index is multi-key, the
+        * scan contain the first k key attributes, and that they be in order.  If
+        * you think about how multi-key ordering works, you'll understand why
+        * this is.
          *
          * We don't test for violation of this condition here, however.  The
-        * initial setup for the index scan had better have gotten it right
-        * (see _bt_first).
+        * initial setup for the index scan had better have gotten it right (see
+        * _bt_first).
          */
  
         for (i = 1; i <= keysz; i++)
@@ -373,30 +466,35 @@ _bt_compare(Relation rel,
                 {
                         if (isNull)
                                 result = 0;             /* NULL "=" NULL */
+                       else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
+                               result = -1;    /* NULL "<" NOT_NULL */
                         else
                                 result = 1;             /* NULL ">" NOT_NULL */
                 }
                 else if (isNull)                /* key is NOT_NULL and item is NULL */
                 {
-                       result = -1;            /* NOT_NULL "<" NULL */
+                       if (scankey->sk_flags & SK_BT_NULLS_FIRST)
+                               result = 1;             /* NOT_NULL ">" NULL */
+                       else
+                               result = -1;    /* NOT_NULL "<" NULL */
                 }
                 else
                 {
                         /*
-                        * The sk_func needs to be passed the index value as left arg
-                        * and the sk_argument as right arg (they might be of different
+                        * The sk_func needs to be passed the index value as left arg and
+                        * the sk_argument as right arg (they might be of different
                          * types).  Since it is convenient for callers to think of
-                        * _bt_compare as comparing the scankey to the index item,
-                        * we have to flip the sign of the comparison result.
-                        *
-                        * Note: curious-looking coding is to avoid overflow if
-                        * comparison function returns INT_MIN.  There is no risk of
-                        * overflow for positive results.
+                        * _bt_compare as comparing the scankey to the index item, we have
+                        * to flip the sign of the comparison result.  (Unless it's a DESC
+                        * column, in which case we *don't* flip the sign.)
                          */
-                       result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
-                                                                                                datum,
-                                                                                                scankey->sk_argument));
-                       result = (result < 0) ? 1 : -result;
+                       result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
+                                                                                                        scankey->sk_collation,
+                                                                                                        datum,
+                                                                                                        scankey->sk_argument));
+
+                       if (!(scankey->sk_flags & SK_BT_DESC))
+                               result = -result;
                 }
  
                 /* if the keys are unequal, return the difference */
@@ -410,75 +508,25 @@ _bt_compare(Relation rel,
         return 0;
  }
  
-/*
- *     _bt_next() -- Get the next item in a scan.
- *
- *             On entry, we have a valid currentItemData in the scan, and a
- *             read lock and pin count on the page that contains that item.
- *             We return the next item in the scan, or false if no more.
- *             On successful exit, the page containing the new item is locked
- *             and pinned; on failure exit, no lock or pin is held.
- */
-bool
-_bt_next(IndexScanDesc scan, ScanDirection dir)
-{
-       Relation        rel;
-       Buffer          buf;
-       Page            page;
-       OffsetNumber offnum;
-       ItemPointer current;
-       BTItem          btitem;
-       IndexTuple      itup;
-       BTScanOpaque so;
-       bool            continuescan;
-
-       rel = scan->indexRelation;
-       so = (BTScanOpaque) scan->opaque;
-       current = &(scan->currentItemData);
-
-       /* we still have the buffer pinned and locked */
-       buf = so->btso_curbuf;
-       Assert(BufferIsValid(buf));
-
-       do
-       {
-               /* step one tuple in the appropriate direction */
-               if (!_bt_step(scan, &buf, dir))
-                       return false;
-
-               /* current is the next candidate tuple to return */
-               offnum = ItemPointerGetOffsetNumber(current);
-               page = BufferGetPage(buf);
-               btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
-               itup = &btitem->bti_itup;
-
-               if (_bt_checkkeys(scan, itup, dir, &continuescan))
-               {
-                       /* tuple passes all scan key conditions, so return it */
-                       scan->xs_ctup.t_self = itup->t_tid;
-                       return true;
-               }
-
-               /* This tuple doesn't pass, but there might be more that do */
-       } while (continuescan);
-
-       /* No more items, so close down the current-item info */
-       ItemPointerSetInvalid(current);
-       so->btso_curbuf = InvalidBuffer;
-       _bt_relbuf(rel, buf);
-
-       return false;
-}
-
  /*
   *     _bt_first() -- Find the first item in a scan.
   *
- *             We need to be clever about the type of scan, the operation it's
- *             performing, and the tree ordering.      We find the
- *             first item in the tree that satisfies the qualification
- *             associated with the scan descriptor.  On exit, the page containing
- *             the current index tuple is read locked and pinned, and the scan's
- *             opaque data entry is updated to include the buffer.
+ *             We need to be clever about the direction of scan, the search
+ *             conditions, and the tree ordering.  We find the first item (or,
+ *             if backwards scan, the last item) in the tree that satisfies the
+ *             qualifications in the scan key.  On success exit, the page containing
+ *             the current index tuple is pinned but not locked, and data about
+ *             the matching tuple(s) on the page has been loaded into so->currPos.
+ *             scan->xs_ctup.t_self is set to the heap TID of the current tuple,
+ *             and if requested, scan->xs_itup points to a copy of the index tuple.
+ *
+ * If there are no matching items in the index, we return FALSE, with no
+ * pins or locks held.
+ *
+ * Note that scan->keyData[], and the so->keyData[] scankey built from it,
+ * are both search-type scankeys (see nbtree/README for more about this).
+ * Within this routine, we build a temporary insertion-type scankey to use
+ * in locating the scan start position.
   */
  bool
  _bt_first(IndexScanDesc scan, ScanDirection dir)
@@ -486,26 +534,26 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
         Relation        rel = scan->indexRelation;
         BTScanOpaque so = (BTScanOpaque) scan->opaque;
         Buffer          buf;
-       Page            page;
         BTStack         stack;
         OffsetNumber offnum;
-       BTItem          btitem;
-       IndexTuple      itup;
-       ItemPointer current;
-       BlockNumber blkno;
         StrategyNumber strat;
-       bool            res;
         bool            nextkey;
-       bool            continuescan;
-       ScanKey         scankeys;
-       ScanKey    *startKeys = NULL;
+       bool            goback;
+       ScanKey         startKeys[INDEX_MAX_KEYS];
+       ScanKeyData scankeys[INDEX_MAX_KEYS];
+       ScanKeyData notnullkeys[INDEX_MAX_KEYS];
         int                     keysCount = 0;
         int                     i;
         StrategyNumber strat_total;
+       BTScanPosItem *currItem;
+
+       Assert(!BTScanPosIsValid(so->currPos));
+
+       pgstat_count_index_scan(rel);
  
         /*
-        * Examine the scan keys and eliminate any redundant keys; also
-        * discover how many keys must be matched to continue the scan.
+        * Examine the scan keys and eliminate any redundant keys; also mark the
+        * keys that must be matched to continue the scan.
          */
         _bt_preprocess_keys(scan);
  
@@ -528,16 +576,16 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
          * attributes to its right, because it would break our simplistic notion
          * of what initial positioning strategy to use.
          *
-        * When the scan keys include non-default operators, _bt_preprocess_keys
+        * When the scan keys include cross-type operators, _bt_preprocess_keys
          * may not be able to eliminate redundant keys; in such cases we will
          * arbitrarily pick a usable one for each attribute.  This is correct
          * but possibly not optimal behavior.  (For example, with keys like
          * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
-        * x=5 would be more efficient.)  Since the situation only arises in
-        * hokily-worded queries, live with it.
+        * x=5 would be more efficient.)  Since the situation only arises given
+        * a poorly-worded query plus an incomplete opfamily, live with it.
          *
          * When both equality and inequality keys appear for a single attribute
-        * (again, only possible when non-default operators appear), we *must*
+        * (again, only possible when cross-type operators appear), we *must*
          * select one of the equality keys for the starting point, because
          * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
          * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
@@ -545,6 +593,21 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
          * equality quals survive preprocessing, however, it doesn't matter which
          * one we use --- by definition, they are either redundant or
          * contradictory.
+        *
+        * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
+        * If the index stores nulls at the end of the index we'll be starting
+        * from, and we have no boundary key for the column (which means the key
+        * we deduced NOT NULL from is an inequality key that constrains the other
+        * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
+        * use as a boundary key.  If we didn't do this, we might find ourselves
+        * traversing a lot of null entries at the start of the scan.
+        *
+        * In this loop, row-comparison keys are treated the same as keys on their
+        * first (leftmost) columns.  We'll add on lower-order columns of the row
+        * comparison below, if possible.
+        *
+        * The selected scan keys (at most one per index column) are remembered by
+        * storing their addresses into the local startKeys[] array.
          *----------
          */
         strat_total = BTEqualStrategyNumber;
@@ -552,9 +615,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
         {
                 AttrNumber      curattr;
                 ScanKey         chosen;
+               ScanKey         impliesNN;
                 ScanKey         cur;
  
-               startKeys = (ScanKey *) palloc(so->numberOfKeys * sizeof(ScanKey));
                 /*
                  * chosen is the so-far-chosen key for the current attribute, if any.
                  * We don't cast the decision in stone until we reach keys for the
@@ -562,6 +625,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
                  */
                 curattr = 1;
                 chosen = NULL;
+               /* Also remember any scankey that implies a NOT NULL constraint */
+               impliesNN = NULL;
+
                 /*
                  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
                  * pass to handle after-last-key processing.  Actual exit from the
@@ -573,14 +639,40 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
                         {
                                 /*
                                  * Done looking at keys for curattr.  If we didn't find a
-                                * usable boundary key, quit; else save the boundary key
-                                * pointer in startKeys.
+                                * usable boundary key, see if we can deduce a NOT NULL key.
+                                */
+                               if (chosen == NULL && impliesNN != NULL &&
+                                       ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
+                                        ScanDirectionIsForward(dir) :
+                                        ScanDirectionIsBackward(dir)))
+                               {
+                                       /* Yes, so build the key in notnullkeys[keysCount] */
+                                       chosen = &notnullkeys[keysCount];
+                                       ScanKeyEntryInitialize(chosen,
+                                                                                  (SK_SEARCHNOTNULL | SK_ISNULL |
+                                                                                       (impliesNN->sk_flags &
+                                                                                 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
+                                                                                  curattr,
+                                                                ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
+                                                                 BTGreaterStrategyNumber :
+                                                                 BTLessStrategyNumber),
+                                                                                  InvalidOid,
+                                                                                  InvalidOid,
+                                                                                  InvalidOid,
+                                                                                  (Datum) 0);
+                               }
+
+                               /*
+                                * If we still didn't find a usable boundary key, quit; else
+                                * save the boundary key pointer in startKeys.
                                  */
                                 if (chosen == NULL)
                                         break;
                                 startKeys[keysCount++] = chosen;
+
                                 /*
-                                * Adjust strat_total, and quit if we have stored a > or < key.
+                                * Adjust strat_total, and quit if we have stored a > or <
+                                * key.
                                  */
                                 strat = chosen->sk_strategy;
                                 if (strat != BTEqualStrategyNumber)
@@ -590,26 +682,42 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
                                                 strat == BTLessStrategyNumber)
                                                 break;
                                 }
+
                                 /*
-                                * Done if that was the last attribute.
+                                * Done if that was the last attribute, or if next key is not
+                                * in sequence (implying no boundary key is available for the
+                                * next attribute).
                                  */
-                               if (i >= so->numberOfKeys)
+                               if (i >= so->numberOfKeys ||
+                                       cur->sk_attno != curattr + 1)
                                         break;
+
                                 /*
-                                * Reset for next attr, which should be in sequence.
+                                * Reset for next attr.
                                  */
-                               Assert(cur->sk_attno == curattr + 1);
                                 curattr = cur->sk_attno;
                                 chosen = NULL;
+                               impliesNN = NULL;
                         }
  
-                       /* Can we use this key as a starting boundary for this attr? */
+                       /*
+                        * Can we use this key as a starting boundary for this attr?
+                        *
+                        * If not, does it imply a NOT NULL constraint?  (Because
+                        * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
+                        * *any* inequality key works for that; we need not test.)
+                        */
                         switch (cur->sk_strategy)
                         {
                                 case BTLessStrategyNumber:
                                 case BTLessEqualStrategyNumber:
-                                       if (chosen == NULL && ScanDirectionIsBackward(dir))
-                                               chosen = cur;
+                                       if (chosen == NULL)
+                                       {
+                                               if (ScanDirectionIsBackward(dir))
+                                                       chosen = cur;
+                                               else
+                                                       impliesNN = cur;
+                                       }
                                         break;
                                 case BTEqualStrategyNumber:
                                         /* override any non-equality choice */
@@ -617,115 +725,249 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
                                         break;
                                 case BTGreaterEqualStrategyNumber:
                                 case BTGreaterStrategyNumber:
-                                       if (chosen == NULL && ScanDirectionIsForward(dir))
-                                               chosen = cur;
+                                       if (chosen == NULL)
+                                       {
+                                               if (ScanDirectionIsForward(dir))
+                                                       chosen = cur;
+                                               else
+                                                       impliesNN = cur;
+                                       }
                                         break;
                         }
                 }
         }
  
         /*
-        * If we found no usable boundary keys, we have to start from one end
-        * of the tree.  Walk down that edge to the first or last key, and
-        * scan from there.
+        * If we found no usable boundary keys, we have to start from one end of
+        * the tree.  Walk down that edge to the first or last key, and scan from
+        * there.
          */
         if (keysCount == 0)
-       {
-               if (startKeys)
-                       pfree(startKeys);
                 return _bt_endpoint(scan, dir);
-       }
  
         /*
-        * We want to start the scan somewhere within the index.  Set up a
-        * 3-way-comparison scankey we can use to search for the boundary
-        * point we identified above.
+        * We want to start the scan somewhere within the index.  Set up an
+        * insertion scankey we can use to search for the boundary point we
+        * identified above.  The insertion scankey is built in the local
+        * scankeys[] array, using the keys identified by startKeys[].
          */
-       scankeys = (ScanKey) palloc(keysCount * sizeof(ScanKeyData));
+       Assert(keysCount <= INDEX_MAX_KEYS);
         for (i = 0; i < keysCount; i++)
         {
                 ScanKey         cur = startKeys[i];
  
-               /*
-                * _bt_preprocess_keys disallows it, but it's place to add some code
-                * later
-                */
-               if (cur->sk_flags & SK_ISNULL)
-               {
-                       pfree(startKeys);
-                       pfree(scankeys);
-                       elog(ERROR, "btree doesn't support is(not)null, yet");
-                       return false;
-               }
-               /*
-                * If scankey operator is of default subtype, we can use the
-                * cached comparison procedure; otherwise gotta look it up in
-                * the catalogs.
-                */
-               if (cur->sk_subtype == InvalidOid)
+               Assert(cur->sk_attno == i + 1);
+
+               if (cur->sk_flags & SK_ROW_HEADER)
                 {
-                       FmgrInfo   *procinfo;
+                       /*
+                        * Row comparison header: look to the first row member instead.
+                        *
+                        * The member scankeys are already in insertion format (ie, they
+                        * have sk_func = 3-way-comparison function), but we have to watch
+                        * out for nulls, which _bt_preprocess_keys didn't check. A null
+                        * in the first row member makes the condition unmatchable, just
+                        * like qual_ok = false.
+                        */
+                       ScanKey         subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
  
-                       procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
-                       ScanKeyEntryInitializeWithInfo(scankeys + i,
-                                                                                  cur->sk_flags,
-                                                                                  i + 1,
-                                                                                  InvalidStrategy,
-                                                                                  InvalidOid,
-                                                                                  procinfo,
-                                                                                  cur->sk_argument);
+                       Assert(subkey->sk_flags & SK_ROW_MEMBER);
+                       if (subkey->sk_flags & SK_ISNULL)
+                               return false;
+                       memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
+
+                       /*
+                        * If the row comparison is the last positioning key we accepted,
+                        * try to add additional keys from the lower-order row members.
+                        * (If we accepted independent conditions on additional index
+                        * columns, we use those instead --- doesn't seem worth trying to
+                        * determine which is more restrictive.)  Note that this is OK
+                        * even if the row comparison is of ">" or "<" type, because the
+                        * condition applied to all but the last row member is effectively
+                        * ">=" or "<=", and so the extra keys don't break the positioning
+                        * scheme.  But, by the same token, if we aren't able to use all
+                        * the row members, then the part of the row comparison that we
+                        * did use has to be treated as just a ">=" or "<=" condition, and
+                        * so we'd better adjust strat_total accordingly.
+                        */
+                       if (i == keysCount - 1)
+                       {
+                               bool            used_all_subkeys = false;
+
+                               Assert(!(subkey->sk_flags & SK_ROW_END));
+                               for (;;)
+                               {
+                                       subkey++;
+                                       Assert(subkey->sk_flags & SK_ROW_MEMBER);
+                                       if (subkey->sk_attno != keysCount + 1)
+                                               break;  /* out-of-sequence, can't use it */
+                                       if (subkey->sk_strategy != cur->sk_strategy)
+                                               break;  /* wrong direction, can't use it */
+                                       if (subkey->sk_flags & SK_ISNULL)
+                                               break;  /* can't use null keys */
+                                       Assert(keysCount < INDEX_MAX_KEYS);
+                                       memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
+                                       keysCount++;
+                                       if (subkey->sk_flags & SK_ROW_END)
+                                       {
+                                               used_all_subkeys = true;
+                                               break;
+                                       }
+                               }
+                               if (!used_all_subkeys)
+                               {
+                                       switch (strat_total)
+                                       {
+                                               case BTLessStrategyNumber:
+                                                       strat_total = BTLessEqualStrategyNumber;
+                                                       break;
+                                               case BTGreaterStrategyNumber:
+                                                       strat_total = BTGreaterEqualStrategyNumber;
+                                                       break;
+                                       }
+                               }
+                               break;                  /* done with outer loop */
+                       }
                 }
                 else
                 {
-                       RegProcedure cmp_proc;
-
-                       cmp_proc = get_opclass_proc(rel->rd_index->indclass[i],
-                                                                               cur->sk_subtype,
-                                                                               BTORDER_PROC);
-                       ScanKeyEntryInitialize(scankeys + i,
-                                                                  cur->sk_flags,
-                                                                  i + 1,
-                                                                  InvalidStrategy,
-                                                                  cur->sk_subtype,
-                                                                  cmp_proc,
-                                                                  cur->sk_argument);
+                       /*
+                        * Ordinary comparison key.  Transform the search-style scan key
+                        * to an insertion scan key by replacing the sk_func with the
+                        * appropriate btree comparison function.
+                        *
+                        * If scankey operator is not a cross-type comparison, we can use
+                        * the cached comparison function; otherwise gotta look it up in
+                        * the catalogs.  (That can't lead to infinite recursion, since no
+                        * indexscan initiated by syscache lookup will use cross-data-type
+                        * operators.)
+                        *
+                        * We support the convention that sk_subtype == InvalidOid means
+                        * the opclass input type; this is a hack to simplify life for
+                        * ScanKeyInit().
+                        */
+                       if (cur->sk_subtype == rel->rd_opcintype[i] ||
+                               cur->sk_subtype == InvalidOid)
+                       {
+                               FmgrInfo   *procinfo;
+
+                               procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
+                               ScanKeyEntryInitializeWithInfo(scankeys + i,
+                                                                                          cur->sk_flags,
+                                                                                          cur->sk_attno,
+                                                                                          InvalidStrategy,
+                                                                                          cur->sk_subtype,
+                                                                                          cur->sk_collation,
+                                                                                          procinfo,
+                                                                                          cur->sk_argument);
+                       }
+                       else
+                       {
+                               RegProcedure cmp_proc;
+
+                               cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
+                                                                                        rel->rd_opcintype[i],
+                                                                                        cur->sk_subtype,
+                                                                                        BTORDER_PROC);
+                               if (!RegProcedureIsValid(cmp_proc))
+                                       elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
+                                                BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
+                                                cur->sk_attno, RelationGetRelationName(rel));
+                               ScanKeyEntryInitialize(scankeys + i,
+                                                                          cur->sk_flags,
+                                                                          cur->sk_attno,
+                                                                          InvalidStrategy,
+                                                                          cur->sk_subtype,
+                                                                          cur->sk_collation,
+                                                                          cmp_proc,
+                                                                          cur->sk_argument);
+                       }
                 }
         }
  
-       pfree(startKeys);
-
-       /*
-        * We want to locate either the first item >= boundary point, or
-        * first item > boundary point, depending on the initial-positioning
-        * strategy we just chose.
+       /*----------
+        * Examine the selected initial-positioning strategy to determine exactly
+        * where we need to start the scan, and set flag variables to control the
+        * code below.
+        *
+        * If nextkey = false, _bt_search and _bt_binsrch will locate the first
+        * item >= scan key.  If nextkey = true, they will locate the first
+        * item > scan key.
+        *
+        * If goback = true, we will then step back one item, while if
+        * goback = false, we will start the scan on the located item.
+        *----------
          */
         switch (strat_total)
         {
                 case BTLessStrategyNumber:
+
+                       /*
+                        * Find first item >= scankey, then back up one to arrive at last
+                        * item < scankey.  (Note: this positioning strategy is only used
+                        * for a backward scan, so that is always the correct starting
+                        * position.)
+                        */
                         nextkey = false;
+                       goback = true;
                         break;
  
                 case BTLessEqualStrategyNumber:
+
+                       /*
+                        * Find first item > scankey, then back up one to arrive at last
+                        * item <= scankey.  (Note: this positioning strategy is only used
+                        * for a backward scan, so that is always the correct starting
+                        * position.)
+                        */
                         nextkey = true;
+                       goback = true;
                         break;
  
                 case BTEqualStrategyNumber:
+
                         /*
-                        * If a backward scan was specified, need to start with last
-                        * equal item not first one.
+                        * If a backward scan was specified, need to start with last equal
+                        * item not first one.
                          */
                         if (ScanDirectionIsBackward(dir))
+                       {
+                               /*
+                                * This is the same as the <= strategy.  We will check at the
+                                * end whether the found item is actually =.
+                                */
                                 nextkey = true;
+                               goback = true;
+                       }
                         else
+                       {
+                               /*
+                                * This is the same as the >= strategy.  We will check at the
+                                * end whether the found item is actually =.
+                                */
                                 nextkey = false;
+                               goback = false;
+                       }
                         break;
  
                 case BTGreaterEqualStrategyNumber:
+
+                       /*
+                        * Find first item >= scankey.  (This is only used for forward
+                        * scans.)
+                        */
                         nextkey = false;
+                       goback = false;
                         break;
  
                 case BTGreaterStrategyNumber:
+
+                       /*
+                        * Find first item > scankey.  (This is only used for forward
+                        * scans.)
+                        */
                         nextkey = true;
+                       goback = false;
                         break;
  
                 default:
@@ -735,297 +977,466 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
         }
  
         /*
-        * Use the manufactured scan key to descend the tree and position
-        * ourselves on the target leaf page.
+        * Use the manufactured insertion scan key to descend the tree and
+        * position ourselves on the target leaf page.
          */
-       stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ);
+       stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ,
+                                          scan->xs_snapshot);
  
         /* don't need to keep the stack around... */
         _bt_freestack(stack);
  
-       current = &(scan->currentItemData);
-
         if (!BufferIsValid(buf))
         {
-               /* Only get here if index is completely empty */
-               ItemPointerSetInvalid(current);
-               so->btso_curbuf = InvalidBuffer;
-               pfree(scankeys);
+               /*
+                * We only get here if the index is completely empty. Lock relation
+                * because nothing finer to lock exists.
+                */
+               PredicateLockRelation(rel, scan->xs_snapshot);
                 return false;
         }
+       else
+               PredicateLockPage(rel, BufferGetBlockNumber(buf),
+                                                 scan->xs_snapshot);
  
-       /* remember which buffer we have pinned */
-       so->btso_curbuf = buf;
-       blkno = BufferGetBlockNumber(buf);
-       page = BufferGetPage(buf);
+       /* initialize moreLeft/moreRight appropriately for scan direction */
+       if (ScanDirectionIsForward(dir))
+       {
+               so->currPos.moreLeft = false;
+               so->currPos.moreRight = true;
+       }
+       else
+       {
+               so->currPos.moreLeft = true;
+               so->currPos.moreRight = false;
+       }
+       so->numKilled = 0;                      /* just paranoia */
+       Assert(so->markItemIndex == -1);
  
         /* position to the precise item on the page */
         offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
  
-       ItemPointerSet(current, blkno, offnum);
-
-       /* done with manufactured scankey, now */
-       pfree(scankeys);
-
         /*
-        * It's now time to examine the initial-positioning strategy to find the
-        * exact place to start the scan.
-        *
-        * If nextkey = false, we are positioned at the first item >= scan key,
-        * or possibly at the end of a page on which all the existing items are
-        * less than the scan key and we know that everything on later pages
-        * is greater than or equal to scan key.
-        *
-        * If nextkey = true, we are positioned at the first item > scan key,
-        * or possibly at the end of a page on which all the existing items are
-        * less than or equal to the scan key and we know that everything on
-        * later pages is greater than scan key.
-        *
-        * The actually desired starting point is either this item or an adjacent
-        * one, or in the end-of-page case it's the last item on this page or
-        * the first item on the next.  We apply _bt_step if needed to get to
-        * the right place.
+        * If nextkey = false, we are positioned at the first item >= scan key, or
+        * possibly at the end of a page on which all the existing items are less
+        * than the scan key and we know that everything on later pages is greater
+        * than or equal to scan key.
          *
-        * Note: if _bt_step fails (meaning we fell off the end of the index in
-        * one direction or the other), then there are no matches so we just
-        * return false.
+        * If nextkey = true, we are positioned at the first item > scan key, or
+        * possibly at the end of a page on which all the existing items are less
+        * than or equal to the scan key and we know that everything on later
+        * pages is greater than scan key.
          *
-        * it's yet other place to add some code later for is(not)null ...
+        * The actually desired starting point is either this item or the prior
+        * one, or in the end-of-page case it's the first item on the next page or
+        * the last item on this page.  Adjust the starting offset if needed. (If
+        * this results in an offset before the first item or after the last one,
+        * _bt_readpage will report no items found, and then we'll step to the
+        * next page as needed.)
          */
-       switch (strat_total)
+       if (goback)
+               offnum = OffsetNumberPrev(offnum);
+
+       /* remember which buffer we have pinned, if any */
+       Assert(!BTScanPosIsValid(so->currPos));
+       so->currPos.buf = buf;
+
+       /*
+        * Now load data from the first page of the scan.
+        */
+       if (!_bt_readpage(scan, dir, offnum))
         {
-               case BTLessStrategyNumber:
+               /*
+                * There's no actually-matching data on this page.  Try to advance to
+                * the next page.  Return false if there's no matching data at all.
+                */
+               LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+               if (!_bt_steppage(scan, dir))
+                       return false;
+       }
+       else
+       {
+               /* Drop the lock, and maybe the pin, on the current page */
+               _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+       }
  
-                       /*
-                        * We are on first item >= scankey.
-                        *
-                        * Back up one to arrive at last item < scankey.  (Note: this
-                        * positioning strategy is only used for a backward scan, so
-                        * that is always the correct starting position.)
-                        */
-                       if (!_bt_step(scan, &buf, BackwardScanDirection))
-                               return false;
-                       break;
+       /* OK, itemIndex says what to return */
+       currItem = &so->currPos.items[so->currPos.itemIndex];
+       scan->xs_ctup.t_self = currItem->heapTid;
+       if (scan->xs_want_itup)
+               scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
  
-               case BTLessEqualStrategyNumber:
+       return true;
+}
  
-                       /*
-                        * We are on first item > scankey.
-                        *
-                        * Back up one to arrive at last item <= scankey.  (Note: this
-                        * positioning strategy is only used for a backward scan, so
-                        * that is always the correct starting position.)
-                        */
-                       if (!_bt_step(scan, &buf, BackwardScanDirection))
+/*
+ *     _bt_next() -- Get the next item in a scan.
+ *
+ *             On entry, so->currPos describes the current page, which may be pinned
+ *             but is not locked, and so->currPos.itemIndex identifies which item was
+ *             previously returned.
+ *
+ *             On successful exit, scan->xs_ctup.t_self is set to the TID of the
+ *             next heap tuple, and if requested, scan->xs_itup points to a copy of
+ *             the index tuple.  so->currPos is updated as needed.
+ *
+ *             On failure exit (no more tuples), we release pin and set
+ *             so->currPos.buf to InvalidBuffer.
+ */
+bool
+_bt_next(IndexScanDesc scan, ScanDirection dir)
+{
+       BTScanOpaque so = (BTScanOpaque) scan->opaque;
+       BTScanPosItem *currItem;
+
+       /*
+        * Advance to next tuple on current page; or if there's no more, try to
+        * step to the next page with data.
+        */
+       if (ScanDirectionIsForward(dir))
+       {
+               if (++so->currPos.itemIndex > so->currPos.lastItem)
+               {
+                       if (!_bt_steppage(scan, dir))
                                 return false;
-                       break;
+               }
+       }
+       else
+       {
+               if (--so->currPos.itemIndex < so->currPos.firstItem)
+               {
+                       if (!_bt_steppage(scan, dir))
+                               return false;
+               }
+       }
  
-               case BTEqualStrategyNumber:
-                       /*
-                        * If a backward scan was specified, need to start with last
-                        * equal item not first one.
-                        */
-                       if (ScanDirectionIsBackward(dir))
+       /* OK, itemIndex says what to return */
+       currItem = &so->currPos.items[so->currPos.itemIndex];
+       scan->xs_ctup.t_self = currItem->heapTid;
+       if (scan->xs_want_itup)
+               scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+       return true;
+}
+
+/*
+ *     _bt_readpage() -- Load data from current index page into so->currPos
+ *
+ * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
+ * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
+ * they are updated as appropriate.  All other fields of so->currPos are
+ * initialized from scratch here.
+ *
+ * We scan the current page starting at offnum and moving in the indicated
+ * direction.  All items matching the scan keys are loaded into currPos.items.
+ * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
+ * that there can be no more matching tuples in the current scan direction.
+ *
+ * Returns true if any matching items found on the page, false if none.
+ */
+static bool
+_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
+{
+       BTScanOpaque so = (BTScanOpaque) scan->opaque;
+       Page            page;
+       BTPageOpaque opaque;
+       OffsetNumber minoff;
+       OffsetNumber maxoff;
+       int                     itemIndex;
+       IndexTuple      itup;
+       bool            continuescan;
+
+       /*
+        * We must have the buffer pinned and locked, but the usual macro can't be
+        * used here; this function is what makes it good for currPos.
+        */
+       Assert(BufferIsValid(so->currPos.buf));
+
+       page = BufferGetPage(so->currPos.buf);
+       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+       minoff = P_FIRSTDATAKEY(opaque);
+       maxoff = PageGetMaxOffsetNumber(page);
+
+       /*
+        * We note the buffer's block number so that we can release the pin later.
+        * This allows us to re-read the buffer if it is needed again for hinting.
+        */
+       so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
+
+       /*
+        * We save the LSN of the page as we read it, so that we know whether it
+        * safe to apply LP_DEAD hints to the page later.  This allows us to drop
+        * the pin for MVCC scans, which allows vacuum to avoid blocking.
+        */
+       so->currPos.lsn = PageGetLSN(page);
+
+       /*
+        * we must save the page's right-link while scanning it; this tells us
+        * where to step right to after we're done with these items.  There is no
+        * corresponding need for the left-link, since splits always go right.
+        */
+       so->currPos.nextPage = opaque->btpo_next;
+
+       /* initialize tuple workspace to empty */
+       so->currPos.nextTupleOffset = 0;
+
+       /*
+        * Now that the current page has been made consistent, the macro should be
+        * good.
+        */
+       Assert(BTScanPosIsPinned(so->currPos));
+
+       if (ScanDirectionIsForward(dir))
+       {
+               /* load items[] in ascending order */
+               itemIndex = 0;
+
+               offnum = Max(offnum, minoff);
+
+               while (offnum <= maxoff)
+               {
+                       itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
+                       if (itup != NULL)
                         {
-                               /*
-                                * We are on first item > scankey.
-                                *
-                                * Back up one to arrive at last item <= scankey.
-                                * We will check below to see if it is equal to scankey.
-                                */
-                               if (!_bt_step(scan, &buf, BackwardScanDirection))
-                                       return false;
+                               /* tuple passes all scan key conditions, so remember it */
+                               _bt_saveitem(so, itemIndex, offnum, itup);
+                               itemIndex++;
                         }
-                       else
+                       if (!continuescan)
                         {
-                               /*
-                                * We are on first item >= scankey.
-                                *
-                                * Make sure we are on a real item; might have to
-                                * step forward if currently at end of page.
-                                * We will check below to see if it is equal to scankey.
-                                */
-                               if (offnum > PageGetMaxOffsetNumber(page))
-                               {
-                                       if (!_bt_step(scan, &buf, ForwardScanDirection))
-                                               return false;
-                               }
+                               /* there can't be any more matches, so stop */
+                               so->currPos.moreRight = false;
+                               break;
                         }
-                       break;
  
-               case BTGreaterEqualStrategyNumber:
+                       offnum = OffsetNumberNext(offnum);
+               }
  
-                       /*
-                        * We want the first item >= scankey, which is where we are...
-                        * unless we're not anywhere at all...
-                        */
-                       if (offnum > PageGetMaxOffsetNumber(page))
-                       {
-                               if (!_bt_step(scan, &buf, ForwardScanDirection))
-                                       return false;
-                       }
-                       break;
+               Assert(itemIndex <= MaxIndexTuplesPerPage);
+               so->currPos.firstItem = 0;
+               so->currPos.lastItem = itemIndex - 1;
+               so->currPos.itemIndex = 0;
+       }
+       else
+       {
+               /* load items[] in descending order */
+               itemIndex = MaxIndexTuplesPerPage;
  
-               case BTGreaterStrategyNumber:
+               offnum = Min(offnum, maxoff);
  
-                       /*
-                        * We want the first item > scankey, which is where we are...
-                        * unless we're not anywhere at all...
-                        */
-                       if (offnum > PageGetMaxOffsetNumber(page))
+               while (offnum >= minoff)
+               {
+                       itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
+                       if (itup != NULL)
                         {
-                               if (!_bt_step(scan, &buf, ForwardScanDirection))
-                                       return false;
+                               /* tuple passes all scan key conditions, so remember it */
+                               itemIndex--;
+                               _bt_saveitem(so, itemIndex, offnum, itup);
+                       }
+                       if (!continuescan)
+                       {
+                               /* there can't be any more matches, so stop */
+                               so->currPos.moreLeft = false;
+                               break;
                         }
-                       break;
-       }
  
-       /* okay, current item pointer for the scan is right */
-       offnum = ItemPointerGetOffsetNumber(current);
-       page = BufferGetPage(buf);
-       btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
-       itup = &btitem->bti_itup;
+                       offnum = OffsetNumberPrev(offnum);
+               }
  
-       /* is the first item actually acceptable? */
-       if (_bt_checkkeys(scan, itup, dir, &continuescan))
-       {
-               /* yes, return it */
-               scan->xs_ctup.t_self = itup->t_tid;
-               res = true;
+               Assert(itemIndex >= 0);
+               so->currPos.firstItem = itemIndex;
+               so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
+               so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
         }
-       else if (continuescan)
-       {
-               /* no, but there might be another one that is */
-               res = _bt_next(scan, dir);
-       }
-       else
+
+       return (so->currPos.firstItem <= so->currPos.lastItem);
+}
+
+/* Save an index item into so->currPos.items[itemIndex] */
+static void
+_bt_saveitem(BTScanOpaque so, int itemIndex,
+                        OffsetNumber offnum, IndexTuple itup)
+{
+       BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+       currItem->heapTid = itup->t_tid;
+       currItem->indexOffset = offnum;
+       if (so->currTuples)
         {
-               /* no tuples in the index match this scan key */
-               ItemPointerSetInvalid(current);
-               so->btso_curbuf = InvalidBuffer;
-               _bt_relbuf(rel, buf);
-               res = false;
-       }
+               Size            itupsz = IndexTupleSize(itup);
  
-       return res;
+               currItem->tupleOffset = so->currPos.nextTupleOffset;
+               memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
+               so->currPos.nextTupleOffset += MAXALIGN(itupsz);
+       }
  }
  
  /*
- *     _bt_step() -- Step one item in the requested direction in a scan on
- *                               the tree.
+ *     _bt_steppage() -- Step to next page containing valid data for scan
   *
- *             *bufP is the current buffer (read-locked and pinned).  If we change
- *             pages, it's updated appropriately.
+ * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
+ * if pinned, we'll drop the pin before moving to next page.  The buffer is
+ * not locked on entry.
   *
- *             If successful, update scan's currentItemData and return true.
- *             If no adjacent record exists in the requested direction,
- *             release buffer pin/locks and return false.
+ * On success exit, so->currPos is updated to contain data from the next
+ * interesting page.  For success on a scan using a non-MVCC snapshot we hold
+ * a pin, but not a read lock, on that page.  If we do not hold the pin, we
+ * set so->currPos.buf to InvalidBuffer.  We return TRUE to indicate success.
+ *
+ * If there are no more matching records in the given direction, we drop all
+ * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
   */
-bool
-_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
+static bool
+_bt_steppage(IndexScanDesc scan, ScanDirection dir)
  {
-       Relation        rel = scan->indexRelation;
-       ItemPointer current = &(scan->currentItemData);
         BTScanOpaque so = (BTScanOpaque) scan->opaque;
+       Relation        rel;
         Page            page;
         BTPageOpaque opaque;
-       OffsetNumber offnum,
-                               maxoff;
-       BlockNumber blkno;
+
+       Assert(BTScanPosIsValid(so->currPos));
+
+       /* Before leaving current page, deal with any killed items */
+       if (so->numKilled > 0)
+               _bt_killitems(scan);
  
         /*
-        * Don't use ItemPointerGetOffsetNumber or you risk to get assertion
-        * due to ability of ip_posid to be equal 0.
+        * Before we modify currPos, make a copy of the page data if there was a
+        * mark position that needs it.
          */
-       offnum = current->ip_posid;
+       if (so->markItemIndex >= 0)
+       {
+               /* bump pin on current buffer for assignment to mark buffer */
+               if (BTScanPosIsPinned(so->currPos))
+                       IncrBufferRefCount(so->currPos.buf);
+               memcpy(&so->markPos, &so->currPos,
+                          offsetof(BTScanPosData, items[1]) +
+                          so->currPos.lastItem * sizeof(BTScanPosItem));
+               if (so->markTuples)
+                       memcpy(so->markTuples, so->currTuples,
+                                  so->currPos.nextTupleOffset);
+               so->markPos.itemIndex = so->markItemIndex;
+               so->markItemIndex = -1;
+       }
  
-       page = BufferGetPage(*bufP);
-       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-       maxoff = PageGetMaxOffsetNumber(page);
+       rel = scan->indexRelation;
  
         if (ScanDirectionIsForward(dir))
         {
-               if (!PageIsEmpty(page) && offnum < maxoff)
-                       offnum = OffsetNumberNext(offnum);
-               else
+               /* Walk right to the next page with data */
+               /* We must rely on the previously saved nextPage link! */
+               BlockNumber blkno = so->currPos.nextPage;
+
+               /* Remember we left a page with data */
+               so->currPos.moreLeft = true;
+
+               /* release the previous buffer, if pinned */
+               BTScanPosUnpinIfPinned(so->currPos);
+
+               for (;;)
                 {
-                       /* Walk right to the next page with data */
-                       for (;;)
+                       /* if we're at end of scan, give up */
+                       if (blkno == P_NONE || !so->currPos.moreRight)
                         {
-                               /* if we're at end of scan, release the buffer and return */
-                               if (P_RIGHTMOST(opaque))
-                               {
-                                       _bt_relbuf(rel, *bufP);
-                                       ItemPointerSetInvalid(current);
-                                       *bufP = so->btso_curbuf = InvalidBuffer;
-                                       return false;
-                               }
-                               /* step right one page */
-                               blkno = opaque->btpo_next;
-                               _bt_relbuf(rel, *bufP);
-                               *bufP = _bt_getbuf(rel, blkno, BT_READ);
-                               page = BufferGetPage(*bufP);
-                               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-                               if (!P_IGNORE(opaque))
-                               {
-                                       maxoff = PageGetMaxOffsetNumber(page);
-                                       /* done if it's not empty */
-                                       offnum = P_FIRSTDATAKEY(opaque);
-                                       if (!PageIsEmpty(page) && offnum <= maxoff)
-                                               break;
-                               }
+                               BTScanPosInvalidate(so->currPos);
+                               return false;
+                       }
+                       /* check for interrupts while we're not holding any buffer lock */
+                       CHECK_FOR_INTERRUPTS();
+                       /* step right one page */
+                       so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+                       /* check for deleted page */
+                       page = BufferGetPage(so->currPos.buf);
+                       TestForOldSnapshot(scan->xs_snapshot, rel, page);
+                       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+                       if (!P_IGNORE(opaque))
+                       {
+                               PredicateLockPage(rel, blkno, scan->xs_snapshot);
+                               /* see if there are any matches on this page */
+                               /* note that this will clear moreRight if we can stop */
+                               if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
+                                       break;
                         }
+
+                       /* nope, keep going */
+                       blkno = opaque->btpo_next;
+                       _bt_relbuf(rel, so->currPos.buf);
                 }
         }
         else
         {
-               /* backwards scan */
-               if (offnum > P_FIRSTDATAKEY(opaque))
-                       offnum = OffsetNumberPrev(offnum);
+               /* Remember we left a page with data */
+               so->currPos.moreRight = true;
+
+               /*
+                * Walk left to the next page with data.  This is much more complex
+                * than the walk-right case because of the possibility that the page
+                * to our left splits while we are in flight to it, plus the
+                * possibility that the page we were on gets deleted after we leave
+                * it.  See nbtree/README for details.
+                *
+                * It might be possible to rearrange this code to have less overhead
+                * in pinning and locking, but that would require capturing the left
+                * pointer when the page is initially read, and using it here, along
+                * with big changes to _bt_walk_left() and the code below.  It is not
+                * clear whether this would be a win, since if the page immediately to
+                * the left splits after we read this page and before we step left, we
+                * would need to visit more pages than with the current code.
+                *
+                * Note that if we change the code so that we drop the pin for a scan
+                * which uses a non-MVCC snapshot, we will need to modify the code for
+                * walking left, to allow for the possibility that a referenced page
+                * has been deleted.  As long as the buffer is pinned or the snapshot
+                * is MVCC the page cannot move past the half-dead state to fully
+                * deleted.
+                */
+               if (BTScanPosIsPinned(so->currPos))
+                       LockBuffer(so->currPos.buf, BT_READ);
                 else
+                       so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
+
+               for (;;)
                 {
-                       /*
-                        * Walk left to the next page with data.  This is much more
-                        * complex than the walk-right case because of the possibility
-                        * that the page to our left splits while we are in flight to
-                        * it, plus the possibility that the page we were on gets
-                        * deleted after we leave it.  See nbtree/README for details.
-                        */
-                       for (;;)
+                       /* Done if we know there are no matching keys to the left */
+                       if (!so->currPos.moreLeft)
                         {
-                               *bufP = _bt_walk_left(rel, *bufP);
+                               _bt_relbuf(rel, so->currPos.buf);
+                               BTScanPosInvalidate(so->currPos);
+                               return false;
+                       }
  
-                               /* if we're at end of scan, return failure */
-                               if (*bufP == InvalidBuffer)
-                               {
-                                       ItemPointerSetInvalid(current);
-                                       so->btso_curbuf = InvalidBuffer;
-                                       return false;
-                               }
-                               page = BufferGetPage(*bufP);
-                               opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+                       /* Step to next physical page */
+                       so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
+                                                                                       scan->xs_snapshot);
  
-                               /*
-                                * Okay, we managed to move left to a non-deleted page.
-                                * Done if it's not half-dead and not empty.  Else loop
-                                * back and do it all again.
-                                */
-                               if (!P_IGNORE(opaque))
-                               {
-                                       maxoff = PageGetMaxOffsetNumber(page);
-                                       offnum = maxoff;
-                                       if (!PageIsEmpty(page) &&
-                                               maxoff >= P_FIRSTDATAKEY(opaque))
-                                               break;
-                               }
+                       /* if we're physically at end of index, return failure */
+                       if (so->currPos.buf == InvalidBuffer)
+                       {
+                               BTScanPosInvalidate(so->currPos);
+                               return false;
+                       }
+
+                       /*
+                        * Okay, we managed to move left to a non-deleted page. Done if
+                        * it's not half-dead and contains matching tuples. Else loop back
+                        * and do it all again.
+                        */
+                       page = BufferGetPage(so->currPos.buf);
+                       TestForOldSnapshot(scan->xs_snapshot, rel, page);
+                       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+                       if (!P_IGNORE(opaque))
+                       {
+                               PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
+                               /* see if there are any matches on this page */
+                               /* note that this will clear moreLeft if we can stop */
+                               if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
+                                       break;
                         }
                 }
         }
  
-       /* Update scan state */
-       so->btso_curbuf = *bufP;
-       blkno = BufferGetBlockNumber(*bufP);
-       ItemPointerSet(current, blkno, offnum);
+       /* Drop the lock, and maybe the pin, on the current page */
+       _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
  
         return true;
  }
@@ -1045,7 +1456,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
   * again if it's important.
   */
  static Buffer
-_bt_walk_left(Relation rel, Buffer buf)
+_bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
  {
         Page            page;
         BTPageOpaque opaque;
@@ -1071,20 +1482,22 @@ _bt_walk_left(Relation rel, Buffer buf)
                 /* step left */
                 blkno = lblkno = opaque->btpo_prev;
                 _bt_relbuf(rel, buf);
+               /* check for interrupts while we're not holding any buffer lock */
+               CHECK_FOR_INTERRUPTS();
                 buf = _bt_getbuf(rel, blkno, BT_READ);
                 page = BufferGetPage(buf);
+               TestForOldSnapshot(snapshot, rel, page);
                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  
                 /*
                  * If this isn't the page we want, walk right till we find what we
-                * want --- but go no more than four hops (an arbitrary limit). If
-                * we don't find the correct page by then, the most likely bet is
-                * that the original page got deleted and isn't in the sibling
-                * chain at all anymore, not that its left sibling got split more
-                * than four times.
+                * want --- but go no more than four hops (an arbitrary limit). If we
+                * don't find the correct page by then, the most likely bet is that
+                * the original page got deleted and isn't in the sibling chain at all
+                * anymore, not that its left sibling got split more than four times.
                  *
                  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
-                * because half-dead pages are still in the sibling chain.      Caller
+                * because half-dead pages are still in the sibling chain.  Caller
                  * must reject half-dead pages if wanted.
                  */
                 tries = 0;
@@ -1098,54 +1511,54 @@ _bt_walk_left(Relation rel, Buffer buf)
                         if (P_RIGHTMOST(opaque) || ++tries > 4)
                                 break;
                         blkno = opaque->btpo_next;
-                       _bt_relbuf(rel, buf);
-                       buf = _bt_getbuf(rel, blkno, BT_READ);
+                       buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
                         page = BufferGetPage(buf);
+                       TestForOldSnapshot(snapshot, rel, page);
                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
                 }
  
                 /* Return to the original page to see what's up */
-               _bt_relbuf(rel, buf);
-               buf = _bt_getbuf(rel, obknum, BT_READ);
+               buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
                 page = BufferGetPage(buf);
+               TestForOldSnapshot(snapshot, rel, page);
                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
                 if (P_ISDELETED(opaque))
                 {
                         /*
-                        * It was deleted.      Move right to first nondeleted page (there
-                        * must be one); that is the page that has acquired the
-                        * deleted one's keyspace, so stepping left from it will take
-                        * us where we want to be.
+                        * It was deleted.  Move right to first nondeleted page (there
+                        * must be one); that is the page that has acquired the deleted
+                        * one's keyspace, so stepping left from it will take us where we
+                        * want to be.
                          */
                         for (;;)
                         {
                                 if (P_RIGHTMOST(opaque))
-                                       elog(ERROR, "fell off the end of \"%s\"",
+                                       elog(ERROR, "fell off the end of index \"%s\"",
                                                  RelationGetRelationName(rel));
                                 blkno = opaque->btpo_next;
-                               _bt_relbuf(rel, buf);
-                               buf = _bt_getbuf(rel, blkno, BT_READ);
+                               buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
                                 page = BufferGetPage(buf);
+                               TestForOldSnapshot(snapshot, rel, page);
                                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
                                 if (!P_ISDELETED(opaque))
                                         break;
                         }
  
                         /*
-                        * Now return to top of loop, resetting obknum to point to
-                        * this nondeleted page, and try again.
+                        * Now return to top of loop, resetting obknum to point to this
+                        * nondeleted page, and try again.
                          */
                 }
                 else
                 {
                         /*
-                        * It wasn't deleted; the explanation had better be that the
-                        * page to the left got split or deleted. Without this check,
-                        * we'd go into an infinite loop if there's anything wrong.
+                        * It wasn't deleted; the explanation had better be that the page
+                        * to the left got split or deleted. Without this check, we'd go
+                        * into an infinite loop if there's anything wrong.
                          */
                         if (opaque->btpo_prev == lblkno)
-                               elog(ERROR, "could not find left sibling in \"%s\"",
-                                        RelationGetRelationName(rel));
+                               elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
+                                        obknum, RelationGetRelationName(rel));
                         /* Okay to try again with new lblkno value */
                 }
         }
@@ -1157,25 +1570,25 @@ _bt_walk_left(Relation rel, Buffer buf)
   * _bt_get_endpoint() -- Find the first or last page on a given tree level
   *
   * If the index is empty, we will return InvalidBuffer; any other failure
- * condition causes ereport(). We will not return a dead page.
+ * condition causes ereport().  We will not return a dead page.
   *
   * The returned buffer is pinned and read-locked.
   */
  Buffer
-_bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
+_bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
+                                Snapshot snapshot)
  {
         Buffer          buf;
         Page            page;
         BTPageOpaque opaque;
         OffsetNumber offnum;
         BlockNumber blkno;
-       BTItem          btitem;
         IndexTuple      itup;
  
         /*
          * If we are looking for a leaf page, okay to descend from fast root;
-        * otherwise better descend from true root.  (There is no point in
-        * being smarter about intermediate levels.)
+        * otherwise better descend from true root.  (There is no point in being
+        * smarter about intermediate levels.)
          */
         if (level == 0)
                 buf = _bt_getroot(rel, BT_READ);
@@ -1183,12 +1596,10 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
                 buf = _bt_gettrueroot(rel);
  
         if (!BufferIsValid(buf))
-       {
-               /* empty index... */
                 return InvalidBuffer;
-       }
  
         page = BufferGetPage(buf);
+       TestForOldSnapshot(snapshot, rel, page);
         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  
         for (;;)
@@ -1196,19 +1607,19 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
                 /*
                  * If we landed on a deleted page, step right to find a live page
                  * (there must be one).  Also, if we want the rightmost page, step
-                * right if needed to get to it (this could happen if the page
-                * split since we obtained a pointer to it).
+                * right if needed to get to it (this could happen if the page split
+                * since we obtained a pointer to it).
                  */
                 while (P_IGNORE(opaque) ||
                            (rightmost && !P_RIGHTMOST(opaque)))
                 {
                         blkno = opaque->btpo_next;
                         if (blkno == P_NONE)
-                               elog(ERROR, "fell off the end of \"%s\"",
+                               elog(ERROR, "fell off the end of index \"%s\"",
                                          RelationGetRelationName(rel));
-                       _bt_relbuf(rel, buf);
-                       buf = _bt_getbuf(rel, blkno, BT_READ);
+                       buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
                         page = BufferGetPage(buf);
+                       TestForOldSnapshot(snapshot, rel, page);
                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
                 }
  
@@ -1216,7 +1627,8 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
                 if (opaque->btpo.level == level)
                         break;
                 if (opaque->btpo.level < level)
-                       elog(ERROR, "btree level %u not found", level);
+                       elog(ERROR, "btree level %u not found in index \"%s\"",
+                                level, RelationGetRelationName(rel));
  
                 /* Descend to leftmost or rightmost child page */
                 if (rightmost)
@@ -1224,12 +1636,10 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
                 else
                         offnum = P_FIRSTDATAKEY(opaque);
  
-               btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
-               itup = &(btitem->bti_itup);
+               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
                 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
  
-               _bt_relbuf(rel, buf);
-               buf = _bt_getbuf(rel, blkno, BT_READ);
+               buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
                 page = BufferGetPage(buf);
                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
         }
@@ -1238,56 +1648,48 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
  }
  
  /*
- *     _bt_endpoint() -- Find the first or last key in the index, and scan
+ *     _bt_endpoint() -- Find the first or last page in the index, and scan
   * from there to the first key satisfying all the quals.
   *
   * This is used by _bt_first() to set up a scan when we've determined
   * that the scan must start at the beginning or end of the index (for
- * a forward or backward scan respectively).
+ * a forward or backward scan respectively).  Exit conditions are the
+ * same as for _bt_first().
   */
  static bool
  _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
  {
-       Relation        rel;
+       Relation        rel = scan->indexRelation;
+       BTScanOpaque so = (BTScanOpaque) scan->opaque;
         Buffer          buf;
         Page            page;
         BTPageOpaque opaque;
-       ItemPointer current;
-       OffsetNumber maxoff;
         OffsetNumber start;
-       BlockNumber blkno;
-       BTItem          btitem;
-       IndexTuple      itup;
-       BTScanOpaque so;
-       bool            res;
-       bool            continuescan;
-
-       rel = scan->indexRelation;
-       current = &(scan->currentItemData);
-       so = (BTScanOpaque) scan->opaque;
+       BTScanPosItem *currItem;
  
         /*
-        * Scan down to the leftmost or rightmost leaf page.  This is a
-        * simplified version of _bt_search().  We don't maintain a stack
-        * since we know we won't need it.
+        * Scan down to the leftmost or rightmost leaf page.  This is a simplified
+        * version of _bt_search().  We don't maintain a stack since we know we
+        * won't need it.
          */
-       buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
+       buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
  
         if (!BufferIsValid(buf))
         {
-               /* empty index... */
-               ItemPointerSetInvalid(current);
-               so->btso_curbuf = InvalidBuffer;
+               /*
+                * Empty index. Lock the whole relation, as nothing finer to lock
+                * exists.
+                */
+               PredicateLockRelation(rel, scan->xs_snapshot);
+               BTScanPosInvalidate(so->currPos);
                 return false;
         }
  
-       blkno = BufferGetBlockNumber(buf);
+       PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
         page = BufferGetPage(buf);
         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
         Assert(P_ISLEAF(opaque));
  
-       maxoff = PageGetMaxOffsetNumber(page);
-
         if (ScanDirectionIsForward(dir))
         {
                 /* There could be dead pages to the left, so not this: */
@@ -1300,9 +1702,6 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
                 Assert(P_RIGHTMOST(opaque));
  
                 start = PageGetMaxOffsetNumber(page);
-               if (start < P_FIRSTDATAKEY(opaque))             /* watch out for empty
-                                                                                                * page */
-                       start = P_FIRSTDATAKEY(opaque);
         }
         else
         {
@@ -1310,47 +1709,47 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
                 start = 0;                              /* keep compiler quiet */
         }
  
-       ItemPointerSet(current, blkno, start);
         /* remember which buffer we have pinned */
-       so->btso_curbuf = buf;
+       so->currPos.buf = buf;
  
-       /*
-        * Left/rightmost page could be empty due to deletions, if so step
-        * till we find a nonempty page.
-        */
-       if (start > maxoff)
+       /* initialize moreLeft/moreRight appropriately for scan direction */
+       if (ScanDirectionIsForward(dir))
         {
-               if (!_bt_step(scan, &buf, dir))
-                       return false;
-               start = ItemPointerGetOffsetNumber(current);
-               page = BufferGetPage(buf);
+               so->currPos.moreLeft = false;
+               so->currPos.moreRight = true;
         }
-
-       btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
-       itup = &(btitem->bti_itup);
+       else
+       {
+               so->currPos.moreLeft = true;
+               so->currPos.moreRight = false;
+       }
+       so->numKilled = 0;                      /* just paranoia */
+       so->markItemIndex = -1;         /* ditto */
  
         /*
-        * Okay, we are on the first or last tuple.  Does it pass all the quals?
+        * Now load data from the first page of the scan.
          */
-       if (_bt_checkkeys(scan, itup, dir, &continuescan))
-       {
-               /* yes, return it */
-               scan->xs_ctup.t_self = itup->t_tid;
-               res = true;
-       }
-       else if (continuescan)
+       if (!_bt_readpage(scan, dir, start))
         {
-               /* no, but there might be another one that does */
-               res = _bt_next(scan, dir);
+               /*
+                * There's no actually-matching data on this page.  Try to advance to
+                * the next page.  Return false if there's no matching data at all.
+                */
+               LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+               if (!_bt_steppage(scan, dir))
+                       return false;
         }
         else
         {
-               /* no tuples in the index match this scan key */
-               ItemPointerSetInvalid(current);
-               so->btso_curbuf = InvalidBuffer;
-               _bt_relbuf(rel, buf);
-               res = false;
+               /* Drop the lock, and maybe the pin, on the current page */
+               _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
         }
  
-       return res;
+       /* OK, itemIndex says what to return */
+       currItem = &so->currPos.items[so->currPos.itemIndex];
+       scan->xs_ctup.t_self = currItem->heapTid;
+       if (scan->xs_want_itup)
+               scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+       return true;
  }