granicus.if.org Git - postgresql/blob - src/backend/access/nbtree/nbtsearch.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtsearch.c
   4  *        Search code for postgres btrees.
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/access/nbtree/nbtsearch.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15
  16 #include "postgres.h"
  17
  18 #include "access/nbtree.h"
  19 #include "access/relscan.h"
  20 #include "miscadmin.h"
  21 #include "pgstat.h"
  22 #include "storage/predicate.h"
  23 #include "utils/lsyscache.h"
  24 #include "utils/rel.h"
  25
  26
  27 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
  28 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
  29 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
  30                                                  OffsetNumber offnum);
  31 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
  32                                                  OffsetNumber offnum, IndexTuple itup);
  33 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
  34 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
  35 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
  36                                                                   ScanDirection dir);
  37 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
  38 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
  39 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
  40
  41
  42 /*
  43  *      _bt_drop_lock_and_maybe_pin()
  44  *
  45  * Unlock the buffer; and if it is safe to release the pin, do that, too.  It
  46  * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
  47  * This will prevent vacuum from stalling in a blocked state trying to read a
  48  * page when a cursor is sitting on it -- at least in many important cases.
  49  *
  50  * Set the buffer to invalid if the pin is released, since the buffer may be
  51  * re-used.  If we need to go back to this block (for example, to apply
  52  * LP_DEAD hints) we must get a fresh reference to the buffer.  Hopefully it
  53  * will remain in shared memory for as long as it takes to scan the index
  54  * buffer page.
  55  */
  56 static void
  57 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
  58 {
  59         LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
  60
  61         if (IsMVCCSnapshot(scan->xs_snapshot) &&
  62                 RelationNeedsWAL(scan->indexRelation) &&
  63                 !scan->xs_want_itup)
  64         {
  65                 ReleaseBuffer(sp->buf);
  66                 sp->buf = InvalidBuffer;
  67         }
  68 }
  69
  70 /*
  71  *      _bt_search() -- Search the tree for a particular scankey,
  72  *              or more precisely for the first leaf page it could be on.
  73  *
  74  * The passed scankey is an insertion-type scankey (see nbtree/README),
  75  * but it can omit the rightmost column(s) of the index.
  76  *
  77  * Return value is a stack of parent-page pointers.  *bufP is set to the
  78  * address of the leaf-page buffer, which is read-locked and pinned.
  79  * No locks are held on the parent pages, however!
  80  *
  81  * If the snapshot parameter is not NULL, "old snapshot" checking will take
  82  * place during the descent through the tree.  This is not needed when
  83  * positioning for an insert or delete, so NULL is used for those cases.
  84  *
  85  * The returned buffer is locked according to access parameter.  Additionally,
  86  * access = BT_WRITE will allow an empty root page to be created and returned.
  87  * When access = BT_READ, an empty index will result in *bufP being set to
  88  * InvalidBuffer.  Also, in BT_WRITE mode, any incomplete splits encountered
  89  * during the search will be finished.
  90  */
  91 BTStack
  92 _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
  93                    Snapshot snapshot)
  94 {
  95         BTStack         stack_in = NULL;
  96         int                     page_access = BT_READ;
  97
  98         /* Get the root page to start with */
  99         *bufP = _bt_getroot(rel, access);
 100
 101         /* If index is empty and access = BT_READ, no root page is created. */
 102         if (!BufferIsValid(*bufP))
 103                 return (BTStack) NULL;
 104
 105         /* Loop iterates once per level descended in the tree */
 106         for (;;)
 107         {
 108                 Page            page;
 109                 BTPageOpaque opaque;
 110                 OffsetNumber offnum;
 111                 ItemId          itemid;
 112                 IndexTuple      itup;
 113                 BlockNumber blkno;
 114                 BlockNumber par_blkno;
 115                 BTStack         new_stack;
 116
 117                 /*
 118                  * Race -- the page we just grabbed may have split since we read its
 119                  * pointer in the parent (or metapage).  If it has, we may need to
 120                  * move right to its new sibling.  Do that.
 121                  *
 122                  * In write-mode, allow _bt_moveright to finish any incomplete splits
 123                  * along the way.  Strictly speaking, we'd only need to finish an
 124                  * incomplete split on the leaf page we're about to insert to, not on
 125                  * any of the upper levels (they are taken care of in _bt_getstackbuf,
 126                  * if the leaf page is split and we insert to the parent page).  But
 127                  * this is a good opportunity to finish splits of internal pages too.
 128                  */
 129                 *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
 130                                                           page_access, snapshot);
 131
 132                 /* if this is a leaf page, we're done */
 133                 page = BufferGetPage(*bufP);
 134                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 135                 if (P_ISLEAF(opaque))
 136                         break;
 137
 138                 /*
 139                  * Find the appropriate item on the internal page, and get the child
 140                  * page that it points to.
 141                  */
 142                 offnum = _bt_binsrch(rel, key, *bufP);
 143                 itemid = PageGetItemId(page, offnum);
 144                 itup = (IndexTuple) PageGetItem(page, itemid);
 145                 blkno = BTreeInnerTupleGetDownLink(itup);
 146                 par_blkno = BufferGetBlockNumber(*bufP);
 147
 148                 /*
 149                  * We need to save the location of the index entry we chose in the
 150                  * parent page on a stack. In case we split the tree, we'll use the
 151                  * stack to work back up to the parent page.  We also save the actual
 152                  * downlink (block) to uniquely identify the index entry, in case it
 153                  * moves right while we're working lower in the tree.  See the paper
 154                  * by Lehman and Yao for how this is detected and handled. (We use the
 155                  * child link during the second half of a page split -- if caller ends
 156                  * up splitting the child it usually ends up inserting a new pivot
 157                  * tuple for child's new right sibling immediately after the original
 158                  * bts_offset offset recorded here.  The downlink block will be needed
 159                  * to check if bts_offset remains the position of this same pivot
 160                  * tuple.)
 161                  */
 162                 new_stack = (BTStack) palloc(sizeof(BTStackData));
 163                 new_stack->bts_blkno = par_blkno;
 164                 new_stack->bts_offset = offnum;
 165                 new_stack->bts_btentry = blkno;
 166                 new_stack->bts_parent = stack_in;
 167
 168                 /*
 169                  * Page level 1 is lowest non-leaf page level prior to leaves.  So, if
 170                  * we're on the level 1 and asked to lock leaf page in write mode,
 171                  * then lock next page in write mode, because it must be a leaf.
 172                  */
 173                 if (opaque->btpo.level == 1 && access == BT_WRITE)
 174                         page_access = BT_WRITE;
 175
 176                 /* drop the read lock on the parent page, acquire one on the child */
 177                 *bufP = _bt_relandgetbuf(rel, *bufP, blkno, page_access);
 178
 179                 /* okay, all set to move down a level */
 180                 stack_in = new_stack;
 181         }
 182
 183         /*
 184          * If we're asked to lock leaf in write mode, but didn't manage to, then
 185          * relock.  This should only happen when the root page is a leaf page (and
 186          * the only page in the index other than the metapage).
 187          */
 188         if (access == BT_WRITE && page_access == BT_READ)
 189         {
 190                 /* trade in our read lock for a write lock */
 191                 LockBuffer(*bufP, BUFFER_LOCK_UNLOCK);
 192                 LockBuffer(*bufP, BT_WRITE);
 193
 194                 /*
 195                  * If the page was split between the time that we surrendered our read
 196                  * lock and acquired our write lock, then this page may no longer be
 197                  * the right place for the key we want to insert.  In this case, we
 198                  * need to move right in the tree.  See Lehman and Yao for an
 199                  * excruciatingly precise description.
 200                  */
 201                 *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
 202                                                           snapshot);
 203         }
 204
 205         return stack_in;
 206 }
 207
 208 /*
 209  *      _bt_moveright() -- move right in the btree if necessary.
 210  *
 211  * When we follow a pointer to reach a page, it is possible that
 212  * the page has changed in the meanwhile.  If this happens, we're
 213  * guaranteed that the page has "split right" -- that is, that any
 214  * data that appeared on the page originally is either on the page
 215  * or strictly to the right of it.
 216  *
 217  * This routine decides whether or not we need to move right in the
 218  * tree by examining the high key entry on the page.  If that entry is
 219  * strictly less than the scankey, or <= the scankey in the
 220  * key.nextkey=true case, then we followed the wrong link and we need
 221  * to move right.
 222  *
 223  * The passed insertion-type scankey can omit the rightmost column(s) of the
 224  * index. (see nbtree/README)
 225  *
 226  * When key.nextkey is false (the usual case), we are looking for the first
 227  * item >= key.  When key.nextkey is true, we are looking for the first item
 228  * strictly greater than key.
 229  *
 230  * If forupdate is true, we will attempt to finish any incomplete splits
 231  * that we encounter.  This is required when locking a target page for an
 232  * insertion, because we don't allow inserting on a page before the split
 233  * is completed.  'stack' is only used if forupdate is true.
 234  *
 235  * On entry, we have the buffer pinned and a lock of the type specified by
 236  * 'access'.  If we move right, we release the buffer and lock and acquire
 237  * the same on the right sibling.  Return value is the buffer we stop at.
 238  *
 239  * If the snapshot parameter is not NULL, "old snapshot" checking will take
 240  * place during the descent through the tree.  This is not needed when
 241  * positioning for an insert or delete, so NULL is used for those cases.
 242  */
 243 Buffer
 244 _bt_moveright(Relation rel,
 245                           BTScanInsert key,
 246                           Buffer buf,
 247                           bool forupdate,
 248                           BTStack stack,
 249                           int access,
 250                           Snapshot snapshot)
 251 {
 252         Page            page;
 253         BTPageOpaque opaque;
 254         int32           cmpval;
 255
 256         /*
 257          * When nextkey = false (normal case): if the scan key that brought us to
 258          * this page is > the high key stored on the page, then the page has split
 259          * and we need to move right.  (pg_upgrade'd !heapkeyspace indexes could
 260          * have some duplicates to the right as well as the left, but that's
 261          * something that's only ever dealt with on the leaf level, after
 262          * _bt_search has found an initial leaf page.)
 263          *
 264          * When nextkey = true: move right if the scan key is >= page's high key.
 265          * (Note that key.scantid cannot be set in this case.)
 266          *
 267          * The page could even have split more than once, so scan as far as
 268          * needed.
 269          *
 270          * We also have to move right if we followed a link that brought us to a
 271          * dead page.
 272          */
 273         cmpval = key->nextkey ? 0 : 1;
 274
 275         for (;;)
 276         {
 277                 page = BufferGetPage(buf);
 278                 TestForOldSnapshot(snapshot, rel, page);
 279                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 280
 281                 if (P_RIGHTMOST(opaque))
 282                         break;
 283
 284                 /*
 285                  * Finish any incomplete splits we encounter along the way.
 286                  */
 287                 if (forupdate && P_INCOMPLETE_SPLIT(opaque))
 288                 {
 289                         BlockNumber blkno = BufferGetBlockNumber(buf);
 290
 291                         /* upgrade our lock if necessary */
 292                         if (access == BT_READ)
 293                         {
 294                                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 295                                 LockBuffer(buf, BT_WRITE);
 296                         }
 297
 298                         if (P_INCOMPLETE_SPLIT(opaque))
 299                                 _bt_finish_split(rel, buf, stack);
 300                         else
 301                                 _bt_relbuf(rel, buf);
 302
 303                         /* re-acquire the lock in the right mode, and re-check */
 304                         buf = _bt_getbuf(rel, blkno, access);
 305                         continue;
 306                 }
 307
 308                 if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
 309                 {
 310                         /* step right one page */
 311                         buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
 312                         continue;
 313                 }
 314                 else
 315                         break;
 316         }
 317
 318         if (P_IGNORE(opaque))
 319                 elog(ERROR, "fell off the end of index \"%s\"",
 320                          RelationGetRelationName(rel));
 321
 322         return buf;
 323 }
 324
 325 /*
 326  *      _bt_binsrch() -- Do a binary search for a key on a particular page.
 327  *
 328  * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
 329  * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
 330  * particular, this means it is possible to return a value 1 greater than the
 331  * number of keys on the page, if the scankey is > all keys on the page.)
 332  *
 333  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
 334  * of the last key < given scankey, or last key <= given scankey if nextkey
 335  * is true.  (Since _bt_compare treats the first data key of such a page as
 336  * minus infinity, there will be at least one key < scankey, so the result
 337  * always points at one of the keys on the page.)  This key indicates the
 338  * right place to descend to be sure we find all leaf keys >= given scankey
 339  * (or leaf keys > given scankey when nextkey is true).
 340  *
 341  * This procedure is not responsible for walking right, it just examines
 342  * the given page.  _bt_binsrch() has no lock or refcount side effects
 343  * on the buffer.
 344  */
 345 static OffsetNumber
 346 _bt_binsrch(Relation rel,
 347                         BTScanInsert key,
 348                         Buffer buf)
 349 {
 350         Page            page;
 351         BTPageOpaque opaque;
 352         OffsetNumber low,
 353                                 high;
 354         int32           result,
 355                                 cmpval;
 356
 357         /* Requesting nextkey semantics while using scantid seems nonsensical */
 358         Assert(!key->nextkey || key->scantid == NULL);
 359
 360         page = BufferGetPage(buf);
 361         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 362
 363         low = P_FIRSTDATAKEY(opaque);
 364         high = PageGetMaxOffsetNumber(page);
 365
 366         /*
 367          * If there are no keys on the page, return the first available slot. Note
 368          * this covers two cases: the page is really empty (no keys), or it
 369          * contains only a high key.  The latter case is possible after vacuuming.
 370          * This can never happen on an internal page, however, since they are
 371          * never empty (an internal page must have children).
 372          */
 373         if (unlikely(high < low))
 374                 return low;
 375
 376         /*
 377          * Binary search to find the first key on the page >= scan key, or first
 378          * key > scankey when nextkey is true.
 379          *
 380          * For nextkey=false (cmpval=1), the loop invariant is: all slots before
 381          * 'low' are < scan key, all slots at or after 'high' are >= scan key.
 382          *
 383          * For nextkey=true (cmpval=0), the loop invariant is: all slots before
 384          * 'low' are <= scan key, all slots at or after 'high' are > scan key.
 385          *
 386          * We can fall out when high == low.
 387          */
 388         high++;                                         /* establish the loop invariant for high */
 389
 390         cmpval = key->nextkey ? 0 : 1;  /* select comparison value */
 391
 392         while (high > low)
 393         {
 394                 OffsetNumber mid = low + ((high - low) / 2);
 395
 396                 /* We have low <= mid < high, so mid points at a real slot */
 397
 398                 result = _bt_compare(rel, key, page, mid);
 399
 400                 if (result >= cmpval)
 401                         low = mid + 1;
 402                 else
 403                         high = mid;
 404         }
 405
 406         /*
 407          * At this point we have high == low, but be careful: they could point
 408          * past the last slot on the page.
 409          *
 410          * On a leaf page, we always return the first key >= scan key (resp. >
 411          * scan key), which could be the last slot + 1.
 412          */
 413         if (P_ISLEAF(opaque))
 414                 return low;
 415
 416         /*
 417          * On a non-leaf page, return the last key < scan key (resp. <= scan key).
 418          * There must be one if _bt_compare() is playing by the rules.
 419          */
 420         Assert(low > P_FIRSTDATAKEY(opaque));
 421
 422         return OffsetNumberPrev(low);
 423 }
 424
 425 /*
 426  *
 427  *      _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
 428  *
 429  * Like _bt_binsrch(), but with support for caching the binary search
 430  * bounds.  Only used during insertion, and only on the leaf page that it
 431  * looks like caller will insert tuple on.  Exclusive-locked and pinned
 432  * leaf page is contained within insertstate.
 433  *
 434  * Caches the bounds fields in insertstate so that a subsequent call can
 435  * reuse the low and strict high bounds of original binary search.  Callers
 436  * that use these fields directly must be prepared for the case where low
 437  * and/or stricthigh are not on the same page (one or both exceed maxoff
 438  * for the page).  The case where there are no items on the page (high <
 439  * low) makes bounds invalid.
 440  *
 441  * Caller is responsible for invalidating bounds when it modifies the page
 442  * before calling here a second time.
 443  */
 444 OffsetNumber
 445 _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 446 {
 447         BTScanInsert key = insertstate->itup_key;
 448         Page            page;
 449         BTPageOpaque opaque;
 450         OffsetNumber low,
 451                                 high,
 452                                 stricthigh;
 453         int32           result,
 454                                 cmpval;
 455
 456         page = BufferGetPage(insertstate->buf);
 457         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 458
 459         Assert(P_ISLEAF(opaque));
 460         Assert(!key->nextkey);
 461
 462         if (!insertstate->bounds_valid)
 463         {
 464                 /* Start new binary search */
 465                 low = P_FIRSTDATAKEY(opaque);
 466                 high = PageGetMaxOffsetNumber(page);
 467         }
 468         else
 469         {
 470                 /* Restore result of previous binary search against same page */
 471                 low = insertstate->low;
 472                 high = insertstate->stricthigh;
 473         }
 474
 475         /* If there are no keys on the page, return the first available slot */
 476         if (unlikely(high < low))
 477         {
 478                 /* Caller can't reuse bounds */
 479                 insertstate->low = InvalidOffsetNumber;
 480                 insertstate->stricthigh = InvalidOffsetNumber;
 481                 insertstate->bounds_valid = false;
 482                 return low;
 483         }
 484
 485         /*
 486          * Binary search to find the first key on the page >= scan key. (nextkey
 487          * is always false when inserting).
 488          *
 489          * The loop invariant is: all slots before 'low' are < scan key, all slots
 490          * at or after 'high' are >= scan key.  'stricthigh' is > scan key, and is
 491          * maintained to save additional search effort for caller.
 492          *
 493          * We can fall out when high == low.
 494          */
 495         if (!insertstate->bounds_valid)
 496                 high++;                                 /* establish the loop invariant for high */
 497         stricthigh = high;                      /* high initially strictly higher */
 498
 499         cmpval = 1;                                     /* !nextkey comparison value */
 500
 501         while (high > low)
 502         {
 503                 OffsetNumber mid = low + ((high - low) / 2);
 504
 505                 /* We have low <= mid < high, so mid points at a real slot */
 506
 507                 result = _bt_compare(rel, key, page, mid);
 508
 509                 if (result >= cmpval)
 510                         low = mid + 1;
 511                 else
 512                 {
 513                         high = mid;
 514                         if (result != 0)
 515                                 stricthigh = high;
 516                 }
 517         }
 518
 519         /*
 520          * On a leaf page, a binary search always returns the first key >= scan
 521          * key (at least in !nextkey case), which could be the last slot + 1. This
 522          * is also the lower bound of cached search.
 523          *
 524          * stricthigh may also be the last slot + 1, which prevents caller from
 525          * using bounds directly, but is still useful to us if we're called a
 526          * second time with cached bounds (cached low will be < stricthigh when
 527          * that happens).
 528          */
 529         insertstate->low = low;
 530         insertstate->stricthigh = stricthigh;
 531         insertstate->bounds_valid = true;
 532
 533         return low;
 534 }
 535
 536 /*----------
 537  *      _bt_compare() -- Compare insertion-type scankey to tuple on a page.
 538  *
 539  *      page/offnum: location of btree item to be compared to.
 540  *
 541  *              This routine returns:
 542  *                      <0 if scankey < tuple at offnum;
 543  *                       0 if scankey == tuple at offnum;
 544  *                      >0 if scankey > tuple at offnum.
 545  *              NULLs in the keys are treated as sortable values.  Therefore
 546  *              "equality" does not necessarily mean that the item should be
 547  *              returned to the caller as a matching key!
 548  *
 549  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
 550  * "minus infinity": this routine will always claim it is less than the
 551  * scankey.  The actual key value stored is explicitly truncated to 0
 552  * attributes (explicitly minus infinity) with version 3+ indexes, but
 553  * that isn't relied upon.  This allows us to implement the Lehman and
 554  * Yao convention that the first down-link pointer is before the first
 555  * key.  See backend/access/nbtree/README for details.
 556  *----------
 557  */
 558 int32
 559 _bt_compare(Relation rel,
 560                         BTScanInsert key,
 561                         Page page,
 562                         OffsetNumber offnum)
 563 {
 564         TupleDesc       itupdesc = RelationGetDescr(rel);
 565         BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 566         IndexTuple      itup;
 567         ItemPointer heapTid;
 568         ScanKey         scankey;
 569         int                     ncmpkey;
 570         int                     ntupatts;
 571
 572         Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
 573         Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
 574         Assert(key->heapkeyspace || key->scantid == NULL);
 575
 576         /*
 577          * Force result ">" if target item is first data item on an internal page
 578          * --- see NOTE above.
 579          */
 580         if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
 581                 return 1;
 582
 583         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
 584         ntupatts = BTreeTupleGetNAtts(itup, rel);
 585
 586         /*
 587          * The scan key is set up with the attribute number associated with each
 588          * term in the key.  It is important that, if the index is multi-key, the
 589          * scan contain the first k key attributes, and that they be in order.  If
 590          * you think about how multi-key ordering works, you'll understand why
 591          * this is.
 592          *
 593          * We don't test for violation of this condition here, however.  The
 594          * initial setup for the index scan had better have gotten it right (see
 595          * _bt_first).
 596          */
 597
 598         ncmpkey = Min(ntupatts, key->keysz);
 599         Assert(key->heapkeyspace || ncmpkey == key->keysz);
 600         scankey = key->scankeys;
 601         for (int i = 1; i <= ncmpkey; i++)
 602         {
 603                 Datum           datum;
 604                 bool            isNull;
 605                 int32           result;
 606
 607                 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
 608
 609                 /* see comments about NULLs handling in btbuild */
 610                 if (scankey->sk_flags & SK_ISNULL)      /* key is NULL */
 611                 {
 612                         if (isNull)
 613                                 result = 0;             /* NULL "=" NULL */
 614                         else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
 615                                 result = -1;    /* NULL "<" NOT_NULL */
 616                         else
 617                                 result = 1;             /* NULL ">" NOT_NULL */
 618                 }
 619                 else if (isNull)                /* key is NOT_NULL and item is NULL */
 620                 {
 621                         if (scankey->sk_flags & SK_BT_NULLS_FIRST)
 622                                 result = 1;             /* NOT_NULL ">" NULL */
 623                         else
 624                                 result = -1;    /* NOT_NULL "<" NULL */
 625                 }
 626                 else
 627                 {
 628                         /*
 629                          * The sk_func needs to be passed the index value as left arg and
 630                          * the sk_argument as right arg (they might be of different
 631                          * types).  Since it is convenient for callers to think of
 632                          * _bt_compare as comparing the scankey to the index item, we have
 633                          * to flip the sign of the comparison result.  (Unless it's a DESC
 634                          * column, in which case we *don't* flip the sign.)
 635                          */
 636                         result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
 637                                                                                                          scankey->sk_collation,
 638                                                                                                          datum,
 639                                                                                                          scankey->sk_argument));
 640
 641                         if (!(scankey->sk_flags & SK_BT_DESC))
 642                                 INVERT_COMPARE_RESULT(result);
 643                 }
 644
 645                 /* if the keys are unequal, return the difference */
 646                 if (result != 0)
 647                         return result;
 648
 649                 scankey++;
 650         }
 651
 652         /*
 653          * All non-truncated attributes (other than heap TID) were found to be
 654          * equal.  Treat truncated attributes as minus infinity when scankey has a
 655          * key attribute value that would otherwise be compared directly.
 656          *
 657          * Note: it doesn't matter if ntupatts includes non-key attributes;
 658          * scankey won't, so explicitly excluding non-key attributes isn't
 659          * necessary.
 660          */
 661         if (key->keysz > ntupatts)
 662                 return 1;
 663
 664         /*
 665          * Use the heap TID attribute and scantid to try to break the tie.  The
 666          * rules are the same as any other key attribute -- only the
 667          * representation differs.
 668          */
 669         heapTid = BTreeTupleGetHeapTID(itup);
 670         if (key->scantid == NULL)
 671         {
 672                 /*
 673                  * Most searches have a scankey that is considered greater than a
 674                  * truncated pivot tuple if and when the scankey has equal values for
 675                  * attributes up to and including the least significant untruncated
 676                  * attribute in tuple.
 677                  *
 678                  * For example, if an index has the minimum two attributes (single
 679                  * user key attribute, plus heap TID attribute), and a page's high key
 680                  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
 681                  * will not descend to the page to the left.  The search will descend
 682                  * right instead.  The truncated attribute in pivot tuple means that
 683                  * all non-pivot tuples on the page to the left are strictly < 'foo',
 684                  * so it isn't necessary to descend left.  In other words, search
 685                  * doesn't have to descend left because it isn't interested in a match
 686                  * that has a heap TID value of -inf.
 687                  *
 688                  * However, some searches (pivotsearch searches) actually require that
 689                  * we descend left when this happens.  -inf is treated as a possible
 690                  * match for omitted scankey attribute(s).  This is needed by page
 691                  * deletion, which must re-find leaf pages that are targets for
 692                  * deletion using their high keys.
 693                  *
 694                  * Note: the heap TID part of the test ensures that scankey is being
 695                  * compared to a pivot tuple with one or more truncated key
 696                  * attributes.
 697                  *
 698                  * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
 699                  * left here, since they have no heap TID attribute (and cannot have
 700                  * any -inf key values in any case, since truncation can only remove
 701                  * non-key attributes).  !heapkeyspace searches must always be
 702                  * prepared to deal with matches on both sides of the pivot once the
 703                  * leaf level is reached.
 704                  */
 705                 if (key->heapkeyspace && !key->pivotsearch &&
 706                         key->keysz == ntupatts && heapTid == NULL)
 707                         return 1;
 708
 709                 /* All provided scankey arguments found to be equal */
 710                 return 0;
 711         }
 712
 713         /*
 714          * Treat truncated heap TID as minus infinity, since scankey has a key
 715          * attribute value (scantid) that would otherwise be compared directly
 716          */
 717         Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
 718         if (heapTid == NULL)
 719                 return 1;
 720
 721         Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
 722         return ItemPointerCompare(key->scantid, heapTid);
 723 }
 724
 725 /*
 726  *      _bt_first() -- Find the first item in a scan.
 727  *
 728  *              We need to be clever about the direction of scan, the search
 729  *              conditions, and the tree ordering.  We find the first item (or,
 730  *              if backwards scan, the last item) in the tree that satisfies the
 731  *              qualifications in the scan key.  On success exit, the page containing
 732  *              the current index tuple is pinned but not locked, and data about
 733  *              the matching tuple(s) on the page has been loaded into so->currPos.
 734  *              scan->xs_ctup.t_self is set to the heap TID of the current tuple,
 735  *              and if requested, scan->xs_itup points to a copy of the index tuple.
 736  *
 737  * If there are no matching items in the index, we return false, with no
 738  * pins or locks held.
 739  *
 740  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
 741  * are both search-type scankeys (see nbtree/README for more about this).
 742  * Within this routine, we build a temporary insertion-type scankey to use
 743  * in locating the scan start position.
 744  */
 745 bool
 746 _bt_first(IndexScanDesc scan, ScanDirection dir)
 747 {
 748         Relation        rel = scan->indexRelation;
 749         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 750         Buffer          buf;
 751         BTStack         stack;
 752         OffsetNumber offnum;
 753         StrategyNumber strat;
 754         bool            nextkey;
 755         bool            goback;
 756         BTScanInsertData inskey;
 757         ScanKey         startKeys[INDEX_MAX_KEYS];
 758         ScanKeyData notnullkeys[INDEX_MAX_KEYS];
 759         int                     keysCount = 0;
 760         int                     i;
 761         bool            status = true;
 762         StrategyNumber strat_total;
 763         BTScanPosItem *currItem;
 764         BlockNumber blkno;
 765
 766         Assert(!BTScanPosIsValid(so->currPos));
 767
 768         pgstat_count_index_scan(rel);
 769
 770         /*
 771          * Examine the scan keys and eliminate any redundant keys; also mark the
 772          * keys that must be matched to continue the scan.
 773          */
 774         _bt_preprocess_keys(scan);
 775
 776         /*
 777          * Quit now if _bt_preprocess_keys() discovered that the scan keys can
 778          * never be satisfied (eg, x == 1 AND x > 2).
 779          */
 780         if (!so->qual_ok)
 781                 return false;
 782
 783         /*
 784          * For parallel scans, get the starting page from shared state. If the
 785          * scan has not started, proceed to find out first leaf page in the usual
 786          * way while keeping other participating processes waiting.  If the scan
 787          * has already begun, use the page number from the shared structure.
 788          */
 789         if (scan->parallel_scan != NULL)
 790         {
 791                 status = _bt_parallel_seize(scan, &blkno);
 792                 if (!status)
 793                         return false;
 794                 else if (blkno == P_NONE)
 795                 {
 796                         _bt_parallel_done(scan);
 797                         return false;
 798                 }
 799                 else if (blkno != InvalidBlockNumber)
 800                 {
 801                         if (!_bt_parallel_readpage(scan, blkno, dir))
 802                                 return false;
 803                         goto readcomplete;
 804                 }
 805         }
 806
 807         /*----------
 808          * Examine the scan keys to discover where we need to start the scan.
 809          *
 810          * We want to identify the keys that can be used as starting boundaries;
 811          * these are =, >, or >= keys for a forward scan or =, <, <= keys for
 812          * a backwards scan.  We can use keys for multiple attributes so long as
 813          * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
 814          * a > or < boundary or find an attribute with no boundary (which can be
 815          * thought of as the same as "> -infinity"), we can't use keys for any
 816          * attributes to its right, because it would break our simplistic notion
 817          * of what initial positioning strategy to use.
 818          *
 819          * When the scan keys include cross-type operators, _bt_preprocess_keys
 820          * may not be able to eliminate redundant keys; in such cases we will
 821          * arbitrarily pick a usable one for each attribute.  This is correct
 822          * but possibly not optimal behavior.  (For example, with keys like
 823          * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
 824          * x=5 would be more efficient.)  Since the situation only arises given
 825          * a poorly-worded query plus an incomplete opfamily, live with it.
 826          *
 827          * When both equality and inequality keys appear for a single attribute
 828          * (again, only possible when cross-type operators appear), we *must*
 829          * select one of the equality keys for the starting point, because
 830          * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
 831          * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
 832          * start at x=4, we will fail and stop before reaching x=10.  If multiple
 833          * equality quals survive preprocessing, however, it doesn't matter which
 834          * one we use --- by definition, they are either redundant or
 835          * contradictory.
 836          *
 837          * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
 838          * If the index stores nulls at the end of the index we'll be starting
 839          * from, and we have no boundary key for the column (which means the key
 840          * we deduced NOT NULL from is an inequality key that constrains the other
 841          * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
 842          * use as a boundary key.  If we didn't do this, we might find ourselves
 843          * traversing a lot of null entries at the start of the scan.
 844          *
 845          * In this loop, row-comparison keys are treated the same as keys on their
 846          * first (leftmost) columns.  We'll add on lower-order columns of the row
 847          * comparison below, if possible.
 848          *
 849          * The selected scan keys (at most one per index column) are remembered by
 850          * storing their addresses into the local startKeys[] array.
 851          *----------
 852          */
 853         strat_total = BTEqualStrategyNumber;
 854         if (so->numberOfKeys > 0)
 855         {
 856                 AttrNumber      curattr;
 857                 ScanKey         chosen;
 858                 ScanKey         impliesNN;
 859                 ScanKey         cur;
 860
 861                 /*
 862                  * chosen is the so-far-chosen key for the current attribute, if any.
 863                  * We don't cast the decision in stone until we reach keys for the
 864                  * next attribute.
 865                  */
 866                 curattr = 1;
 867                 chosen = NULL;
 868                 /* Also remember any scankey that implies a NOT NULL constraint */
 869                 impliesNN = NULL;
 870
 871                 /*
 872                  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
 873                  * pass to handle after-last-key processing.  Actual exit from the
 874                  * loop is at one of the "break" statements below.
 875                  */
 876                 for (cur = so->keyData, i = 0;; cur++, i++)
 877                 {
 878                         if (i >= so->numberOfKeys || cur->sk_attno != curattr)
 879                         {
 880                                 /*
 881                                  * Done looking at keys for curattr.  If we didn't find a
 882                                  * usable boundary key, see if we can deduce a NOT NULL key.
 883                                  */
 884                                 if (chosen == NULL && impliesNN != NULL &&
 885                                         ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
 886                                          ScanDirectionIsForward(dir) :
 887                                          ScanDirectionIsBackward(dir)))
 888                                 {
 889                                         /* Yes, so build the key in notnullkeys[keysCount] */
 890                                         chosen = &notnullkeys[keysCount];
 891                                         ScanKeyEntryInitialize(chosen,
 892                                                                                    (SK_SEARCHNOTNULL | SK_ISNULL |
 893                                                                                         (impliesNN->sk_flags &
 894                                                                                          (SK_BT_DESC | SK_BT_NULLS_FIRST))),
 895                                                                                    curattr,
 896                                                                                    ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
 897                                                                                         BTGreaterStrategyNumber :
 898                                                                                         BTLessStrategyNumber),
 899                                                                                    InvalidOid,
 900                                                                                    InvalidOid,
 901                                                                                    InvalidOid,
 902                                                                                    (Datum) 0);
 903                                 }
 904
 905                                 /*
 906                                  * If we still didn't find a usable boundary key, quit; else
 907                                  * save the boundary key pointer in startKeys.
 908                                  */
 909                                 if (chosen == NULL)
 910                                         break;
 911                                 startKeys[keysCount++] = chosen;
 912
 913                                 /*
 914                                  * Adjust strat_total, and quit if we have stored a > or <
 915                                  * key.
 916                                  */
 917                                 strat = chosen->sk_strategy;
 918                                 if (strat != BTEqualStrategyNumber)
 919                                 {
 920                                         strat_total = strat;
 921                                         if (strat == BTGreaterStrategyNumber ||
 922                                                 strat == BTLessStrategyNumber)
 923                                                 break;
 924                                 }
 925
 926                                 /*
 927                                  * Done if that was the last attribute, or if next key is not
 928                                  * in sequence (implying no boundary key is available for the
 929                                  * next attribute).
 930                                  */
 931                                 if (i >= so->numberOfKeys ||
 932                                         cur->sk_attno != curattr + 1)
 933                                         break;
 934
 935                                 /*
 936                                  * Reset for next attr.
 937                                  */
 938                                 curattr = cur->sk_attno;
 939                                 chosen = NULL;
 940                                 impliesNN = NULL;
 941                         }
 942
 943                         /*
 944                          * Can we use this key as a starting boundary for this attr?
 945                          *
 946                          * If not, does it imply a NOT NULL constraint?  (Because
 947                          * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
 948                          * *any* inequality key works for that; we need not test.)
 949                          */
 950                         switch (cur->sk_strategy)
 951                         {
 952                                 case BTLessStrategyNumber:
 953                                 case BTLessEqualStrategyNumber:
 954                                         if (chosen == NULL)
 955                                         {
 956                                                 if (ScanDirectionIsBackward(dir))
 957                                                         chosen = cur;
 958                                                 else
 959                                                         impliesNN = cur;
 960                                         }
 961                                         break;
 962                                 case BTEqualStrategyNumber:
 963                                         /* override any non-equality choice */
 964                                         chosen = cur;
 965                                         break;
 966                                 case BTGreaterEqualStrategyNumber:
 967                                 case BTGreaterStrategyNumber:
 968                                         if (chosen == NULL)
 969                                         {
 970                                                 if (ScanDirectionIsForward(dir))
 971                                                         chosen = cur;
 972                                                 else
 973                                                         impliesNN = cur;
 974                                         }
 975                                         break;
 976                         }
 977                 }
 978         }
 979
 980         /*
 981          * If we found no usable boundary keys, we have to start from one end of
 982          * the tree.  Walk down that edge to the first or last key, and scan from
 983          * there.
 984          */
 985         if (keysCount == 0)
 986         {
 987                 bool            match;
 988
 989                 match = _bt_endpoint(scan, dir);
 990
 991                 if (!match)
 992                 {
 993                         /* No match, so mark (parallel) scan finished */
 994                         _bt_parallel_done(scan);
 995                 }
 996
 997                 return match;
 998         }
 999
1000         /*
1001          * We want to start the scan somewhere within the index.  Set up an
1002          * insertion scankey we can use to search for the boundary point we
1003          * identified above.  The insertion scankey is built using the keys
1004          * identified by startKeys[].  (Remaining insertion scankey fields are
1005          * initialized after initial-positioning strategy is finalized.)
1006          */
1007         Assert(keysCount <= INDEX_MAX_KEYS);
1008         for (i = 0; i < keysCount; i++)
1009         {
1010                 ScanKey         cur = startKeys[i];
1011
1012                 Assert(cur->sk_attno == i + 1);
1013
1014                 if (cur->sk_flags & SK_ROW_HEADER)
1015                 {
1016                         /*
1017                          * Row comparison header: look to the first row member instead.
1018                          *
1019                          * The member scankeys are already in insertion format (ie, they
1020                          * have sk_func = 3-way-comparison function), but we have to watch
1021                          * out for nulls, which _bt_preprocess_keys didn't check. A null
1022                          * in the first row member makes the condition unmatchable, just
1023                          * like qual_ok = false.
1024                          */
1025                         ScanKey         subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1026
1027                         Assert(subkey->sk_flags & SK_ROW_MEMBER);
1028                         if (subkey->sk_flags & SK_ISNULL)
1029                         {
1030                                 _bt_parallel_done(scan);
1031                                 return false;
1032                         }
1033                         memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1034
1035                         /*
1036                          * If the row comparison is the last positioning key we accepted,
1037                          * try to add additional keys from the lower-order row members.
1038                          * (If we accepted independent conditions on additional index
1039                          * columns, we use those instead --- doesn't seem worth trying to
1040                          * determine which is more restrictive.)  Note that this is OK
1041                          * even if the row comparison is of ">" or "<" type, because the
1042                          * condition applied to all but the last row member is effectively
1043                          * ">=" or "<=", and so the extra keys don't break the positioning
1044                          * scheme.  But, by the same token, if we aren't able to use all
1045                          * the row members, then the part of the row comparison that we
1046                          * did use has to be treated as just a ">=" or "<=" condition, and
1047                          * so we'd better adjust strat_total accordingly.
1048                          */
1049                         if (i == keysCount - 1)
1050                         {
1051                                 bool            used_all_subkeys = false;
1052
1053                                 Assert(!(subkey->sk_flags & SK_ROW_END));
1054                                 for (;;)
1055                                 {
1056                                         subkey++;
1057                                         Assert(subkey->sk_flags & SK_ROW_MEMBER);
1058                                         if (subkey->sk_attno != keysCount + 1)
1059                                                 break;  /* out-of-sequence, can't use it */
1060                                         if (subkey->sk_strategy != cur->sk_strategy)
1061                                                 break;  /* wrong direction, can't use it */
1062                                         if (subkey->sk_flags & SK_ISNULL)
1063                                                 break;  /* can't use null keys */
1064                                         Assert(keysCount < INDEX_MAX_KEYS);
1065                                         memcpy(inskey.scankeys + keysCount, subkey,
1066                                                    sizeof(ScanKeyData));
1067                                         keysCount++;
1068                                         if (subkey->sk_flags & SK_ROW_END)
1069                                         {
1070                                                 used_all_subkeys = true;
1071                                                 break;
1072                                         }
1073                                 }
1074                                 if (!used_all_subkeys)
1075                                 {
1076                                         switch (strat_total)
1077                                         {
1078                                                 case BTLessStrategyNumber:
1079                                                         strat_total = BTLessEqualStrategyNumber;
1080                                                         break;
1081                                                 case BTGreaterStrategyNumber:
1082                                                         strat_total = BTGreaterEqualStrategyNumber;
1083                                                         break;
1084                                         }
1085                                 }
1086                                 break;                  /* done with outer loop */
1087                         }
1088                 }
1089                 else
1090                 {
1091                         /*
1092                          * Ordinary comparison key.  Transform the search-style scan key
1093                          * to an insertion scan key by replacing the sk_func with the
1094                          * appropriate btree comparison function.
1095                          *
1096                          * If scankey operator is not a cross-type comparison, we can use
1097                          * the cached comparison function; otherwise gotta look it up in
1098                          * the catalogs.  (That can't lead to infinite recursion, since no
1099                          * indexscan initiated by syscache lookup will use cross-data-type
1100                          * operators.)
1101                          *
1102                          * We support the convention that sk_subtype == InvalidOid means
1103                          * the opclass input type; this is a hack to simplify life for
1104                          * ScanKeyInit().
1105                          */
1106                         if (cur->sk_subtype == rel->rd_opcintype[i] ||
1107                                 cur->sk_subtype == InvalidOid)
1108                         {
1109                                 FmgrInfo   *procinfo;
1110
1111                                 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1112                                 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1113                                                                                            cur->sk_flags,
1114                                                                                            cur->sk_attno,
1115                                                                                            InvalidStrategy,
1116                                                                                            cur->sk_subtype,
1117                                                                                            cur->sk_collation,
1118                                                                                            procinfo,
1119                                                                                            cur->sk_argument);
1120                         }
1121                         else
1122                         {
1123                                 RegProcedure cmp_proc;
1124
1125                                 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1126                                                                                          rel->rd_opcintype[i],
1127                                                                                          cur->sk_subtype,
1128                                                                                          BTORDER_PROC);
1129                                 if (!RegProcedureIsValid(cmp_proc))
1130                                         elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1131                                                  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1132                                                  cur->sk_attno, RelationGetRelationName(rel));
1133                                 ScanKeyEntryInitialize(inskey.scankeys + i,
1134                                                                            cur->sk_flags,
1135                                                                            cur->sk_attno,
1136                                                                            InvalidStrategy,
1137                                                                            cur->sk_subtype,
1138                                                                            cur->sk_collation,
1139                                                                            cmp_proc,
1140                                                                            cur->sk_argument);
1141                         }
1142                 }
1143         }
1144
1145         /*----------
1146          * Examine the selected initial-positioning strategy to determine exactly
1147          * where we need to start the scan, and set flag variables to control the
1148          * code below.
1149          *
1150          * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1151          * item >= scan key.  If nextkey = true, they will locate the first
1152          * item > scan key.
1153          *
1154          * If goback = true, we will then step back one item, while if
1155          * goback = false, we will start the scan on the located item.
1156          *----------
1157          */
1158         switch (strat_total)
1159         {
1160                 case BTLessStrategyNumber:
1161
1162                         /*
1163                          * Find first item >= scankey, then back up one to arrive at last
1164                          * item < scankey.  (Note: this positioning strategy is only used
1165                          * for a backward scan, so that is always the correct starting
1166                          * position.)
1167                          */
1168                         nextkey = false;
1169                         goback = true;
1170                         break;
1171
1172                 case BTLessEqualStrategyNumber:
1173
1174                         /*
1175                          * Find first item > scankey, then back up one to arrive at last
1176                          * item <= scankey.  (Note: this positioning strategy is only used
1177                          * for a backward scan, so that is always the correct starting
1178                          * position.)
1179                          */
1180                         nextkey = true;
1181                         goback = true;
1182                         break;
1183
1184                 case BTEqualStrategyNumber:
1185
1186                         /*
1187                          * If a backward scan was specified, need to start with last equal
1188                          * item not first one.
1189                          */
1190                         if (ScanDirectionIsBackward(dir))
1191                         {
1192                                 /*
1193                                  * This is the same as the <= strategy.  We will check at the
1194                                  * end whether the found item is actually =.
1195                                  */
1196                                 nextkey = true;
1197                                 goback = true;
1198                         }
1199                         else
1200                         {
1201                                 /*
1202                                  * This is the same as the >= strategy.  We will check at the
1203                                  * end whether the found item is actually =.
1204                                  */
1205                                 nextkey = false;
1206                                 goback = false;
1207                         }
1208                         break;
1209
1210                 case BTGreaterEqualStrategyNumber:
1211
1212                         /*
1213                          * Find first item >= scankey.  (This is only used for forward
1214                          * scans.)
1215                          */
1216                         nextkey = false;
1217                         goback = false;
1218                         break;
1219
1220                 case BTGreaterStrategyNumber:
1221
1222                         /*
1223                          * Find first item > scankey.  (This is only used for forward
1224                          * scans.)
1225                          */
1226                         nextkey = true;
1227                         goback = false;
1228                         break;
1229
1230                 default:
1231                         /* can't get here, but keep compiler quiet */
1232                         elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1233                         return false;
1234         }
1235
1236         /* Initialize remaining insertion scan key fields */
1237         inskey.heapkeyspace = _bt_heapkeyspace(rel);
1238         inskey.anynullkeys = false; /* unused */
1239         inskey.nextkey = nextkey;
1240         inskey.pivotsearch = false;
1241         inskey.scantid = NULL;
1242         inskey.keysz = keysCount;
1243
1244         /*
1245          * Use the manufactured insertion scan key to descend the tree and
1246          * position ourselves on the target leaf page.
1247          */
1248         stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1249
1250         /* don't need to keep the stack around... */
1251         _bt_freestack(stack);
1252
1253         if (!BufferIsValid(buf))
1254         {
1255                 /*
1256                  * We only get here if the index is completely empty. Lock relation
1257                  * because nothing finer to lock exists.
1258                  */
1259                 PredicateLockRelation(rel, scan->xs_snapshot);
1260
1261                 /*
1262                  * mark parallel scan as done, so that all the workers can finish
1263                  * their scan
1264                  */
1265                 _bt_parallel_done(scan);
1266                 BTScanPosInvalidate(so->currPos);
1267
1268                 return false;
1269         }
1270         else
1271                 PredicateLockPage(rel, BufferGetBlockNumber(buf),
1272                                                   scan->xs_snapshot);
1273
1274         _bt_initialize_more_data(so, dir);
1275
1276         /* position to the precise item on the page */
1277         offnum = _bt_binsrch(rel, &inskey, buf);
1278
1279         /*
1280          * If nextkey = false, we are positioned at the first item >= scan key, or
1281          * possibly at the end of a page on which all the existing items are less
1282          * than the scan key and we know that everything on later pages is greater
1283          * than or equal to scan key.
1284          *
1285          * If nextkey = true, we are positioned at the first item > scan key, or
1286          * possibly at the end of a page on which all the existing items are less
1287          * than or equal to the scan key and we know that everything on later
1288          * pages is greater than scan key.
1289          *
1290          * The actually desired starting point is either this item or the prior
1291          * one, or in the end-of-page case it's the first item on the next page or
1292          * the last item on this page.  Adjust the starting offset if needed. (If
1293          * this results in an offset before the first item or after the last one,
1294          * _bt_readpage will report no items found, and then we'll step to the
1295          * next page as needed.)
1296          */
1297         if (goback)
1298                 offnum = OffsetNumberPrev(offnum);
1299
1300         /* remember which buffer we have pinned, if any */
1301         Assert(!BTScanPosIsValid(so->currPos));
1302         so->currPos.buf = buf;
1303
1304         /*
1305          * Now load data from the first page of the scan.
1306          */
1307         if (!_bt_readpage(scan, dir, offnum))
1308         {
1309                 /*
1310                  * There's no actually-matching data on this page.  Try to advance to
1311                  * the next page.  Return false if there's no matching data at all.
1312                  */
1313                 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1314                 if (!_bt_steppage(scan, dir))
1315                         return false;
1316         }
1317         else
1318         {
1319                 /* Drop the lock, and maybe the pin, on the current page */
1320                 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1321         }
1322
1323 readcomplete:
1324         /* OK, itemIndex says what to return */
1325         currItem = &so->currPos.items[so->currPos.itemIndex];
1326         scan->xs_heaptid = currItem->heapTid;
1327         if (scan->xs_want_itup)
1328                 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1329
1330         return true;
1331 }
1332
1333 /*
1334  *      _bt_next() -- Get the next item in a scan.
1335  *
1336  *              On entry, so->currPos describes the current page, which may be pinned
1337  *              but is not locked, and so->currPos.itemIndex identifies which item was
1338  *              previously returned.
1339  *
1340  *              On successful exit, scan->xs_ctup.t_self is set to the TID of the
1341  *              next heap tuple, and if requested, scan->xs_itup points to a copy of
1342  *              the index tuple.  so->currPos is updated as needed.
1343  *
1344  *              On failure exit (no more tuples), we release pin and set
1345  *              so->currPos.buf to InvalidBuffer.
1346  */
1347 bool
1348 _bt_next(IndexScanDesc scan, ScanDirection dir)
1349 {
1350         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1351         BTScanPosItem *currItem;
1352
1353         /*
1354          * Advance to next tuple on current page; or if there's no more, try to
1355          * step to the next page with data.
1356          */
1357         if (ScanDirectionIsForward(dir))
1358         {
1359                 if (++so->currPos.itemIndex > so->currPos.lastItem)
1360                 {
1361                         if (!_bt_steppage(scan, dir))
1362                                 return false;
1363                 }
1364         }
1365         else
1366         {
1367                 if (--so->currPos.itemIndex < so->currPos.firstItem)
1368                 {
1369                         if (!_bt_steppage(scan, dir))
1370                                 return false;
1371                 }
1372         }
1373
1374         /* OK, itemIndex says what to return */
1375         currItem = &so->currPos.items[so->currPos.itemIndex];
1376         scan->xs_heaptid = currItem->heapTid;
1377         if (scan->xs_want_itup)
1378                 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1379
1380         return true;
1381 }
1382
1383 /*
1384  *      _bt_readpage() -- Load data from current index page into so->currPos
1385  *
1386  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1387  * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
1388  * they are updated as appropriate.  All other fields of so->currPos are
1389  * initialized from scratch here.
1390  *
1391  * We scan the current page starting at offnum and moving in the indicated
1392  * direction.  All items matching the scan keys are loaded into currPos.items.
1393  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1394  * that there can be no more matching tuples in the current scan direction.
1395  *
1396  * In the case of a parallel scan, caller must have called _bt_parallel_seize
1397  * prior to calling this function; this function will invoke
1398  * _bt_parallel_release before returning.
1399  *
1400  * Returns true if any matching items found on the page, false if none.
1401  */
1402 static bool
1403 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
1404 {
1405         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1406         Page            page;
1407         BTPageOpaque opaque;
1408         OffsetNumber minoff;
1409         OffsetNumber maxoff;
1410         int                     itemIndex;
1411         bool            continuescan;
1412         int                     indnatts;
1413
1414         /*
1415          * We must have the buffer pinned and locked, but the usual macro can't be
1416          * used here; this function is what makes it good for currPos.
1417          */
1418         Assert(BufferIsValid(so->currPos.buf));
1419
1420         page = BufferGetPage(so->currPos.buf);
1421         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1422
1423         /* allow next page be processed by parallel worker */
1424         if (scan->parallel_scan)
1425         {
1426                 if (ScanDirectionIsForward(dir))
1427                         _bt_parallel_release(scan, opaque->btpo_next);
1428                 else
1429                         _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1430         }
1431
1432         continuescan = true;            /* default assumption */
1433         indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
1434         minoff = P_FIRSTDATAKEY(opaque);
1435         maxoff = PageGetMaxOffsetNumber(page);
1436
1437         /*
1438          * We note the buffer's block number so that we can release the pin later.
1439          * This allows us to re-read the buffer if it is needed again for hinting.
1440          */
1441         so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1442
1443         /*
1444          * We save the LSN of the page as we read it, so that we know whether it
1445          * safe to apply LP_DEAD hints to the page later.  This allows us to drop
1446          * the pin for MVCC scans, which allows vacuum to avoid blocking.
1447          */
1448         so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1449
1450         /*
1451          * we must save the page's right-link while scanning it; this tells us
1452          * where to step right to after we're done with these items.  There is no
1453          * corresponding need for the left-link, since splits always go right.
1454          */
1455         so->currPos.nextPage = opaque->btpo_next;
1456
1457         /* initialize tuple workspace to empty */
1458         so->currPos.nextTupleOffset = 0;
1459
1460         /*
1461          * Now that the current page has been made consistent, the macro should be
1462          * good.
1463          */
1464         Assert(BTScanPosIsPinned(so->currPos));
1465
1466         if (ScanDirectionIsForward(dir))
1467         {
1468                 /* load items[] in ascending order */
1469                 itemIndex = 0;
1470
1471                 offnum = Max(offnum, minoff);
1472
1473                 while (offnum <= maxoff)
1474                 {
1475                         ItemId          iid = PageGetItemId(page, offnum);
1476                         IndexTuple      itup;
1477
1478                         /*
1479                          * If the scan specifies not to return killed tuples, then we
1480                          * treat a killed tuple as not passing the qual
1481                          */
1482                         if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1483                         {
1484                                 offnum = OffsetNumberNext(offnum);
1485                                 continue;
1486                         }
1487
1488                         itup = (IndexTuple) PageGetItem(page, iid);
1489
1490                         if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
1491                         {
1492                                 /* tuple passes all scan key conditions, so remember it */
1493                                 _bt_saveitem(so, itemIndex, offnum, itup);
1494                                 itemIndex++;
1495                         }
1496                         /* When !continuescan, there can't be any more matches, so stop */
1497                         if (!continuescan)
1498                                 break;
1499
1500                         offnum = OffsetNumberNext(offnum);
1501                 }
1502
1503                 /*
1504                  * We don't need to visit page to the right when the high key
1505                  * indicates that no more matches will be found there.
1506                  *
1507                  * Checking the high key like this works out more often than you might
1508                  * think.  Leaf page splits pick a split point between the two most
1509                  * dissimilar tuples (this is weighed against the need to evenly share
1510                  * free space).  Leaf pages with high key attribute values that can
1511                  * only appear on non-pivot tuples on the right sibling page are
1512                  * common.
1513                  */
1514                 if (continuescan && !P_RIGHTMOST(opaque))
1515                 {
1516                         ItemId          iid = PageGetItemId(page, P_HIKEY);
1517                         IndexTuple      itup = (IndexTuple) PageGetItem(page, iid);
1518                         int                     truncatt;
1519
1520                         truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1521                         _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1522                 }
1523
1524                 if (!continuescan)
1525                         so->currPos.moreRight = false;
1526
1527                 Assert(itemIndex <= MaxIndexTuplesPerPage);
1528                 so->currPos.firstItem = 0;
1529                 so->currPos.lastItem = itemIndex - 1;
1530                 so->currPos.itemIndex = 0;
1531         }
1532         else
1533         {
1534                 /* load items[] in descending order */
1535                 itemIndex = MaxIndexTuplesPerPage;
1536
1537                 offnum = Min(offnum, maxoff);
1538
1539                 while (offnum >= minoff)
1540                 {
1541                         ItemId          iid = PageGetItemId(page, offnum);
1542                         IndexTuple      itup;
1543                         bool            tuple_alive;
1544                         bool            passes_quals;
1545
1546                         /*
1547                          * If the scan specifies not to return killed tuples, then we
1548                          * treat a killed tuple as not passing the qual.  Most of the
1549                          * time, it's a win to not bother examining the tuple's index
1550                          * keys, but just skip to the next tuple (previous, actually,
1551                          * since we're scanning backwards).  However, if this is the first
1552                          * tuple on the page, we do check the index keys, to prevent
1553                          * uselessly advancing to the page to the left.  This is similar
1554                          * to the high key optimization used by forward scans.
1555                          */
1556                         if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1557                         {
1558                                 Assert(offnum >= P_FIRSTDATAKEY(opaque));
1559                                 if (offnum > P_FIRSTDATAKEY(opaque))
1560                                 {
1561                                         offnum = OffsetNumberPrev(offnum);
1562                                         continue;
1563                                 }
1564
1565                                 tuple_alive = false;
1566                         }
1567                         else
1568                                 tuple_alive = true;
1569
1570                         itup = (IndexTuple) PageGetItem(page, iid);
1571
1572                         passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1573                                                                                  &continuescan);
1574                         if (passes_quals && tuple_alive)
1575                         {
1576                                 /* tuple passes all scan key conditions, so remember it */
1577                                 itemIndex--;
1578                                 _bt_saveitem(so, itemIndex, offnum, itup);
1579                         }
1580                         if (!continuescan)
1581                         {
1582                                 /* there can't be any more matches, so stop */
1583                                 so->currPos.moreLeft = false;
1584                                 break;
1585                         }
1586
1587                         offnum = OffsetNumberPrev(offnum);
1588                 }
1589
1590                 Assert(itemIndex >= 0);
1591                 so->currPos.firstItem = itemIndex;
1592                 so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
1593                 so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
1594         }
1595
1596         return (so->currPos.firstItem <= so->currPos.lastItem);
1597 }
1598
1599 /* Save an index item into so->currPos.items[itemIndex] */
1600 static void
1601 _bt_saveitem(BTScanOpaque so, int itemIndex,
1602                          OffsetNumber offnum, IndexTuple itup)
1603 {
1604         BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1605
1606         currItem->heapTid = itup->t_tid;
1607         currItem->indexOffset = offnum;
1608         if (so->currTuples)
1609         {
1610                 Size            itupsz = IndexTupleSize(itup);
1611
1612                 currItem->tupleOffset = so->currPos.nextTupleOffset;
1613                 memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1614                 so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1615         }
1616 }
1617
1618 /*
1619  *      _bt_steppage() -- Step to next page containing valid data for scan
1620  *
1621  * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1622  * if pinned, we'll drop the pin before moving to next page.  The buffer is
1623  * not locked on entry.
1624  *
1625  * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1626  * read lock, on that page.  If we do not hold the pin, we set so->currPos.buf
1627  * to InvalidBuffer.  We return true to indicate success.
1628  */
1629 static bool
1630 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1631 {
1632         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1633         BlockNumber blkno = InvalidBlockNumber;
1634         bool            status = true;
1635
1636         Assert(BTScanPosIsValid(so->currPos));
1637
1638         /* Before leaving current page, deal with any killed items */
1639         if (so->numKilled > 0)
1640                 _bt_killitems(scan);
1641
1642         /*
1643          * Before we modify currPos, make a copy of the page data if there was a
1644          * mark position that needs it.
1645          */
1646         if (so->markItemIndex >= 0)
1647         {
1648                 /* bump pin on current buffer for assignment to mark buffer */
1649                 if (BTScanPosIsPinned(so->currPos))
1650                         IncrBufferRefCount(so->currPos.buf);
1651                 memcpy(&so->markPos, &so->currPos,
1652                            offsetof(BTScanPosData, items[1]) +
1653                            so->currPos.lastItem * sizeof(BTScanPosItem));
1654                 if (so->markTuples)
1655                         memcpy(so->markTuples, so->currTuples,
1656                                    so->currPos.nextTupleOffset);
1657                 so->markPos.itemIndex = so->markItemIndex;
1658                 so->markItemIndex = -1;
1659         }
1660
1661         if (ScanDirectionIsForward(dir))
1662         {
1663                 /* Walk right to the next page with data */
1664                 if (scan->parallel_scan != NULL)
1665                 {
1666                         /*
1667                          * Seize the scan to get the next block number; if the scan has
1668                          * ended already, bail out.
1669                          */
1670                         status = _bt_parallel_seize(scan, &blkno);
1671                         if (!status)
1672                         {
1673                                 /* release the previous buffer, if pinned */
1674                                 BTScanPosUnpinIfPinned(so->currPos);
1675                                 BTScanPosInvalidate(so->currPos);
1676                                 return false;
1677                         }
1678                 }
1679                 else
1680                 {
1681                         /* Not parallel, so use the previously-saved nextPage link. */
1682                         blkno = so->currPos.nextPage;
1683                 }
1684
1685                 /* Remember we left a page with data */
1686                 so->currPos.moreLeft = true;
1687
1688                 /* release the previous buffer, if pinned */
1689                 BTScanPosUnpinIfPinned(so->currPos);
1690         }
1691         else
1692         {
1693                 /* Remember we left a page with data */
1694                 so->currPos.moreRight = true;
1695
1696                 if (scan->parallel_scan != NULL)
1697                 {
1698                         /*
1699                          * Seize the scan to get the current block number; if the scan has
1700                          * ended already, bail out.
1701                          */
1702                         status = _bt_parallel_seize(scan, &blkno);
1703                         BTScanPosUnpinIfPinned(so->currPos);
1704                         if (!status)
1705                         {
1706                                 BTScanPosInvalidate(so->currPos);
1707                                 return false;
1708                         }
1709                 }
1710                 else
1711                 {
1712                         /* Not parallel, so just use our own notion of the current page */
1713                         blkno = so->currPos.currPage;
1714                 }
1715         }
1716
1717         if (!_bt_readnextpage(scan, blkno, dir))
1718                 return false;
1719
1720         /* Drop the lock, and maybe the pin, on the current page */
1721         _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1722
1723         return true;
1724 }
1725
1726 /*
1727  *      _bt_readnextpage() -- Read next page containing valid data for scan
1728  *
1729  * On success exit, so->currPos is updated to contain data from the next
1730  * interesting page.  Caller is responsible to release lock and pin on
1731  * buffer on success.  We return true to indicate success.
1732  *
1733  * If there are no more matching records in the given direction, we drop all
1734  * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1735  */
1736 static bool
1737 _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1738 {
1739         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1740         Relation        rel;
1741         Page            page;
1742         BTPageOpaque opaque;
1743         bool            status = true;
1744
1745         rel = scan->indexRelation;
1746
1747         if (ScanDirectionIsForward(dir))
1748         {
1749                 for (;;)
1750                 {
1751                         /*
1752                          * if we're at end of scan, give up and mark parallel scan as
1753                          * done, so that all the workers can finish their scan
1754                          */
1755                         if (blkno == P_NONE || !so->currPos.moreRight)
1756                         {
1757                                 _bt_parallel_done(scan);
1758                                 BTScanPosInvalidate(so->currPos);
1759                                 return false;
1760                         }
1761                         /* check for interrupts while we're not holding any buffer lock */
1762                         CHECK_FOR_INTERRUPTS();
1763                         /* step right one page */
1764                         so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1765                         page = BufferGetPage(so->currPos.buf);
1766                         TestForOldSnapshot(scan->xs_snapshot, rel, page);
1767                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1768                         /* check for deleted page */
1769                         if (!P_IGNORE(opaque))
1770                         {
1771                                 PredicateLockPage(rel, blkno, scan->xs_snapshot);
1772                                 /* see if there are any matches on this page */
1773                                 /* note that this will clear moreRight if we can stop */
1774                                 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1775                                         break;
1776                         }
1777                         else if (scan->parallel_scan != NULL)
1778                         {
1779                                 /* allow next page be processed by parallel worker */
1780                                 _bt_parallel_release(scan, opaque->btpo_next);
1781                         }
1782
1783                         /* nope, keep going */
1784                         if (scan->parallel_scan != NULL)
1785                         {
1786                                 _bt_relbuf(rel, so->currPos.buf);
1787                                 status = _bt_parallel_seize(scan, &blkno);
1788                                 if (!status)
1789                                 {
1790                                         BTScanPosInvalidate(so->currPos);
1791                                         return false;
1792                                 }
1793                         }
1794                         else
1795                         {
1796                                 blkno = opaque->btpo_next;
1797                                 _bt_relbuf(rel, so->currPos.buf);
1798                         }
1799                 }
1800         }
1801         else
1802         {
1803                 /*
1804                  * Should only happen in parallel cases, when some other backend
1805                  * advanced the scan.
1806                  */
1807                 if (so->currPos.currPage != blkno)
1808                 {
1809                         BTScanPosUnpinIfPinned(so->currPos);
1810                         so->currPos.currPage = blkno;
1811                 }
1812
1813                 /*
1814                  * Walk left to the next page with data.  This is much more complex
1815                  * than the walk-right case because of the possibility that the page
1816                  * to our left splits while we are in flight to it, plus the
1817                  * possibility that the page we were on gets deleted after we leave
1818                  * it.  See nbtree/README for details.
1819                  *
1820                  * It might be possible to rearrange this code to have less overhead
1821                  * in pinning and locking, but that would require capturing the left
1822                  * pointer when the page is initially read, and using it here, along
1823                  * with big changes to _bt_walk_left() and the code below.  It is not
1824                  * clear whether this would be a win, since if the page immediately to
1825                  * the left splits after we read this page and before we step left, we
1826                  * would need to visit more pages than with the current code.
1827                  *
1828                  * Note that if we change the code so that we drop the pin for a scan
1829                  * which uses a non-MVCC snapshot, we will need to modify the code for
1830                  * walking left, to allow for the possibility that a referenced page
1831                  * has been deleted.  As long as the buffer is pinned or the snapshot
1832                  * is MVCC the page cannot move past the half-dead state to fully
1833                  * deleted.
1834                  */
1835                 if (BTScanPosIsPinned(so->currPos))
1836                         LockBuffer(so->currPos.buf, BT_READ);
1837                 else
1838                         so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
1839
1840                 for (;;)
1841                 {
1842                         /* Done if we know there are no matching keys to the left */
1843                         if (!so->currPos.moreLeft)
1844                         {
1845                                 _bt_relbuf(rel, so->currPos.buf);
1846                                 _bt_parallel_done(scan);
1847                                 BTScanPosInvalidate(so->currPos);
1848                                 return false;
1849                         }
1850
1851                         /* Step to next physical page */
1852                         so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
1853                                                                                         scan->xs_snapshot);
1854
1855                         /* if we're physically at end of index, return failure */
1856                         if (so->currPos.buf == InvalidBuffer)
1857                         {
1858                                 _bt_parallel_done(scan);
1859                                 BTScanPosInvalidate(so->currPos);
1860                                 return false;
1861                         }
1862
1863                         /*
1864                          * Okay, we managed to move left to a non-deleted page. Done if
1865                          * it's not half-dead and contains matching tuples. Else loop back
1866                          * and do it all again.
1867                          */
1868                         page = BufferGetPage(so->currPos.buf);
1869                         TestForOldSnapshot(scan->xs_snapshot, rel, page);
1870                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1871                         if (!P_IGNORE(opaque))
1872                         {
1873                                 PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
1874                                 /* see if there are any matches on this page */
1875                                 /* note that this will clear moreLeft if we can stop */
1876                                 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1877                                         break;
1878                         }
1879                         else if (scan->parallel_scan != NULL)
1880                         {
1881                                 /* allow next page be processed by parallel worker */
1882                                 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1883                         }
1884
1885                         /*
1886                          * For parallel scans, get the last page scanned as it is quite
1887                          * possible that by the time we try to seize the scan, some other
1888                          * worker has already advanced the scan to a different page.  We
1889                          * must continue based on the latest page scanned by any worker.
1890                          */
1891                         if (scan->parallel_scan != NULL)
1892                         {
1893                                 _bt_relbuf(rel, so->currPos.buf);
1894                                 status = _bt_parallel_seize(scan, &blkno);
1895                                 if (!status)
1896                                 {
1897                                         BTScanPosInvalidate(so->currPos);
1898                                         return false;
1899                                 }
1900                                 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1901                         }
1902                 }
1903         }
1904
1905         return true;
1906 }
1907
1908 /*
1909  *      _bt_parallel_readpage() -- Read current page containing valid data for scan
1910  *
1911  * On success, release lock and maybe pin on buffer.  We return true to
1912  * indicate success.
1913  */
1914 static bool
1915 _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1916 {
1917         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1918
1919         _bt_initialize_more_data(so, dir);
1920
1921         if (!_bt_readnextpage(scan, blkno, dir))
1922                 return false;
1923
1924         /* Drop the lock, and maybe the pin, on the current page */
1925         _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1926
1927         return true;
1928 }
1929
1930 /*
1931  * _bt_walk_left() -- step left one page, if possible
1932  *
1933  * The given buffer must be pinned and read-locked.  This will be dropped
1934  * before stepping left.  On return, we have pin and read lock on the
1935  * returned page, instead.
1936  *
1937  * Returns InvalidBuffer if there is no page to the left (no lock is held
1938  * in that case).
1939  *
1940  * When working on a non-leaf level, it is possible for the returned page
1941  * to be half-dead; the caller should check that condition and step left
1942  * again if it's important.
1943  */
1944 static Buffer
1945 _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
1946 {
1947         Page            page;
1948         BTPageOpaque opaque;
1949
1950         page = BufferGetPage(buf);
1951         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1952
1953         for (;;)
1954         {
1955                 BlockNumber obknum;
1956                 BlockNumber lblkno;
1957                 BlockNumber blkno;
1958                 int                     tries;
1959
1960                 /* if we're at end of tree, release buf and return failure */
1961                 if (P_LEFTMOST(opaque))
1962                 {
1963                         _bt_relbuf(rel, buf);
1964                         break;
1965                 }
1966                 /* remember original page we are stepping left from */
1967                 obknum = BufferGetBlockNumber(buf);
1968                 /* step left */
1969                 blkno = lblkno = opaque->btpo_prev;
1970                 _bt_relbuf(rel, buf);
1971                 /* check for interrupts while we're not holding any buffer lock */
1972                 CHECK_FOR_INTERRUPTS();
1973                 buf = _bt_getbuf(rel, blkno, BT_READ);
1974                 page = BufferGetPage(buf);
1975                 TestForOldSnapshot(snapshot, rel, page);
1976                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1977
1978                 /*
1979                  * If this isn't the page we want, walk right till we find what we
1980                  * want --- but go no more than four hops (an arbitrary limit). If we
1981                  * don't find the correct page by then, the most likely bet is that
1982                  * the original page got deleted and isn't in the sibling chain at all
1983                  * anymore, not that its left sibling got split more than four times.
1984                  *
1985                  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1986                  * because half-dead pages are still in the sibling chain.  Caller
1987                  * must reject half-dead pages if wanted.
1988                  */
1989                 tries = 0;
1990                 for (;;)
1991                 {
1992                         if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1993                         {
1994                                 /* Found desired page, return it */
1995                                 return buf;
1996                         }
1997                         if (P_RIGHTMOST(opaque) || ++tries > 4)
1998                                 break;
1999                         blkno = opaque->btpo_next;
2000                         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2001                         page = BufferGetPage(buf);
2002                         TestForOldSnapshot(snapshot, rel, page);
2003                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2004                 }
2005
2006                 /* Return to the original page to see what's up */
2007                 buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2008                 page = BufferGetPage(buf);
2009                 TestForOldSnapshot(snapshot, rel, page);
2010                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2011                 if (P_ISDELETED(opaque))
2012                 {
2013                         /*
2014                          * It was deleted.  Move right to first nondeleted page (there
2015                          * must be one); that is the page that has acquired the deleted
2016                          * one's keyspace, so stepping left from it will take us where we
2017                          * want to be.
2018                          */
2019                         for (;;)
2020                         {
2021                                 if (P_RIGHTMOST(opaque))
2022                                         elog(ERROR, "fell off the end of index \"%s\"",
2023                                                  RelationGetRelationName(rel));
2024                                 blkno = opaque->btpo_next;
2025                                 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2026                                 page = BufferGetPage(buf);
2027                                 TestForOldSnapshot(snapshot, rel, page);
2028                                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2029                                 if (!P_ISDELETED(opaque))
2030                                         break;
2031                         }
2032
2033                         /*
2034                          * Now return to top of loop, resetting obknum to point to this
2035                          * nondeleted page, and try again.
2036                          */
2037                 }
2038                 else
2039                 {
2040                         /*
2041                          * It wasn't deleted; the explanation had better be that the page
2042                          * to the left got split or deleted. Without this check, we'd go
2043                          * into an infinite loop if there's anything wrong.
2044                          */
2045                         if (opaque->btpo_prev == lblkno)
2046                                 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2047                                          obknum, RelationGetRelationName(rel));
2048                         /* Okay to try again with new lblkno value */
2049                 }
2050         }
2051
2052         return InvalidBuffer;
2053 }
2054
2055 /*
2056  * _bt_get_endpoint() -- Find the first or last page on a given tree level
2057  *
2058  * If the index is empty, we will return InvalidBuffer; any other failure
2059  * condition causes ereport().  We will not return a dead page.
2060  *
2061  * The returned buffer is pinned and read-locked.
2062  */
2063 Buffer
2064 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
2065                                  Snapshot snapshot)
2066 {
2067         Buffer          buf;
2068         Page            page;
2069         BTPageOpaque opaque;
2070         OffsetNumber offnum;
2071         BlockNumber blkno;
2072         IndexTuple      itup;
2073
2074         /*
2075          * If we are looking for a leaf page, okay to descend from fast root;
2076          * otherwise better descend from true root.  (There is no point in being
2077          * smarter about intermediate levels.)
2078          */
2079         if (level == 0)
2080                 buf = _bt_getroot(rel, BT_READ);
2081         else
2082                 buf = _bt_gettrueroot(rel);
2083
2084         if (!BufferIsValid(buf))
2085                 return InvalidBuffer;
2086
2087         page = BufferGetPage(buf);
2088         TestForOldSnapshot(snapshot, rel, page);
2089         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2090
2091         for (;;)
2092         {
2093                 /*
2094                  * If we landed on a deleted page, step right to find a live page
2095                  * (there must be one).  Also, if we want the rightmost page, step
2096                  * right if needed to get to it (this could happen if the page split
2097                  * since we obtained a pointer to it).
2098                  */
2099                 while (P_IGNORE(opaque) ||
2100                            (rightmost && !P_RIGHTMOST(opaque)))
2101                 {
2102                         blkno = opaque->btpo_next;
2103                         if (blkno == P_NONE)
2104                                 elog(ERROR, "fell off the end of index \"%s\"",
2105                                          RelationGetRelationName(rel));
2106                         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2107                         page = BufferGetPage(buf);
2108                         TestForOldSnapshot(snapshot, rel, page);
2109                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2110                 }
2111
2112                 /* Done? */
2113                 if (opaque->btpo.level == level)
2114                         break;
2115                 if (opaque->btpo.level < level)
2116                         ereport(ERROR,
2117                                         (errcode(ERRCODE_INDEX_CORRUPTED),
2118                                          errmsg_internal("btree level %u not found in index \"%s\"",
2119                                                                          level, RelationGetRelationName(rel))));
2120
2121                 /* Descend to leftmost or rightmost child page */
2122                 if (rightmost)
2123                         offnum = PageGetMaxOffsetNumber(page);
2124                 else
2125                         offnum = P_FIRSTDATAKEY(opaque);
2126
2127                 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2128                 blkno = BTreeInnerTupleGetDownLink(itup);
2129
2130                 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2131                 page = BufferGetPage(buf);
2132                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2133         }
2134
2135         return buf;
2136 }
2137
2138 /*
2139  *      _bt_endpoint() -- Find the first or last page in the index, and scan
2140  * from there to the first key satisfying all the quals.
2141  *
2142  * This is used by _bt_first() to set up a scan when we've determined
2143  * that the scan must start at the beginning or end of the index (for
2144  * a forward or backward scan respectively).  Exit conditions are the
2145  * same as for _bt_first().
2146  */
2147 static bool
2148 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2149 {
2150         Relation        rel = scan->indexRelation;
2151         BTScanOpaque so = (BTScanOpaque) scan->opaque;
2152         Buffer          buf;
2153         Page            page;
2154         BTPageOpaque opaque;
2155         OffsetNumber start;
2156         BTScanPosItem *currItem;
2157
2158         /*
2159          * Scan down to the leftmost or rightmost leaf page.  This is a simplified
2160          * version of _bt_search().  We don't maintain a stack since we know we
2161          * won't need it.
2162          */
2163         buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
2164
2165         if (!BufferIsValid(buf))
2166         {
2167                 /*
2168                  * Empty index. Lock the whole relation, as nothing finer to lock
2169                  * exists.
2170                  */
2171                 PredicateLockRelation(rel, scan->xs_snapshot);
2172                 BTScanPosInvalidate(so->currPos);
2173                 return false;
2174         }
2175
2176         PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
2177         page = BufferGetPage(buf);
2178         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2179         Assert(P_ISLEAF(opaque));
2180
2181         if (ScanDirectionIsForward(dir))
2182         {
2183                 /* There could be dead pages to the left, so not this: */
2184                 /* Assert(P_LEFTMOST(opaque)); */
2185
2186                 start = P_FIRSTDATAKEY(opaque);
2187         }
2188         else if (ScanDirectionIsBackward(dir))
2189         {
2190                 Assert(P_RIGHTMOST(opaque));
2191
2192                 start = PageGetMaxOffsetNumber(page);
2193         }
2194         else
2195         {
2196                 elog(ERROR, "invalid scan direction: %d", (int) dir);
2197                 start = 0;                              /* keep compiler quiet */
2198         }
2199
2200         /* remember which buffer we have pinned */
2201         so->currPos.buf = buf;
2202
2203         _bt_initialize_more_data(so, dir);
2204
2205         /*
2206          * Now load data from the first page of the scan.
2207          */
2208         if (!_bt_readpage(scan, dir, start))
2209         {
2210                 /*
2211                  * There's no actually-matching data on this page.  Try to advance to
2212                  * the next page.  Return false if there's no matching data at all.
2213                  */
2214                 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
2215                 if (!_bt_steppage(scan, dir))
2216                         return false;
2217         }
2218         else
2219         {
2220                 /* Drop the lock, and maybe the pin, on the current page */
2221                 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2222         }
2223
2224         /* OK, itemIndex says what to return */
2225         currItem = &so->currPos.items[so->currPos.itemIndex];
2226         scan->xs_heaptid = currItem->heapTid;
2227         if (scan->xs_want_itup)
2228                 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2229
2230         return true;
2231 }
2232
2233 /*
2234  * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
2235  * for scan direction
2236  */
2237 static inline void
2238 _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
2239 {
2240         /* initialize moreLeft/moreRight appropriately for scan direction */
2241         if (ScanDirectionIsForward(dir))
2242         {
2243                 so->currPos.moreLeft = false;
2244                 so->currPos.moreRight = true;
2245         }
2246         else
2247         {
2248                 so->currPos.moreLeft = true;
2249                 so->currPos.moreRight = false;
2250         }
2251         so->numKilled = 0;                      /* just paranoia */
2252         so->markItemIndex = -1;         /* ditto */
2253 }