1 /*-------------------------------------------------------------------------
4 * Search code for postgres btrees.
7 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/nbtree/nbtsearch.c
13 *-------------------------------------------------------------------------
18 #include "access/nbtree.h"
19 #include "access/relscan.h"
20 #include "miscadmin.h"
22 #include "storage/predicate.h"
23 #include "utils/lsyscache.h"
24 #include "utils/rel.h"
25 #include "utils/tqual.h"
28 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
30 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
31 OffsetNumber offnum, IndexTuple itup);
32 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
33 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
34 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
35 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
39 * _bt_drop_lock_and_maybe_pin()
41 * Unlock the buffer; and if it is safe to release the pin, do that, too. It
42 * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
43 * This will prevent vacuum from stalling in a blocked state trying to read a
44 * page when a cursor is sitting on it -- at least in many important cases.
46 * Set the buffer to invalid if the pin is released, since the buffer may be
47 * re-used. If we need to go back to this block (for example, to apply
48 * LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it
49 * will remain in shared memory for as long as it takes to scan the index
53 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
55 LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
57 if (IsMVCCSnapshot(scan->xs_snapshot) &&
58 RelationNeedsWAL(scan->indexRelation) &&
61 ReleaseBuffer(sp->buf);
62 sp->buf = InvalidBuffer;
68 * _bt_search() -- Search the tree for a particular scankey,
69 * or more precisely for the first leaf page it could be on.
71 * The passed scankey must be an insertion-type scankey (see nbtree/README),
72 * but it can omit the rightmost column(s) of the index.
74 * When nextkey is false (the usual case), we are looking for the first
75 * item >= scankey. When nextkey is true, we are looking for the first
76 * item strictly greater than scankey.
78 * Return value is a stack of parent-page pointers. *bufP is set to the
79 * address of the leaf-page buffer, which is read-locked and pinned.
80 * No locks are held on the parent pages, however!
82 * NOTE that the returned buffer is read-locked regardless of the access
83 * parameter. However, access = BT_WRITE will allow an empty root page
84 * to be created and returned. When access = BT_READ, an empty index
85 * will result in *bufP being set to InvalidBuffer. Also, in BT_WRITE mode,
86 * any incomplete splits encountered during the search will be finished.
89 _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
90 Buffer *bufP, int access)
92 BTStack stack_in = NULL;
94 /* Get the root page to start with */
95 *bufP = _bt_getroot(rel, access);
97 /* If index is empty and access = BT_READ, no root page is created. */
98 if (!BufferIsValid(*bufP))
100 return (BTStack) NULL;
103 /* Loop iterates once per level descended in the tree */
112 BlockNumber par_blkno;
116 * Race -- the page we just grabbed may have split since we read its
117 * pointer in the parent (or metapage). If it has, we may need to
118 * move right to its new sibling. Do that.
120 * In write-mode, allow _bt_moveright to finish any incomplete splits
121 * along the way. Strictly speaking, we'd only need to finish an
122 * incomplete split on the leaf page we're about to insert to, not on
123 * any of the upper levels (they are taken care of in _bt_getstackbuf,
124 * if the leaf page is split and we insert to the parent page). But
125 * this is a good opportunity to finish splits of internal pages too.
127 *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
128 (access == BT_WRITE), stack_in,
131 /* if this is a leaf page, we're done */
132 page = BufferGetPage(*bufP, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
133 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
134 if (P_ISLEAF(opaque))
138 * Find the appropriate item on the internal page, and get the child
139 * page that it points to.
141 offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
142 itemid = PageGetItemId(page, offnum);
143 itup = (IndexTuple) PageGetItem(page, itemid);
144 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
145 par_blkno = BufferGetBlockNumber(*bufP);
148 * We need to save the location of the index entry we chose in the
149 * parent page on a stack. In case we split the tree, we'll use the
150 * stack to work back up to the parent page. We also save the actual
151 * downlink (TID) to uniquely identify the index entry, in case it
152 * moves right while we're working lower in the tree. See the paper
153 * by Lehman and Yao for how this is detected and handled. (We use the
154 * child link to disambiguate duplicate keys in the index -- Lehman
155 * and Yao disallow duplicate keys.)
157 new_stack = (BTStack) palloc(sizeof(BTStackData));
158 new_stack->bts_blkno = par_blkno;
159 new_stack->bts_offset = offnum;
160 memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
161 new_stack->bts_parent = stack_in;
163 /* drop the read lock on the parent page, acquire one on the child */
164 *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
166 /* okay, all set to move down a level */
167 stack_in = new_stack;
174 * _bt_moveright() -- move right in the btree if necessary.
176 * When we follow a pointer to reach a page, it is possible that
177 * the page has changed in the meanwhile. If this happens, we're
178 * guaranteed that the page has "split right" -- that is, that any
179 * data that appeared on the page originally is either on the page
180 * or strictly to the right of it.
182 * This routine decides whether or not we need to move right in the
183 * tree by examining the high key entry on the page. If that entry
184 * is strictly less than the scankey, or <= the scankey in the nextkey=true
185 * case, then we followed the wrong link and we need to move right.
187 * The passed scankey must be an insertion-type scankey (see nbtree/README),
188 * but it can omit the rightmost column(s) of the index.
190 * When nextkey is false (the usual case), we are looking for the first
191 * item >= scankey. When nextkey is true, we are looking for the first
192 * item strictly greater than scankey.
194 * If forupdate is true, we will attempt to finish any incomplete splits
195 * that we encounter. This is required when locking a target page for an
196 * insertion, because we don't allow inserting on a page before the split
197 * is completed. 'stack' is only used if forupdate is true.
199 * On entry, we have the buffer pinned and a lock of the type specified by
200 * 'access'. If we move right, we release the buffer and lock and acquire
201 * the same on the right sibling. Return value is the buffer we stop at.
204 _bt_moveright(Relation rel,
218 * When nextkey = false (normal case): if the scan key that brought us to
219 * this page is > the high key stored on the page, then the page has split
220 * and we need to move right. (If the scan key is equal to the high key,
221 * we might or might not need to move right; have to scan the page first
224 * When nextkey = true: move right if the scan key is >= page's high key.
226 * The page could even have split more than once, so scan as far as
229 * We also have to move right if we followed a link that brought us to a
232 cmpval = nextkey ? 0 : 1;
236 page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
237 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
239 if (P_RIGHTMOST(opaque))
243 * Finish any incomplete splits we encounter along the way.
245 if (forupdate && P_INCOMPLETE_SPLIT(opaque))
247 BlockNumber blkno = BufferGetBlockNumber(buf);
249 /* upgrade our lock if necessary */
250 if (access == BT_READ)
252 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
253 LockBuffer(buf, BT_WRITE);
256 if (P_INCOMPLETE_SPLIT(opaque))
257 _bt_finish_split(rel, buf, stack);
259 _bt_relbuf(rel, buf);
261 /* re-acquire the lock in the right mode, and re-check */
262 buf = _bt_getbuf(rel, blkno, access);
266 if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
268 /* step right one page */
269 buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
276 if (P_IGNORE(opaque))
277 elog(ERROR, "fell off the end of index \"%s\"",
278 RelationGetRelationName(rel));
284 * _bt_binsrch() -- Do a binary search for a key on a particular page.
286 * The passed scankey must be an insertion-type scankey (see nbtree/README),
287 * but it can omit the rightmost column(s) of the index.
289 * When nextkey is false (the usual case), we are looking for the first
290 * item >= scankey. When nextkey is true, we are looking for the first
291 * item strictly greater than scankey.
293 * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
294 * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
295 * particular, this means it is possible to return a value 1 greater than the
296 * number of keys on the page, if the scankey is > all keys on the page.)
298 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
299 * of the last key < given scankey, or last key <= given scankey if nextkey
300 * is true. (Since _bt_compare treats the first data key of such a page as
301 * minus infinity, there will be at least one key < scankey, so the result
302 * always points at one of the keys on the page.) This key indicates the
303 * right place to descend to be sure we find all leaf keys >= given scankey
304 * (or leaf keys > given scankey when nextkey is true).
306 * This procedure is not responsible for walking right, it just examines
307 * the given page. _bt_binsrch() has no lock or refcount side effects
311 _bt_binsrch(Relation rel,
324 page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
325 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
327 low = P_FIRSTDATAKEY(opaque);
328 high = PageGetMaxOffsetNumber(page);
331 * If there are no keys on the page, return the first available slot. Note
332 * this covers two cases: the page is really empty (no keys), or it
333 * contains only a high key. The latter case is possible after vacuuming.
334 * This can never happen on an internal page, however, since they are
335 * never empty (an internal page must have children).
341 * Binary search to find the first key on the page >= scan key, or first
342 * key > scankey when nextkey is true.
344 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
345 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
347 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
348 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
350 * We can fall out when high == low.
352 high++; /* establish the loop invariant for high */
354 cmpval = nextkey ? 0 : 1; /* select comparison value */
358 OffsetNumber mid = low + ((high - low) / 2);
360 /* We have low <= mid < high, so mid points at a real slot */
362 result = _bt_compare(rel, keysz, scankey, page, mid);
364 if (result >= cmpval)
371 * At this point we have high == low, but be careful: they could point
372 * past the last slot on the page.
374 * On a leaf page, we always return the first key >= scan key (resp. >
375 * scan key), which could be the last slot + 1.
377 if (P_ISLEAF(opaque))
381 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
382 * There must be one if _bt_compare() is playing by the rules.
384 Assert(low > P_FIRSTDATAKEY(opaque));
386 return OffsetNumberPrev(low);
390 * _bt_compare() -- Compare scankey to a particular tuple on the page.
392 * The passed scankey must be an insertion-type scankey (see nbtree/README),
393 * but it can omit the rightmost column(s) of the index.
395 * keysz: number of key conditions to be checked (might be less than the
396 * number of index columns!)
397 * page/offnum: location of btree item to be compared to.
399 * This routine returns:
400 * <0 if scankey < tuple at offnum;
401 * 0 if scankey == tuple at offnum;
402 * >0 if scankey > tuple at offnum.
403 * NULLs in the keys are treated as sortable values. Therefore
404 * "equality" does not necessarily mean that the item should be
405 * returned to the caller as a matching key!
407 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
408 * "minus infinity": this routine will always claim it is less than the
409 * scankey. The actual key value stored (if any, which there probably isn't)
410 * does not matter. This convention allows us to implement the Lehman and
411 * Yao convention that the first down-link pointer is before the first key.
412 * See backend/access/nbtree/README for details.
416 _bt_compare(Relation rel,
422 TupleDesc itupdesc = RelationGetDescr(rel);
423 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
428 * Force result ">" if target item is first data item on an internal page
429 * --- see NOTE above.
431 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
434 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
437 * The scan key is set up with the attribute number associated with each
438 * term in the key. It is important that, if the index is multi-key, the
439 * scan contain the first k key attributes, and that they be in order. If
440 * you think about how multi-key ordering works, you'll understand why
443 * We don't test for violation of this condition here, however. The
444 * initial setup for the index scan had better have gotten it right (see
448 for (i = 1; i <= keysz; i++)
454 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
456 /* see comments about NULLs handling in btbuild */
457 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
460 result = 0; /* NULL "=" NULL */
461 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
462 result = -1; /* NULL "<" NOT_NULL */
464 result = 1; /* NULL ">" NOT_NULL */
466 else if (isNull) /* key is NOT_NULL and item is NULL */
468 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
469 result = 1; /* NOT_NULL ">" NULL */
471 result = -1; /* NOT_NULL "<" NULL */
476 * The sk_func needs to be passed the index value as left arg and
477 * the sk_argument as right arg (they might be of different
478 * types). Since it is convenient for callers to think of
479 * _bt_compare as comparing the scankey to the index item, we have
480 * to flip the sign of the comparison result. (Unless it's a DESC
481 * column, in which case we *don't* flip the sign.)
483 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
484 scankey->sk_collation,
486 scankey->sk_argument));
488 if (!(scankey->sk_flags & SK_BT_DESC))
492 /* if the keys are unequal, return the difference */
499 /* if we get here, the keys are equal */
504 * _bt_first() -- Find the first item in a scan.
506 * We need to be clever about the direction of scan, the search
507 * conditions, and the tree ordering. We find the first item (or,
508 * if backwards scan, the last item) in the tree that satisfies the
509 * qualifications in the scan key. On success exit, the page containing
510 * the current index tuple is pinned but not locked, and data about
511 * the matching tuple(s) on the page has been loaded into so->currPos.
512 * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
513 * and if requested, scan->xs_itup points to a copy of the index tuple.
515 * If there are no matching items in the index, we return FALSE, with no
516 * pins or locks held.
518 * Note that scan->keyData[], and the so->keyData[] scankey built from it,
519 * are both search-type scankeys (see nbtree/README for more about this).
520 * Within this routine, we build a temporary insertion-type scankey to use
521 * in locating the scan start position.
524 _bt_first(IndexScanDesc scan, ScanDirection dir)
526 Relation rel = scan->indexRelation;
527 BTScanOpaque so = (BTScanOpaque) scan->opaque;
531 StrategyNumber strat;
534 ScanKey startKeys[INDEX_MAX_KEYS];
535 ScanKeyData scankeys[INDEX_MAX_KEYS];
536 ScanKeyData notnullkeys[INDEX_MAX_KEYS];
539 StrategyNumber strat_total;
540 BTScanPosItem *currItem;
542 Assert(!BTScanPosIsValid(so->currPos));
544 pgstat_count_index_scan(rel);
547 * Examine the scan keys and eliminate any redundant keys; also mark the
548 * keys that must be matched to continue the scan.
550 _bt_preprocess_keys(scan);
553 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
554 * never be satisfied (eg, x == 1 AND x > 2).
560 * Examine the scan keys to discover where we need to start the scan.
562 * We want to identify the keys that can be used as starting boundaries;
563 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
564 * a backwards scan. We can use keys for multiple attributes so long as
565 * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
566 * a > or < boundary or find an attribute with no boundary (which can be
567 * thought of as the same as "> -infinity"), we can't use keys for any
568 * attributes to its right, because it would break our simplistic notion
569 * of what initial positioning strategy to use.
571 * When the scan keys include cross-type operators, _bt_preprocess_keys
572 * may not be able to eliminate redundant keys; in such cases we will
573 * arbitrarily pick a usable one for each attribute. This is correct
574 * but possibly not optimal behavior. (For example, with keys like
575 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
576 * x=5 would be more efficient.) Since the situation only arises given
577 * a poorly-worded query plus an incomplete opfamily, live with it.
579 * When both equality and inequality keys appear for a single attribute
580 * (again, only possible when cross-type operators appear), we *must*
581 * select one of the equality keys for the starting point, because
582 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
583 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
584 * start at x=4, we will fail and stop before reaching x=10. If multiple
585 * equality quals survive preprocessing, however, it doesn't matter which
586 * one we use --- by definition, they are either redundant or
589 * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
590 * If the index stores nulls at the end of the index we'll be starting
591 * from, and we have no boundary key for the column (which means the key
592 * we deduced NOT NULL from is an inequality key that constrains the other
593 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
594 * use as a boundary key. If we didn't do this, we might find ourselves
595 * traversing a lot of null entries at the start of the scan.
597 * In this loop, row-comparison keys are treated the same as keys on their
598 * first (leftmost) columns. We'll add on lower-order columns of the row
599 * comparison below, if possible.
601 * The selected scan keys (at most one per index column) are remembered by
602 * storing their addresses into the local startKeys[] array.
605 strat_total = BTEqualStrategyNumber;
606 if (so->numberOfKeys > 0)
614 * chosen is the so-far-chosen key for the current attribute, if any.
615 * We don't cast the decision in stone until we reach keys for the
620 /* Also remember any scankey that implies a NOT NULL constraint */
624 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
625 * pass to handle after-last-key processing. Actual exit from the
626 * loop is at one of the "break" statements below.
628 for (cur = so->keyData, i = 0;; cur++, i++)
630 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
633 * Done looking at keys for curattr. If we didn't find a
634 * usable boundary key, see if we can deduce a NOT NULL key.
636 if (chosen == NULL && impliesNN != NULL &&
637 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
638 ScanDirectionIsForward(dir) :
639 ScanDirectionIsBackward(dir)))
641 /* Yes, so build the key in notnullkeys[keysCount] */
642 chosen = ¬nullkeys[keysCount];
643 ScanKeyEntryInitialize(chosen,
644 (SK_SEARCHNOTNULL | SK_ISNULL |
645 (impliesNN->sk_flags &
646 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
648 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
649 BTGreaterStrategyNumber :
650 BTLessStrategyNumber),
658 * If we still didn't find a usable boundary key, quit; else
659 * save the boundary key pointer in startKeys.
663 startKeys[keysCount++] = chosen;
666 * Adjust strat_total, and quit if we have stored a > or <
669 strat = chosen->sk_strategy;
670 if (strat != BTEqualStrategyNumber)
673 if (strat == BTGreaterStrategyNumber ||
674 strat == BTLessStrategyNumber)
679 * Done if that was the last attribute, or if next key is not
680 * in sequence (implying no boundary key is available for the
683 if (i >= so->numberOfKeys ||
684 cur->sk_attno != curattr + 1)
688 * Reset for next attr.
690 curattr = cur->sk_attno;
696 * Can we use this key as a starting boundary for this attr?
698 * If not, does it imply a NOT NULL constraint? (Because
699 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
700 * *any* inequality key works for that; we need not test.)
702 switch (cur->sk_strategy)
704 case BTLessStrategyNumber:
705 case BTLessEqualStrategyNumber:
708 if (ScanDirectionIsBackward(dir))
714 case BTEqualStrategyNumber:
715 /* override any non-equality choice */
718 case BTGreaterEqualStrategyNumber:
719 case BTGreaterStrategyNumber:
722 if (ScanDirectionIsForward(dir))
733 * If we found no usable boundary keys, we have to start from one end of
734 * the tree. Walk down that edge to the first or last key, and scan from
738 return _bt_endpoint(scan, dir);
741 * We want to start the scan somewhere within the index. Set up an
742 * insertion scankey we can use to search for the boundary point we
743 * identified above. The insertion scankey is built in the local
744 * scankeys[] array, using the keys identified by startKeys[].
746 Assert(keysCount <= INDEX_MAX_KEYS);
747 for (i = 0; i < keysCount; i++)
749 ScanKey cur = startKeys[i];
751 Assert(cur->sk_attno == i + 1);
753 if (cur->sk_flags & SK_ROW_HEADER)
756 * Row comparison header: look to the first row member instead.
758 * The member scankeys are already in insertion format (ie, they
759 * have sk_func = 3-way-comparison function), but we have to watch
760 * out for nulls, which _bt_preprocess_keys didn't check. A null
761 * in the first row member makes the condition unmatchable, just
762 * like qual_ok = false.
764 ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
766 Assert(subkey->sk_flags & SK_ROW_MEMBER);
767 if (subkey->sk_flags & SK_ISNULL)
769 memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
772 * If the row comparison is the last positioning key we accepted,
773 * try to add additional keys from the lower-order row members.
774 * (If we accepted independent conditions on additional index
775 * columns, we use those instead --- doesn't seem worth trying to
776 * determine which is more restrictive.) Note that this is OK
777 * even if the row comparison is of ">" or "<" type, because the
778 * condition applied to all but the last row member is effectively
779 * ">=" or "<=", and so the extra keys don't break the positioning
780 * scheme. But, by the same token, if we aren't able to use all
781 * the row members, then the part of the row comparison that we
782 * did use has to be treated as just a ">=" or "<=" condition, and
783 * so we'd better adjust strat_total accordingly.
785 if (i == keysCount - 1)
787 bool used_all_subkeys = false;
789 Assert(!(subkey->sk_flags & SK_ROW_END));
793 Assert(subkey->sk_flags & SK_ROW_MEMBER);
794 if (subkey->sk_attno != keysCount + 1)
795 break; /* out-of-sequence, can't use it */
796 if (subkey->sk_strategy != cur->sk_strategy)
797 break; /* wrong direction, can't use it */
798 if (subkey->sk_flags & SK_ISNULL)
799 break; /* can't use null keys */
800 Assert(keysCount < INDEX_MAX_KEYS);
801 memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
803 if (subkey->sk_flags & SK_ROW_END)
805 used_all_subkeys = true;
809 if (!used_all_subkeys)
813 case BTLessStrategyNumber:
814 strat_total = BTLessEqualStrategyNumber;
816 case BTGreaterStrategyNumber:
817 strat_total = BTGreaterEqualStrategyNumber;
821 break; /* done with outer loop */
827 * Ordinary comparison key. Transform the search-style scan key
828 * to an insertion scan key by replacing the sk_func with the
829 * appropriate btree comparison function.
831 * If scankey operator is not a cross-type comparison, we can use
832 * the cached comparison function; otherwise gotta look it up in
833 * the catalogs. (That can't lead to infinite recursion, since no
834 * indexscan initiated by syscache lookup will use cross-data-type
837 * We support the convention that sk_subtype == InvalidOid means
838 * the opclass input type; this is a hack to simplify life for
841 if (cur->sk_subtype == rel->rd_opcintype[i] ||
842 cur->sk_subtype == InvalidOid)
846 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
847 ScanKeyEntryInitializeWithInfo(scankeys + i,
858 RegProcedure cmp_proc;
860 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
861 rel->rd_opcintype[i],
864 if (!RegProcedureIsValid(cmp_proc))
865 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
866 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
867 cur->sk_attno, RelationGetRelationName(rel));
868 ScanKeyEntryInitialize(scankeys + i,
881 * Examine the selected initial-positioning strategy to determine exactly
882 * where we need to start the scan, and set flag variables to control the
885 * If nextkey = false, _bt_search and _bt_binsrch will locate the first
886 * item >= scan key. If nextkey = true, they will locate the first
889 * If goback = true, we will then step back one item, while if
890 * goback = false, we will start the scan on the located item.
895 case BTLessStrategyNumber:
898 * Find first item >= scankey, then back up one to arrive at last
899 * item < scankey. (Note: this positioning strategy is only used
900 * for a backward scan, so that is always the correct starting
907 case BTLessEqualStrategyNumber:
910 * Find first item > scankey, then back up one to arrive at last
911 * item <= scankey. (Note: this positioning strategy is only used
912 * for a backward scan, so that is always the correct starting
919 case BTEqualStrategyNumber:
922 * If a backward scan was specified, need to start with last equal
923 * item not first one.
925 if (ScanDirectionIsBackward(dir))
928 * This is the same as the <= strategy. We will check at the
929 * end whether the found item is actually =.
937 * This is the same as the >= strategy. We will check at the
938 * end whether the found item is actually =.
945 case BTGreaterEqualStrategyNumber:
948 * Find first item >= scankey. (This is only used for forward
955 case BTGreaterStrategyNumber:
958 * Find first item > scankey. (This is only used for forward
966 /* can't get here, but keep compiler quiet */
967 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
972 * Use the manufactured insertion scan key to descend the tree and
973 * position ourselves on the target leaf page.
975 stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ);
977 /* don't need to keep the stack around... */
978 _bt_freestack(stack);
980 if (!BufferIsValid(buf))
983 * We only get here if the index is completely empty. Lock relation
984 * because nothing finer to lock exists.
986 PredicateLockRelation(rel, scan->xs_snapshot);
990 PredicateLockPage(rel, BufferGetBlockNumber(buf),
993 /* initialize moreLeft/moreRight appropriately for scan direction */
994 if (ScanDirectionIsForward(dir))
996 so->currPos.moreLeft = false;
997 so->currPos.moreRight = true;
1001 so->currPos.moreLeft = true;
1002 so->currPos.moreRight = false;
1004 so->numKilled = 0; /* just paranoia */
1005 so->markItemIndex = -1; /* ditto */
1007 /* position to the precise item on the page */
1008 offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
1011 * If nextkey = false, we are positioned at the first item >= scan key, or
1012 * possibly at the end of a page on which all the existing items are less
1013 * than the scan key and we know that everything on later pages is greater
1014 * than or equal to scan key.
1016 * If nextkey = true, we are positioned at the first item > scan key, or
1017 * possibly at the end of a page on which all the existing items are less
1018 * than or equal to the scan key and we know that everything on later
1019 * pages is greater than scan key.
1021 * The actually desired starting point is either this item or the prior
1022 * one, or in the end-of-page case it's the first item on the next page or
1023 * the last item on this page. Adjust the starting offset if needed. (If
1024 * this results in an offset before the first item or after the last one,
1025 * _bt_readpage will report no items found, and then we'll step to the
1026 * next page as needed.)
1029 offnum = OffsetNumberPrev(offnum);
1031 /* remember which buffer we have pinned, if any */
1032 Assert(!BTScanPosIsValid(so->currPos));
1033 so->currPos.buf = buf;
1036 * Now load data from the first page of the scan.
1038 if (!_bt_readpage(scan, dir, offnum))
1041 * There's no actually-matching data on this page. Try to advance to
1042 * the next page. Return false if there's no matching data at all.
1044 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1045 if (!_bt_steppage(scan, dir))
1050 /* Drop the lock, and maybe the pin, on the current page */
1051 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1054 /* OK, itemIndex says what to return */
1055 currItem = &so->currPos.items[so->currPos.itemIndex];
1056 scan->xs_ctup.t_self = currItem->heapTid;
1057 if (scan->xs_want_itup)
1058 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1064 * _bt_next() -- Get the next item in a scan.
1066 * On entry, so->currPos describes the current page, which may be pinned
1067 * but is not locked, and so->currPos.itemIndex identifies which item was
1068 * previously returned.
1070 * On successful exit, scan->xs_ctup.t_self is set to the TID of the
1071 * next heap tuple, and if requested, scan->xs_itup points to a copy of
1072 * the index tuple. so->currPos is updated as needed.
1074 * On failure exit (no more tuples), we release pin and set
1075 * so->currPos.buf to InvalidBuffer.
1078 _bt_next(IndexScanDesc scan, ScanDirection dir)
1080 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1081 BTScanPosItem *currItem;
1084 * Advance to next tuple on current page; or if there's no more, try to
1085 * step to the next page with data.
1087 if (ScanDirectionIsForward(dir))
1089 if (++so->currPos.itemIndex > so->currPos.lastItem)
1091 if (!_bt_steppage(scan, dir))
1097 if (--so->currPos.itemIndex < so->currPos.firstItem)
1099 if (!_bt_steppage(scan, dir))
1104 /* OK, itemIndex says what to return */
1105 currItem = &so->currPos.items[so->currPos.itemIndex];
1106 scan->xs_ctup.t_self = currItem->heapTid;
1107 if (scan->xs_want_itup)
1108 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1114 * _bt_readpage() -- Load data from current index page into so->currPos
1116 * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1117 * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1118 * they are updated as appropriate. All other fields of so->currPos are
1119 * initialized from scratch here.
1121 * We scan the current page starting at offnum and moving in the indicated
1122 * direction. All items matching the scan keys are loaded into currPos.items.
1123 * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1124 * that there can be no more matching tuples in the current scan direction.
1126 * Returns true if any matching items found on the page, false if none.
1129 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
1131 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1133 BTPageOpaque opaque;
1134 OffsetNumber minoff;
1135 OffsetNumber maxoff;
1141 * We must have the buffer pinned and locked, but the usual macro can't be
1142 * used here; this function is what makes it good for currPos.
1144 Assert(BufferIsValid(so->currPos.buf));
1146 page = BufferGetPage(so->currPos.buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
1147 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1148 minoff = P_FIRSTDATAKEY(opaque);
1149 maxoff = PageGetMaxOffsetNumber(page);
1152 * We note the buffer's block number so that we can release the pin later.
1153 * This allows us to re-read the buffer if it is needed again for hinting.
1155 so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1158 * We save the LSN of the page as we read it, so that we know whether it
1159 * safe to apply LP_DEAD hints to the page later. This allows us to drop
1160 * the pin for MVCC scans, which allows vacuum to avoid blocking.
1162 so->currPos.lsn = PageGetLSN(page);
1165 * we must save the page's right-link while scanning it; this tells us
1166 * where to step right to after we're done with these items. There is no
1167 * corresponding need for the left-link, since splits always go right.
1169 so->currPos.nextPage = opaque->btpo_next;
1171 /* initialize tuple workspace to empty */
1172 so->currPos.nextTupleOffset = 0;
1175 * Now that the current page has been made consistent, the macro should be
1178 Assert(BTScanPosIsPinned(so->currPos));
1180 if (ScanDirectionIsForward(dir))
1182 /* load items[] in ascending order */
1185 offnum = Max(offnum, minoff);
1187 while (offnum <= maxoff)
1189 itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1192 /* tuple passes all scan key conditions, so remember it */
1193 _bt_saveitem(so, itemIndex, offnum, itup);
1198 /* there can't be any more matches, so stop */
1199 so->currPos.moreRight = false;
1203 offnum = OffsetNumberNext(offnum);
1206 Assert(itemIndex <= MaxIndexTuplesPerPage);
1207 so->currPos.firstItem = 0;
1208 so->currPos.lastItem = itemIndex - 1;
1209 so->currPos.itemIndex = 0;
1213 /* load items[] in descending order */
1214 itemIndex = MaxIndexTuplesPerPage;
1216 offnum = Min(offnum, maxoff);
1218 while (offnum >= minoff)
1220 itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1223 /* tuple passes all scan key conditions, so remember it */
1225 _bt_saveitem(so, itemIndex, offnum, itup);
1229 /* there can't be any more matches, so stop */
1230 so->currPos.moreLeft = false;
1234 offnum = OffsetNumberPrev(offnum);
1237 Assert(itemIndex >= 0);
1238 so->currPos.firstItem = itemIndex;
1239 so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
1240 so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
1243 return (so->currPos.firstItem <= so->currPos.lastItem);
1246 /* Save an index item into so->currPos.items[itemIndex] */
1248 _bt_saveitem(BTScanOpaque so, int itemIndex,
1249 OffsetNumber offnum, IndexTuple itup)
1251 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1253 currItem->heapTid = itup->t_tid;
1254 currItem->indexOffset = offnum;
1257 Size itupsz = IndexTupleSize(itup);
1259 currItem->tupleOffset = so->currPos.nextTupleOffset;
1260 memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1261 so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1266 * _bt_steppage() -- Step to next page containing valid data for scan
1268 * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1269 * if pinned, we'll drop the pin before moving to next page. The buffer is
1270 * not locked on entry.
1272 * On success exit, so->currPos is updated to contain data from the next
1273 * interesting page. For success on a scan using a non-MVCC snapshot we hold
1274 * a pin, but not a read lock, on that page. If we do not hold the pin, we
1275 * set so->currPos.buf to InvalidBuffer. We return TRUE to indicate success.
1277 * If there are no more matching records in the given direction, we drop all
1278 * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
1281 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1283 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1286 BTPageOpaque opaque;
1288 Assert(BTScanPosIsValid(so->currPos));
1290 /* Before leaving current page, deal with any killed items */
1291 if (so->numKilled > 0)
1292 _bt_killitems(scan);
1295 * Before we modify currPos, make a copy of the page data if there was a
1296 * mark position that needs it.
1298 if (so->markItemIndex >= 0)
1300 /* bump pin on current buffer for assignment to mark buffer */
1301 if (BTScanPosIsPinned(so->currPos))
1302 IncrBufferRefCount(so->currPos.buf);
1303 memcpy(&so->markPos, &so->currPos,
1304 offsetof(BTScanPosData, items[1]) +
1305 so->currPos.lastItem * sizeof(BTScanPosItem));
1307 memcpy(so->markTuples, so->currTuples,
1308 so->currPos.nextTupleOffset);
1309 so->markPos.itemIndex = so->markItemIndex;
1310 so->markItemIndex = -1;
1313 rel = scan->indexRelation;
1315 if (ScanDirectionIsForward(dir))
1317 /* Walk right to the next page with data */
1318 /* We must rely on the previously saved nextPage link! */
1319 BlockNumber blkno = so->currPos.nextPage;
1321 /* Remember we left a page with data */
1322 so->currPos.moreLeft = true;
1324 /* release the previous buffer, if pinned */
1325 BTScanPosUnpinIfPinned(so->currPos);
1329 /* if we're at end of scan, give up */
1330 if (blkno == P_NONE || !so->currPos.moreRight)
1332 BTScanPosInvalidate(so->currPos);
1335 /* check for interrupts while we're not holding any buffer lock */
1336 CHECK_FOR_INTERRUPTS();
1337 /* step right one page */
1338 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1339 /* check for deleted page */
1340 page = BufferGetPage(so->currPos.buf, NULL, NULL,
1341 BGP_NO_SNAPSHOT_TEST);
1342 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1343 if (!P_IGNORE(opaque))
1345 PredicateLockPage(rel, blkno, scan->xs_snapshot);
1346 /* see if there are any matches on this page */
1347 /* note that this will clear moreRight if we can stop */
1348 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1352 /* nope, keep going */
1353 blkno = opaque->btpo_next;
1354 _bt_relbuf(rel, so->currPos.buf);
1359 /* Remember we left a page with data */
1360 so->currPos.moreRight = true;
1363 * Walk left to the next page with data. This is much more complex
1364 * than the walk-right case because of the possibility that the page
1365 * to our left splits while we are in flight to it, plus the
1366 * possibility that the page we were on gets deleted after we leave
1367 * it. See nbtree/README for details.
1369 * It might be possible to rearrange this code to have less overhead
1370 * in pinning and locking, but that would require capturing the left
1371 * pointer when the page is initially read, and using it here, along
1372 * with big changes to _bt_walk_left() and the code below. It is not
1373 * clear whether this would be a win, since if the page immediately to
1374 * the left splits after we read this page and before we step left, we
1375 * would need to visit more pages than with the current code.
1377 * Note that if we change the code so that we drop the pin for a scan
1378 * which uses a non-MVCC snapshot, we will need to modify the code for
1379 * walking left, to allow for the possibility that a referenced page
1380 * has been deleted. As long as the buffer is pinned or the snapshot
1381 * is MVCC the page cannot move past the half-dead state to fully
1384 if (BTScanPosIsPinned(so->currPos))
1385 LockBuffer(so->currPos.buf, BT_READ);
1387 so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
1391 /* Done if we know there are no matching keys to the left */
1392 if (!so->currPos.moreLeft)
1394 _bt_relbuf(rel, so->currPos.buf);
1395 BTScanPosInvalidate(so->currPos);
1399 /* Step to next physical page */
1400 so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
1403 /* if we're physically at end of index, return failure */
1404 if (so->currPos.buf == InvalidBuffer)
1406 BTScanPosInvalidate(so->currPos);
1411 * Okay, we managed to move left to a non-deleted page. Done if
1412 * it's not half-dead and contains matching tuples. Else loop back
1413 * and do it all again.
1415 page = BufferGetPage(so->currPos.buf, NULL, NULL,
1416 BGP_NO_SNAPSHOT_TEST);
1417 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1418 if (!P_IGNORE(opaque))
1420 PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
1421 /* see if there are any matches on this page */
1422 /* note that this will clear moreLeft if we can stop */
1423 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1429 /* Drop the lock, and maybe the pin, on the current page */
1430 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1436 * _bt_walk_left() -- step left one page, if possible
1438 * The given buffer must be pinned and read-locked. This will be dropped
1439 * before stepping left. On return, we have pin and read lock on the
1440 * returned page, instead.
1442 * Returns InvalidBuffer if there is no page to the left (no lock is held
1445 * When working on a non-leaf level, it is possible for the returned page
1446 * to be half-dead; the caller should check that condition and step left
1447 * again if it's important.
1450 _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
1453 BTPageOpaque opaque;
1455 page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
1456 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1465 /* if we're at end of tree, release buf and return failure */
1466 if (P_LEFTMOST(opaque))
1468 _bt_relbuf(rel, buf);
1471 /* remember original page we are stepping left from */
1472 obknum = BufferGetBlockNumber(buf);
1474 blkno = lblkno = opaque->btpo_prev;
1475 _bt_relbuf(rel, buf);
1476 /* check for interrupts while we're not holding any buffer lock */
1477 CHECK_FOR_INTERRUPTS();
1478 buf = _bt_getbuf(rel, blkno, BT_READ);
1479 page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
1480 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1483 * If this isn't the page we want, walk right till we find what we
1484 * want --- but go no more than four hops (an arbitrary limit). If we
1485 * don't find the correct page by then, the most likely bet is that
1486 * the original page got deleted and isn't in the sibling chain at all
1487 * anymore, not that its left sibling got split more than four times.
1489 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1490 * because half-dead pages are still in the sibling chain. Caller
1491 * must reject half-dead pages if wanted.
1496 if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1498 /* Found desired page, return it */
1501 if (P_RIGHTMOST(opaque) || ++tries > 4)
1503 blkno = opaque->btpo_next;
1504 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1505 page = BufferGetPage(buf, NULL, NULL,
1506 BGP_NO_SNAPSHOT_TEST);
1507 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1510 /* Return to the original page to see what's up */
1511 buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
1512 page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
1513 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1514 if (P_ISDELETED(opaque))
1517 * It was deleted. Move right to first nondeleted page (there
1518 * must be one); that is the page that has acquired the deleted
1519 * one's keyspace, so stepping left from it will take us where we
1524 if (P_RIGHTMOST(opaque))
1525 elog(ERROR, "fell off the end of index \"%s\"",
1526 RelationGetRelationName(rel));
1527 blkno = opaque->btpo_next;
1528 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1529 page = BufferGetPage(buf, NULL, NULL,
1530 BGP_NO_SNAPSHOT_TEST);
1531 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1532 if (!P_ISDELETED(opaque))
1537 * Now return to top of loop, resetting obknum to point to this
1538 * nondeleted page, and try again.
1544 * It wasn't deleted; the explanation had better be that the page
1545 * to the left got split or deleted. Without this check, we'd go
1546 * into an infinite loop if there's anything wrong.
1548 if (opaque->btpo_prev == lblkno)
1549 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
1550 obknum, RelationGetRelationName(rel));
1551 /* Okay to try again with new lblkno value */
1555 return InvalidBuffer;
1559 * _bt_get_endpoint() -- Find the first or last page on a given tree level
1561 * If the index is empty, we will return InvalidBuffer; any other failure
1562 * condition causes ereport(). We will not return a dead page.
1564 * The returned buffer is pinned and read-locked.
1567 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
1571 BTPageOpaque opaque;
1572 OffsetNumber offnum;
1577 * If we are looking for a leaf page, okay to descend from fast root;
1578 * otherwise better descend from true root. (There is no point in being
1579 * smarter about intermediate levels.)
1582 buf = _bt_getroot(rel, BT_READ);
1584 buf = _bt_gettrueroot(rel);
1586 if (!BufferIsValid(buf))
1587 return InvalidBuffer;
1589 page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
1590 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1595 * If we landed on a deleted page, step right to find a live page
1596 * (there must be one). Also, if we want the rightmost page, step
1597 * right if needed to get to it (this could happen if the page split
1598 * since we obtained a pointer to it).
1600 while (P_IGNORE(opaque) ||
1601 (rightmost && !P_RIGHTMOST(opaque)))
1603 blkno = opaque->btpo_next;
1604 if (blkno == P_NONE)
1605 elog(ERROR, "fell off the end of index \"%s\"",
1606 RelationGetRelationName(rel));
1607 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1608 page = BufferGetPage(buf, NULL, NULL,
1609 BGP_NO_SNAPSHOT_TEST);
1610 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1614 if (opaque->btpo.level == level)
1616 if (opaque->btpo.level < level)
1617 elog(ERROR, "btree level %u not found in index \"%s\"",
1618 level, RelationGetRelationName(rel));
1620 /* Descend to leftmost or rightmost child page */
1622 offnum = PageGetMaxOffsetNumber(page);
1624 offnum = P_FIRSTDATAKEY(opaque);
1626 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1627 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1629 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1630 page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
1631 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1638 * _bt_endpoint() -- Find the first or last page in the index, and scan
1639 * from there to the first key satisfying all the quals.
1641 * This is used by _bt_first() to set up a scan when we've determined
1642 * that the scan must start at the beginning or end of the index (for
1643 * a forward or backward scan respectively). Exit conditions are the
1644 * same as for _bt_first().
1647 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
1649 Relation rel = scan->indexRelation;
1650 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1653 BTPageOpaque opaque;
1655 BTScanPosItem *currItem;
1658 * Scan down to the leftmost or rightmost leaf page. This is a simplified
1659 * version of _bt_search(). We don't maintain a stack since we know we
1662 buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
1664 if (!BufferIsValid(buf))
1667 * Empty index. Lock the whole relation, as nothing finer to lock
1670 PredicateLockRelation(rel, scan->xs_snapshot);
1671 BTScanPosInvalidate(so->currPos);
1675 PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
1676 page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
1677 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1678 Assert(P_ISLEAF(opaque));
1680 if (ScanDirectionIsForward(dir))
1682 /* There could be dead pages to the left, so not this: */
1683 /* Assert(P_LEFTMOST(opaque)); */
1685 start = P_FIRSTDATAKEY(opaque);
1687 else if (ScanDirectionIsBackward(dir))
1689 Assert(P_RIGHTMOST(opaque));
1691 start = PageGetMaxOffsetNumber(page);
1695 elog(ERROR, "invalid scan direction: %d", (int) dir);
1696 start = 0; /* keep compiler quiet */
1699 /* remember which buffer we have pinned */
1700 so->currPos.buf = buf;
1702 /* initialize moreLeft/moreRight appropriately for scan direction */
1703 if (ScanDirectionIsForward(dir))
1705 so->currPos.moreLeft = false;
1706 so->currPos.moreRight = true;
1710 so->currPos.moreLeft = true;
1711 so->currPos.moreRight = false;
1713 so->numKilled = 0; /* just paranoia */
1714 so->markItemIndex = -1; /* ditto */
1717 * Now load data from the first page of the scan.
1719 if (!_bt_readpage(scan, dir, start))
1722 * There's no actually-matching data on this page. Try to advance to
1723 * the next page. Return false if there's no matching data at all.
1725 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1726 if (!_bt_steppage(scan, dir))
1731 /* Drop the lock, and maybe the pin, on the current page */
1732 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1735 /* OK, itemIndex says what to return */
1736 currItem = &so->currPos.items[so->currPos.itemIndex];
1737 scan->xs_ctup.t_self = currItem->heapTid;
1738 if (scan->xs_want_itup)
1739 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);