1 /*-------------------------------------------------------------------------
4 * Search code for postgres btrees.
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/nbtree/nbtsearch.c
13 *-------------------------------------------------------------------------
18 #include "access/nbtree.h"
19 #include "access/relscan.h"
20 #include "miscadmin.h"
22 #include "storage/predicate.h"
23 #include "utils/lsyscache.h"
24 #include "utils/rel.h"
27 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
28 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
29 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
31 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
32 OffsetNumber offnum, IndexTuple itup);
33 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
34 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
35 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
37 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
38 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
39 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
43 * _bt_drop_lock_and_maybe_pin()
45 * Unlock the buffer; and if it is safe to release the pin, do that, too. It
46 * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
47 * This will prevent vacuum from stalling in a blocked state trying to read a
48 * page when a cursor is sitting on it -- at least in many important cases.
50 * Set the buffer to invalid if the pin is released, since the buffer may be
51 * re-used. If we need to go back to this block (for example, to apply
52 * LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it
53 * will remain in shared memory for as long as it takes to scan the index
57 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
59 LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
61 if (IsMVCCSnapshot(scan->xs_snapshot) &&
62 RelationNeedsWAL(scan->indexRelation) &&
65 ReleaseBuffer(sp->buf);
66 sp->buf = InvalidBuffer;
71 * _bt_search() -- Search the tree for a particular scankey,
72 * or more precisely for the first leaf page it could be on.
74 * The passed scankey is an insertion-type scankey (see nbtree/README),
75 * but it can omit the rightmost column(s) of the index.
77 * Return value is a stack of parent-page pointers. *bufP is set to the
78 * address of the leaf-page buffer, which is read-locked and pinned.
79 * No locks are held on the parent pages, however!
81 * If the snapshot parameter is not NULL, "old snapshot" checking will take
82 * place during the descent through the tree. This is not needed when
83 * positioning for an insert or delete, so NULL is used for those cases.
85 * The returned buffer is locked according to access parameter. Additionally,
86 * access = BT_WRITE will allow an empty root page to be created and returned.
87 * When access = BT_READ, an empty index will result in *bufP being set to
88 * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
89 * during the search will be finished.
92 _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
95 BTStack stack_in = NULL;
96 int page_access = BT_READ;
98 /* Get the root page to start with */
99 *bufP = _bt_getroot(rel, access);
101 /* If index is empty and access = BT_READ, no root page is created. */
102 if (!BufferIsValid(*bufP))
103 return (BTStack) NULL;
105 /* Loop iterates once per level descended in the tree */
114 BlockNumber par_blkno;
118 * Race -- the page we just grabbed may have split since we read its
119 * pointer in the parent (or metapage). If it has, we may need to
120 * move right to its new sibling. Do that.
122 * In write-mode, allow _bt_moveright to finish any incomplete splits
123 * along the way. Strictly speaking, we'd only need to finish an
124 * incomplete split on the leaf page we're about to insert to, not on
125 * any of the upper levels (they are taken care of in _bt_getstackbuf,
126 * if the leaf page is split and we insert to the parent page). But
127 * this is a good opportunity to finish splits of internal pages too.
129 *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
130 page_access, snapshot);
132 /* if this is a leaf page, we're done */
133 page = BufferGetPage(*bufP);
134 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
135 if (P_ISLEAF(opaque))
139 * Find the appropriate item on the internal page, and get the child
140 * page that it points to.
142 offnum = _bt_binsrch(rel, key, *bufP);
143 itemid = PageGetItemId(page, offnum);
144 itup = (IndexTuple) PageGetItem(page, itemid);
145 blkno = BTreeInnerTupleGetDownLink(itup);
146 par_blkno = BufferGetBlockNumber(*bufP);
149 * We need to save the location of the index entry we chose in the
150 * parent page on a stack. In case we split the tree, we'll use the
151 * stack to work back up to the parent page. We also save the actual
152 * downlink (block) to uniquely identify the index entry, in case it
153 * moves right while we're working lower in the tree. See the paper
154 * by Lehman and Yao for how this is detected and handled. (We use the
155 * child link during the second half of a page split -- if caller ends
156 * up splitting the child it usually ends up inserting a new pivot
157 * tuple for child's new right sibling immediately after the original
158 * bts_offset offset recorded here. The downlink block will be needed
159 * to check if bts_offset remains the position of this same pivot
162 new_stack = (BTStack) palloc(sizeof(BTStackData));
163 new_stack->bts_blkno = par_blkno;
164 new_stack->bts_offset = offnum;
165 new_stack->bts_btentry = blkno;
166 new_stack->bts_parent = stack_in;
169 * Page level 1 is lowest non-leaf page level prior to leaves. So, if
170 * we're on the level 1 and asked to lock leaf page in write mode,
171 * then lock next page in write mode, because it must be a leaf.
173 if (opaque->btpo.level == 1 && access == BT_WRITE)
174 page_access = BT_WRITE;
176 /* drop the read lock on the parent page, acquire one on the child */
177 *bufP = _bt_relandgetbuf(rel, *bufP, blkno, page_access);
179 /* okay, all set to move down a level */
180 stack_in = new_stack;
184 * If we're asked to lock leaf in write mode, but didn't manage to, then
185 * relock. This should only happen when the root page is a leaf page (and
186 * the only page in the index other than the metapage).
188 if (access == BT_WRITE && page_access == BT_READ)
190 /* trade in our read lock for a write lock */
191 LockBuffer(*bufP, BUFFER_LOCK_UNLOCK);
192 LockBuffer(*bufP, BT_WRITE);
195 * If the page was split between the time that we surrendered our read
196 * lock and acquired our write lock, then this page may no longer be
197 * the right place for the key we want to insert. In this case, we
198 * need to move right in the tree. See Lehman and Yao for an
199 * excruciatingly precise description.
201 *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
209 * _bt_moveright() -- move right in the btree if necessary.
211 * When we follow a pointer to reach a page, it is possible that
212 * the page has changed in the meanwhile. If this happens, we're
213 * guaranteed that the page has "split right" -- that is, that any
214 * data that appeared on the page originally is either on the page
215 * or strictly to the right of it.
217 * This routine decides whether or not we need to move right in the
218 * tree by examining the high key entry on the page. If that entry is
219 * strictly less than the scankey, or <= the scankey in the
220 * key.nextkey=true case, then we followed the wrong link and we need
223 * The passed insertion-type scankey can omit the rightmost column(s) of the
224 * index. (see nbtree/README)
226 * When key.nextkey is false (the usual case), we are looking for the first
227 * item >= key. When key.nextkey is true, we are looking for the first item
228 * strictly greater than key.
230 * If forupdate is true, we will attempt to finish any incomplete splits
231 * that we encounter. This is required when locking a target page for an
232 * insertion, because we don't allow inserting on a page before the split
233 * is completed. 'stack' is only used if forupdate is true.
235 * On entry, we have the buffer pinned and a lock of the type specified by
236 * 'access'. If we move right, we release the buffer and lock and acquire
237 * the same on the right sibling. Return value is the buffer we stop at.
239 * If the snapshot parameter is not NULL, "old snapshot" checking will take
240 * place during the descent through the tree. This is not needed when
241 * positioning for an insert or delete, so NULL is used for those cases.
244 _bt_moveright(Relation rel,
257 * When nextkey = false (normal case): if the scan key that brought us to
258 * this page is > the high key stored on the page, then the page has split
259 * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
260 * have some duplicates to the right as well as the left, but that's
261 * something that's only ever dealt with on the leaf level, after
262 * _bt_search has found an initial leaf page.)
264 * When nextkey = true: move right if the scan key is >= page's high key.
265 * (Note that key.scantid cannot be set in this case.)
267 * The page could even have split more than once, so scan as far as
270 * We also have to move right if we followed a link that brought us to a
273 cmpval = key->nextkey ? 0 : 1;
277 page = BufferGetPage(buf);
278 TestForOldSnapshot(snapshot, rel, page);
279 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
281 if (P_RIGHTMOST(opaque))
285 * Finish any incomplete splits we encounter along the way.
287 if (forupdate && P_INCOMPLETE_SPLIT(opaque))
289 BlockNumber blkno = BufferGetBlockNumber(buf);
291 /* upgrade our lock if necessary */
292 if (access == BT_READ)
294 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
295 LockBuffer(buf, BT_WRITE);
298 if (P_INCOMPLETE_SPLIT(opaque))
299 _bt_finish_split(rel, buf, stack);
301 _bt_relbuf(rel, buf);
303 /* re-acquire the lock in the right mode, and re-check */
304 buf = _bt_getbuf(rel, blkno, access);
308 if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
310 /* step right one page */
311 buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
318 if (P_IGNORE(opaque))
319 elog(ERROR, "fell off the end of index \"%s\"",
320 RelationGetRelationName(rel));
326 * _bt_binsrch() -- Do a binary search for a key on a particular page.
328 * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
329 * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
330 * particular, this means it is possible to return a value 1 greater than the
331 * number of keys on the page, if the scankey is > all keys on the page.)
333 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
334 * of the last key < given scankey, or last key <= given scankey if nextkey
335 * is true. (Since _bt_compare treats the first data key of such a page as
336 * minus infinity, there will be at least one key < scankey, so the result
337 * always points at one of the keys on the page.) This key indicates the
338 * right place to descend to be sure we find all leaf keys >= given scankey
339 * (or leaf keys > given scankey when nextkey is true).
341 * This procedure is not responsible for walking right, it just examines
342 * the given page. _bt_binsrch() has no lock or refcount side effects
346 _bt_binsrch(Relation rel,
357 /* Requesting nextkey semantics while using scantid seems nonsensical */
358 Assert(!key->nextkey || key->scantid == NULL);
360 page = BufferGetPage(buf);
361 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
363 low = P_FIRSTDATAKEY(opaque);
364 high = PageGetMaxOffsetNumber(page);
367 * If there are no keys on the page, return the first available slot. Note
368 * this covers two cases: the page is really empty (no keys), or it
369 * contains only a high key. The latter case is possible after vacuuming.
370 * This can never happen on an internal page, however, since they are
371 * never empty (an internal page must have children).
373 if (unlikely(high < low))
377 * Binary search to find the first key on the page >= scan key, or first
378 * key > scankey when nextkey is true.
380 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
381 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
383 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
384 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
386 * We can fall out when high == low.
388 high++; /* establish the loop invariant for high */
390 cmpval = key->nextkey ? 0 : 1; /* select comparison value */
394 OffsetNumber mid = low + ((high - low) / 2);
396 /* We have low <= mid < high, so mid points at a real slot */
398 result = _bt_compare(rel, key, page, mid);
400 if (result >= cmpval)
407 * At this point we have high == low, but be careful: they could point
408 * past the last slot on the page.
410 * On a leaf page, we always return the first key >= scan key (resp. >
411 * scan key), which could be the last slot + 1.
413 if (P_ISLEAF(opaque))
417 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
418 * There must be one if _bt_compare() is playing by the rules.
420 Assert(low > P_FIRSTDATAKEY(opaque));
422 return OffsetNumberPrev(low);
427 * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
429 * Like _bt_binsrch(), but with support for caching the binary search
430 * bounds. Only used during insertion, and only on the leaf page that it
431 * looks like caller will insert tuple on. Exclusive-locked and pinned
432 * leaf page is contained within insertstate.
434 * Caches the bounds fields in insertstate so that a subsequent call can
435 * reuse the low and strict high bounds of original binary search. Callers
436 * that use these fields directly must be prepared for the case where low
437 * and/or stricthigh are not on the same page (one or both exceed maxoff
438 * for the page). The case where there are no items on the page (high <
439 * low) makes bounds invalid.
441 * Caller is responsible for invalidating bounds when it modifies the page
442 * before calling here a second time.
445 _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
447 BTScanInsert key = insertstate->itup_key;
456 page = BufferGetPage(insertstate->buf);
457 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
459 Assert(P_ISLEAF(opaque));
460 Assert(!key->nextkey);
462 if (!insertstate->bounds_valid)
464 /* Start new binary search */
465 low = P_FIRSTDATAKEY(opaque);
466 high = PageGetMaxOffsetNumber(page);
470 /* Restore result of previous binary search against same page */
471 low = insertstate->low;
472 high = insertstate->stricthigh;
475 /* If there are no keys on the page, return the first available slot */
476 if (unlikely(high < low))
478 /* Caller can't reuse bounds */
479 insertstate->low = InvalidOffsetNumber;
480 insertstate->stricthigh = InvalidOffsetNumber;
481 insertstate->bounds_valid = false;
486 * Binary search to find the first key on the page >= scan key. (nextkey
487 * is always false when inserting).
489 * The loop invariant is: all slots before 'low' are < scan key, all slots
490 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
491 * maintained to save additional search effort for caller.
493 * We can fall out when high == low.
495 if (!insertstate->bounds_valid)
496 high++; /* establish the loop invariant for high */
497 stricthigh = high; /* high initially strictly higher */
499 cmpval = 1; /* !nextkey comparison value */
503 OffsetNumber mid = low + ((high - low) / 2);
505 /* We have low <= mid < high, so mid points at a real slot */
507 result = _bt_compare(rel, key, page, mid);
509 if (result >= cmpval)
520 * On a leaf page, a binary search always returns the first key >= scan
521 * key (at least in !nextkey case), which could be the last slot + 1. This
522 * is also the lower bound of cached search.
524 * stricthigh may also be the last slot + 1, which prevents caller from
525 * using bounds directly, but is still useful to us if we're called a
526 * second time with cached bounds (cached low will be < stricthigh when
529 insertstate->low = low;
530 insertstate->stricthigh = stricthigh;
531 insertstate->bounds_valid = true;
537 * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
539 * page/offnum: location of btree item to be compared to.
541 * This routine returns:
542 * <0 if scankey < tuple at offnum;
543 * 0 if scankey == tuple at offnum;
544 * >0 if scankey > tuple at offnum.
545 * NULLs in the keys are treated as sortable values. Therefore
546 * "equality" does not necessarily mean that the item should be
547 * returned to the caller as a matching key!
549 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
550 * "minus infinity": this routine will always claim it is less than the
551 * scankey. The actual key value stored is explicitly truncated to 0
552 * attributes (explicitly minus infinity) with version 3+ indexes, but
553 * that isn't relied upon. This allows us to implement the Lehman and
554 * Yao convention that the first down-link pointer is before the first
555 * key. See backend/access/nbtree/README for details.
559 _bt_compare(Relation rel,
564 TupleDesc itupdesc = RelationGetDescr(rel);
565 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
572 Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
573 Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
574 Assert(key->heapkeyspace || key->scantid == NULL);
577 * Force result ">" if target item is first data item on an internal page
578 * --- see NOTE above.
580 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
583 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
584 ntupatts = BTreeTupleGetNAtts(itup, rel);
587 * The scan key is set up with the attribute number associated with each
588 * term in the key. It is important that, if the index is multi-key, the
589 * scan contain the first k key attributes, and that they be in order. If
590 * you think about how multi-key ordering works, you'll understand why
593 * We don't test for violation of this condition here, however. The
594 * initial setup for the index scan had better have gotten it right (see
598 ncmpkey = Min(ntupatts, key->keysz);
599 Assert(key->heapkeyspace || ncmpkey == key->keysz);
600 scankey = key->scankeys;
601 for (int i = 1; i <= ncmpkey; i++)
607 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
609 /* see comments about NULLs handling in btbuild */
610 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
613 result = 0; /* NULL "=" NULL */
614 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
615 result = -1; /* NULL "<" NOT_NULL */
617 result = 1; /* NULL ">" NOT_NULL */
619 else if (isNull) /* key is NOT_NULL and item is NULL */
621 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
622 result = 1; /* NOT_NULL ">" NULL */
624 result = -1; /* NOT_NULL "<" NULL */
629 * The sk_func needs to be passed the index value as left arg and
630 * the sk_argument as right arg (they might be of different
631 * types). Since it is convenient for callers to think of
632 * _bt_compare as comparing the scankey to the index item, we have
633 * to flip the sign of the comparison result. (Unless it's a DESC
634 * column, in which case we *don't* flip the sign.)
636 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
637 scankey->sk_collation,
639 scankey->sk_argument));
641 if (!(scankey->sk_flags & SK_BT_DESC))
642 INVERT_COMPARE_RESULT(result);
645 /* if the keys are unequal, return the difference */
653 * All non-truncated attributes (other than heap TID) were found to be
654 * equal. Treat truncated attributes as minus infinity when scankey has a
655 * key attribute value that would otherwise be compared directly.
657 * Note: it doesn't matter if ntupatts includes non-key attributes;
658 * scankey won't, so explicitly excluding non-key attributes isn't
661 if (key->keysz > ntupatts)
665 * Use the heap TID attribute and scantid to try to break the tie. The
666 * rules are the same as any other key attribute -- only the
667 * representation differs.
669 heapTid = BTreeTupleGetHeapTID(itup);
670 if (key->scantid == NULL)
673 * Most searches have a scankey that is considered greater than a
674 * truncated pivot tuple if and when the scankey has equal values for
675 * attributes up to and including the least significant untruncated
676 * attribute in tuple.
678 * For example, if an index has the minimum two attributes (single
679 * user key attribute, plus heap TID attribute), and a page's high key
680 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
681 * will not descend to the page to the left. The search will descend
682 * right instead. The truncated attribute in pivot tuple means that
683 * all non-pivot tuples on the page to the left are strictly < 'foo',
684 * so it isn't necessary to descend left. In other words, search
685 * doesn't have to descend left because it isn't interested in a match
686 * that has a heap TID value of -inf.
688 * However, some searches (pivotsearch searches) actually require that
689 * we descend left when this happens. -inf is treated as a possible
690 * match for omitted scankey attribute(s). This is needed by page
691 * deletion, which must re-find leaf pages that are targets for
692 * deletion using their high keys.
694 * Note: the heap TID part of the test ensures that scankey is being
695 * compared to a pivot tuple with one or more truncated key
698 * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
699 * left here, since they have no heap TID attribute (and cannot have
700 * any -inf key values in any case, since truncation can only remove
701 * non-key attributes). !heapkeyspace searches must always be
702 * prepared to deal with matches on both sides of the pivot once the
703 * leaf level is reached.
705 if (key->heapkeyspace && !key->pivotsearch &&
706 key->keysz == ntupatts && heapTid == NULL)
709 /* All provided scankey arguments found to be equal */
714 * Treat truncated heap TID as minus infinity, since scankey has a key
715 * attribute value (scantid) that would otherwise be compared directly
717 Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
721 Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
722 return ItemPointerCompare(key->scantid, heapTid);
726 * _bt_first() -- Find the first item in a scan.
728 * We need to be clever about the direction of scan, the search
729 * conditions, and the tree ordering. We find the first item (or,
730 * if backwards scan, the last item) in the tree that satisfies the
731 * qualifications in the scan key. On success exit, the page containing
732 * the current index tuple is pinned but not locked, and data about
733 * the matching tuple(s) on the page has been loaded into so->currPos.
734 * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
735 * and if requested, scan->xs_itup points to a copy of the index tuple.
737 * If there are no matching items in the index, we return false, with no
738 * pins or locks held.
740 * Note that scan->keyData[], and the so->keyData[] scankey built from it,
741 * are both search-type scankeys (see nbtree/README for more about this).
742 * Within this routine, we build a temporary insertion-type scankey to use
743 * in locating the scan start position.
746 _bt_first(IndexScanDesc scan, ScanDirection dir)
748 Relation rel = scan->indexRelation;
749 BTScanOpaque so = (BTScanOpaque) scan->opaque;
753 StrategyNumber strat;
756 BTScanInsertData inskey;
757 ScanKey startKeys[INDEX_MAX_KEYS];
758 ScanKeyData notnullkeys[INDEX_MAX_KEYS];
762 StrategyNumber strat_total;
763 BTScanPosItem *currItem;
766 Assert(!BTScanPosIsValid(so->currPos));
768 pgstat_count_index_scan(rel);
771 * Examine the scan keys and eliminate any redundant keys; also mark the
772 * keys that must be matched to continue the scan.
774 _bt_preprocess_keys(scan);
777 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
778 * never be satisfied (eg, x == 1 AND x > 2).
784 * For parallel scans, get the starting page from shared state. If the
785 * scan has not started, proceed to find out first leaf page in the usual
786 * way while keeping other participating processes waiting. If the scan
787 * has already begun, use the page number from the shared structure.
789 if (scan->parallel_scan != NULL)
791 status = _bt_parallel_seize(scan, &blkno);
794 else if (blkno == P_NONE)
796 _bt_parallel_done(scan);
799 else if (blkno != InvalidBlockNumber)
801 if (!_bt_parallel_readpage(scan, blkno, dir))
808 * Examine the scan keys to discover where we need to start the scan.
810 * We want to identify the keys that can be used as starting boundaries;
811 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
812 * a backwards scan. We can use keys for multiple attributes so long as
813 * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
814 * a > or < boundary or find an attribute with no boundary (which can be
815 * thought of as the same as "> -infinity"), we can't use keys for any
816 * attributes to its right, because it would break our simplistic notion
817 * of what initial positioning strategy to use.
819 * When the scan keys include cross-type operators, _bt_preprocess_keys
820 * may not be able to eliminate redundant keys; in such cases we will
821 * arbitrarily pick a usable one for each attribute. This is correct
822 * but possibly not optimal behavior. (For example, with keys like
823 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
824 * x=5 would be more efficient.) Since the situation only arises given
825 * a poorly-worded query plus an incomplete opfamily, live with it.
827 * When both equality and inequality keys appear for a single attribute
828 * (again, only possible when cross-type operators appear), we *must*
829 * select one of the equality keys for the starting point, because
830 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
831 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
832 * start at x=4, we will fail and stop before reaching x=10. If multiple
833 * equality quals survive preprocessing, however, it doesn't matter which
834 * one we use --- by definition, they are either redundant or
837 * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
838 * If the index stores nulls at the end of the index we'll be starting
839 * from, and we have no boundary key for the column (which means the key
840 * we deduced NOT NULL from is an inequality key that constrains the other
841 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
842 * use as a boundary key. If we didn't do this, we might find ourselves
843 * traversing a lot of null entries at the start of the scan.
845 * In this loop, row-comparison keys are treated the same as keys on their
846 * first (leftmost) columns. We'll add on lower-order columns of the row
847 * comparison below, if possible.
849 * The selected scan keys (at most one per index column) are remembered by
850 * storing their addresses into the local startKeys[] array.
853 strat_total = BTEqualStrategyNumber;
854 if (so->numberOfKeys > 0)
862 * chosen is the so-far-chosen key for the current attribute, if any.
863 * We don't cast the decision in stone until we reach keys for the
868 /* Also remember any scankey that implies a NOT NULL constraint */
872 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
873 * pass to handle after-last-key processing. Actual exit from the
874 * loop is at one of the "break" statements below.
876 for (cur = so->keyData, i = 0;; cur++, i++)
878 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
881 * Done looking at keys for curattr. If we didn't find a
882 * usable boundary key, see if we can deduce a NOT NULL key.
884 if (chosen == NULL && impliesNN != NULL &&
885 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
886 ScanDirectionIsForward(dir) :
887 ScanDirectionIsBackward(dir)))
889 /* Yes, so build the key in notnullkeys[keysCount] */
890 chosen = ¬nullkeys[keysCount];
891 ScanKeyEntryInitialize(chosen,
892 (SK_SEARCHNOTNULL | SK_ISNULL |
893 (impliesNN->sk_flags &
894 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
896 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
897 BTGreaterStrategyNumber :
898 BTLessStrategyNumber),
906 * If we still didn't find a usable boundary key, quit; else
907 * save the boundary key pointer in startKeys.
911 startKeys[keysCount++] = chosen;
914 * Adjust strat_total, and quit if we have stored a > or <
917 strat = chosen->sk_strategy;
918 if (strat != BTEqualStrategyNumber)
921 if (strat == BTGreaterStrategyNumber ||
922 strat == BTLessStrategyNumber)
927 * Done if that was the last attribute, or if next key is not
928 * in sequence (implying no boundary key is available for the
931 if (i >= so->numberOfKeys ||
932 cur->sk_attno != curattr + 1)
936 * Reset for next attr.
938 curattr = cur->sk_attno;
944 * Can we use this key as a starting boundary for this attr?
946 * If not, does it imply a NOT NULL constraint? (Because
947 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
948 * *any* inequality key works for that; we need not test.)
950 switch (cur->sk_strategy)
952 case BTLessStrategyNumber:
953 case BTLessEqualStrategyNumber:
956 if (ScanDirectionIsBackward(dir))
962 case BTEqualStrategyNumber:
963 /* override any non-equality choice */
966 case BTGreaterEqualStrategyNumber:
967 case BTGreaterStrategyNumber:
970 if (ScanDirectionIsForward(dir))
981 * If we found no usable boundary keys, we have to start from one end of
982 * the tree. Walk down that edge to the first or last key, and scan from
989 match = _bt_endpoint(scan, dir);
993 /* No match, so mark (parallel) scan finished */
994 _bt_parallel_done(scan);
1001 * We want to start the scan somewhere within the index. Set up an
1002 * insertion scankey we can use to search for the boundary point we
1003 * identified above. The insertion scankey is built using the keys
1004 * identified by startKeys[]. (Remaining insertion scankey fields are
1005 * initialized after initial-positioning strategy is finalized.)
1007 Assert(keysCount <= INDEX_MAX_KEYS);
1008 for (i = 0; i < keysCount; i++)
1010 ScanKey cur = startKeys[i];
1012 Assert(cur->sk_attno == i + 1);
1014 if (cur->sk_flags & SK_ROW_HEADER)
1017 * Row comparison header: look to the first row member instead.
1019 * The member scankeys are already in insertion format (ie, they
1020 * have sk_func = 3-way-comparison function), but we have to watch
1021 * out for nulls, which _bt_preprocess_keys didn't check. A null
1022 * in the first row member makes the condition unmatchable, just
1023 * like qual_ok = false.
1025 ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1027 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1028 if (subkey->sk_flags & SK_ISNULL)
1030 _bt_parallel_done(scan);
1033 memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1036 * If the row comparison is the last positioning key we accepted,
1037 * try to add additional keys from the lower-order row members.
1038 * (If we accepted independent conditions on additional index
1039 * columns, we use those instead --- doesn't seem worth trying to
1040 * determine which is more restrictive.) Note that this is OK
1041 * even if the row comparison is of ">" or "<" type, because the
1042 * condition applied to all but the last row member is effectively
1043 * ">=" or "<=", and so the extra keys don't break the positioning
1044 * scheme. But, by the same token, if we aren't able to use all
1045 * the row members, then the part of the row comparison that we
1046 * did use has to be treated as just a ">=" or "<=" condition, and
1047 * so we'd better adjust strat_total accordingly.
1049 if (i == keysCount - 1)
1051 bool used_all_subkeys = false;
1053 Assert(!(subkey->sk_flags & SK_ROW_END));
1057 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1058 if (subkey->sk_attno != keysCount + 1)
1059 break; /* out-of-sequence, can't use it */
1060 if (subkey->sk_strategy != cur->sk_strategy)
1061 break; /* wrong direction, can't use it */
1062 if (subkey->sk_flags & SK_ISNULL)
1063 break; /* can't use null keys */
1064 Assert(keysCount < INDEX_MAX_KEYS);
1065 memcpy(inskey.scankeys + keysCount, subkey,
1066 sizeof(ScanKeyData));
1068 if (subkey->sk_flags & SK_ROW_END)
1070 used_all_subkeys = true;
1074 if (!used_all_subkeys)
1076 switch (strat_total)
1078 case BTLessStrategyNumber:
1079 strat_total = BTLessEqualStrategyNumber;
1081 case BTGreaterStrategyNumber:
1082 strat_total = BTGreaterEqualStrategyNumber;
1086 break; /* done with outer loop */
1092 * Ordinary comparison key. Transform the search-style scan key
1093 * to an insertion scan key by replacing the sk_func with the
1094 * appropriate btree comparison function.
1096 * If scankey operator is not a cross-type comparison, we can use
1097 * the cached comparison function; otherwise gotta look it up in
1098 * the catalogs. (That can't lead to infinite recursion, since no
1099 * indexscan initiated by syscache lookup will use cross-data-type
1102 * We support the convention that sk_subtype == InvalidOid means
1103 * the opclass input type; this is a hack to simplify life for
1106 if (cur->sk_subtype == rel->rd_opcintype[i] ||
1107 cur->sk_subtype == InvalidOid)
1111 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1112 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1123 RegProcedure cmp_proc;
1125 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1126 rel->rd_opcintype[i],
1129 if (!RegProcedureIsValid(cmp_proc))
1130 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1131 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1132 cur->sk_attno, RelationGetRelationName(rel));
1133 ScanKeyEntryInitialize(inskey.scankeys + i,
1146 * Examine the selected initial-positioning strategy to determine exactly
1147 * where we need to start the scan, and set flag variables to control the
1150 * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1151 * item >= scan key. If nextkey = true, they will locate the first
1154 * If goback = true, we will then step back one item, while if
1155 * goback = false, we will start the scan on the located item.
1158 switch (strat_total)
1160 case BTLessStrategyNumber:
1163 * Find first item >= scankey, then back up one to arrive at last
1164 * item < scankey. (Note: this positioning strategy is only used
1165 * for a backward scan, so that is always the correct starting
1172 case BTLessEqualStrategyNumber:
1175 * Find first item > scankey, then back up one to arrive at last
1176 * item <= scankey. (Note: this positioning strategy is only used
1177 * for a backward scan, so that is always the correct starting
1184 case BTEqualStrategyNumber:
1187 * If a backward scan was specified, need to start with last equal
1188 * item not first one.
1190 if (ScanDirectionIsBackward(dir))
1193 * This is the same as the <= strategy. We will check at the
1194 * end whether the found item is actually =.
1202 * This is the same as the >= strategy. We will check at the
1203 * end whether the found item is actually =.
1210 case BTGreaterEqualStrategyNumber:
1213 * Find first item >= scankey. (This is only used for forward
1220 case BTGreaterStrategyNumber:
1223 * Find first item > scankey. (This is only used for forward
1231 /* can't get here, but keep compiler quiet */
1232 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1236 /* Initialize remaining insertion scan key fields */
1237 inskey.heapkeyspace = _bt_heapkeyspace(rel);
1238 inskey.anynullkeys = false; /* unused */
1239 inskey.nextkey = nextkey;
1240 inskey.pivotsearch = false;
1241 inskey.scantid = NULL;
1242 inskey.keysz = keysCount;
1245 * Use the manufactured insertion scan key to descend the tree and
1246 * position ourselves on the target leaf page.
1248 stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1250 /* don't need to keep the stack around... */
1251 _bt_freestack(stack);
1253 if (!BufferIsValid(buf))
1256 * We only get here if the index is completely empty. Lock relation
1257 * because nothing finer to lock exists.
1259 PredicateLockRelation(rel, scan->xs_snapshot);
1262 * mark parallel scan as done, so that all the workers can finish
1265 _bt_parallel_done(scan);
1266 BTScanPosInvalidate(so->currPos);
1271 PredicateLockPage(rel, BufferGetBlockNumber(buf),
1274 _bt_initialize_more_data(so, dir);
1276 /* position to the precise item on the page */
1277 offnum = _bt_binsrch(rel, &inskey, buf);
1280 * If nextkey = false, we are positioned at the first item >= scan key, or
1281 * possibly at the end of a page on which all the existing items are less
1282 * than the scan key and we know that everything on later pages is greater
1283 * than or equal to scan key.
1285 * If nextkey = true, we are positioned at the first item > scan key, or
1286 * possibly at the end of a page on which all the existing items are less
1287 * than or equal to the scan key and we know that everything on later
1288 * pages is greater than scan key.
1290 * The actually desired starting point is either this item or the prior
1291 * one, or in the end-of-page case it's the first item on the next page or
1292 * the last item on this page. Adjust the starting offset if needed. (If
1293 * this results in an offset before the first item or after the last one,
1294 * _bt_readpage will report no items found, and then we'll step to the
1295 * next page as needed.)
1298 offnum = OffsetNumberPrev(offnum);
1300 /* remember which buffer we have pinned, if any */
1301 Assert(!BTScanPosIsValid(so->currPos));
1302 so->currPos.buf = buf;
1305 * Now load data from the first page of the scan.
1307 if (!_bt_readpage(scan, dir, offnum))
1310 * There's no actually-matching data on this page. Try to advance to
1311 * the next page. Return false if there's no matching data at all.
1313 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1314 if (!_bt_steppage(scan, dir))
1319 /* Drop the lock, and maybe the pin, on the current page */
1320 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1324 /* OK, itemIndex says what to return */
1325 currItem = &so->currPos.items[so->currPos.itemIndex];
1326 scan->xs_heaptid = currItem->heapTid;
1327 if (scan->xs_want_itup)
1328 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1334 * _bt_next() -- Get the next item in a scan.
1336 * On entry, so->currPos describes the current page, which may be pinned
1337 * but is not locked, and so->currPos.itemIndex identifies which item was
1338 * previously returned.
1340 * On successful exit, scan->xs_ctup.t_self is set to the TID of the
1341 * next heap tuple, and if requested, scan->xs_itup points to a copy of
1342 * the index tuple. so->currPos is updated as needed.
1344 * On failure exit (no more tuples), we release pin and set
1345 * so->currPos.buf to InvalidBuffer.
1348 _bt_next(IndexScanDesc scan, ScanDirection dir)
1350 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1351 BTScanPosItem *currItem;
1354 * Advance to next tuple on current page; or if there's no more, try to
1355 * step to the next page with data.
1357 if (ScanDirectionIsForward(dir))
1359 if (++so->currPos.itemIndex > so->currPos.lastItem)
1361 if (!_bt_steppage(scan, dir))
1367 if (--so->currPos.itemIndex < so->currPos.firstItem)
1369 if (!_bt_steppage(scan, dir))
1374 /* OK, itemIndex says what to return */
1375 currItem = &so->currPos.items[so->currPos.itemIndex];
1376 scan->xs_heaptid = currItem->heapTid;
1377 if (scan->xs_want_itup)
1378 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1384 * _bt_readpage() -- Load data from current index page into so->currPos
1386 * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1387 * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1388 * they are updated as appropriate. All other fields of so->currPos are
1389 * initialized from scratch here.
1391 * We scan the current page starting at offnum and moving in the indicated
1392 * direction. All items matching the scan keys are loaded into currPos.items.
1393 * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1394 * that there can be no more matching tuples in the current scan direction.
1396 * In the case of a parallel scan, caller must have called _bt_parallel_seize
1397 * prior to calling this function; this function will invoke
1398 * _bt_parallel_release before returning.
1400 * Returns true if any matching items found on the page, false if none.
1403 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
1405 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1407 BTPageOpaque opaque;
1408 OffsetNumber minoff;
1409 OffsetNumber maxoff;
1415 * We must have the buffer pinned and locked, but the usual macro can't be
1416 * used here; this function is what makes it good for currPos.
1418 Assert(BufferIsValid(so->currPos.buf));
1420 page = BufferGetPage(so->currPos.buf);
1421 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1423 /* allow next page be processed by parallel worker */
1424 if (scan->parallel_scan)
1426 if (ScanDirectionIsForward(dir))
1427 _bt_parallel_release(scan, opaque->btpo_next);
1429 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1432 continuescan = true; /* default assumption */
1433 indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
1434 minoff = P_FIRSTDATAKEY(opaque);
1435 maxoff = PageGetMaxOffsetNumber(page);
1438 * We note the buffer's block number so that we can release the pin later.
1439 * This allows us to re-read the buffer if it is needed again for hinting.
1441 so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1444 * We save the LSN of the page as we read it, so that we know whether it
1445 * safe to apply LP_DEAD hints to the page later. This allows us to drop
1446 * the pin for MVCC scans, which allows vacuum to avoid blocking.
1448 so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1451 * we must save the page's right-link while scanning it; this tells us
1452 * where to step right to after we're done with these items. There is no
1453 * corresponding need for the left-link, since splits always go right.
1455 so->currPos.nextPage = opaque->btpo_next;
1457 /* initialize tuple workspace to empty */
1458 so->currPos.nextTupleOffset = 0;
1461 * Now that the current page has been made consistent, the macro should be
1464 Assert(BTScanPosIsPinned(so->currPos));
1466 if (ScanDirectionIsForward(dir))
1468 /* load items[] in ascending order */
1471 offnum = Max(offnum, minoff);
1473 while (offnum <= maxoff)
1475 ItemId iid = PageGetItemId(page, offnum);
1479 * If the scan specifies not to return killed tuples, then we
1480 * treat a killed tuple as not passing the qual
1482 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1484 offnum = OffsetNumberNext(offnum);
1488 itup = (IndexTuple) PageGetItem(page, iid);
1490 if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
1492 /* tuple passes all scan key conditions, so remember it */
1493 _bt_saveitem(so, itemIndex, offnum, itup);
1496 /* When !continuescan, there can't be any more matches, so stop */
1500 offnum = OffsetNumberNext(offnum);
1504 * We don't need to visit page to the right when the high key
1505 * indicates that no more matches will be found there.
1507 * Checking the high key like this works out more often than you might
1508 * think. Leaf page splits pick a split point between the two most
1509 * dissimilar tuples (this is weighed against the need to evenly share
1510 * free space). Leaf pages with high key attribute values that can
1511 * only appear on non-pivot tuples on the right sibling page are
1514 if (continuescan && !P_RIGHTMOST(opaque))
1516 ItemId iid = PageGetItemId(page, P_HIKEY);
1517 IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1520 truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1521 _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1525 so->currPos.moreRight = false;
1527 Assert(itemIndex <= MaxIndexTuplesPerPage);
1528 so->currPos.firstItem = 0;
1529 so->currPos.lastItem = itemIndex - 1;
1530 so->currPos.itemIndex = 0;
1534 /* load items[] in descending order */
1535 itemIndex = MaxIndexTuplesPerPage;
1537 offnum = Min(offnum, maxoff);
1539 while (offnum >= minoff)
1541 ItemId iid = PageGetItemId(page, offnum);
1547 * If the scan specifies not to return killed tuples, then we
1548 * treat a killed tuple as not passing the qual. Most of the
1549 * time, it's a win to not bother examining the tuple's index
1550 * keys, but just skip to the next tuple (previous, actually,
1551 * since we're scanning backwards). However, if this is the first
1552 * tuple on the page, we do check the index keys, to prevent
1553 * uselessly advancing to the page to the left. This is similar
1554 * to the high key optimization used by forward scans.
1556 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1558 Assert(offnum >= P_FIRSTDATAKEY(opaque));
1559 if (offnum > P_FIRSTDATAKEY(opaque))
1561 offnum = OffsetNumberPrev(offnum);
1565 tuple_alive = false;
1570 itup = (IndexTuple) PageGetItem(page, iid);
1572 passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1574 if (passes_quals && tuple_alive)
1576 /* tuple passes all scan key conditions, so remember it */
1578 _bt_saveitem(so, itemIndex, offnum, itup);
1582 /* there can't be any more matches, so stop */
1583 so->currPos.moreLeft = false;
1587 offnum = OffsetNumberPrev(offnum);
1590 Assert(itemIndex >= 0);
1591 so->currPos.firstItem = itemIndex;
1592 so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
1593 so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
1596 return (so->currPos.firstItem <= so->currPos.lastItem);
1599 /* Save an index item into so->currPos.items[itemIndex] */
1601 _bt_saveitem(BTScanOpaque so, int itemIndex,
1602 OffsetNumber offnum, IndexTuple itup)
1604 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1606 currItem->heapTid = itup->t_tid;
1607 currItem->indexOffset = offnum;
1610 Size itupsz = IndexTupleSize(itup);
1612 currItem->tupleOffset = so->currPos.nextTupleOffset;
1613 memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1614 so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1619 * _bt_steppage() -- Step to next page containing valid data for scan
1621 * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1622 * if pinned, we'll drop the pin before moving to next page. The buffer is
1623 * not locked on entry.
1625 * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1626 * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
1627 * to InvalidBuffer. We return true to indicate success.
1630 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1632 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1633 BlockNumber blkno = InvalidBlockNumber;
1636 Assert(BTScanPosIsValid(so->currPos));
1638 /* Before leaving current page, deal with any killed items */
1639 if (so->numKilled > 0)
1640 _bt_killitems(scan);
1643 * Before we modify currPos, make a copy of the page data if there was a
1644 * mark position that needs it.
1646 if (so->markItemIndex >= 0)
1648 /* bump pin on current buffer for assignment to mark buffer */
1649 if (BTScanPosIsPinned(so->currPos))
1650 IncrBufferRefCount(so->currPos.buf);
1651 memcpy(&so->markPos, &so->currPos,
1652 offsetof(BTScanPosData, items[1]) +
1653 so->currPos.lastItem * sizeof(BTScanPosItem));
1655 memcpy(so->markTuples, so->currTuples,
1656 so->currPos.nextTupleOffset);
1657 so->markPos.itemIndex = so->markItemIndex;
1658 so->markItemIndex = -1;
1661 if (ScanDirectionIsForward(dir))
1663 /* Walk right to the next page with data */
1664 if (scan->parallel_scan != NULL)
1667 * Seize the scan to get the next block number; if the scan has
1668 * ended already, bail out.
1670 status = _bt_parallel_seize(scan, &blkno);
1673 /* release the previous buffer, if pinned */
1674 BTScanPosUnpinIfPinned(so->currPos);
1675 BTScanPosInvalidate(so->currPos);
1681 /* Not parallel, so use the previously-saved nextPage link. */
1682 blkno = so->currPos.nextPage;
1685 /* Remember we left a page with data */
1686 so->currPos.moreLeft = true;
1688 /* release the previous buffer, if pinned */
1689 BTScanPosUnpinIfPinned(so->currPos);
1693 /* Remember we left a page with data */
1694 so->currPos.moreRight = true;
1696 if (scan->parallel_scan != NULL)
1699 * Seize the scan to get the current block number; if the scan has
1700 * ended already, bail out.
1702 status = _bt_parallel_seize(scan, &blkno);
1703 BTScanPosUnpinIfPinned(so->currPos);
1706 BTScanPosInvalidate(so->currPos);
1712 /* Not parallel, so just use our own notion of the current page */
1713 blkno = so->currPos.currPage;
1717 if (!_bt_readnextpage(scan, blkno, dir))
1720 /* Drop the lock, and maybe the pin, on the current page */
1721 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1727 * _bt_readnextpage() -- Read next page containing valid data for scan
1729 * On success exit, so->currPos is updated to contain data from the next
1730 * interesting page. Caller is responsible to release lock and pin on
1731 * buffer on success. We return true to indicate success.
1733 * If there are no more matching records in the given direction, we drop all
1734 * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1737 _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1739 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1742 BTPageOpaque opaque;
1745 rel = scan->indexRelation;
1747 if (ScanDirectionIsForward(dir))
1752 * if we're at end of scan, give up and mark parallel scan as
1753 * done, so that all the workers can finish their scan
1755 if (blkno == P_NONE || !so->currPos.moreRight)
1757 _bt_parallel_done(scan);
1758 BTScanPosInvalidate(so->currPos);
1761 /* check for interrupts while we're not holding any buffer lock */
1762 CHECK_FOR_INTERRUPTS();
1763 /* step right one page */
1764 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1765 page = BufferGetPage(so->currPos.buf);
1766 TestForOldSnapshot(scan->xs_snapshot, rel, page);
1767 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1768 /* check for deleted page */
1769 if (!P_IGNORE(opaque))
1771 PredicateLockPage(rel, blkno, scan->xs_snapshot);
1772 /* see if there are any matches on this page */
1773 /* note that this will clear moreRight if we can stop */
1774 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1777 else if (scan->parallel_scan != NULL)
1779 /* allow next page be processed by parallel worker */
1780 _bt_parallel_release(scan, opaque->btpo_next);
1783 /* nope, keep going */
1784 if (scan->parallel_scan != NULL)
1786 _bt_relbuf(rel, so->currPos.buf);
1787 status = _bt_parallel_seize(scan, &blkno);
1790 BTScanPosInvalidate(so->currPos);
1796 blkno = opaque->btpo_next;
1797 _bt_relbuf(rel, so->currPos.buf);
1804 * Should only happen in parallel cases, when some other backend
1805 * advanced the scan.
1807 if (so->currPos.currPage != blkno)
1809 BTScanPosUnpinIfPinned(so->currPos);
1810 so->currPos.currPage = blkno;
1814 * Walk left to the next page with data. This is much more complex
1815 * than the walk-right case because of the possibility that the page
1816 * to our left splits while we are in flight to it, plus the
1817 * possibility that the page we were on gets deleted after we leave
1818 * it. See nbtree/README for details.
1820 * It might be possible to rearrange this code to have less overhead
1821 * in pinning and locking, but that would require capturing the left
1822 * pointer when the page is initially read, and using it here, along
1823 * with big changes to _bt_walk_left() and the code below. It is not
1824 * clear whether this would be a win, since if the page immediately to
1825 * the left splits after we read this page and before we step left, we
1826 * would need to visit more pages than with the current code.
1828 * Note that if we change the code so that we drop the pin for a scan
1829 * which uses a non-MVCC snapshot, we will need to modify the code for
1830 * walking left, to allow for the possibility that a referenced page
1831 * has been deleted. As long as the buffer is pinned or the snapshot
1832 * is MVCC the page cannot move past the half-dead state to fully
1835 if (BTScanPosIsPinned(so->currPos))
1836 LockBuffer(so->currPos.buf, BT_READ);
1838 so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
1842 /* Done if we know there are no matching keys to the left */
1843 if (!so->currPos.moreLeft)
1845 _bt_relbuf(rel, so->currPos.buf);
1846 _bt_parallel_done(scan);
1847 BTScanPosInvalidate(so->currPos);
1851 /* Step to next physical page */
1852 so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
1855 /* if we're physically at end of index, return failure */
1856 if (so->currPos.buf == InvalidBuffer)
1858 _bt_parallel_done(scan);
1859 BTScanPosInvalidate(so->currPos);
1864 * Okay, we managed to move left to a non-deleted page. Done if
1865 * it's not half-dead and contains matching tuples. Else loop back
1866 * and do it all again.
1868 page = BufferGetPage(so->currPos.buf);
1869 TestForOldSnapshot(scan->xs_snapshot, rel, page);
1870 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1871 if (!P_IGNORE(opaque))
1873 PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
1874 /* see if there are any matches on this page */
1875 /* note that this will clear moreLeft if we can stop */
1876 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1879 else if (scan->parallel_scan != NULL)
1881 /* allow next page be processed by parallel worker */
1882 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1886 * For parallel scans, get the last page scanned as it is quite
1887 * possible that by the time we try to seize the scan, some other
1888 * worker has already advanced the scan to a different page. We
1889 * must continue based on the latest page scanned by any worker.
1891 if (scan->parallel_scan != NULL)
1893 _bt_relbuf(rel, so->currPos.buf);
1894 status = _bt_parallel_seize(scan, &blkno);
1897 BTScanPosInvalidate(so->currPos);
1900 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1909 * _bt_parallel_readpage() -- Read current page containing valid data for scan
1911 * On success, release lock and maybe pin on buffer. We return true to
1915 _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1917 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1919 _bt_initialize_more_data(so, dir);
1921 if (!_bt_readnextpage(scan, blkno, dir))
1924 /* Drop the lock, and maybe the pin, on the current page */
1925 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1931 * _bt_walk_left() -- step left one page, if possible
1933 * The given buffer must be pinned and read-locked. This will be dropped
1934 * before stepping left. On return, we have pin and read lock on the
1935 * returned page, instead.
1937 * Returns InvalidBuffer if there is no page to the left (no lock is held
1940 * When working on a non-leaf level, it is possible for the returned page
1941 * to be half-dead; the caller should check that condition and step left
1942 * again if it's important.
1945 _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
1948 BTPageOpaque opaque;
1950 page = BufferGetPage(buf);
1951 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1960 /* if we're at end of tree, release buf and return failure */
1961 if (P_LEFTMOST(opaque))
1963 _bt_relbuf(rel, buf);
1966 /* remember original page we are stepping left from */
1967 obknum = BufferGetBlockNumber(buf);
1969 blkno = lblkno = opaque->btpo_prev;
1970 _bt_relbuf(rel, buf);
1971 /* check for interrupts while we're not holding any buffer lock */
1972 CHECK_FOR_INTERRUPTS();
1973 buf = _bt_getbuf(rel, blkno, BT_READ);
1974 page = BufferGetPage(buf);
1975 TestForOldSnapshot(snapshot, rel, page);
1976 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1979 * If this isn't the page we want, walk right till we find what we
1980 * want --- but go no more than four hops (an arbitrary limit). If we
1981 * don't find the correct page by then, the most likely bet is that
1982 * the original page got deleted and isn't in the sibling chain at all
1983 * anymore, not that its left sibling got split more than four times.
1985 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1986 * because half-dead pages are still in the sibling chain. Caller
1987 * must reject half-dead pages if wanted.
1992 if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1994 /* Found desired page, return it */
1997 if (P_RIGHTMOST(opaque) || ++tries > 4)
1999 blkno = opaque->btpo_next;
2000 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2001 page = BufferGetPage(buf);
2002 TestForOldSnapshot(snapshot, rel, page);
2003 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2006 /* Return to the original page to see what's up */
2007 buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2008 page = BufferGetPage(buf);
2009 TestForOldSnapshot(snapshot, rel, page);
2010 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2011 if (P_ISDELETED(opaque))
2014 * It was deleted. Move right to first nondeleted page (there
2015 * must be one); that is the page that has acquired the deleted
2016 * one's keyspace, so stepping left from it will take us where we
2021 if (P_RIGHTMOST(opaque))
2022 elog(ERROR, "fell off the end of index \"%s\"",
2023 RelationGetRelationName(rel));
2024 blkno = opaque->btpo_next;
2025 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2026 page = BufferGetPage(buf);
2027 TestForOldSnapshot(snapshot, rel, page);
2028 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2029 if (!P_ISDELETED(opaque))
2034 * Now return to top of loop, resetting obknum to point to this
2035 * nondeleted page, and try again.
2041 * It wasn't deleted; the explanation had better be that the page
2042 * to the left got split or deleted. Without this check, we'd go
2043 * into an infinite loop if there's anything wrong.
2045 if (opaque->btpo_prev == lblkno)
2046 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2047 obknum, RelationGetRelationName(rel));
2048 /* Okay to try again with new lblkno value */
2052 return InvalidBuffer;
2056 * _bt_get_endpoint() -- Find the first or last page on a given tree level
2058 * If the index is empty, we will return InvalidBuffer; any other failure
2059 * condition causes ereport(). We will not return a dead page.
2061 * The returned buffer is pinned and read-locked.
2064 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
2069 BTPageOpaque opaque;
2070 OffsetNumber offnum;
2075 * If we are looking for a leaf page, okay to descend from fast root;
2076 * otherwise better descend from true root. (There is no point in being
2077 * smarter about intermediate levels.)
2080 buf = _bt_getroot(rel, BT_READ);
2082 buf = _bt_gettrueroot(rel);
2084 if (!BufferIsValid(buf))
2085 return InvalidBuffer;
2087 page = BufferGetPage(buf);
2088 TestForOldSnapshot(snapshot, rel, page);
2089 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2094 * If we landed on a deleted page, step right to find a live page
2095 * (there must be one). Also, if we want the rightmost page, step
2096 * right if needed to get to it (this could happen if the page split
2097 * since we obtained a pointer to it).
2099 while (P_IGNORE(opaque) ||
2100 (rightmost && !P_RIGHTMOST(opaque)))
2102 blkno = opaque->btpo_next;
2103 if (blkno == P_NONE)
2104 elog(ERROR, "fell off the end of index \"%s\"",
2105 RelationGetRelationName(rel));
2106 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2107 page = BufferGetPage(buf);
2108 TestForOldSnapshot(snapshot, rel, page);
2109 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2113 if (opaque->btpo.level == level)
2115 if (opaque->btpo.level < level)
2117 (errcode(ERRCODE_INDEX_CORRUPTED),
2118 errmsg_internal("btree level %u not found in index \"%s\"",
2119 level, RelationGetRelationName(rel))));
2121 /* Descend to leftmost or rightmost child page */
2123 offnum = PageGetMaxOffsetNumber(page);
2125 offnum = P_FIRSTDATAKEY(opaque);
2127 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2128 blkno = BTreeInnerTupleGetDownLink(itup);
2130 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2131 page = BufferGetPage(buf);
2132 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2139 * _bt_endpoint() -- Find the first or last page in the index, and scan
2140 * from there to the first key satisfying all the quals.
2142 * This is used by _bt_first() to set up a scan when we've determined
2143 * that the scan must start at the beginning or end of the index (for
2144 * a forward or backward scan respectively). Exit conditions are the
2145 * same as for _bt_first().
2148 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2150 Relation rel = scan->indexRelation;
2151 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2154 BTPageOpaque opaque;
2156 BTScanPosItem *currItem;
2159 * Scan down to the leftmost or rightmost leaf page. This is a simplified
2160 * version of _bt_search(). We don't maintain a stack since we know we
2163 buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
2165 if (!BufferIsValid(buf))
2168 * Empty index. Lock the whole relation, as nothing finer to lock
2171 PredicateLockRelation(rel, scan->xs_snapshot);
2172 BTScanPosInvalidate(so->currPos);
2176 PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
2177 page = BufferGetPage(buf);
2178 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2179 Assert(P_ISLEAF(opaque));
2181 if (ScanDirectionIsForward(dir))
2183 /* There could be dead pages to the left, so not this: */
2184 /* Assert(P_LEFTMOST(opaque)); */
2186 start = P_FIRSTDATAKEY(opaque);
2188 else if (ScanDirectionIsBackward(dir))
2190 Assert(P_RIGHTMOST(opaque));
2192 start = PageGetMaxOffsetNumber(page);
2196 elog(ERROR, "invalid scan direction: %d", (int) dir);
2197 start = 0; /* keep compiler quiet */
2200 /* remember which buffer we have pinned */
2201 so->currPos.buf = buf;
2203 _bt_initialize_more_data(so, dir);
2206 * Now load data from the first page of the scan.
2208 if (!_bt_readpage(scan, dir, start))
2211 * There's no actually-matching data on this page. Try to advance to
2212 * the next page. Return false if there's no matching data at all.
2214 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
2215 if (!_bt_steppage(scan, dir))
2220 /* Drop the lock, and maybe the pin, on the current page */
2221 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2224 /* OK, itemIndex says what to return */
2225 currItem = &so->currPos.items[so->currPos.itemIndex];
2226 scan->xs_heaptid = currItem->heapTid;
2227 if (scan->xs_want_itup)
2228 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2234 * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
2235 * for scan direction
2238 _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
2240 /* initialize moreLeft/moreRight appropriately for scan direction */
2241 if (ScanDirectionIsForward(dir))
2243 so->currPos.moreLeft = false;
2244 so->currPos.moreRight = true;
2248 so->currPos.moreLeft = true;
2249 so->currPos.moreRight = false;
2251 so->numKilled = 0; /* just paranoia */
2252 so->markItemIndex = -1; /* ditto */