1 /*-------------------------------------------------------------------------
4 * Search code for postgres btrees.
7 * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.84 2003/12/21 01:23:06 tgl Exp $
13 *-------------------------------------------------------------------------
18 #include "access/genam.h"
19 #include "access/nbtree.h"
20 #include "utils/lsyscache.h"
23 static Buffer _bt_walk_left(Relation rel, Buffer buf);
24 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
28 * _bt_search() -- Search the tree for a particular scankey,
29 * or more precisely for the first leaf page it could be on.
31 * When nextkey is false (the usual case), we are looking for the first
32 * item >= scankey. When nextkey is true, we are looking for the first
33 * item strictly greater than scankey.
35 * Return value is a stack of parent-page pointers. *bufP is set to the
36 * address of the leaf-page buffer, which is read-locked and pinned.
37 * No locks are held on the parent pages, however!
39 * NOTE that the returned buffer is read-locked regardless of the access
40 * parameter. However, access = BT_WRITE will allow an empty root page
41 * to be created and returned. When access = BT_READ, an empty index
42 * will result in *bufP being set to InvalidBuffer.
45 _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
46 Buffer *bufP, int access)
48 BTStack stack_in = NULL;
50 /* Get the root page to start with */
51 *bufP = _bt_getroot(rel, access);
53 /* If index is empty and access = BT_READ, no root page is created. */
54 if (!BufferIsValid(*bufP))
55 return (BTStack) NULL;
57 /* Loop iterates once per level descended in the tree */
67 BlockNumber par_blkno;
71 * Race -- the page we just grabbed may have split since we read
72 * its pointer in the parent (or metapage). If it has, we may
73 * need to move right to its new sibling. Do that.
75 *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
77 /* if this is a leaf page, we're done */
78 page = BufferGetPage(*bufP);
79 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
84 * Find the appropriate item on the internal page, and get the
85 * child page that it points to.
87 offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
88 itemid = PageGetItemId(page, offnum);
89 btitem = (BTItem) PageGetItem(page, itemid);
90 itup = &(btitem->bti_itup);
91 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
92 par_blkno = BufferGetBlockNumber(*bufP);
95 * We need to save the location of the index entry we chose in the
96 * parent page on a stack. In case we split the tree, we'll use
97 * the stack to work back up to the parent page. We also save the
98 * actual downlink (TID) to uniquely identify the index entry, in
99 * case it moves right while we're working lower in the tree. See
100 * the paper by Lehman and Yao for how this is detected and
101 * handled. (We use the child link to disambiguate duplicate keys
102 * in the index -- Lehman and Yao disallow duplicate keys.)
104 new_stack = (BTStack) palloc(sizeof(BTStackData));
105 new_stack->bts_blkno = par_blkno;
106 new_stack->bts_offset = offnum;
107 memcpy(&new_stack->bts_btitem, btitem, sizeof(BTItemData));
108 new_stack->bts_parent = stack_in;
110 /* drop the read lock on the parent page, acquire one on the child */
111 _bt_relbuf(rel, *bufP);
112 *bufP = _bt_getbuf(rel, blkno, BT_READ);
114 /* okay, all set to move down a level */
115 stack_in = new_stack;
122 * _bt_moveright() -- move right in the btree if necessary.
124 * When we follow a pointer to reach a page, it is possible that
125 * the page has changed in the meanwhile. If this happens, we're
126 * guaranteed that the page has "split right" -- that is, that any
127 * data that appeared on the page originally is either on the page
128 * or strictly to the right of it.
130 * When nextkey is false (the usual case), we are looking for the first
131 * item >= scankey. When nextkey is true, we are looking for the first
132 * item strictly greater than scankey.
134 * This routine decides whether or not we need to move right in the
135 * tree by examining the high key entry on the page. If that entry
136 * is strictly less than the scankey, or <= the scankey in the nextkey=true
137 * case, then we followed the wrong link and we need to move right.
139 * On entry, we have the buffer pinned and a lock of the type specified by
140 * 'access'. If we move right, we release the buffer and lock and acquire
141 * the same on the right sibling. Return value is the buffer we stop at.
144 _bt_moveright(Relation rel,
155 page = BufferGetPage(buf);
156 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
159 * When nextkey = false (normal case): if the scan key that brought us to
160 * this page is > the high key stored on the page, then the page has split
161 * and we need to move right. (If the scan key is equal to the high key,
162 * we might or might not need to move right; have to scan the page first
165 * When nextkey = true: move right if the scan key is >= page's high key.
167 * The page could even have split more than once, so scan as far as needed.
169 * We also have to move right if we followed a link that brought us to a
172 cmpval = nextkey ? 0 : 1;
174 while (!P_RIGHTMOST(opaque) &&
176 _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
178 /* step right one page */
179 BlockNumber rblkno = opaque->btpo_next;
181 _bt_relbuf(rel, buf);
182 buf = _bt_getbuf(rel, rblkno, access);
183 page = BufferGetPage(buf);
184 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
187 if (P_IGNORE(opaque))
188 elog(ERROR, "fell off the end of \"%s\"",
189 RelationGetRelationName(rel));
195 * _bt_binsrch() -- Do a binary search for a key on a particular page.
197 * When nextkey is false (the usual case), we are looking for the first
198 * item >= scankey. When nextkey is true, we are looking for the first
199 * item strictly greater than scankey.
201 * The scankey we get has the compare function stored in the procedure
202 * entry of each data struct. We invoke this regproc to do the
203 * comparison for every key in the scankey.
205 * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
206 * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
207 * particular, this means it is possible to return a value 1 greater than the
208 * number of keys on the page, if the scankey is > all keys on the page.)
210 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
211 * of the last key < given scankey, or last key <= given scankey if nextkey
212 * is true. (Since _bt_compare treats the first data key of such a page as
213 * minus infinity, there will be at least one key < scankey, so the result
214 * always points at one of the keys on the page.) This key indicates the
215 * right place to descend to be sure we find all leaf keys >= given scankey
216 * (or leaf keys > given scankey when nextkey is true).
218 * This procedure is not responsible for walking right, it just examines
219 * the given page. _bt_binsrch() has no lock or refcount side effects
223 _bt_binsrch(Relation rel,
237 itupdesc = RelationGetDescr(rel);
238 page = BufferGetPage(buf);
239 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
241 low = P_FIRSTDATAKEY(opaque);
242 high = PageGetMaxOffsetNumber(page);
245 * If there are no keys on the page, return the first available slot.
246 * Note this covers two cases: the page is really empty (no keys), or
247 * it contains only a high key. The latter case is possible after
248 * vacuuming. This can never happen on an internal page, however,
249 * since they are never empty (an internal page must have children).
255 * Binary search to find the first key on the page >= scan key, or
256 * first key > scankey when nextkey is true.
258 * For nextkey=false (cmpval=1), the loop invariant is: all slots
259 * before 'low' are < scan key, all slots at or after 'high'
262 * For nextkey=true (cmpval=0), the loop invariant is: all slots
263 * before 'low' are <= scan key, all slots at or after 'high'
266 * We can fall out when high == low.
268 high++; /* establish the loop invariant for high */
270 cmpval = nextkey ? 0 : 1; /* select comparison value */
274 OffsetNumber mid = low + ((high - low) / 2);
276 /* We have low <= mid < high, so mid points at a real slot */
278 result = _bt_compare(rel, keysz, scankey, page, mid);
280 if (result >= cmpval)
287 * At this point we have high == low, but be careful: they could point
288 * past the last slot on the page.
290 * On a leaf page, we always return the first key >= scan key (resp.
291 * > scan key), which could be the last slot + 1.
293 if (P_ISLEAF(opaque))
297 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
298 * There must be one if _bt_compare() is playing by the rules.
300 Assert(low > P_FIRSTDATAKEY(opaque));
302 return OffsetNumberPrev(low);
306 * _bt_compare() -- Compare scankey to a particular tuple on the page.
308 * keysz: number of key conditions to be checked (might be less than the
309 * total length of the scan key!)
310 * page/offnum: location of btree item to be compared to.
312 * This routine returns:
313 * <0 if scankey < tuple at offnum;
314 * 0 if scankey == tuple at offnum;
315 * >0 if scankey > tuple at offnum.
316 * NULLs in the keys are treated as sortable values. Therefore
317 * "equality" does not necessarily mean that the item should be
318 * returned to the caller as a matching key!
320 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
321 * "minus infinity": this routine will always claim it is less than the
322 * scankey. The actual key value stored (if any, which there probably isn't)
323 * does not matter. This convention allows us to implement the Lehman and
324 * Yao convention that the first down-link pointer is before the first key.
325 * See backend/access/nbtree/README for details.
329 _bt_compare(Relation rel,
335 TupleDesc itupdesc = RelationGetDescr(rel);
336 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
342 * Force result ">" if target item is first data item on an internal
343 * page --- see NOTE above.
345 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
348 btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
349 itup = &(btitem->bti_itup);
352 * The scan key is set up with the attribute number associated with
353 * each term in the key. It is important that, if the index is
354 * multi-key, the scan contain the first k key attributes, and that
355 * they be in order. If you think about how multi-key ordering works,
356 * you'll understand why this is.
358 * We don't test for violation of this condition here, however. The
359 * initial setup for the index scan had better have gotten it right
363 for (i = 1; i <= keysz; i++)
369 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
371 /* see comments about NULLs handling in btbuild */
372 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
375 result = 0; /* NULL "=" NULL */
377 result = 1; /* NULL ">" NOT_NULL */
379 else if (isNull) /* key is NOT_NULL and item is NULL */
381 result = -1; /* NOT_NULL "<" NULL */
386 * The sk_func needs to be passed the index value as left arg
387 * and the sk_argument as right arg (they might be of different
388 * types). Since it is convenient for callers to think of
389 * _bt_compare as comparing the scankey to the index item,
390 * we have to flip the sign of the comparison result.
392 * Note: curious-looking coding is to avoid overflow if
393 * comparison function returns INT_MIN. There is no risk of
394 * overflow for positive results.
396 result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
398 scankey->sk_argument));
399 result = (result < 0) ? 1 : -result;
402 /* if the keys are unequal, return the difference */
409 /* if we get here, the keys are equal */
414 * _bt_next() -- Get the next item in a scan.
416 * On entry, we have a valid currentItemData in the scan, and a
417 * read lock and pin count on the page that contains that item.
418 * We return the next item in the scan, or false if no more.
419 * On successful exit, the page containing the new item is locked
420 * and pinned; on failure exit, no lock or pin is held.
423 _bt_next(IndexScanDesc scan, ScanDirection dir)
435 rel = scan->indexRelation;
436 so = (BTScanOpaque) scan->opaque;
437 current = &(scan->currentItemData);
439 /* we still have the buffer pinned and locked */
440 buf = so->btso_curbuf;
441 Assert(BufferIsValid(buf));
445 /* step one tuple in the appropriate direction */
446 if (!_bt_step(scan, &buf, dir))
449 /* current is the next candidate tuple to return */
450 offnum = ItemPointerGetOffsetNumber(current);
451 page = BufferGetPage(buf);
452 btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
453 itup = &btitem->bti_itup;
455 if (_bt_checkkeys(scan, itup, dir, &continuescan))
457 /* tuple passes all scan key conditions, so return it */
458 scan->xs_ctup.t_self = itup->t_tid;
462 /* This tuple doesn't pass, but there might be more that do */
463 } while (continuescan);
465 /* No more items, so close down the current-item info */
466 ItemPointerSetInvalid(current);
467 so->btso_curbuf = InvalidBuffer;
468 _bt_relbuf(rel, buf);
474 * _bt_first() -- Find the first item in a scan.
476 * We need to be clever about the type of scan, the operation it's
477 * performing, and the tree ordering. We find the
478 * first item in the tree that satisfies the qualification
479 * associated with the scan descriptor. On exit, the page containing
480 * the current index tuple is read locked and pinned, and the scan's
481 * opaque data entry is updated to include the buffer.
484 _bt_first(IndexScanDesc scan, ScanDirection dir)
486 Relation rel = scan->indexRelation;
487 BTScanOpaque so = (BTScanOpaque) scan->opaque;
496 StrategyNumber strat;
501 ScanKey scankeys = NULL;
502 ScanKey *startKeys = NULL;
505 StrategyNumber strat_total;
508 * Examine the scan keys and eliminate any redundant keys; also
509 * discover how many keys must be matched to continue the scan.
511 _bt_preprocess_keys(scan);
514 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
515 * never be satisfied (eg, x == 1 AND x > 2).
521 * Examine the scan keys to discover where we need to start the scan.
523 * We want to identify the keys that can be used as starting boundaries;
524 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
525 * a backwards scan. We can use keys for multiple attributes so long as
526 * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
527 * a > or < boundary or find an attribute with no boundary (which can be
528 * thought of as the same as "> -infinity"), we can't use keys for any
529 * attributes to its right, because it would break our simplistic notion
530 * of what initial positioning strategy to use.
532 * When the scan keys include non-default operators, _bt_preprocess_keys
533 * may not be able to eliminate redundant keys; in such cases we will
534 * arbitrarily pick a usable one for each attribute. This is correct
535 * but possibly not optimal behavior. (For example, with keys like
536 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
537 * x=5 would be more efficient.) Since the situation only arises in
538 * hokily-worded queries, live with it.
540 * When both equality and inequality keys appear for a single attribute
541 * (again, only possible when non-default operators appear), we *must*
542 * select one of the equality keys for the starting point, because
543 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
544 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
545 * start at x=4, we will fail and stop before reaching x=10. If multiple
546 * equality quals survive preprocessing, however, it doesn't matter which
547 * one we use --- by definition, they are either redundant or
551 strat_total = BTEqualStrategyNumber;
552 if (so->numberOfKeys > 0)
558 startKeys = (ScanKey *) palloc(so->numberOfKeys * sizeof(ScanKey));
560 * chosen is the so-far-chosen key for the current attribute, if any.
561 * We don't cast the decision in stone until we reach keys for the
567 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
568 * pass to handle after-last-key processing. Actual exit from the
569 * loop is at one of the "break" statements below.
571 for (cur = so->keyData, i = 0;; cur++, i++)
573 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
576 * Done looking at keys for curattr. If we didn't find a
577 * usable boundary key, quit; else save the boundary key
578 * pointer in startKeys.
582 startKeys[keysCount++] = chosen;
584 * Adjust strat_total, and quit if we have stored a > or < key.
586 strat = chosen->sk_strategy;
587 if (strat != BTEqualStrategyNumber)
590 if (strat == BTGreaterStrategyNumber ||
591 strat == BTLessStrategyNumber)
595 * Done if that was the last attribute.
597 if (i >= so->numberOfKeys)
600 * Reset for next attr, which should be in sequence.
602 Assert(cur->sk_attno == curattr + 1);
603 curattr = cur->sk_attno;
607 /* Can we use this key as a starting boundary for this attr? */
608 switch (cur->sk_strategy)
610 case BTLessStrategyNumber:
611 case BTLessEqualStrategyNumber:
612 if (chosen == NULL && ScanDirectionIsBackward(dir))
615 case BTEqualStrategyNumber:
616 /* override any non-equality choice */
619 case BTGreaterEqualStrategyNumber:
620 case BTGreaterStrategyNumber:
621 if (chosen == NULL && ScanDirectionIsForward(dir))
629 * If we found no usable boundary keys, we have to start from one end
630 * of the tree. Walk down that edge to the first or last key, and
637 return _bt_endpoint(scan, dir);
641 * We want to start the scan somewhere within the index. Set up a
642 * 3-way-comparison scankey we can use to search for the boundary
643 * point we identified above.
645 scankeys = (ScanKey) palloc(keysCount * sizeof(ScanKeyData));
646 for (i = 0; i < keysCount; i++)
648 ScanKey cur = startKeys[i];
651 * _bt_preprocess_keys disallows it, but it's place to add some code
654 if (cur->sk_flags & SK_ISNULL)
658 elog(ERROR, "btree doesn't support is(not)null, yet");
662 * If scankey operator is of default subtype, we can use the
663 * cached comparison procedure; otherwise gotta look it up in
666 if (cur->sk_subtype == InvalidOid)
670 procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
671 ScanKeyEntryInitializeWithInfo(scankeys + i,
681 RegProcedure cmp_proc;
683 cmp_proc = get_opclass_proc(rel->rd_index->indclass[i],
686 ScanKeyEntryInitialize(scankeys + i,
698 current = &(scan->currentItemData);
701 * We want to locate either the first item >= boundary point, or
702 * first item > boundary point, depending on the initial-positioning
703 * strategy we just chose.
707 case BTLessStrategyNumber:
711 case BTLessEqualStrategyNumber:
715 case BTEqualStrategyNumber:
717 * If a backward scan was specified, need to start with last
718 * equal item not first one.
720 if (ScanDirectionIsBackward(dir))
726 case BTGreaterEqualStrategyNumber:
730 case BTGreaterStrategyNumber:
735 /* can't get here, but keep compiler quiet */
736 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
741 * Use the manufactured scan key to descend the tree and position
742 * ourselves on the target leaf page.
744 stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ);
746 /* don't need to keep the stack around... */
747 _bt_freestack(stack);
749 if (!BufferIsValid(buf))
751 /* Only get here if index is completely empty */
752 ItemPointerSetInvalid(current);
753 so->btso_curbuf = InvalidBuffer;
758 /* remember which buffer we have pinned */
759 so->btso_curbuf = buf;
760 blkno = BufferGetBlockNumber(buf);
761 page = BufferGetPage(buf);
763 /* position to the precise item on the page */
764 offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
766 ItemPointerSet(current, blkno, offnum);
769 * It's now time to examine the initial-positioning strategy to find the
770 * exact place to start the scan.
772 * If nextkey = false, we are positioned at the first item >= scan key,
773 * or possibly at the end of a page on which all the existing items are
774 * less than the scan key and we know that everything on later pages
775 * is greater than or equal to scan key.
777 * If nextkey = true, we are positioned at the first item > scan key,
778 * or possibly at the end of a page on which all the existing items are
779 * less than or equal to the scan key and we know that everything on
780 * later pages is greater than scan key.
782 * The actually desired starting point is either this item or an adjacent
783 * one, or in the end-of-page case it's the last item on this page or
784 * the first item on the next. We apply _bt_step if needed to get to
787 * Note: if _bt_step fails (meaning we fell off the end of the index in
788 * one direction or the other), then there are no matches so we just
791 * it's yet other place to add some code later for is(not)null ...
795 case BTLessStrategyNumber:
798 * We are on first item >= scankey.
800 * Back up one to arrive at last item < scankey. (Note: this
801 * positioning strategy is only used for a backward scan, so
802 * that is always the correct starting position.)
804 if (!_bt_step(scan, &buf, BackwardScanDirection))
811 case BTLessEqualStrategyNumber:
814 * We are on first item > scankey.
816 * Back up one to arrive at last item <= scankey. (Note: this
817 * positioning strategy is only used for a backward scan, so
818 * that is always the correct starting position.)
820 if (!_bt_step(scan, &buf, BackwardScanDirection))
827 case BTEqualStrategyNumber:
829 * If a backward scan was specified, need to start with last
830 * equal item not first one.
832 if (ScanDirectionIsBackward(dir))
835 * We are on first item > scankey.
837 * Back up one to arrive at last item <= scankey, then
838 * check to see if it is equal to scankey.
840 if (!_bt_step(scan, &buf, BackwardScanDirection))
849 * We are on first item >= scankey.
851 * Make sure we are on a real item; might have to
852 * step forward if currently at end of page. Then check
853 * to see if it is equal to scankey.
855 if (offnum > PageGetMaxOffsetNumber(page))
857 if (!_bt_step(scan, &buf, ForwardScanDirection))
865 /* If we are not now on an equal item, then there ain't any. */
866 offnum = ItemPointerGetOffsetNumber(current);
867 page = BufferGetPage(buf);
868 result = _bt_compare(rel, keysCount, scankeys, page, offnum);
870 goto nomatches; /* no equal items! */
873 case BTGreaterEqualStrategyNumber:
876 * We want the first item >= scankey, which is where we are...
877 * unless we're not anywhere at all...
879 if (offnum > PageGetMaxOffsetNumber(page))
881 if (!_bt_step(scan, &buf, ForwardScanDirection))
889 case BTGreaterStrategyNumber:
892 * We want the first item > scankey, which is where we are...
893 * unless we're not anywhere at all...
895 if (offnum > PageGetMaxOffsetNumber(page))
897 if (!_bt_step(scan, &buf, ForwardScanDirection))
906 /* okay, current item pointer for the scan is right */
907 offnum = ItemPointerGetOffsetNumber(current);
908 page = BufferGetPage(buf);
909 btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
910 itup = &btitem->bti_itup;
912 /* is the first item actually acceptable? */
913 if (_bt_checkkeys(scan, itup, dir, &continuescan))
916 scan->xs_ctup.t_self = itup->t_tid;
919 else if (continuescan)
921 /* no, but there might be another one that is */
922 res = _bt_next(scan, dir);
926 /* no tuples in the index match this scan key */
928 ItemPointerSetInvalid(current);
929 so->btso_curbuf = InvalidBuffer;
930 _bt_relbuf(rel, buf);
940 * _bt_step() -- Step one item in the requested direction in a scan on
943 * *bufP is the current buffer (read-locked and pinned). If we change
944 * pages, it's updated appropriately.
946 * If successful, update scan's currentItemData and return true.
947 * If no adjacent record exists in the requested direction,
948 * release buffer pin/locks and return false.
951 _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
953 Relation rel = scan->indexRelation;
954 ItemPointer current = &(scan->currentItemData);
955 BTScanOpaque so = (BTScanOpaque) scan->opaque;
963 * Don't use ItemPointerGetOffsetNumber or you risk to get assertion
964 * due to ability of ip_posid to be equal 0.
966 offnum = current->ip_posid;
968 page = BufferGetPage(*bufP);
969 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
970 maxoff = PageGetMaxOffsetNumber(page);
972 if (ScanDirectionIsForward(dir))
974 if (!PageIsEmpty(page) && offnum < maxoff)
975 offnum = OffsetNumberNext(offnum);
978 /* Walk right to the next page with data */
981 /* if we're at end of scan, release the buffer and return */
982 if (P_RIGHTMOST(opaque))
984 _bt_relbuf(rel, *bufP);
985 ItemPointerSetInvalid(current);
986 *bufP = so->btso_curbuf = InvalidBuffer;
989 /* step right one page */
990 blkno = opaque->btpo_next;
991 _bt_relbuf(rel, *bufP);
992 *bufP = _bt_getbuf(rel, blkno, BT_READ);
993 page = BufferGetPage(*bufP);
994 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
995 if (!P_IGNORE(opaque))
997 maxoff = PageGetMaxOffsetNumber(page);
998 /* done if it's not empty */
999 offnum = P_FIRSTDATAKEY(opaque);
1000 if (!PageIsEmpty(page) && offnum <= maxoff)
1008 /* backwards scan */
1009 if (offnum > P_FIRSTDATAKEY(opaque))
1010 offnum = OffsetNumberPrev(offnum);
1014 * Walk left to the next page with data. This is much more
1015 * complex than the walk-right case because of the possibility
1016 * that the page to our left splits while we are in flight to
1017 * it, plus the possibility that the page we were on gets
1018 * deleted after we leave it. See nbtree/README for details.
1022 *bufP = _bt_walk_left(rel, *bufP);
1024 /* if we're at end of scan, return failure */
1025 if (*bufP == InvalidBuffer)
1027 ItemPointerSetInvalid(current);
1028 so->btso_curbuf = InvalidBuffer;
1031 page = BufferGetPage(*bufP);
1032 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1035 * Okay, we managed to move left to a non-deleted page.
1036 * Done if it's not half-dead and not empty. Else loop
1037 * back and do it all again.
1039 if (!P_IGNORE(opaque))
1041 maxoff = PageGetMaxOffsetNumber(page);
1043 if (!PageIsEmpty(page) &&
1044 maxoff >= P_FIRSTDATAKEY(opaque))
1051 /* Update scan state */
1052 so->btso_curbuf = *bufP;
1053 blkno = BufferGetBlockNumber(*bufP);
1054 ItemPointerSet(current, blkno, offnum);
1060 * _bt_walk_left() -- step left one page, if possible
1062 * The given buffer must be pinned and read-locked. This will be dropped
1063 * before stepping left. On return, we have pin and read lock on the
1064 * returned page, instead.
1066 * Returns InvalidBuffer if there is no page to the left (no lock is held
1069 * When working on a non-leaf level, it is possible for the returned page
1070 * to be half-dead; the caller should check that condition and step left
1071 * again if it's important.
1074 _bt_walk_left(Relation rel, Buffer buf)
1077 BTPageOpaque opaque;
1079 page = BufferGetPage(buf);
1080 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1089 /* if we're at end of tree, release buf and return failure */
1090 if (P_LEFTMOST(opaque))
1092 _bt_relbuf(rel, buf);
1095 /* remember original page we are stepping left from */
1096 obknum = BufferGetBlockNumber(buf);
1098 blkno = lblkno = opaque->btpo_prev;
1099 _bt_relbuf(rel, buf);
1100 buf = _bt_getbuf(rel, blkno, BT_READ);
1101 page = BufferGetPage(buf);
1102 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1105 * If this isn't the page we want, walk right till we find what we
1106 * want --- but go no more than four hops (an arbitrary limit). If
1107 * we don't find the correct page by then, the most likely bet is
1108 * that the original page got deleted and isn't in the sibling
1109 * chain at all anymore, not that its left sibling got split more
1112 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1113 * because half-dead pages are still in the sibling chain. Caller
1114 * must reject half-dead pages if wanted.
1119 if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1121 /* Found desired page, return it */
1124 if (P_RIGHTMOST(opaque) || ++tries > 4)
1126 blkno = opaque->btpo_next;
1127 _bt_relbuf(rel, buf);
1128 buf = _bt_getbuf(rel, blkno, BT_READ);
1129 page = BufferGetPage(buf);
1130 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1133 /* Return to the original page to see what's up */
1134 _bt_relbuf(rel, buf);
1135 buf = _bt_getbuf(rel, obknum, BT_READ);
1136 page = BufferGetPage(buf);
1137 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1138 if (P_ISDELETED(opaque))
1141 * It was deleted. Move right to first nondeleted page (there
1142 * must be one); that is the page that has acquired the
1143 * deleted one's keyspace, so stepping left from it will take
1144 * us where we want to be.
1148 if (P_RIGHTMOST(opaque))
1149 elog(ERROR, "fell off the end of \"%s\"",
1150 RelationGetRelationName(rel));
1151 blkno = opaque->btpo_next;
1152 _bt_relbuf(rel, buf);
1153 buf = _bt_getbuf(rel, blkno, BT_READ);
1154 page = BufferGetPage(buf);
1155 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1156 if (!P_ISDELETED(opaque))
1161 * Now return to top of loop, resetting obknum to point to
1162 * this nondeleted page, and try again.
1168 * It wasn't deleted; the explanation had better be that the
1169 * page to the left got split or deleted. Without this check,
1170 * we'd go into an infinite loop if there's anything wrong.
1172 if (opaque->btpo_prev == lblkno)
1173 elog(ERROR, "could not find left sibling in \"%s\"",
1174 RelationGetRelationName(rel));
1175 /* Okay to try again with new lblkno value */
1179 return InvalidBuffer;
1183 * _bt_get_endpoint() -- Find the first or last page on a given tree level
1185 * If the index is empty, we will return InvalidBuffer; any other failure
1186 * condition causes ereport(). We will not return a dead page.
1188 * The returned buffer is pinned and read-locked.
1191 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
1195 BTPageOpaque opaque;
1196 OffsetNumber offnum;
1202 * If we are looking for a leaf page, okay to descend from fast root;
1203 * otherwise better descend from true root. (There is no point in
1204 * being smarter about intermediate levels.)
1207 buf = _bt_getroot(rel, BT_READ);
1209 buf = _bt_gettrueroot(rel);
1211 if (!BufferIsValid(buf))
1213 /* empty index... */
1214 return InvalidBuffer;
1217 page = BufferGetPage(buf);
1218 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1223 * If we landed on a deleted page, step right to find a live page
1224 * (there must be one). Also, if we want the rightmost page, step
1225 * right if needed to get to it (this could happen if the page
1226 * split since we obtained a pointer to it).
1228 while (P_IGNORE(opaque) ||
1229 (rightmost && !P_RIGHTMOST(opaque)))
1231 blkno = opaque->btpo_next;
1232 if (blkno == P_NONE)
1233 elog(ERROR, "fell off the end of \"%s\"",
1234 RelationGetRelationName(rel));
1235 _bt_relbuf(rel, buf);
1236 buf = _bt_getbuf(rel, blkno, BT_READ);
1237 page = BufferGetPage(buf);
1238 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1242 if (opaque->btpo.level == level)
1244 if (opaque->btpo.level < level)
1245 elog(ERROR, "btree level %u not found", level);
1247 /* Descend to leftmost or rightmost child page */
1249 offnum = PageGetMaxOffsetNumber(page);
1251 offnum = P_FIRSTDATAKEY(opaque);
1253 btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
1254 itup = &(btitem->bti_itup);
1255 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1257 _bt_relbuf(rel, buf);
1258 buf = _bt_getbuf(rel, blkno, BT_READ);
1259 page = BufferGetPage(buf);
1260 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1267 * _bt_endpoint() -- Find the first or last key in the index, and scan
1268 * from there to the first key satisfying all the quals.
1270 * This is used by _bt_first() to set up a scan when we've determined
1271 * that the scan must start at the beginning or end of the index (for
1272 * a forward or backward scan respectively).
1275 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
1280 BTPageOpaque opaque;
1281 ItemPointer current;
1282 OffsetNumber maxoff;
1291 rel = scan->indexRelation;
1292 current = &(scan->currentItemData);
1293 so = (BTScanOpaque) scan->opaque;
1296 * Scan down to the leftmost or rightmost leaf page. This is a
1297 * simplified version of _bt_search(). We don't maintain a stack
1298 * since we know we won't need it.
1300 buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
1302 if (!BufferIsValid(buf))
1304 /* empty index... */
1305 ItemPointerSetInvalid(current);
1306 so->btso_curbuf = InvalidBuffer;
1310 blkno = BufferGetBlockNumber(buf);
1311 page = BufferGetPage(buf);
1312 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1313 Assert(P_ISLEAF(opaque));
1315 maxoff = PageGetMaxOffsetNumber(page);
1317 if (ScanDirectionIsForward(dir))
1319 /* There could be dead pages to the left, so not this: */
1320 /* Assert(P_LEFTMOST(opaque)); */
1322 start = P_FIRSTDATAKEY(opaque);
1324 else if (ScanDirectionIsBackward(dir))
1326 Assert(P_RIGHTMOST(opaque));
1328 start = PageGetMaxOffsetNumber(page);
1329 if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty
1331 start = P_FIRSTDATAKEY(opaque);
1335 elog(ERROR, "invalid scan direction: %d", (int) dir);
1336 start = 0; /* keep compiler quiet */
1339 ItemPointerSet(current, blkno, start);
1340 /* remember which buffer we have pinned */
1341 so->btso_curbuf = buf;
1344 * Left/rightmost page could be empty due to deletions, if so step
1345 * till we find a nonempty page.
1349 if (!_bt_step(scan, &buf, dir))
1351 start = ItemPointerGetOffsetNumber(current);
1352 page = BufferGetPage(buf);
1355 btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
1356 itup = &(btitem->bti_itup);
1359 * Okay, we are on the first or last tuple. Does it pass all the quals?
1361 if (_bt_checkkeys(scan, itup, dir, &continuescan))
1363 /* yes, return it */
1364 scan->xs_ctup.t_self = itup->t_tid;
1367 else if (continuescan)
1369 /* no, but there might be another one that does */
1370 res = _bt_next(scan, dir);
1374 /* no tuples in the index match this scan key */
1375 ItemPointerSetInvalid(current);
1376 so->btso_curbuf = InvalidBuffer;
1377 _bt_relbuf(rel, buf);