1 /*-------------------------------------------------------------------------
4 * Search code for postgres btrees.
7 * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.79 2003/08/04 02:39:57 momjian Exp $
13 *-------------------------------------------------------------------------
18 #include "access/genam.h"
19 #include "access/nbtree.h"
22 static Buffer _bt_walk_left(Relation rel, Buffer buf);
23 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
27 * _bt_search() -- Search the tree for a particular scankey,
28 * or more precisely for the first leaf page it could be on.
30 * Return value is a stack of parent-page pointers. *bufP is set to the
31 * address of the leaf-page buffer, which is read-locked and pinned.
32 * No locks are held on the parent pages, however!
34 * NOTE that the returned buffer is read-locked regardless of the access
35 * parameter. However, access = BT_WRITE will allow an empty root page
36 * to be created and returned. When access = BT_READ, an empty index
37 * will result in *bufP being set to InvalidBuffer.
40 _bt_search(Relation rel, int keysz, ScanKey scankey,
41 Buffer *bufP, int access)
43 BTStack stack_in = NULL;
45 /* Get the root page to start with */
46 *bufP = _bt_getroot(rel, access);
48 /* If index is empty and access = BT_READ, no root page is created. */
49 if (!BufferIsValid(*bufP))
50 return (BTStack) NULL;
52 /* Loop iterates once per level descended in the tree */
62 BlockNumber par_blkno;
66 * Race -- the page we just grabbed may have split since we read
67 * its pointer in the parent (or metapage). If it has, we may
68 * need to move right to its new sibling. Do that.
70 *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);
72 /* if this is a leaf page, we're done */
73 page = BufferGetPage(*bufP);
74 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
79 * Find the appropriate item on the internal page, and get the
80 * child page that it points to.
82 offnum = _bt_binsrch(rel, *bufP, keysz, scankey);
83 itemid = PageGetItemId(page, offnum);
84 btitem = (BTItem) PageGetItem(page, itemid);
85 itup = &(btitem->bti_itup);
86 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
87 par_blkno = BufferGetBlockNumber(*bufP);
90 * We need to save the location of the index entry we chose in the
91 * parent page on a stack. In case we split the tree, we'll use
92 * the stack to work back up to the parent page. We also save the
93 * actual downlink (TID) to uniquely identify the index entry, in
94 * case it moves right while we're working lower in the tree. See
95 * the paper by Lehman and Yao for how this is detected and
96 * handled. (We use the child link to disambiguate duplicate keys
97 * in the index -- Lehman and Yao disallow duplicate keys.)
99 new_stack = (BTStack) palloc(sizeof(BTStackData));
100 new_stack->bts_blkno = par_blkno;
101 new_stack->bts_offset = offnum;
102 memcpy(&new_stack->bts_btitem, btitem, sizeof(BTItemData));
103 new_stack->bts_parent = stack_in;
105 /* drop the read lock on the parent page, acquire one on the child */
106 _bt_relbuf(rel, *bufP);
107 *bufP = _bt_getbuf(rel, blkno, BT_READ);
109 /* okay, all set to move down a level */
110 stack_in = new_stack;
117 * _bt_moveright() -- move right in the btree if necessary.
119 * When we follow a pointer to reach a page, it is possible that
120 * the page has changed in the meanwhile. If this happens, we're
121 * guaranteed that the page has "split right" -- that is, that any
122 * data that appeared on the page originally is either on the page
123 * or strictly to the right of it.
125 * This routine decides whether or not we need to move right in the
126 * tree by examining the high key entry on the page. If that entry
127 * is strictly less than one we expect to be on the page, then our
128 * picture of the page is incorrect and we need to move right.
130 * On entry, we have the buffer pinned and a lock of the proper type.
131 * If we move right, we release the buffer and lock and acquire the
132 * same on the right sibling. Return value is the buffer we stop at.
135 _bt_moveright(Relation rel,
144 page = BufferGetPage(buf);
145 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
148 * If the scan key that brought us to this page is > the high key
149 * stored on the page, then the page has split and we need to move
150 * right. (If the scan key is equal to the high key, we might or
151 * might not need to move right; have to scan the page first anyway.)
152 * It could even have split more than once, so scan as far as needed.
154 * We also have to move right if we followed a link that brought us to a
157 while (!P_RIGHTMOST(opaque) &&
159 _bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0))
161 /* step right one page */
162 BlockNumber rblkno = opaque->btpo_next;
164 _bt_relbuf(rel, buf);
165 buf = _bt_getbuf(rel, rblkno, access);
166 page = BufferGetPage(buf);
167 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
170 if (P_IGNORE(opaque))
171 elog(ERROR, "fell off the end of \"%s\"",
172 RelationGetRelationName(rel));
178 * _bt_binsrch() -- Do a binary search for a key on a particular page.
180 * The scankey we get has the compare function stored in the procedure
181 * entry of each data struct. We invoke this regproc to do the
182 * comparison for every key in the scankey.
184 * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
185 * key >= given scankey. (NOTE: in particular, this means it is possible
186 * to return a value 1 greater than the number of keys on the page,
187 * if the scankey is > all keys on the page.)
189 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
190 * of the last key < given scankey. (Since _bt_compare treats the first
191 * data key of such a page as minus infinity, there will be at least one
192 * key < scankey, so the result always points at one of the keys on the
193 * page.) This key indicates the right place to descend to be sure we
194 * find all leaf keys >= given scankey.
196 * This procedure is not responsible for walking right, it just examines
197 * the given page. _bt_binsrch() has no lock or refcount side effects
201 _bt_binsrch(Relation rel,
213 itupdesc = RelationGetDescr(rel);
214 page = BufferGetPage(buf);
215 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
217 low = P_FIRSTDATAKEY(opaque);
218 high = PageGetMaxOffsetNumber(page);
221 * If there are no keys on the page, return the first available slot.
222 * Note this covers two cases: the page is really empty (no keys), or
223 * it contains only a high key. The latter case is possible after
224 * vacuuming. This can never happen on an internal page, however,
225 * since they are never empty (an internal page must have children).
231 * Binary search to find the first key on the page >= scan key. Loop
232 * invariant: all slots before 'low' are < scan key, all slots at or
233 * after 'high' are >= scan key. We can fall out when high == low.
235 high++; /* establish the loop invariant for high */
239 OffsetNumber mid = low + ((high - low) / 2);
241 /* We have low <= mid < high, so mid points at a real slot */
243 result = _bt_compare(rel, keysz, scankey, page, mid);
252 * At this point we have high == low, but be careful: they could point
253 * past the last slot on the page.
255 * On a leaf page, we always return the first key >= scan key (which
256 * could be the last slot + 1).
258 if (P_ISLEAF(opaque))
262 * On a non-leaf page, return the last key < scan key. There must be
263 * one if _bt_compare() is playing by the rules.
265 Assert(low > P_FIRSTDATAKEY(opaque));
267 return OffsetNumberPrev(low);
271 * _bt_compare() -- Compare scankey to a particular tuple on the page.
273 * keysz: number of key conditions to be checked (might be less than the
274 * total length of the scan key!)
275 * page/offnum: location of btree item to be compared to.
277 * This routine returns:
278 * <0 if scankey < tuple at offnum;
279 * 0 if scankey == tuple at offnum;
280 * >0 if scankey > tuple at offnum.
281 * NULLs in the keys are treated as sortable values. Therefore
282 * "equality" does not necessarily mean that the item should be
283 * returned to the caller as a matching key!
285 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
286 * "minus infinity": this routine will always claim it is less than the
287 * scankey. The actual key value stored (if any, which there probably isn't)
288 * does not matter. This convention allows us to implement the Lehman and
289 * Yao convention that the first down-link pointer is before the first key.
290 * See backend/access/nbtree/README for details.
294 _bt_compare(Relation rel,
300 TupleDesc itupdesc = RelationGetDescr(rel);
301 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
307 * Force result ">" if target item is first data item on an internal
308 * page --- see NOTE above.
310 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
313 btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
314 itup = &(btitem->bti_itup);
317 * The scan key is set up with the attribute number associated with
318 * each term in the key. It is important that, if the index is
319 * multi-key, the scan contain the first k key attributes, and that
320 * they be in order. If you think about how multi-key ordering works,
321 * you'll understand why this is.
323 * We don't test for violation of this condition here, however. The
324 * initial setup for the index scan had better have gotten it right
328 for (i = 0; i < keysz; i++)
330 ScanKey entry = &scankey[i];
335 datum = index_getattr(itup, entry->sk_attno, itupdesc, &isNull);
337 /* see comments about NULLs handling in btbuild */
338 if (entry->sk_flags & SK_ISNULL) /* key is NULL */
341 result = 0; /* NULL "=" NULL */
343 result = 1; /* NULL ">" NOT_NULL */
345 else if (isNull) /* key is NOT_NULL and item is NULL */
347 result = -1; /* NOT_NULL "<" NULL */
351 result = DatumGetInt32(FunctionCall2(&entry->sk_func,
356 /* if the keys are unequal, return the difference */
361 /* if we get here, the keys are equal */
366 * _bt_next() -- Get the next item in a scan.
368 * On entry, we have a valid currentItemData in the scan, and a
369 * read lock and pin count on the page that contains that item.
370 * We return the next item in the scan, or false if no more.
371 * On successful exit, the page containing the new item is locked
372 * and pinned; on failure exit, no lock or pin is held.
375 _bt_next(IndexScanDesc scan, ScanDirection dir)
387 rel = scan->indexRelation;
388 so = (BTScanOpaque) scan->opaque;
389 current = &(scan->currentItemData);
391 /* we still have the buffer pinned and locked */
392 buf = so->btso_curbuf;
393 Assert(BufferIsValid(buf));
397 /* step one tuple in the appropriate direction */
398 if (!_bt_step(scan, &buf, dir))
401 /* current is the next candidate tuple to return */
402 offnum = ItemPointerGetOffsetNumber(current);
403 page = BufferGetPage(buf);
404 btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
405 itup = &btitem->bti_itup;
407 if (_bt_checkkeys(scan, itup, dir, &continuescan))
409 /* tuple passes all scan key conditions, so return it */
410 scan->xs_ctup.t_self = itup->t_tid;
414 /* This tuple doesn't pass, but there might be more that do */
415 } while (continuescan);
417 /* No more items, so close down the current-item info */
418 ItemPointerSetInvalid(current);
419 so->btso_curbuf = InvalidBuffer;
420 _bt_relbuf(rel, buf);
426 * _bt_first() -- Find the first item in a scan.
428 * We need to be clever about the type of scan, the operation it's
429 * performing, and the tree ordering. We find the
430 * first item in the tree that satisfies the qualification
431 * associated with the scan descriptor. On exit, the page containing
432 * the current index tuple is read locked and pinned, and the scan's
433 * opaque data entry is updated to include the buffer.
436 _bt_first(IndexScanDesc scan, ScanDirection dir)
438 Relation rel = scan->indexRelation;
439 BTScanOpaque so = (BTScanOpaque) scan->opaque;
448 StrategyNumber strat;
453 ScanKey scankeys = NULL;
458 StrategyNumber strat_total;
461 * Order the scan keys in our canonical fashion and eliminate any
467 * Quit now if _bt_orderkeys() discovered that the scan keys can never
468 * be satisfied (eg, x == 1 AND x > 2).
474 * Examine the scan keys to discover where we need to start the scan.
477 strat_total = BTEqualStrategyNumber;
478 if (so->numberOfKeys > 0)
480 nKeyIs = (int *) palloc(so->numberOfKeys * sizeof(int));
481 for (i = 0; i < so->numberOfKeys; i++)
483 AttrNumber attno = so->keyData[i].sk_attno;
485 /* ignore keys for already-determined attrs */
486 if (attno <= keysCount)
488 /* if we didn't find a boundary for the preceding attr, quit */
489 if (attno > keysCount + 1)
491 strat = _bt_getstrat(rel, attno,
492 so->keyData[i].sk_procedure);
495 * Can we use this key as a starting boundary for this attr?
497 * We can use multiple keys if they look like, say, = >= = but we
498 * have to stop after accepting a > or < boundary.
500 if (strat == strat_total ||
501 strat == BTEqualStrategyNumber)
502 nKeyIs[keysCount++] = i;
503 else if (ScanDirectionIsBackward(dir) &&
504 (strat == BTLessStrategyNumber ||
505 strat == BTLessEqualStrategyNumber))
507 nKeyIs[keysCount++] = i;
509 if (strat == BTLessStrategyNumber)
512 else if (ScanDirectionIsForward(dir) &&
513 (strat == BTGreaterStrategyNumber ||
514 strat == BTGreaterEqualStrategyNumber))
516 nKeyIs[keysCount++] = i;
518 if (strat == BTGreaterStrategyNumber)
528 /* if we just need to walk down one edge of the tree, do that */
533 return _bt_endpoint(scan, dir);
537 * We want to start the scan somewhere within the index. Set up a
538 * scankey we can use to search for the correct starting point.
540 scankeys = (ScanKey) palloc(keysCount * sizeof(ScanKeyData));
541 for (i = 0; i < keysCount; i++)
548 * _bt_orderkeys disallows it, but it's place to add some code
551 if (so->keyData[j].sk_flags & SK_ISNULL)
555 elog(ERROR, "btree doesn't support is(not)null, yet");
558 procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
559 ScanKeyEntryInitializeWithInfo(scankeys + i,
560 so->keyData[j].sk_flags,
563 CurrentMemoryContext,
564 so->keyData[j].sk_argument);
569 current = &(scan->currentItemData);
572 * Use the manufactured scan key to descend the tree and position
573 * ourselves on the target leaf page.
575 stack = _bt_search(rel, keysCount, scankeys, &buf, BT_READ);
577 /* don't need to keep the stack around... */
578 _bt_freestack(stack);
580 if (!BufferIsValid(buf))
582 /* Only get here if index is completely empty */
583 ItemPointerSetInvalid(current);
584 so->btso_curbuf = InvalidBuffer;
589 /* remember which buffer we have pinned */
590 so->btso_curbuf = buf;
591 blkno = BufferGetBlockNumber(buf);
592 page = BufferGetPage(buf);
594 /* position to the precise item on the page */
595 offnum = _bt_binsrch(rel, buf, keysCount, scankeys);
597 ItemPointerSet(current, blkno, offnum);
600 * At this point we are positioned at the first item >= scan key, or
601 * possibly at the end of a page on which all the existing items are
602 * less than the scan key and we know that everything on later pages
603 * is greater than or equal to scan key.
605 * We could step forward in the latter case, but that'd be a waste of
606 * time if we want to scan backwards. So, it's now time to examine
607 * the scan strategy to find the exact place to start the scan.
609 * Note: if _bt_step fails (meaning we fell off the end of the index in
610 * one direction or the other), we either return false (no matches) or
611 * call _bt_endpoint() to set up a scan starting at that index
612 * endpoint, as appropriate for the desired scan type.
614 * it's yet other place to add some code later for is(not)null ...
619 case BTLessStrategyNumber:
622 * Back up one to arrive at last item < scankey
624 if (!_bt_step(scan, &buf, BackwardScanDirection))
631 case BTLessEqualStrategyNumber:
634 * We need to find the last item <= scankey, so step forward
635 * till we find one > scankey, then step back one.
637 if (offnum > PageGetMaxOffsetNumber(page))
639 if (!_bt_step(scan, &buf, ForwardScanDirection))
642 return _bt_endpoint(scan, dir);
647 offnum = ItemPointerGetOffsetNumber(current);
648 page = BufferGetPage(buf);
649 result = _bt_compare(rel, keysCount, scankeys, page, offnum);
652 if (!_bt_step(scan, &buf, ForwardScanDirection))
655 return _bt_endpoint(scan, dir);
658 if (!_bt_step(scan, &buf, BackwardScanDirection))
665 case BTEqualStrategyNumber:
668 * Make sure we are on the first equal item; might have to
669 * step forward if currently at end of page.
671 if (offnum > PageGetMaxOffsetNumber(page))
673 if (!_bt_step(scan, &buf, ForwardScanDirection))
678 offnum = ItemPointerGetOffsetNumber(current);
679 page = BufferGetPage(buf);
681 result = _bt_compare(rel, keysCount, scankeys, page, offnum);
683 goto nomatches; /* no equal items! */
686 * If a backward scan was specified, need to start with last
687 * equal item not first one.
689 if (ScanDirectionIsBackward(dir))
693 if (!_bt_step(scan, &buf, ForwardScanDirection))
696 return _bt_endpoint(scan, dir);
698 offnum = ItemPointerGetOffsetNumber(current);
699 page = BufferGetPage(buf);
700 result = _bt_compare(rel, keysCount, scankeys, page, offnum);
701 } while (result == 0);
702 if (!_bt_step(scan, &buf, BackwardScanDirection))
703 elog(ERROR, "equal items disappeared?");
707 case BTGreaterEqualStrategyNumber:
710 * We want the first item >= scankey, which is where we are...
711 * unless we're not anywhere at all...
713 if (offnum > PageGetMaxOffsetNumber(page))
715 if (!_bt_step(scan, &buf, ForwardScanDirection))
723 case BTGreaterStrategyNumber:
726 * We want the first item > scankey, so make sure we are on an
727 * item and then step over any equal items.
729 if (offnum > PageGetMaxOffsetNumber(page))
731 if (!_bt_step(scan, &buf, ForwardScanDirection))
736 offnum = ItemPointerGetOffsetNumber(current);
737 page = BufferGetPage(buf);
739 result = _bt_compare(rel, keysCount, scankeys, page, offnum);
742 if (!_bt_step(scan, &buf, ForwardScanDirection))
747 offnum = ItemPointerGetOffsetNumber(current);
748 page = BufferGetPage(buf);
749 result = _bt_compare(rel, keysCount, scankeys, page, offnum);
754 /* okay, current item pointer for the scan is right */
755 offnum = ItemPointerGetOffsetNumber(current);
756 page = BufferGetPage(buf);
757 btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
758 itup = &btitem->bti_itup;
760 /* is the first item actually acceptable? */
761 if (_bt_checkkeys(scan, itup, dir, &continuescan))
764 scan->xs_ctup.t_self = itup->t_tid;
767 else if (continuescan)
769 /* no, but there might be another one that is */
770 res = _bt_next(scan, dir);
774 /* no tuples in the index match this scan key */
776 ItemPointerSetInvalid(current);
777 so->btso_curbuf = InvalidBuffer;
778 _bt_relbuf(rel, buf);
788 * _bt_step() -- Step one item in the requested direction in a scan on
791 * *bufP is the current buffer (read-locked and pinned). If we change
792 * pages, it's updated appropriately.
794 * If successful, update scan's currentItemData and return true.
795 * If no adjacent record exists in the requested direction,
796 * release buffer pin/locks and return false.
799 _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
801 Relation rel = scan->indexRelation;
802 ItemPointer current = &(scan->currentItemData);
803 BTScanOpaque so = (BTScanOpaque) scan->opaque;
811 * Don't use ItemPointerGetOffsetNumber or you risk to get assertion
812 * due to ability of ip_posid to be equal 0.
814 offnum = current->ip_posid;
816 page = BufferGetPage(*bufP);
817 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
818 maxoff = PageGetMaxOffsetNumber(page);
820 if (ScanDirectionIsForward(dir))
822 if (!PageIsEmpty(page) && offnum < maxoff)
823 offnum = OffsetNumberNext(offnum);
826 /* Walk right to the next page with data */
829 /* if we're at end of scan, release the buffer and return */
830 if (P_RIGHTMOST(opaque))
832 _bt_relbuf(rel, *bufP);
833 ItemPointerSetInvalid(current);
834 *bufP = so->btso_curbuf = InvalidBuffer;
837 /* step right one page */
838 blkno = opaque->btpo_next;
839 _bt_relbuf(rel, *bufP);
840 *bufP = _bt_getbuf(rel, blkno, BT_READ);
841 page = BufferGetPage(*bufP);
842 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
843 if (!P_IGNORE(opaque))
845 maxoff = PageGetMaxOffsetNumber(page);
846 /* done if it's not empty */
847 offnum = P_FIRSTDATAKEY(opaque);
848 if (!PageIsEmpty(page) && offnum <= maxoff)
857 if (offnum > P_FIRSTDATAKEY(opaque))
858 offnum = OffsetNumberPrev(offnum);
862 * Walk left to the next page with data. This is much more
863 * complex than the walk-right case because of the possibility
864 * that the page to our left splits while we are in flight to
865 * it, plus the possibility that the page we were on gets
866 * deleted after we leave it. See nbtree/README for details.
870 *bufP = _bt_walk_left(rel, *bufP);
872 /* if we're at end of scan, return failure */
873 if (*bufP == InvalidBuffer)
875 ItemPointerSetInvalid(current);
876 so->btso_curbuf = InvalidBuffer;
879 page = BufferGetPage(*bufP);
880 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
883 * Okay, we managed to move left to a non-deleted page.
884 * Done if it's not half-dead and not empty. Else loop
885 * back and do it all again.
887 if (!P_IGNORE(opaque))
889 maxoff = PageGetMaxOffsetNumber(page);
891 if (!PageIsEmpty(page) &&
892 maxoff >= P_FIRSTDATAKEY(opaque))
899 /* Update scan state */
900 so->btso_curbuf = *bufP;
901 blkno = BufferGetBlockNumber(*bufP);
902 ItemPointerSet(current, blkno, offnum);
908 * _bt_walk_left() -- step left one page, if possible
910 * The given buffer must be pinned and read-locked. This will be dropped
911 * before stepping left. On return, we have pin and read lock on the
912 * returned page, instead.
914 * Returns InvalidBuffer if there is no page to the left (no lock is held
917 * When working on a non-leaf level, it is possible for the returned page
918 * to be half-dead; the caller should check that condition and step left
919 * again if it's important.
922 _bt_walk_left(Relation rel, Buffer buf)
927 page = BufferGetPage(buf);
928 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
937 /* if we're at end of tree, release buf and return failure */
938 if (P_LEFTMOST(opaque))
940 _bt_relbuf(rel, buf);
943 /* remember original page we are stepping left from */
944 obknum = BufferGetBlockNumber(buf);
946 blkno = lblkno = opaque->btpo_prev;
947 _bt_relbuf(rel, buf);
948 buf = _bt_getbuf(rel, blkno, BT_READ);
949 page = BufferGetPage(buf);
950 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
953 * If this isn't the page we want, walk right till we find what we
954 * want --- but go no more than four hops (an arbitrary limit).
955 * If we don't find the correct page by then, the most likely bet
956 * is that the original page got deleted and isn't in the sibling
957 * chain at all anymore, not that its left sibling got split more
960 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
961 * because half-dead pages are still in the sibling chain. Caller
962 * must reject half-dead pages if wanted.
967 if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
969 /* Found desired page, return it */
972 if (P_RIGHTMOST(opaque) || ++tries > 4)
974 blkno = opaque->btpo_next;
975 _bt_relbuf(rel, buf);
976 buf = _bt_getbuf(rel, blkno, BT_READ);
977 page = BufferGetPage(buf);
978 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
981 /* Return to the original page to see what's up */
982 _bt_relbuf(rel, buf);
983 buf = _bt_getbuf(rel, obknum, BT_READ);
984 page = BufferGetPage(buf);
985 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
986 if (P_ISDELETED(opaque))
989 * It was deleted. Move right to first nondeleted page (there
990 * must be one); that is the page that has acquired the
991 * deleted one's keyspace, so stepping left from it will take
992 * us where we want to be.
996 if (P_RIGHTMOST(opaque))
997 elog(ERROR, "fell off the end of \"%s\"",
998 RelationGetRelationName(rel));
999 blkno = opaque->btpo_next;
1000 _bt_relbuf(rel, buf);
1001 buf = _bt_getbuf(rel, blkno, BT_READ);
1002 page = BufferGetPage(buf);
1003 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1004 if (!P_ISDELETED(opaque))
1009 * Now return to top of loop, resetting obknum to point to
1010 * this nondeleted page, and try again.
1016 * It wasn't deleted; the explanation had better be that the
1017 * page to the left got split or deleted. Without this check,
1018 * we'd go into an infinite loop if there's anything wrong.
1020 if (opaque->btpo_prev == lblkno)
1021 elog(ERROR, "could not find left sibling in \"%s\"",
1022 RelationGetRelationName(rel));
1023 /* Okay to try again with new lblkno value */
1027 return InvalidBuffer;
1031 * _bt_get_endpoint() -- Find the first or last page on a given tree level
1033 * If the index is empty, we will return InvalidBuffer; any other failure
1034 * condition causes ereport(). We will not return a dead page.
1036 * The returned buffer is pinned and read-locked.
1039 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
1043 BTPageOpaque opaque;
1044 OffsetNumber offnum;
1050 * If we are looking for a leaf page, okay to descend from fast root;
1051 * otherwise better descend from true root. (There is no point in
1052 * being smarter about intermediate levels.)
1055 buf = _bt_getroot(rel, BT_READ);
1057 buf = _bt_gettrueroot(rel);
1059 if (!BufferIsValid(buf))
1061 /* empty index... */
1062 return InvalidBuffer;
1065 page = BufferGetPage(buf);
1066 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1071 * If we landed on a deleted page, step right to find a live page
1072 * (there must be one). Also, if we want the rightmost page, step
1073 * right if needed to get to it (this could happen if the page
1074 * split since we obtained a pointer to it).
1076 while (P_IGNORE(opaque) ||
1077 (rightmost && !P_RIGHTMOST(opaque)))
1079 blkno = opaque->btpo_next;
1080 if (blkno == P_NONE)
1081 elog(ERROR, "fell off the end of \"%s\"",
1082 RelationGetRelationName(rel));
1083 _bt_relbuf(rel, buf);
1084 buf = _bt_getbuf(rel, blkno, BT_READ);
1085 page = BufferGetPage(buf);
1086 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1090 if (opaque->btpo.level == level)
1092 if (opaque->btpo.level < level)
1093 elog(ERROR, "btree level %u not found", level);
1095 /* Descend to leftmost or rightmost child page */
1097 offnum = PageGetMaxOffsetNumber(page);
1099 offnum = P_FIRSTDATAKEY(opaque);
1101 btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
1102 itup = &(btitem->bti_itup);
1103 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1105 _bt_relbuf(rel, buf);
1106 buf = _bt_getbuf(rel, blkno, BT_READ);
1107 page = BufferGetPage(buf);
1108 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1115 * _bt_endpoint() -- Find the first or last key in the index.
1117 * This is used by _bt_first() to set up a scan when we've determined
1118 * that the scan must start at the beginning or end of the index (for
1119 * a forward or backward scan respectively).
1122 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
1127 BTPageOpaque opaque;
1128 ItemPointer current;
1129 OffsetNumber maxoff;
1138 rel = scan->indexRelation;
1139 current = &(scan->currentItemData);
1140 so = (BTScanOpaque) scan->opaque;
1143 * Scan down to the leftmost or rightmost leaf page. This is a
1144 * simplified version of _bt_search(). We don't maintain a stack
1145 * since we know we won't need it.
1147 buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
1149 if (!BufferIsValid(buf))
1151 /* empty index... */
1152 ItemPointerSetInvalid(current);
1153 so->btso_curbuf = InvalidBuffer;
1157 blkno = BufferGetBlockNumber(buf);
1158 page = BufferGetPage(buf);
1159 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1160 Assert(P_ISLEAF(opaque));
1162 maxoff = PageGetMaxOffsetNumber(page);
1164 if (ScanDirectionIsForward(dir))
1166 /* There could be dead pages to the left, so not this: */
1167 /* Assert(P_LEFTMOST(opaque)); */
1169 start = P_FIRSTDATAKEY(opaque);
1171 else if (ScanDirectionIsBackward(dir))
1173 Assert(P_RIGHTMOST(opaque));
1175 start = PageGetMaxOffsetNumber(page);
1176 if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty
1178 start = P_FIRSTDATAKEY(opaque);
1182 elog(ERROR, "invalid scan direction: %d", (int) dir);
1183 start = 0; /* keep compiler quiet */
1186 ItemPointerSet(current, blkno, start);
1187 /* remember which buffer we have pinned */
1188 so->btso_curbuf = buf;
1191 * Left/rightmost page could be empty due to deletions, if so step
1192 * till we find a nonempty page.
1196 if (!_bt_step(scan, &buf, dir))
1198 start = ItemPointerGetOffsetNumber(current);
1199 page = BufferGetPage(buf);
1202 btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
1203 itup = &(btitem->bti_itup);
1205 /* see if we picked a winner */
1206 if (_bt_checkkeys(scan, itup, dir, &continuescan))
1208 /* yes, return it */
1209 scan->xs_ctup.t_self = itup->t_tid;
1212 else if (continuescan)
1214 /* no, but there might be another one that is */
1215 res = _bt_next(scan, dir);
1219 /* no tuples in the index match this scan key */
1220 ItemPointerSetInvalid(current);
1221 so->btso_curbuf = InvalidBuffer;
1222 _bt_relbuf(rel, buf);