1 /*-------------------------------------------------------------------------
4 * Search code for postgres btrees.
7 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.110 2007/01/05 22:19:23 momjian Exp $
13 *-------------------------------------------------------------------------
18 #include "access/genam.h"
19 #include "access/nbtree.h"
21 #include "utils/lsyscache.h"
24 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
26 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
27 static Buffer _bt_walk_left(Relation rel, Buffer buf);
28 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
32 * _bt_search() -- Search the tree for a particular scankey,
33 * or more precisely for the first leaf page it could be on.
35 * The passed scankey must be an insertion-type scankey (see nbtree/README),
36 * but it can omit the rightmost column(s) of the index.
38 * When nextkey is false (the usual case), we are looking for the first
39 * item >= scankey. When nextkey is true, we are looking for the first
40 * item strictly greater than scankey.
42 * Return value is a stack of parent-page pointers. *bufP is set to the
43 * address of the leaf-page buffer, which is read-locked and pinned.
44 * No locks are held on the parent pages, however!
46 * NOTE that the returned buffer is read-locked regardless of the access
47 * parameter. However, access = BT_WRITE will allow an empty root page
48 * to be created and returned. When access = BT_READ, an empty index
49 * will result in *bufP being set to InvalidBuffer.
52 _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
53 Buffer *bufP, int access)
55 BTStack stack_in = NULL;
57 /* Get the root page to start with */
58 *bufP = _bt_getroot(rel, access);
60 /* If index is empty and access = BT_READ, no root page is created. */
61 if (!BufferIsValid(*bufP))
62 return (BTStack) NULL;
64 /* Loop iterates once per level descended in the tree */
73 BlockNumber par_blkno;
77 * Race -- the page we just grabbed may have split since we read its
78 * pointer in the parent (or metapage). If it has, we may need to
79 * move right to its new sibling. Do that.
81 *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
83 /* if this is a leaf page, we're done */
84 page = BufferGetPage(*bufP);
85 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
90 * Find the appropriate item on the internal page, and get the child
91 * page that it points to.
93 offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
94 itemid = PageGetItemId(page, offnum);
95 itup = (IndexTuple) PageGetItem(page, itemid);
96 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
97 par_blkno = BufferGetBlockNumber(*bufP);
100 * We need to save the location of the index entry we chose in the
101 * parent page on a stack. In case we split the tree, we'll use the
102 * stack to work back up to the parent page. We also save the actual
103 * downlink (TID) to uniquely identify the index entry, in case it
104 * moves right while we're working lower in the tree. See the paper
105 * by Lehman and Yao for how this is detected and handled. (We use the
106 * child link to disambiguate duplicate keys in the index -- Lehman
107 * and Yao disallow duplicate keys.)
109 new_stack = (BTStack) palloc(sizeof(BTStackData));
110 new_stack->bts_blkno = par_blkno;
111 new_stack->bts_offset = offnum;
112 memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
113 new_stack->bts_parent = stack_in;
115 /* drop the read lock on the parent page, acquire one on the child */
116 *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
118 /* okay, all set to move down a level */
119 stack_in = new_stack;
126 * _bt_moveright() -- move right in the btree if necessary.
128 * When we follow a pointer to reach a page, it is possible that
129 * the page has changed in the meanwhile. If this happens, we're
130 * guaranteed that the page has "split right" -- that is, that any
131 * data that appeared on the page originally is either on the page
132 * or strictly to the right of it.
134 * This routine decides whether or not we need to move right in the
135 * tree by examining the high key entry on the page. If that entry
136 * is strictly less than the scankey, or <= the scankey in the nextkey=true
137 * case, then we followed the wrong link and we need to move right.
139 * The passed scankey must be an insertion-type scankey (see nbtree/README),
140 * but it can omit the rightmost column(s) of the index.
142 * When nextkey is false (the usual case), we are looking for the first
143 * item >= scankey. When nextkey is true, we are looking for the first
144 * item strictly greater than scankey.
146 * On entry, we have the buffer pinned and a lock of the type specified by
147 * 'access'. If we move right, we release the buffer and lock and acquire
148 * the same on the right sibling. Return value is the buffer we stop at.
151 _bt_moveright(Relation rel,
162 page = BufferGetPage(buf);
163 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
166 * When nextkey = false (normal case): if the scan key that brought us to
167 * this page is > the high key stored on the page, then the page has split
168 * and we need to move right. (If the scan key is equal to the high key,
169 * we might or might not need to move right; have to scan the page first
172 * When nextkey = true: move right if the scan key is >= page's high key.
174 * The page could even have split more than once, so scan as far as
177 * We also have to move right if we followed a link that brought us to a
180 cmpval = nextkey ? 0 : 1;
182 while (!P_RIGHTMOST(opaque) &&
184 _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
186 /* step right one page */
187 BlockNumber rblkno = opaque->btpo_next;
189 buf = _bt_relandgetbuf(rel, buf, rblkno, access);
190 page = BufferGetPage(buf);
191 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
194 if (P_IGNORE(opaque))
195 elog(ERROR, "fell off the end of \"%s\"",
196 RelationGetRelationName(rel));
202 * _bt_binsrch() -- Do a binary search for a key on a particular page.
204 * The passed scankey must be an insertion-type scankey (see nbtree/README),
205 * but it can omit the rightmost column(s) of the index.
207 * When nextkey is false (the usual case), we are looking for the first
208 * item >= scankey. When nextkey is true, we are looking for the first
209 * item strictly greater than scankey.
211 * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
212 * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
213 * particular, this means it is possible to return a value 1 greater than the
214 * number of keys on the page, if the scankey is > all keys on the page.)
216 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
217 * of the last key < given scankey, or last key <= given scankey if nextkey
218 * is true. (Since _bt_compare treats the first data key of such a page as
219 * minus infinity, there will be at least one key < scankey, so the result
220 * always points at one of the keys on the page.) This key indicates the
221 * right place to descend to be sure we find all leaf keys >= given scankey
222 * (or leaf keys > given scankey when nextkey is true).
224 * This procedure is not responsible for walking right, it just examines
225 * the given page. _bt_binsrch() has no lock or refcount side effects
229 _bt_binsrch(Relation rel,
242 page = BufferGetPage(buf);
243 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
245 low = P_FIRSTDATAKEY(opaque);
246 high = PageGetMaxOffsetNumber(page);
249 * If there are no keys on the page, return the first available slot. Note
250 * this covers two cases: the page is really empty (no keys), or it
251 * contains only a high key. The latter case is possible after vacuuming.
252 * This can never happen on an internal page, however, since they are
253 * never empty (an internal page must have children).
259 * Binary search to find the first key on the page >= scan key, or first
260 * key > scankey when nextkey is true.
262 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
263 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
265 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
266 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
268 * We can fall out when high == low.
270 high++; /* establish the loop invariant for high */
272 cmpval = nextkey ? 0 : 1; /* select comparison value */
276 OffsetNumber mid = low + ((high - low) / 2);
278 /* We have low <= mid < high, so mid points at a real slot */
280 result = _bt_compare(rel, keysz, scankey, page, mid);
282 if (result >= cmpval)
289 * At this point we have high == low, but be careful: they could point
290 * past the last slot on the page.
292 * On a leaf page, we always return the first key >= scan key (resp. >
293 * scan key), which could be the last slot + 1.
295 if (P_ISLEAF(opaque))
299 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
300 * There must be one if _bt_compare() is playing by the rules.
302 Assert(low > P_FIRSTDATAKEY(opaque));
304 return OffsetNumberPrev(low);
308 * _bt_compare() -- Compare scankey to a particular tuple on the page.
310 * The passed scankey must be an insertion-type scankey (see nbtree/README),
311 * but it can omit the rightmost column(s) of the index.
313 * keysz: number of key conditions to be checked (might be less than the
314 * number of index columns!)
315 * page/offnum: location of btree item to be compared to.
317 * This routine returns:
318 * <0 if scankey < tuple at offnum;
319 * 0 if scankey == tuple at offnum;
320 * >0 if scankey > tuple at offnum.
321 * NULLs in the keys are treated as sortable values. Therefore
322 * "equality" does not necessarily mean that the item should be
323 * returned to the caller as a matching key!
325 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
326 * "minus infinity": this routine will always claim it is less than the
327 * scankey. The actual key value stored (if any, which there probably isn't)
328 * does not matter. This convention allows us to implement the Lehman and
329 * Yao convention that the first down-link pointer is before the first key.
330 * See backend/access/nbtree/README for details.
334 _bt_compare(Relation rel,
340 TupleDesc itupdesc = RelationGetDescr(rel);
341 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
346 * Force result ">" if target item is first data item on an internal page
347 * --- see NOTE above.
349 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
352 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
355 * The scan key is set up with the attribute number associated with each
356 * term in the key. It is important that, if the index is multi-key, the
357 * scan contain the first k key attributes, and that they be in order. If
358 * you think about how multi-key ordering works, you'll understand why
361 * We don't test for violation of this condition here, however. The
362 * initial setup for the index scan had better have gotten it right (see
366 for (i = 1; i <= keysz; i++)
372 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
374 /* see comments about NULLs handling in btbuild */
375 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
378 result = 0; /* NULL "=" NULL */
380 result = 1; /* NULL ">" NOT_NULL */
382 else if (isNull) /* key is NOT_NULL and item is NULL */
384 result = -1; /* NOT_NULL "<" NULL */
389 * The sk_func needs to be passed the index value as left arg and
390 * the sk_argument as right arg (they might be of different
391 * types). Since it is convenient for callers to think of
392 * _bt_compare as comparing the scankey to the index item, we have
393 * to flip the sign of the comparison result.
395 * Note: curious-looking coding is to avoid overflow if comparison
396 * function returns INT_MIN. There is no risk of overflow for
399 result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
401 scankey->sk_argument));
402 result = (result < 0) ? 1 : -result;
405 /* if the keys are unequal, return the difference */
412 /* if we get here, the keys are equal */
417 * _bt_first() -- Find the first item in a scan.
419 * We need to be clever about the direction of scan, the search
420 * conditions, and the tree ordering. We find the first item (or,
421 * if backwards scan, the last item) in the tree that satisfies the
422 * qualifications in the scan key. On success exit, the page containing
423 * the current index tuple is pinned but not locked, and data about
424 * the matching tuple(s) on the page has been loaded into so->currPos,
425 * and scan->xs_ctup.t_self is set to the heap TID of the current tuple.
427 * If there are no matching items in the index, we return FALSE, with no
428 * pins or locks held.
430 * Note that scan->keyData[], and the so->keyData[] scankey built from it,
431 * are both search-type scankeys (see nbtree/README for more about this).
432 * Within this routine, we build a temporary insertion-type scankey to use
433 * in locating the scan start position.
436 _bt_first(IndexScanDesc scan, ScanDirection dir)
438 Relation rel = scan->indexRelation;
439 BTScanOpaque so = (BTScanOpaque) scan->opaque;
443 StrategyNumber strat;
446 ScanKey startKeys[INDEX_MAX_KEYS];
447 ScanKeyData scankeys[INDEX_MAX_KEYS];
450 StrategyNumber strat_total;
452 pgstat_count_index_scan(&scan->xs_pgstat_info);
455 * Examine the scan keys and eliminate any redundant keys; also mark the
456 * keys that must be matched to continue the scan.
458 _bt_preprocess_keys(scan);
461 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
462 * never be satisfied (eg, x == 1 AND x > 2).
468 * Examine the scan keys to discover where we need to start the scan.
470 * We want to identify the keys that can be used as starting boundaries;
471 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
472 * a backwards scan. We can use keys for multiple attributes so long as
473 * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
474 * a > or < boundary or find an attribute with no boundary (which can be
475 * thought of as the same as "> -infinity"), we can't use keys for any
476 * attributes to its right, because it would break our simplistic notion
477 * of what initial positioning strategy to use.
479 * When the scan keys include cross-type operators, _bt_preprocess_keys
480 * may not be able to eliminate redundant keys; in such cases we will
481 * arbitrarily pick a usable one for each attribute. This is correct
482 * but possibly not optimal behavior. (For example, with keys like
483 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
484 * x=5 would be more efficient.) Since the situation only arises given
485 * a poorly-worded query plus an incomplete opfamily, live with it.
487 * When both equality and inequality keys appear for a single attribute
488 * (again, only possible when cross-type operators appear), we *must*
489 * select one of the equality keys for the starting point, because
490 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
491 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
492 * start at x=4, we will fail and stop before reaching x=10. If multiple
493 * equality quals survive preprocessing, however, it doesn't matter which
494 * one we use --- by definition, they are either redundant or
497 * In this loop, row-comparison keys are treated the same as keys on their
498 * first (leftmost) columns. We'll add on lower-order columns of the row
499 * comparison below, if possible.
501 * The selected scan keys (at most one per index column) are remembered by
502 * storing their addresses into the local startKeys[] array.
505 strat_total = BTEqualStrategyNumber;
506 if (so->numberOfKeys > 0)
513 * chosen is the so-far-chosen key for the current attribute, if any.
514 * We don't cast the decision in stone until we reach keys for the
521 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
522 * pass to handle after-last-key processing. Actual exit from the
523 * loop is at one of the "break" statements below.
525 for (cur = so->keyData, i = 0;; cur++, i++)
527 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
530 * Done looking at keys for curattr. If we didn't find a
531 * usable boundary key, quit; else save the boundary key
532 * pointer in startKeys.
536 startKeys[keysCount++] = chosen;
539 * Adjust strat_total, and quit if we have stored a > or <
542 strat = chosen->sk_strategy;
543 if (strat != BTEqualStrategyNumber)
546 if (strat == BTGreaterStrategyNumber ||
547 strat == BTLessStrategyNumber)
552 * Done if that was the last attribute, or if next key is not
553 * in sequence (implying no boundary key is available for the
556 if (i >= so->numberOfKeys ||
557 cur->sk_attno != curattr + 1)
561 * Reset for next attr.
563 curattr = cur->sk_attno;
567 /* Can we use this key as a starting boundary for this attr? */
568 switch (cur->sk_strategy)
570 case BTLessStrategyNumber:
571 case BTLessEqualStrategyNumber:
572 if (chosen == NULL && ScanDirectionIsBackward(dir))
575 case BTEqualStrategyNumber:
576 /* override any non-equality choice */
579 case BTGreaterEqualStrategyNumber:
580 case BTGreaterStrategyNumber:
581 if (chosen == NULL && ScanDirectionIsForward(dir))
589 * If we found no usable boundary keys, we have to start from one end of
590 * the tree. Walk down that edge to the first or last key, and scan from
594 return _bt_endpoint(scan, dir);
597 * We want to start the scan somewhere within the index. Set up an
598 * insertion scankey we can use to search for the boundary point we
599 * identified above. The insertion scankey is built in the local
600 * scankeys[] array, using the keys identified by startKeys[].
602 Assert(keysCount <= INDEX_MAX_KEYS);
603 for (i = 0; i < keysCount; i++)
605 ScanKey cur = startKeys[i];
607 Assert(cur->sk_attno == i + 1);
609 if (cur->sk_flags & SK_ROW_HEADER)
612 * Row comparison header: look to the first row member instead.
614 * The member scankeys are already in insertion format (ie, they
615 * have sk_func = 3-way-comparison function), but we have to watch
616 * out for nulls, which _bt_preprocess_keys didn't check. A null
617 * in the first row member makes the condition unmatchable, just
618 * like qual_ok = false.
620 cur = (ScanKey) DatumGetPointer(cur->sk_argument);
621 Assert(cur->sk_flags & SK_ROW_MEMBER);
622 if (cur->sk_flags & SK_ISNULL)
624 memcpy(scankeys + i, cur, sizeof(ScanKeyData));
627 * If the row comparison is the last positioning key we accepted,
628 * try to add additional keys from the lower-order row members.
629 * (If we accepted independent conditions on additional index
630 * columns, we use those instead --- doesn't seem worth trying to
631 * determine which is more restrictive.) Note that this is OK
632 * even if the row comparison is of ">" or "<" type, because the
633 * condition applied to all but the last row member is effectively
634 * ">=" or "<=", and so the extra keys don't break the positioning
637 if (i == keysCount - 1)
639 while (!(cur->sk_flags & SK_ROW_END))
642 Assert(cur->sk_flags & SK_ROW_MEMBER);
643 if (cur->sk_attno != keysCount + 1)
644 break; /* out-of-sequence, can't use it */
645 if (cur->sk_flags & SK_ISNULL)
646 break; /* can't use null keys */
647 Assert(keysCount < INDEX_MAX_KEYS);
648 memcpy(scankeys + keysCount, cur, sizeof(ScanKeyData));
651 break; /* done with outer loop */
657 * Ordinary comparison key. Transform the search-style scan key
658 * to an insertion scan key by replacing the sk_func with the
659 * appropriate btree comparison function.
661 * If scankey operator is not a cross-type comparison, we can use
662 * the cached comparison function; otherwise gotta look it up in
663 * the catalogs. (That can't lead to infinite recursion, since no
664 * indexscan initiated by syscache lookup will use cross-data-type
667 * We support the convention that sk_subtype == InvalidOid means
668 * the opclass input type; this is a hack to simplify life for
671 if (cur->sk_subtype == rel->rd_opcintype[i] ||
672 cur->sk_subtype == InvalidOid)
676 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
677 ScanKeyEntryInitializeWithInfo(scankeys + i,
687 RegProcedure cmp_proc;
689 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
690 rel->rd_opcintype[i],
693 if (!RegProcedureIsValid(cmp_proc))
694 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
695 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
696 cur->sk_attno, RelationGetRelationName(rel));
697 ScanKeyEntryInitialize(scankeys + i,
709 * Examine the selected initial-positioning strategy to determine exactly
710 * where we need to start the scan, and set flag variables to control the
713 * If nextkey = false, _bt_search and _bt_binsrch will locate the first
714 * item >= scan key. If nextkey = true, they will locate the first
717 * If goback = true, we will then step back one item, while if
718 * goback = false, we will start the scan on the located item.
720 * it's yet other place to add some code later for is(not)null ...
725 case BTLessStrategyNumber:
728 * Find first item >= scankey, then back up one to arrive at last
729 * item < scankey. (Note: this positioning strategy is only used
730 * for a backward scan, so that is always the correct starting
737 case BTLessEqualStrategyNumber:
740 * Find first item > scankey, then back up one to arrive at last
741 * item <= scankey. (Note: this positioning strategy is only used
742 * for a backward scan, so that is always the correct starting
749 case BTEqualStrategyNumber:
752 * If a backward scan was specified, need to start with last equal
753 * item not first one.
755 if (ScanDirectionIsBackward(dir))
758 * This is the same as the <= strategy. We will check at the
759 * end whether the found item is actually =.
767 * This is the same as the >= strategy. We will check at the
768 * end whether the found item is actually =.
775 case BTGreaterEqualStrategyNumber:
778 * Find first item >= scankey. (This is only used for forward
785 case BTGreaterStrategyNumber:
788 * Find first item > scankey. (This is only used for forward
796 /* can't get here, but keep compiler quiet */
797 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
802 * Use the manufactured insertion scan key to descend the tree and
803 * position ourselves on the target leaf page.
805 stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ);
807 /* don't need to keep the stack around... */
808 _bt_freestack(stack);
810 /* remember which buffer we have pinned, if any */
811 so->currPos.buf = buf;
813 if (!BufferIsValid(buf))
815 /* Only get here if index is completely empty */
819 /* initialize moreLeft/moreRight appropriately for scan direction */
820 if (ScanDirectionIsForward(dir))
822 so->currPos.moreLeft = false;
823 so->currPos.moreRight = true;
827 so->currPos.moreLeft = true;
828 so->currPos.moreRight = false;
830 so->numKilled = 0; /* just paranoia */
831 so->markItemIndex = -1; /* ditto */
833 /* position to the precise item on the page */
834 offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
837 * If nextkey = false, we are positioned at the first item >= scan key, or
838 * possibly at the end of a page on which all the existing items are less
839 * than the scan key and we know that everything on later pages is greater
840 * than or equal to scan key.
842 * If nextkey = true, we are positioned at the first item > scan key, or
843 * possibly at the end of a page on which all the existing items are less
844 * than or equal to the scan key and we know that everything on later
845 * pages is greater than scan key.
847 * The actually desired starting point is either this item or the prior
848 * one, or in the end-of-page case it's the first item on the next page or
849 * the last item on this page. Adjust the starting offset if needed. (If
850 * this results in an offset before the first item or after the last one,
851 * _bt_readpage will report no items found, and then we'll step to the
852 * next page as needed.)
855 offnum = OffsetNumberPrev(offnum);
858 * Now load data from the first page of the scan.
860 if (!_bt_readpage(scan, dir, offnum))
863 * There's no actually-matching data on this page. Try to advance to
864 * the next page. Return false if there's no matching data at all.
866 if (!_bt_steppage(scan, dir))
870 /* Drop the lock, but not pin, on the current page */
871 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
873 /* OK, itemIndex says what to return */
874 scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
880 * _bt_next() -- Get the next item in a scan.
882 * On entry, so->currPos describes the current page, which is pinned
883 * but not locked, and so->currPos.itemIndex identifies which item was
884 * previously returned.
886 * On successful exit, scan->xs_ctup.t_self is set to the TID of the
887 * next heap tuple, and so->currPos is updated as needed.
889 * On failure exit (no more tuples), we release pin and set
890 * so->currPos.buf to InvalidBuffer.
893 _bt_next(IndexScanDesc scan, ScanDirection dir)
895 BTScanOpaque so = (BTScanOpaque) scan->opaque;
898 * Advance to next tuple on current page; or if there's no more, try to
899 * step to the next page with data.
901 if (ScanDirectionIsForward(dir))
903 if (++so->currPos.itemIndex > so->currPos.lastItem)
905 /* We must acquire lock before applying _bt_steppage */
906 Assert(BufferIsValid(so->currPos.buf));
907 LockBuffer(so->currPos.buf, BT_READ);
908 if (!_bt_steppage(scan, dir))
910 /* Drop the lock, but not pin, on the new page */
911 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
916 if (--so->currPos.itemIndex < so->currPos.firstItem)
918 /* We must acquire lock before applying _bt_steppage */
919 Assert(BufferIsValid(so->currPos.buf));
920 LockBuffer(so->currPos.buf, BT_READ);
921 if (!_bt_steppage(scan, dir))
923 /* Drop the lock, but not pin, on the new page */
924 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
928 /* OK, itemIndex says what to return */
929 scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
935 * _bt_readpage() -- Load data from current index page into so->currPos
937 * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
938 * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
939 * they are updated as appropriate. All other fields of so->currPos are
940 * initialized from scratch here.
942 * We scan the current page starting at offnum and moving in the indicated
943 * direction. All items matching the scan keys are loaded into currPos.items.
944 * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
945 * that there can be no more matching tuples in the current scan direction.
947 * Returns true if any matching items found on the page, false if none.
950 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
952 BTScanOpaque so = (BTScanOpaque) scan->opaque;
960 /* we must have the buffer pinned and locked */
961 Assert(BufferIsValid(so->currPos.buf));
963 page = BufferGetPage(so->currPos.buf);
964 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
965 minoff = P_FIRSTDATAKEY(opaque);
966 maxoff = PageGetMaxOffsetNumber(page);
969 * we must save the page's right-link while scanning it; this tells us
970 * where to step right to after we're done with these items. There is no
971 * corresponding need for the left-link, since splits always go right.
973 so->currPos.nextPage = opaque->btpo_next;
975 if (ScanDirectionIsForward(dir))
977 /* load items[] in ascending order */
980 offnum = Max(offnum, minoff);
982 while (offnum <= maxoff)
984 if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
986 /* tuple passes all scan key conditions, so remember it */
987 /* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
988 so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
989 so->currPos.items[itemIndex].indexOffset = offnum;
994 /* there can't be any more matches, so stop */
995 so->currPos.moreRight = false;
999 offnum = OffsetNumberNext(offnum);
1002 Assert(itemIndex <= MaxIndexTuplesPerPage);
1003 so->currPos.firstItem = 0;
1004 so->currPos.lastItem = itemIndex - 1;
1005 so->currPos.itemIndex = 0;
1009 /* load items[] in descending order */
1010 itemIndex = MaxIndexTuplesPerPage;
1012 offnum = Min(offnum, maxoff);
1014 while (offnum >= minoff)
1016 if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
1018 /* tuple passes all scan key conditions, so remember it */
1019 /* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
1021 so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
1022 so->currPos.items[itemIndex].indexOffset = offnum;
1026 /* there can't be any more matches, so stop */
1027 so->currPos.moreLeft = false;
1031 offnum = OffsetNumberPrev(offnum);
1034 Assert(itemIndex >= 0);
1035 so->currPos.firstItem = itemIndex;
1036 so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
1037 so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
1040 return (so->currPos.firstItem <= so->currPos.lastItem);
1044 * _bt_steppage() -- Step to next page containing valid data for scan
1046 * On entry, so->currPos.buf must be pinned and read-locked. We'll drop
1047 * the lock and pin before moving to next page.
1049 * On success exit, we hold pin and read-lock on the next interesting page,
1050 * and so->currPos is updated to contain data from that page.
1052 * If there are no more matching records in the given direction, we drop all
1053 * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
1056 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1058 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1061 BTPageOpaque opaque;
1063 /* we must have the buffer pinned and locked */
1064 Assert(BufferIsValid(so->currPos.buf));
1066 /* Before leaving current page, deal with any killed items */
1067 if (so->numKilled > 0)
1068 _bt_killitems(scan, true);
1071 * Before we modify currPos, make a copy of the page data if there was a
1072 * mark position that needs it.
1074 if (so->markItemIndex >= 0)
1076 /* bump pin on current buffer for assignment to mark buffer */
1077 IncrBufferRefCount(so->currPos.buf);
1078 memcpy(&so->markPos, &so->currPos,
1079 offsetof(BTScanPosData, items[1]) +
1080 so->currPos.lastItem * sizeof(BTScanPosItem));
1081 so->markPos.itemIndex = so->markItemIndex;
1082 so->markItemIndex = -1;
1085 rel = scan->indexRelation;
1087 if (ScanDirectionIsForward(dir))
1089 /* Walk right to the next page with data */
1090 /* We must rely on the previously saved nextPage link! */
1091 BlockNumber blkno = so->currPos.nextPage;
1093 /* Remember we left a page with data */
1094 so->currPos.moreLeft = true;
1098 /* if we're at end of scan, release the buffer and return */
1099 if (blkno == P_NONE || !so->currPos.moreRight)
1101 _bt_relbuf(rel, so->currPos.buf);
1102 so->currPos.buf = InvalidBuffer;
1105 /* step right one page */
1106 so->currPos.buf = _bt_relandgetbuf(rel, so->currPos.buf,
1108 /* check for deleted page */
1109 page = BufferGetPage(so->currPos.buf);
1110 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1111 if (!P_IGNORE(opaque))
1113 /* see if there are any matches on this page */
1114 /* note that this will clear moreRight if we can stop */
1115 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1118 /* nope, keep going */
1119 blkno = opaque->btpo_next;
1124 /* Remember we left a page with data */
1125 so->currPos.moreRight = true;
1128 * Walk left to the next page with data. This is much more complex
1129 * than the walk-right case because of the possibility that the page
1130 * to our left splits while we are in flight to it, plus the
1131 * possibility that the page we were on gets deleted after we leave
1132 * it. See nbtree/README for details.
1136 /* Done if we know there are no matching keys to the left */
1137 if (!so->currPos.moreLeft)
1139 _bt_relbuf(rel, so->currPos.buf);
1140 so->currPos.buf = InvalidBuffer;
1144 /* Step to next physical page */
1145 so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
1147 /* if we're physically at end of index, return failure */
1148 if (so->currPos.buf == InvalidBuffer)
1152 * Okay, we managed to move left to a non-deleted page. Done if
1153 * it's not half-dead and contains matching tuples. Else loop back
1154 * and do it all again.
1156 page = BufferGetPage(so->currPos.buf);
1157 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1158 if (!P_IGNORE(opaque))
1160 /* see if there are any matches on this page */
1161 /* note that this will clear moreLeft if we can stop */
1162 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1172 * _bt_walk_left() -- step left one page, if possible
1174 * The given buffer must be pinned and read-locked. This will be dropped
1175 * before stepping left. On return, we have pin and read lock on the
1176 * returned page, instead.
1178 * Returns InvalidBuffer if there is no page to the left (no lock is held
1181 * When working on a non-leaf level, it is possible for the returned page
1182 * to be half-dead; the caller should check that condition and step left
1183 * again if it's important.
1186 _bt_walk_left(Relation rel, Buffer buf)
1189 BTPageOpaque opaque;
1191 page = BufferGetPage(buf);
1192 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1201 /* if we're at end of tree, release buf and return failure */
1202 if (P_LEFTMOST(opaque))
1204 _bt_relbuf(rel, buf);
1207 /* remember original page we are stepping left from */
1208 obknum = BufferGetBlockNumber(buf);
1210 blkno = lblkno = opaque->btpo_prev;
1211 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1212 page = BufferGetPage(buf);
1213 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1216 * If this isn't the page we want, walk right till we find what we
1217 * want --- but go no more than four hops (an arbitrary limit). If we
1218 * don't find the correct page by then, the most likely bet is that
1219 * the original page got deleted and isn't in the sibling chain at all
1220 * anymore, not that its left sibling got split more than four times.
1222 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1223 * because half-dead pages are still in the sibling chain. Caller
1224 * must reject half-dead pages if wanted.
1229 if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1231 /* Found desired page, return it */
1234 if (P_RIGHTMOST(opaque) || ++tries > 4)
1236 blkno = opaque->btpo_next;
1237 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1238 page = BufferGetPage(buf);
1239 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1242 /* Return to the original page to see what's up */
1243 buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
1244 page = BufferGetPage(buf);
1245 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1246 if (P_ISDELETED(opaque))
1249 * It was deleted. Move right to first nondeleted page (there
1250 * must be one); that is the page that has acquired the deleted
1251 * one's keyspace, so stepping left from it will take us where we
1256 if (P_RIGHTMOST(opaque))
1257 elog(ERROR, "fell off the end of \"%s\"",
1258 RelationGetRelationName(rel));
1259 blkno = opaque->btpo_next;
1260 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1261 page = BufferGetPage(buf);
1262 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1263 if (!P_ISDELETED(opaque))
1268 * Now return to top of loop, resetting obknum to point to this
1269 * nondeleted page, and try again.
1275 * It wasn't deleted; the explanation had better be that the page
1276 * to the left got split or deleted. Without this check, we'd go
1277 * into an infinite loop if there's anything wrong.
1279 if (opaque->btpo_prev == lblkno)
1280 elog(ERROR, "could not find left sibling in \"%s\"",
1281 RelationGetRelationName(rel));
1282 /* Okay to try again with new lblkno value */
1286 return InvalidBuffer;
1290 * _bt_get_endpoint() -- Find the first or last page on a given tree level
1292 * If the index is empty, we will return InvalidBuffer; any other failure
1293 * condition causes ereport(). We will not return a dead page.
1295 * The returned buffer is pinned and read-locked.
1298 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
1302 BTPageOpaque opaque;
1303 OffsetNumber offnum;
1308 * If we are looking for a leaf page, okay to descend from fast root;
1309 * otherwise better descend from true root. (There is no point in being
1310 * smarter about intermediate levels.)
1313 buf = _bt_getroot(rel, BT_READ);
1315 buf = _bt_gettrueroot(rel);
1317 if (!BufferIsValid(buf))
1319 /* empty index... */
1320 return InvalidBuffer;
1323 page = BufferGetPage(buf);
1324 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1329 * If we landed on a deleted page, step right to find a live page
1330 * (there must be one). Also, if we want the rightmost page, step
1331 * right if needed to get to it (this could happen if the page split
1332 * since we obtained a pointer to it).
1334 while (P_IGNORE(opaque) ||
1335 (rightmost && !P_RIGHTMOST(opaque)))
1337 blkno = opaque->btpo_next;
1338 if (blkno == P_NONE)
1339 elog(ERROR, "fell off the end of \"%s\"",
1340 RelationGetRelationName(rel));
1341 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1342 page = BufferGetPage(buf);
1343 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1347 if (opaque->btpo.level == level)
1349 if (opaque->btpo.level < level)
1350 elog(ERROR, "btree level %u not found", level);
1352 /* Descend to leftmost or rightmost child page */
1354 offnum = PageGetMaxOffsetNumber(page);
1356 offnum = P_FIRSTDATAKEY(opaque);
1358 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1359 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1361 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1362 page = BufferGetPage(buf);
1363 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1370 * _bt_endpoint() -- Find the first or last page in the index, and scan
1371 * from there to the first key satisfying all the quals.
1373 * This is used by _bt_first() to set up a scan when we've determined
1374 * that the scan must start at the beginning or end of the index (for
1375 * a forward or backward scan respectively). Exit conditions are the
1376 * same as for _bt_first().
1379 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
1381 Relation rel = scan->indexRelation;
1382 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1385 BTPageOpaque opaque;
1389 * Scan down to the leftmost or rightmost leaf page. This is a simplified
1390 * version of _bt_search(). We don't maintain a stack since we know we
1393 buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
1395 if (!BufferIsValid(buf))
1397 /* empty index... */
1398 so->currPos.buf = InvalidBuffer;
1402 page = BufferGetPage(buf);
1403 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1404 Assert(P_ISLEAF(opaque));
1406 if (ScanDirectionIsForward(dir))
1408 /* There could be dead pages to the left, so not this: */
1409 /* Assert(P_LEFTMOST(opaque)); */
1411 start = P_FIRSTDATAKEY(opaque);
1413 else if (ScanDirectionIsBackward(dir))
1415 Assert(P_RIGHTMOST(opaque));
1417 start = PageGetMaxOffsetNumber(page);
1421 elog(ERROR, "invalid scan direction: %d", (int) dir);
1422 start = 0; /* keep compiler quiet */
1425 /* remember which buffer we have pinned */
1426 so->currPos.buf = buf;
1428 /* initialize moreLeft/moreRight appropriately for scan direction */
1429 if (ScanDirectionIsForward(dir))
1431 so->currPos.moreLeft = false;
1432 so->currPos.moreRight = true;
1436 so->currPos.moreLeft = true;
1437 so->currPos.moreRight = false;
1439 so->numKilled = 0; /* just paranoia */
1440 so->markItemIndex = -1; /* ditto */
1443 * Now load data from the first page of the scan.
1445 if (!_bt_readpage(scan, dir, start))
1448 * There's no actually-matching data on this page. Try to advance to
1449 * the next page. Return false if there's no matching data at all.
1451 if (!_bt_steppage(scan, dir))
1455 /* Drop the lock, but not pin, on the current page */
1456 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1458 /* OK, itemIndex says what to return */
1459 scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;