1 /*-------------------------------------------------------------------------
4 * Item insertion in Lehman and Yao btrees for Postgres.
6 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.175 2009/12/19 01:32:32 sriggs Exp $
13 *-------------------------------------------------------------------------
18 #include "access/heapam.h"
19 #include "access/nbtree.h"
20 #include "access/transam.h"
21 #include "miscadmin.h"
22 #include "storage/bufmgr.h"
23 #include "storage/lmgr.h"
24 #include "utils/inval.h"
25 #include "utils/tqual.h"
30 /* context data for _bt_checksplitloc */
31 Size newitemsz; /* size of new item to be inserted */
32 int fillfactor; /* needed when splitting rightmost page */
33 bool is_leaf; /* T if splitting a leaf page */
34 bool is_rightmost; /* T if splitting a rightmost page */
35 OffsetNumber newitemoff; /* where the new item is to be inserted */
36 int leftspace; /* space available for items on left page */
37 int rightspace; /* space available for items on right page */
38 int olddataitemstotal; /* space taken by old items */
40 bool have_split; /* found a valid split? */
42 /* these fields valid only if have_split is true */
43 bool newitemonleft; /* new item on left or right of best split */
44 OffsetNumber firstright; /* best split point */
45 int best_delta; /* best size delta so far */
49 static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
51 static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
52 Relation heapRel, Buffer buf, OffsetNumber offset,
54 IndexUniqueCheck checkUnique, bool *is_unique);
55 static void _bt_findinsertloc(Relation rel,
57 OffsetNumber *offsetptr,
61 static void _bt_insertonpg(Relation rel, Buffer buf,
64 OffsetNumber newitemoff,
65 bool split_only_page);
66 static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
67 OffsetNumber newitemoff, Size newitemsz,
68 IndexTuple newitem, bool newitemonleft);
69 static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
70 OffsetNumber newitemoff,
73 static void _bt_checksplitloc(FindSplitData *state,
74 OffsetNumber firstoldonright, bool newitemonleft,
75 int dataitemstoleft, Size firstoldonrightsz);
76 static void _bt_pgaddtup(Relation rel, Page page,
77 Size itemsize, IndexTuple itup,
78 OffsetNumber itup_off, const char *where);
79 static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
80 int keysz, ScanKey scankey);
81 static void _bt_vacuum_one_page(Relation rel, Buffer buffer);
85 * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
87 * This routine is called by the public interface routines, btbuild
88 * and btinsert. By here, itup is filled in, including the TID.
90 * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
91 * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
92 * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
93 * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
94 * don't actually insert.
96 * The result value is only significant for UNIQUE_CHECK_PARTIAL:
97 * it must be TRUE if the entry is known unique, else FALSE.
98 * (In the current implementation we'll also return TRUE after a
99 * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
100 * that's just a coding artifact.)
103 _bt_doinsert(Relation rel, IndexTuple itup,
104 IndexUniqueCheck checkUnique, Relation heapRel)
106 bool is_unique = false;
107 int natts = rel->rd_rel->relnatts;
108 ScanKey itup_scankey;
113 /* we need an insertion scan key to do our search, so build one */
114 itup_scankey = _bt_mkscankey(rel, itup);
117 /* find the first page containing this key */
118 stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);
120 offset = InvalidOffsetNumber;
122 /* trade in our read lock for a write lock */
123 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
124 LockBuffer(buf, BT_WRITE);
127 * If the page was split between the time that we surrendered our read
128 * lock and acquired our write lock, then this page may no longer be the
129 * right place for the key we want to insert. In this case, we need to
130 * move right in the tree. See Lehman and Yao for an excruciatingly
131 * precise description.
133 buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE);
136 * If we're not allowing duplicates, make sure the key isn't already in
139 * NOTE: obviously, _bt_check_unique can only detect keys that are already
140 * in the index; so it cannot defend against concurrent insertions of the
141 * same key. We protect against that by means of holding a write lock on
142 * the target page. Any other would-be inserter of the same key must
143 * acquire a write lock on the same target page, so only one would-be
144 * inserter can be making the check at one time. Furthermore, once we are
145 * past the check we hold write locks continuously until we have performed
146 * our insertion, so no later inserter can fail to see our insertion.
147 * (This requires some care in _bt_insertonpg.)
149 * If we must wait for another xact, we release the lock while waiting,
150 * and then must start over completely.
152 * For a partial uniqueness check, we don't wait for the other xact.
153 * Just let the tuple in and return false for possibly non-unique,
154 * or true for definitely unique.
156 if (checkUnique != UNIQUE_CHECK_NO)
160 offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
161 xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey,
162 checkUnique, &is_unique);
164 if (TransactionIdIsValid(xwait))
166 /* Have to wait for the other guy ... */
167 _bt_relbuf(rel, buf);
168 XactLockTableWait(xwait);
170 _bt_freestack(stack);
175 if (checkUnique != UNIQUE_CHECK_EXISTING)
177 /* do the insertion */
178 _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup);
179 _bt_insertonpg(rel, buf, stack, itup, offset, false);
183 /* just release the buffer */
184 _bt_relbuf(rel, buf);
188 _bt_freestack(stack);
189 _bt_freeskey(itup_scankey);
195 * _bt_check_unique() -- Check for violation of unique index constraint
197 * offset points to the first possible item that could conflict. It can
198 * also point to end-of-page, which means that the first tuple to check
199 * is the first tuple on the next page.
201 * Returns InvalidTransactionId if there is no conflict, else an xact ID
202 * we must wait for to see if it commits a conflicting tuple. If an actual
203 * conflict is detected, no return --- just ereport().
205 * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
206 * InvalidTransactionId because we don't want to wait. In this case we
207 * set *is_unique to false if there is a potential conflict, and the
208 * core code must redo the uniqueness check later.
211 _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
212 Buffer buf, OffsetNumber offset, ScanKey itup_scankey,
213 IndexUniqueCheck checkUnique, bool *is_unique)
215 TupleDesc itupdesc = RelationGetDescr(rel);
216 int natts = rel->rd_rel->relnatts;
217 SnapshotData SnapshotDirty;
221 Buffer nbuf = InvalidBuffer;
224 /* Assume unique until we find a duplicate */
227 InitDirtySnapshot(SnapshotDirty);
229 page = BufferGetPage(buf);
230 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
231 maxoff = PageGetMaxOffsetNumber(page);
234 * Scan over all equal tuples, looking for live conflicts.
243 * make sure the offset points to an actual item before trying to
246 if (offset <= maxoff)
248 curitemid = PageGetItemId(page, offset);
251 * We can skip items that are marked killed.
253 * Formerly, we applied _bt_isequal() before checking the kill
254 * flag, so as to fall out of the item loop as soon as possible.
255 * However, in the presence of heavy update activity an index may
256 * contain many killed items with the same key; running
257 * _bt_isequal() on each killed item gets expensive. Furthermore
258 * it is likely that the non-killed version of each key appears
259 * first, so that we didn't actually get to exit any sooner
260 * anyway. So now we just advance over killed items as quickly as
261 * we can. We only apply _bt_isequal() when we get to a non-killed
262 * item or the end of the page.
264 if (!ItemIdIsDead(curitemid))
266 ItemPointerData htid;
270 * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
271 * how we handling NULLs - and so we must not use _bt_compare
272 * in real comparison, but only for ordering/finding items on
273 * pages. - vadim 03/24/97
275 if (!_bt_isequal(itupdesc, page, offset, natts, itup_scankey))
276 break; /* we're past all the equal tuples */
278 /* okay, we gotta fetch the heap tuple ... */
279 curitup = (IndexTuple) PageGetItem(page, curitemid);
280 htid = curitup->t_tid;
283 * If we are doing a recheck, we expect to find the tuple we
284 * are rechecking. It's not a duplicate, but we have to keep
287 if (checkUnique == UNIQUE_CHECK_EXISTING &&
288 ItemPointerCompare(&htid, &itup->t_tid) == 0)
294 * We check the whole HOT-chain to see if there is any tuple
295 * that satisfies SnapshotDirty. This is necessary because we
296 * have just a single index entry for the entire chain.
298 else if (heap_hot_search(&htid, heapRel, &SnapshotDirty,
304 * It is a duplicate. If we are only doing a partial
305 * check, then don't bother checking if the tuple is
306 * being updated in another transaction. Just return
307 * the fact that it is a potential conflict and leave
308 * the full check till later.
310 if (checkUnique == UNIQUE_CHECK_PARTIAL)
312 if (nbuf != InvalidBuffer)
313 _bt_relbuf(rel, nbuf);
315 return InvalidTransactionId;
319 * If this tuple is being updated by other transaction
320 * then we have to wait for its commit/abort.
322 xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
323 SnapshotDirty.xmin : SnapshotDirty.xmax;
325 if (TransactionIdIsValid(xwait))
327 if (nbuf != InvalidBuffer)
328 _bt_relbuf(rel, nbuf);
329 /* Tell _bt_doinsert to wait... */
334 * Otherwise we have a definite conflict. But before
335 * complaining, look to see if the tuple we want to insert
336 * is itself now committed dead --- if so, don't complain.
337 * This is a waste of time in normal scenarios but we must
338 * do it to support CREATE INDEX CONCURRENTLY.
340 * We must follow HOT-chains here because during
341 * concurrent index build, we insert the root TID though
342 * the actual tuple may be somewhere in the HOT-chain.
343 * While following the chain we might not stop at the
344 * exact tuple which triggered the insert, but that's OK
345 * because if we find a live tuple anywhere in this chain,
346 * we have a unique key conflict. The other live tuple is
347 * not part of this chain because it had a different index
351 if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL))
353 /* Normal case --- it's still live */
358 * It's been deleted, so no error, and no need to
365 * This is a definite conflict. Break the tuple down
366 * into datums and report the error. But first, make
367 * sure we release the buffer locks we're holding ---
368 * BuildIndexValueDescription could make catalog accesses,
369 * which in the worst case might touch this same index
370 * and cause deadlocks.
372 if (nbuf != InvalidBuffer)
373 _bt_relbuf(rel, nbuf);
374 _bt_relbuf(rel, buf);
377 Datum values[INDEX_MAX_KEYS];
378 bool isnull[INDEX_MAX_KEYS];
380 index_deform_tuple(itup, RelationGetDescr(rel),
383 (errcode(ERRCODE_UNIQUE_VIOLATION),
384 errmsg("duplicate key value violates unique constraint \"%s\"",
385 RelationGetRelationName(rel)),
386 errdetail("Key %s already exists.",
387 BuildIndexValueDescription(rel,
394 * The conflicting tuple (or whole HOT chain) is dead to
395 * everyone, so we may as well mark the index entry
398 ItemIdMarkDead(curitemid);
399 opaque->btpo_flags |= BTP_HAS_GARBAGE;
400 /* be sure to mark the proper buffer dirty... */
401 if (nbuf != InvalidBuffer)
402 SetBufferCommitInfoNeedsSave(nbuf);
404 SetBufferCommitInfoNeedsSave(buf);
410 * Advance to next tuple to continue checking.
413 offset = OffsetNumberNext(offset);
416 /* If scankey == hikey we gotta check the next page too */
417 if (P_RIGHTMOST(opaque))
419 if (!_bt_isequal(itupdesc, page, P_HIKEY,
420 natts, itup_scankey))
422 /* Advance to next non-dead page --- there must be one */
425 nblkno = opaque->btpo_next;
426 nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
427 page = BufferGetPage(nbuf);
428 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
429 if (!P_IGNORE(opaque))
431 if (P_RIGHTMOST(opaque))
432 elog(ERROR, "fell off the end of index \"%s\"",
433 RelationGetRelationName(rel));
435 maxoff = PageGetMaxOffsetNumber(page);
436 offset = P_FIRSTDATAKEY(opaque);
441 * If we are doing a recheck then we should have found the tuple we
442 * are checking. Otherwise there's something very wrong --- probably,
443 * the index is on a non-immutable expression.
445 if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
447 (errcode(ERRCODE_INTERNAL_ERROR),
448 errmsg("failed to re-find tuple within index \"%s\"",
449 RelationGetRelationName(rel)),
450 errhint("This may be because of a non-immutable index expression.")));
452 if (nbuf != InvalidBuffer)
453 _bt_relbuf(rel, nbuf);
455 return InvalidTransactionId;
460 * _bt_findinsertloc() -- Finds an insert location for a tuple
462 * If the new key is equal to one or more existing keys, we can
463 * legitimately place it anywhere in the series of equal keys --- in fact,
464 * if the new key is equal to the page's "high key" we can place it on
465 * the next page. If it is equal to the high key, and there's not room
466 * to insert the new tuple on the current page without splitting, then
467 * we can move right hoping to find more free space and avoid a split.
468 * (We should not move right indefinitely, however, since that leads to
469 * O(N^2) insertion behavior in the presence of many equal keys.)
470 * Once we have chosen the page to put the key on, we'll insert it before
471 * any existing equal keys because of the way _bt_binsrch() works.
473 * If there's not enough room in the space, we try to make room by
474 * removing any LP_DEAD tuples.
476 * On entry, *buf and *offsetptr point to the first legal position
477 * where the new tuple could be inserted. The caller should hold an
478 * exclusive lock on *buf. *offsetptr can also be set to
479 * InvalidOffsetNumber, in which case the function will search for the
480 * right location within the page if needed. On exit, they point to the
481 * chosen insert location. If _bt_findinsertloc decides to move right,
482 * the lock and pin on the original page will be released and the new
483 * page returned to the caller is exclusively locked instead.
485 * newtup is the new tuple we're inserting, and scankey is an insertion
486 * type scan key for it.
489 _bt_findinsertloc(Relation rel,
491 OffsetNumber *offsetptr,
496 Buffer buf = *bufptr;
497 Page page = BufferGetPage(buf);
499 BTPageOpaque lpageop;
502 OffsetNumber newitemoff;
503 OffsetNumber firstlegaloff = *offsetptr;
505 lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
507 itemsz = IndexTupleDSize(*newtup);
508 itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
509 * need to be consistent */
512 * Check whether the item can fit on a btree page at all. (Eventually, we
513 * ought to try to apply TOAST methods if not.) We actually need to be
514 * able to fit three items on every page, so restrict any one item to 1/3
515 * the per-page available space. Note that at this point, itemsz doesn't
516 * include the ItemId.
518 if (itemsz > BTMaxItemSize(page))
520 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
521 errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
522 (unsigned long) itemsz,
523 (unsigned long) BTMaxItemSize(page),
524 RelationGetRelationName(rel)),
525 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
526 "Consider a function index of an MD5 hash of the value, "
527 "or use full text indexing.")));
530 * If we will need to split the page to put the item on this page,
531 * check whether we can put the tuple somewhere to the right,
532 * instead. Keep scanning right until we
533 * (a) find a page with enough free space,
534 * (b) reach the last page where the tuple can legally go, or
535 * (c) get tired of searching.
536 * (c) is not flippant; it is important because if there are many
537 * pages' worth of equal keys, it's better to split one of the early
538 * pages than to scan all the way to the end of the run of equal keys
539 * on every insert. We implement "get tired" as a random choice,
540 * since stopping after scanning a fixed number of pages wouldn't work
541 * well (we'd never reach the right-hand side of previously split
542 * pages). Currently the probability of moving right is set at 0.99,
543 * which may seem too high to change the behavior much, but it does an
544 * excellent job of preventing O(N^2) behavior with many equal keys.
549 while (PageGetFreeSpace(page) < itemsz)
554 * before considering moving right, see if we can obtain enough space
555 * by erasing LP_DEAD items
557 if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
559 _bt_vacuum_one_page(rel, buf);
562 * remember that we vacuumed this page, because that makes the
563 * hint supplied by the caller invalid
567 if (PageGetFreeSpace(page) >= itemsz)
568 break; /* OK, now we have enough space */
572 * nope, so check conditions (b) and (c) enumerated above
574 if (P_RIGHTMOST(lpageop) ||
575 _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
576 random() <= (MAX_RANDOM_VALUE / 100))
580 * step right to next non-dead page
582 * must write-lock that page before releasing write lock on current
583 * page; else someone else's _bt_check_unique scan could fail to see
584 * our insertion. write locks on intermediate dead pages won't do
585 * because we don't know when they will get de-linked from the tree.
587 rbuf = InvalidBuffer;
591 BlockNumber rblkno = lpageop->btpo_next;
593 rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
594 page = BufferGetPage(rbuf);
595 lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
596 if (!P_IGNORE(lpageop))
598 if (P_RIGHTMOST(lpageop))
599 elog(ERROR, "fell off the end of index \"%s\"",
600 RelationGetRelationName(rel));
602 _bt_relbuf(rel, buf);
609 * Now we are on the right page, so find the insert position. If we moved
610 * right at all, we know we should insert at the start of the page. If we
611 * didn't move right, we can use the firstlegaloff hint if the caller
612 * supplied one, unless we vacuumed the page which might have moved tuples
613 * around making the hint invalid. If we didn't move right or can't use
614 * the hint, find the position by searching.
617 newitemoff = P_FIRSTDATAKEY(lpageop);
618 else if (firstlegaloff != InvalidOffsetNumber && !vacuumed)
619 newitemoff = firstlegaloff;
621 newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
624 *offsetptr = newitemoff;
628 * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
630 * This recursive procedure does the following things:
632 * + if necessary, splits the target page (making sure that the
633 * split is equitable as far as post-insert free space goes).
634 * + inserts the tuple.
635 * + if the page was split, pops the parent stack, and finds the
636 * right place to insert the new child pointer (by walking
637 * right using information stored in the parent stack).
638 * + invokes itself with the appropriate tuple for the right
639 * child page on the parent.
640 * + updates the metapage if a true root or fast root is split.
642 * On entry, we must have the right buffer in which to do the
643 * insertion, and the buffer must be pinned and write-locked. On return,
644 * we will have dropped both the pin and the lock on the buffer.
646 * The locking interactions in this code are critical. You should
647 * grok Lehman and Yao's paper before making any changes. In addition,
648 * you need to understand how we disambiguate duplicate keys in this
649 * implementation, in order to be able to find our location using
650 * L&Y "move right" operations. Since we may insert duplicate user
651 * keys, and since these dups may propagate up the tree, we use the
652 * 'afteritem' parameter to position ourselves correctly for the
653 * insertion on internal pages.
657 _bt_insertonpg(Relation rel,
661 OffsetNumber newitemoff,
662 bool split_only_page)
665 BTPageOpaque lpageop;
666 OffsetNumber firstright = InvalidOffsetNumber;
669 page = BufferGetPage(buf);
670 lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
672 itemsz = IndexTupleDSize(*itup);
673 itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
674 * need to be consistent */
677 * Do we need to split the page to fit the item on it?
679 * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
680 * so this comparison is correct even though we appear to be accounting
681 * only for the item and not for its line pointer.
683 if (PageGetFreeSpace(page) < itemsz)
685 bool is_root = P_ISROOT(lpageop);
686 bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
690 /* Choose the split point */
691 firstright = _bt_findsplitloc(rel, page,
695 /* split the buffer into left and right halves */
696 rbuf = _bt_split(rel, buf, firstright,
697 newitemoff, itemsz, itup, newitemonleft);
702 * + our target page has been split;
703 * + the original tuple has been inserted;
704 * + we have write locks on both the old (left half)
705 * and new (right half) buffers, after the split; and
706 * + we know the key we want to insert into the parent
707 * (it's the "high key" on the left child page).
709 * We're ready to do the parent insertion. We need to hold onto the
710 * locks for the child pages until we locate the parent, but we can
711 * release them before doing the actual insertion (see Lehman and Yao
712 * for the reasoning).
715 _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
719 Buffer metabuf = InvalidBuffer;
721 BTMetaPageData *metad = NULL;
722 OffsetNumber itup_off;
723 BlockNumber itup_blkno;
725 itup_off = newitemoff;
726 itup_blkno = BufferGetBlockNumber(buf);
729 * If we are doing this insert because we split a page that was the
730 * only one on its tree level, but was not the root, it may have been
731 * the "fast root". We need to ensure that the fast root link points
732 * at or above the current page. We can safely acquire a lock on the
733 * metapage here --- see comments for _bt_newroot().
737 Assert(!P_ISLEAF(lpageop));
739 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
740 metapg = BufferGetPage(metabuf);
741 metad = BTPageGetMeta(metapg);
743 if (metad->btm_fastlevel >= lpageop->btpo.level)
745 /* no update wanted */
746 _bt_relbuf(rel, metabuf);
747 metabuf = InvalidBuffer;
751 /* Do the update. No ereport(ERROR) until changes are logged */
752 START_CRIT_SECTION();
754 _bt_pgaddtup(rel, page, itemsz, itup, newitemoff, "page");
756 MarkBufferDirty(buf);
758 if (BufferIsValid(metabuf))
760 metad->btm_fastroot = itup_blkno;
761 metad->btm_fastlevel = lpageop->btpo.level;
762 MarkBufferDirty(metabuf);
768 xl_btree_insert xlrec;
769 BlockNumber xldownlink;
770 xl_btree_metadata xlmeta;
773 XLogRecData rdata[4];
774 XLogRecData *nextrdata;
775 IndexTupleData trunctuple;
777 xlrec.target.node = rel->rd_node;
778 ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off);
780 rdata[0].data = (char *) &xlrec;
781 rdata[0].len = SizeOfBtreeInsert;
782 rdata[0].buffer = InvalidBuffer;
783 rdata[0].next = nextrdata = &(rdata[1]);
785 if (P_ISLEAF(lpageop))
786 xlinfo = XLOG_BTREE_INSERT_LEAF;
789 xldownlink = ItemPointerGetBlockNumber(&(itup->t_tid));
790 Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
792 nextrdata->data = (char *) &xldownlink;
793 nextrdata->len = sizeof(BlockNumber);
794 nextrdata->buffer = InvalidBuffer;
795 nextrdata->next = nextrdata + 1;
798 xlinfo = XLOG_BTREE_INSERT_UPPER;
801 if (BufferIsValid(metabuf))
803 xlmeta.root = metad->btm_root;
804 xlmeta.level = metad->btm_level;
805 xlmeta.fastroot = metad->btm_fastroot;
806 xlmeta.fastlevel = metad->btm_fastlevel;
808 nextrdata->data = (char *) &xlmeta;
809 nextrdata->len = sizeof(xl_btree_metadata);
810 nextrdata->buffer = InvalidBuffer;
811 nextrdata->next = nextrdata + 1;
814 xlinfo = XLOG_BTREE_INSERT_META;
817 /* Read comments in _bt_pgaddtup */
818 if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
821 trunctuple.t_info = sizeof(IndexTupleData);
822 nextrdata->data = (char *) &trunctuple;
823 nextrdata->len = sizeof(IndexTupleData);
827 nextrdata->data = (char *) itup;
828 nextrdata->len = IndexTupleDSize(*itup);
830 nextrdata->buffer = buf;
831 nextrdata->buffer_std = true;
832 nextrdata->next = NULL;
834 recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
836 if (BufferIsValid(metabuf))
838 PageSetLSN(metapg, recptr);
839 PageSetTLI(metapg, ThisTimeLineID);
842 PageSetLSN(page, recptr);
843 PageSetTLI(page, ThisTimeLineID);
848 /* release buffers; send out relcache inval if metapage changed */
849 if (BufferIsValid(metabuf))
852 CacheInvalidateRelcache(rel);
853 _bt_relbuf(rel, metabuf);
856 _bt_relbuf(rel, buf);
861 * _bt_split() -- split a page in the btree.
863 * On entry, buf is the page to split, and is pinned and write-locked.
864 * firstright is the item index of the first item to be moved to the
865 * new right page. newitemoff etc. tell us about the new item that
866 * must be inserted along with the data from the old page.
868 * Returns the new right sibling of buf, pinned and write-locked.
869 * The pin and lock on buf are maintained.
872 _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
873 OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
880 BTPageOpaque ropaque,
883 Buffer sbuf = InvalidBuffer;
885 BTPageOpaque sopaque = NULL;
889 OffsetNumber leftoff,
895 rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
896 origpage = BufferGetPage(buf);
897 leftpage = PageGetTempPage(origpage);
898 rightpage = BufferGetPage(rbuf);
900 _bt_pageinit(leftpage, BufferGetPageSize(buf));
901 /* rightpage was already initialized by _bt_getbuf */
904 * Copy the original page's LSN and TLI into leftpage, which will become
905 * the updated version of the page. We need this because XLogInsert will
906 * examine these fields and possibly dump them in a page image.
908 PageSetLSN(leftpage, PageGetLSN(origpage));
909 PageSetTLI(leftpage, PageGetTLI(origpage));
911 /* init btree private data */
912 oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
913 lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
914 ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
916 isroot = P_ISROOT(oopaque);
918 /* if we're splitting this page, it won't be the root when we're done */
919 /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
920 lopaque->btpo_flags = oopaque->btpo_flags;
921 lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
922 ropaque->btpo_flags = lopaque->btpo_flags;
923 lopaque->btpo_prev = oopaque->btpo_prev;
924 lopaque->btpo_next = BufferGetBlockNumber(rbuf);
925 ropaque->btpo_prev = BufferGetBlockNumber(buf);
926 ropaque->btpo_next = oopaque->btpo_next;
927 lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
928 /* Since we already have write-lock on both pages, ok to read cycleid */
929 lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
930 ropaque->btpo_cycleid = lopaque->btpo_cycleid;
933 * If the page we're splitting is not the rightmost page at its level in
934 * the tree, then the first entry on the page is the high key for the
935 * page. We need to copy that to the right half. Otherwise (meaning the
936 * rightmost page case), all the items on the right half will be user
941 if (!P_RIGHTMOST(oopaque))
943 itemid = PageGetItemId(origpage, P_HIKEY);
944 itemsz = ItemIdGetLength(itemid);
945 item = (IndexTuple) PageGetItem(origpage, itemid);
946 if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
947 false, false) == InvalidOffsetNumber)
948 elog(PANIC, "failed to add hikey to the right sibling"
949 " while splitting block %u of index \"%s\"",
950 BufferGetBlockNumber(buf), RelationGetRelationName(rel));
951 rightoff = OffsetNumberNext(rightoff);
955 * The "high key" for the new left page will be the first key that's going
956 * to go into the new right page. This might be either the existing data
957 * item at position firstright, or the incoming tuple.
960 if (!newitemonleft && newitemoff == firstright)
962 /* incoming tuple will become first on right page */
968 /* existing item at firstright will become first on right page */
969 itemid = PageGetItemId(origpage, firstright);
970 itemsz = ItemIdGetLength(itemid);
971 item = (IndexTuple) PageGetItem(origpage, itemid);
973 if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
974 false, false) == InvalidOffsetNumber)
975 elog(PANIC, "failed to add hikey to the left sibling"
976 " while splitting block %u of index \"%s\"",
977 BufferGetBlockNumber(buf), RelationGetRelationName(rel));
978 leftoff = OffsetNumberNext(leftoff);
981 * Now transfer all the data items to the appropriate page.
983 * Note: we *must* insert at least the right page's items in item-number
984 * order, for the benefit of _bt_restore_page().
986 maxoff = PageGetMaxOffsetNumber(origpage);
988 for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
990 itemid = PageGetItemId(origpage, i);
991 itemsz = ItemIdGetLength(itemid);
992 item = (IndexTuple) PageGetItem(origpage, itemid);
994 /* does new item belong before this one? */
999 _bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff,
1001 leftoff = OffsetNumberNext(leftoff);
1005 _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
1007 rightoff = OffsetNumberNext(rightoff);
1011 /* decide which page to put it on */
1014 _bt_pgaddtup(rel, leftpage, itemsz, item, leftoff,
1016 leftoff = OffsetNumberNext(leftoff);
1020 _bt_pgaddtup(rel, rightpage, itemsz, item, rightoff,
1022 rightoff = OffsetNumberNext(rightoff);
1026 /* cope with possibility that newitem goes at the end */
1027 if (i <= newitemoff)
1030 * Can't have newitemonleft here; that would imply we were told to put
1031 * *everything* on the left page, which cannot fit (if it could, we'd
1032 * not be splitting the page).
1034 Assert(!newitemonleft);
1035 _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
1037 rightoff = OffsetNumberNext(rightoff);
1041 * We have to grab the right sibling (if any) and fix the prev pointer
1042 * there. We are guaranteed that this is deadlock-free since no other
1043 * writer will be holding a lock on that page and trying to move left, and
1044 * all readers release locks on a page before trying to fetch its
1048 if (!P_RIGHTMOST(ropaque))
1050 sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
1051 spage = BufferGetPage(sbuf);
1052 sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
1053 if (sopaque->btpo_prev != ropaque->btpo_prev)
1054 elog(PANIC, "right sibling's left-link doesn't match: "
1055 "block %u links to %u instead of expected %u in index \"%s\"",
1056 ropaque->btpo_next, sopaque->btpo_prev, ropaque->btpo_prev,
1057 RelationGetRelationName(rel));
1060 * Check to see if we can set the SPLIT_END flag in the right-hand
1061 * split page; this can save some I/O for vacuum since it need not
1062 * proceed to the right sibling. We can set the flag if the right
1063 * sibling has a different cycleid: that means it could not be part of
1064 * a group of pages that were all split off from the same ancestor
1065 * page. If you're confused, imagine that page A splits to A B and
1066 * then again, yielding A C B, while vacuum is in progress. Tuples
1067 * originally in A could now be in either B or C, hence vacuum must
1068 * examine both pages. But if D, our right sibling, has a different
1069 * cycleid then it could not contain any tuples that were in A when
1070 * the vacuum started.
1072 if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1073 ropaque->btpo_flags |= BTP_SPLIT_END;
1077 * Right sibling is locked, new siblings are prepared, but original page
1078 * is not updated yet.
1080 * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1081 * not starting the critical section till here because we haven't been
1082 * scribbling on the original page yet, and we don't care about the new
1083 * sibling until it's linked into the btree.
1085 START_CRIT_SECTION();
1088 * By here, the original data page has been split into two new halves, and
1089 * these are correct. The algorithm requires that the left page never
1090 * move during a split, so we copy the new left page back on top of the
1091 * original. Note that this is not a waste of time, since we also require
1092 * (in the page management code) that the center of a page always be
1093 * clean, and the most efficient way to guarantee this is just to compact
1094 * the data by reinserting it into a new left page. (XXX the latter
1095 * comment is probably obsolete.)
1097 * We need to do this before writing the WAL record, so that XLogInsert
1098 * can WAL log an image of the page if necessary.
1100 PageRestoreTempPage(leftpage, origpage);
1102 MarkBufferDirty(buf);
1103 MarkBufferDirty(rbuf);
1105 if (!P_RIGHTMOST(ropaque))
1107 sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
1108 MarkBufferDirty(sbuf);
1112 if (!rel->rd_istemp)
1114 xl_btree_split xlrec;
1117 XLogRecData rdata[7];
1118 XLogRecData *lastrdata;
1120 xlrec.node = rel->rd_node;
1121 xlrec.leftsib = BufferGetBlockNumber(buf);
1122 xlrec.rightsib = BufferGetBlockNumber(rbuf);
1123 xlrec.rnext = ropaque->btpo_next;
1124 xlrec.level = ropaque->btpo.level;
1125 xlrec.firstright = firstright;
1127 rdata[0].data = (char *) &xlrec;
1128 rdata[0].len = SizeOfBtreeSplit;
1129 rdata[0].buffer = InvalidBuffer;
1131 lastrdata = &rdata[0];
1133 if (ropaque->btpo.level > 0)
1135 /* Log downlink on non-leaf pages */
1136 lastrdata->next = lastrdata + 1;
1139 lastrdata->data = (char *) &newitem->t_tid.ip_blkid;
1140 lastrdata->len = sizeof(BlockIdData);
1141 lastrdata->buffer = InvalidBuffer;
1144 * We must also log the left page's high key, because the right
1145 * page's leftmost key is suppressed on non-leaf levels. Show it
1146 * as belonging to the left page buffer, so that it is not stored
1147 * if XLogInsert decides it needs a full-page image of the left
1150 lastrdata->next = lastrdata + 1;
1153 itemid = PageGetItemId(origpage, P_HIKEY);
1154 item = (IndexTuple) PageGetItem(origpage, itemid);
1155 lastrdata->data = (char *) item;
1156 lastrdata->len = MAXALIGN(IndexTupleSize(item));
1157 lastrdata->buffer = buf; /* backup block 1 */
1158 lastrdata->buffer_std = true;
1162 * Log the new item and its offset, if it was inserted on the left
1163 * page. (If it was put on the right page, we don't need to explicitly
1164 * WAL log it because it's included with all the other items on the
1165 * right page.) Show the new item as belonging to the left page
1166 * buffer, so that it is not stored if XLogInsert decides it needs a
1167 * full-page image of the left page. We store the offset anyway,
1168 * though, to support archive compression of these records.
1172 lastrdata->next = lastrdata + 1;
1175 lastrdata->data = (char *) &newitemoff;
1176 lastrdata->len = sizeof(OffsetNumber);
1177 lastrdata->buffer = InvalidBuffer;
1179 lastrdata->next = lastrdata + 1;
1182 lastrdata->data = (char *) newitem;
1183 lastrdata->len = MAXALIGN(newitemsz);
1184 lastrdata->buffer = buf; /* backup block 1 */
1185 lastrdata->buffer_std = true;
1187 else if (ropaque->btpo.level == 0)
1190 * Although we don't need to WAL-log the new item, we still need
1191 * XLogInsert to consider storing a full-page image of the left
1192 * page, so make an empty entry referencing that buffer. This also
1193 * ensures that the left page is always backup block 1.
1195 lastrdata->next = lastrdata + 1;
1198 lastrdata->data = NULL;
1200 lastrdata->buffer = buf; /* backup block 1 */
1201 lastrdata->buffer_std = true;
1205 * Log the contents of the right page in the format understood by
1206 * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
1207 * because we're going to recreate the whole page anyway, so it should
1208 * never be stored by XLogInsert.
1210 * Direct access to page is not good but faster - we should implement
1211 * some new func in page API. Note we only store the tuples
1212 * themselves, knowing that they were inserted in item-number order
1213 * and so the item pointers can be reconstructed. See comments for
1214 * _bt_restore_page().
1216 lastrdata->next = lastrdata + 1;
1219 lastrdata->data = (char *) rightpage +
1220 ((PageHeader) rightpage)->pd_upper;
1221 lastrdata->len = ((PageHeader) rightpage)->pd_special -
1222 ((PageHeader) rightpage)->pd_upper;
1223 lastrdata->buffer = InvalidBuffer;
1225 /* Log the right sibling, because we've changed its' prev-pointer. */
1226 if (!P_RIGHTMOST(ropaque))
1228 lastrdata->next = lastrdata + 1;
1231 lastrdata->data = NULL;
1233 lastrdata->buffer = sbuf; /* backup block 2 */
1234 lastrdata->buffer_std = true;
1237 lastrdata->next = NULL;
1240 xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;
1242 xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
1244 recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
1246 PageSetLSN(origpage, recptr);
1247 PageSetTLI(origpage, ThisTimeLineID);
1248 PageSetLSN(rightpage, recptr);
1249 PageSetTLI(rightpage, ThisTimeLineID);
1250 if (!P_RIGHTMOST(ropaque))
1252 PageSetLSN(spage, recptr);
1253 PageSetTLI(spage, ThisTimeLineID);
1259 /* release the old right sibling */
1260 if (!P_RIGHTMOST(ropaque))
1261 _bt_relbuf(rel, sbuf);
1268 * _bt_findsplitloc() -- find an appropriate place to split a page.
1270 * The idea here is to equalize the free space that will be on each split
1271 * page, *after accounting for the inserted tuple*. (If we fail to account
1272 * for it, we might find ourselves with too little room on the page that
1273 * it needs to go into!)
1275 * If the page is the rightmost page on its level, we instead try to arrange
1276 * to leave the left split page fillfactor% full. In this way, when we are
1277 * inserting successively increasing keys (consider sequences, timestamps,
1278 * etc) we will end up with a tree whose pages are about fillfactor% full,
1279 * instead of the 50% full result that we'd get without this special case.
1280 * This is the same as nbtsort.c produces for a newly-created tree. Note
1281 * that leaf and nonleaf pages use different fillfactors.
1283 * We are passed the intended insert position of the new tuple, expressed as
1284 * the offsetnumber of the tuple it must go in front of. (This could be
1285 * maxoff+1 if the tuple is to go at the end.)
1287 * We return the index of the first existing tuple that should go on the
1288 * righthand page, plus a boolean indicating whether the new tuple goes on
1289 * the left or right page. The bool is necessary to disambiguate the case
1290 * where firstright == newitemoff.
1293 _bt_findsplitloc(Relation rel,
1295 OffsetNumber newitemoff,
1297 bool *newitemonleft)
1299 BTPageOpaque opaque;
1300 OffsetNumber offnum;
1301 OffsetNumber maxoff;
1303 FindSplitData state;
1309 bool goodenoughfound;
1311 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1313 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
1314 newitemsz += sizeof(ItemIdData);
1316 /* Total free space available on a btree page, after fixed overhead */
1317 leftspace = rightspace =
1318 PageGetPageSize(page) - SizeOfPageHeaderData -
1319 MAXALIGN(sizeof(BTPageOpaqueData));
1321 /* The right page will have the same high key as the old page */
1322 if (!P_RIGHTMOST(opaque))
1324 itemid = PageGetItemId(page, P_HIKEY);
1325 rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
1326 sizeof(ItemIdData));
1329 /* Count up total space in data items without actually scanning 'em */
1330 olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
1332 state.newitemsz = newitemsz;
1333 state.is_leaf = P_ISLEAF(opaque);
1334 state.is_rightmost = P_RIGHTMOST(opaque);
1335 state.have_split = false;
1337 state.fillfactor = RelationGetFillFactor(rel,
1338 BTREE_DEFAULT_FILLFACTOR);
1340 state.fillfactor = BTREE_NONLEAF_FILLFACTOR;
1341 state.newitemonleft = false; /* these just to keep compiler quiet */
1342 state.firstright = 0;
1343 state.best_delta = 0;
1344 state.leftspace = leftspace;
1345 state.rightspace = rightspace;
1346 state.olddataitemstotal = olddataitemstotal;
1347 state.newitemoff = newitemoff;
1350 * Finding the best possible split would require checking all the possible
1351 * split points, because of the high-key and left-key special cases.
1352 * That's probably more work than it's worth; instead, stop as soon as we
1353 * find a "good-enough" split, where good-enough is defined as an
1354 * imbalance in free space of no more than pagesize/16 (arbitrary...) This
1355 * should let us stop near the middle on most pages, instead of plowing to
1358 goodenough = leftspace / 16;
1361 * Scan through the data items and calculate space usage for a split at
1362 * each possible position.
1364 olddataitemstoleft = 0;
1365 goodenoughfound = false;
1366 maxoff = PageGetMaxOffsetNumber(page);
1368 for (offnum = P_FIRSTDATAKEY(opaque);
1370 offnum = OffsetNumberNext(offnum))
1374 itemid = PageGetItemId(page, offnum);
1375 itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
1378 * Will the new item go to left or right of split?
1380 if (offnum > newitemoff)
1381 _bt_checksplitloc(&state, offnum, true,
1382 olddataitemstoleft, itemsz);
1384 else if (offnum < newitemoff)
1385 _bt_checksplitloc(&state, offnum, false,
1386 olddataitemstoleft, itemsz);
1389 /* need to try it both ways! */
1390 _bt_checksplitloc(&state, offnum, true,
1391 olddataitemstoleft, itemsz);
1393 _bt_checksplitloc(&state, offnum, false,
1394 olddataitemstoleft, itemsz);
1397 /* Abort scan once we find a good-enough choice */
1398 if (state.have_split && state.best_delta <= goodenough)
1400 goodenoughfound = true;
1404 olddataitemstoleft += itemsz;
1408 * If the new item goes as the last item, check for splitting so that all
1409 * the old items go to the left page and the new item goes to the right
1412 if (newitemoff > maxoff && !goodenoughfound)
1413 _bt_checksplitloc(&state, newitemoff, false, olddataitemstotal, 0);
1416 * I believe it is not possible to fail to find a feasible split, but just
1419 if (!state.have_split)
1420 elog(ERROR, "could not find a feasible split point for index \"%s\"",
1421 RelationGetRelationName(rel));
1423 *newitemonleft = state.newitemonleft;
1424 return state.firstright;
1428 * Subroutine to analyze a particular possible split choice (ie, firstright
1429 * and newitemonleft settings), and record the best split so far in *state.
1431 * firstoldonright is the offset of the first item on the original page
1432 * that goes to the right page, and firstoldonrightsz is the size of that
1433 * tuple. firstoldonright can be > max offset, which means that all the old
1434 * items go to the left page and only the new item goes to the right page.
1435 * In that case, firstoldonrightsz is not used.
1437 * olddataitemstoleft is the total size of all old items to the left of
1441 _bt_checksplitloc(FindSplitData *state,
1442 OffsetNumber firstoldonright,
1444 int olddataitemstoleft,
1445 Size firstoldonrightsz)
1449 Size firstrightitemsz;
1450 bool newitemisfirstonright;
1452 /* Is the new item going to be the first item on the right page? */
1453 newitemisfirstonright = (firstoldonright == state->newitemoff
1456 if (newitemisfirstonright)
1457 firstrightitemsz = state->newitemsz;
1459 firstrightitemsz = firstoldonrightsz;
1461 /* Account for all the old tuples */
1462 leftfree = state->leftspace - olddataitemstoleft;
1463 rightfree = state->rightspace -
1464 (state->olddataitemstotal - olddataitemstoleft);
1467 * The first item on the right page becomes the high key of the left page;
1468 * therefore it counts against left space as well as right space.
1470 leftfree -= firstrightitemsz;
1472 /* account for the new item */
1474 leftfree -= (int) state->newitemsz;
1476 rightfree -= (int) state->newitemsz;
1479 * If we are not on the leaf level, we will be able to discard the key
1480 * data from the first item that winds up on the right page.
1482 if (!state->is_leaf)
1483 rightfree += (int) firstrightitemsz -
1484 (int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
1487 * If feasible split point, remember best delta.
1489 if (leftfree >= 0 && rightfree >= 0)
1493 if (state->is_rightmost)
1496 * If splitting a rightmost page, try to put (100-fillfactor)% of
1497 * free space on left page. See comments for _bt_findsplitloc.
1499 delta = (state->fillfactor * leftfree)
1500 - ((100 - state->fillfactor) * rightfree);
1504 /* Otherwise, aim for equal free space on both sides */
1505 delta = leftfree - rightfree;
1510 if (!state->have_split || delta < state->best_delta)
1512 state->have_split = true;
1513 state->newitemonleft = newitemonleft;
1514 state->firstright = firstoldonright;
1515 state->best_delta = delta;
1521 * _bt_insert_parent() -- Insert downlink into parent after a page split.
1523 * On entry, buf and rbuf are the left and right split pages, which we
1524 * still hold write locks on per the L&Y algorithm. We release the
1525 * write locks once we have write lock on the parent page. (Any sooner,
1526 * and it'd be possible for some other process to try to split or delete
1527 * one of these pages, and get confused because it cannot find the downlink.)
1529 * stack - stack showing how we got here. May be NULL in cases that don't
1530 * have to be efficient (concurrent ROOT split, WAL recovery)
1531 * is_root - we split the true root
1532 * is_only - we split a page alone on its level (might have been fast root)
1534 * This is exported so it can be called by nbtxlog.c.
1537 _bt_insert_parent(Relation rel,
1545 * Here we have to do something Lehman and Yao don't talk about: deal with
1546 * a root split and construction of a new root. If our stack is empty
1547 * then we have just split a node on what had been the root level when we
1548 * descended the tree. If it was still the root then we perform a
1549 * new-root construction. If it *wasn't* the root anymore, search to find
1550 * the next higher level that someone constructed meanwhile, and find the
1551 * right place to insert as for the normal case.
1553 * If we have to search for the parent level, we do so by re-descending
1554 * from the root. This is not super-efficient, but it's rare enough not
1555 * to matter. (This path is also taken when called from WAL recovery ---
1556 * we have no stack in that case.)
1562 Assert(stack == NULL);
1564 /* create a new root node and update the metapage */
1565 rootbuf = _bt_newroot(rel, buf, rbuf);
1566 /* release the split buffers */
1567 _bt_relbuf(rel, rootbuf);
1568 _bt_relbuf(rel, rbuf);
1569 _bt_relbuf(rel, buf);
1573 BlockNumber bknum = BufferGetBlockNumber(buf);
1574 BlockNumber rbknum = BufferGetBlockNumber(rbuf);
1575 Page page = BufferGetPage(buf);
1576 IndexTuple new_item;
1577 BTStackData fakestack;
1583 BTPageOpaque lpageop;
1586 elog(DEBUG2, "concurrent ROOT page split");
1587 lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1588 /* Find the leftmost page at the next level up */
1589 pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false);
1590 /* Set up a phony stack entry pointing there */
1592 stack->bts_blkno = BufferGetBlockNumber(pbuf);
1593 stack->bts_offset = InvalidOffsetNumber;
1594 /* bts_btentry will be initialized below */
1595 stack->bts_parent = NULL;
1596 _bt_relbuf(rel, pbuf);
1599 /* get high key from left page == lowest key on new right page */
1600 ritem = (IndexTuple) PageGetItem(page,
1601 PageGetItemId(page, P_HIKEY));
1603 /* form an index tuple that points at the new right page */
1604 new_item = CopyIndexTuple(ritem);
1605 ItemPointerSet(&(new_item->t_tid), rbknum, P_HIKEY);
1608 * Find the parent buffer and get the parent page.
1610 * Oops - if we were moved right then we need to change stack item! We
1611 * want to find parent pointing to where we are, right ? - vadim
1614 ItemPointerSet(&(stack->bts_btentry.t_tid), bknum, P_HIKEY);
1616 pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
1618 /* Now we can unlock the children */
1619 _bt_relbuf(rel, rbuf);
1620 _bt_relbuf(rel, buf);
1622 /* Check for error only after writing children */
1623 if (pbuf == InvalidBuffer)
1624 elog(ERROR, "failed to re-find parent key in index \"%s\" for split pages %u/%u",
1625 RelationGetRelationName(rel), bknum, rbknum);
1627 /* Recursively update the parent */
1628 _bt_insertonpg(rel, pbuf, stack->bts_parent,
1629 new_item, stack->bts_offset + 1,
1638 * _bt_getstackbuf() -- Walk back up the tree one step, and find the item
1639 * we last looked at in the parent.
1641 * This is possible because we save the downlink from the parent item,
1642 * which is enough to uniquely identify it. Insertions into the parent
1643 * level could cause the item to move right; deletions could cause it
1644 * to move left, but not left of the page we previously found it in.
1646 * Adjusts bts_blkno & bts_offset if changed.
1648 * Returns InvalidBuffer if item not found (should not happen).
1651 _bt_getstackbuf(Relation rel, BTStack stack, int access)
1656 blkno = stack->bts_blkno;
1657 start = stack->bts_offset;
1663 BTPageOpaque opaque;
1665 buf = _bt_getbuf(rel, blkno, access);
1666 page = BufferGetPage(buf);
1667 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1669 if (!P_IGNORE(opaque))
1671 OffsetNumber offnum,
1677 minoff = P_FIRSTDATAKEY(opaque);
1678 maxoff = PageGetMaxOffsetNumber(page);
1681 * start = InvalidOffsetNumber means "search the whole page". We
1682 * need this test anyway due to possibility that page has a high
1683 * key now when it didn't before.
1689 * Need this check too, to guard against possibility that page
1690 * split since we visited it originally.
1693 start = OffsetNumberNext(maxoff);
1696 * These loops will check every item on the page --- but in an
1697 * order that's attuned to the probability of where it actually
1698 * is. Scan to the right first, then to the left.
1700 for (offnum = start;
1702 offnum = OffsetNumberNext(offnum))
1704 itemid = PageGetItemId(page, offnum);
1705 item = (IndexTuple) PageGetItem(page, itemid);
1706 if (BTEntrySame(item, &stack->bts_btentry))
1708 /* Return accurate pointer to where link is now */
1709 stack->bts_blkno = blkno;
1710 stack->bts_offset = offnum;
1715 for (offnum = OffsetNumberPrev(start);
1717 offnum = OffsetNumberPrev(offnum))
1719 itemid = PageGetItemId(page, offnum);
1720 item = (IndexTuple) PageGetItem(page, itemid);
1721 if (BTEntrySame(item, &stack->bts_btentry))
1723 /* Return accurate pointer to where link is now */
1724 stack->bts_blkno = blkno;
1725 stack->bts_offset = offnum;
1732 * The item we're looking for moved right at least one page.
1734 if (P_RIGHTMOST(opaque))
1736 _bt_relbuf(rel, buf);
1737 return InvalidBuffer;
1739 blkno = opaque->btpo_next;
1740 start = InvalidOffsetNumber;
1741 _bt_relbuf(rel, buf);
1746 * _bt_newroot() -- Create a new root page for the index.
1748 * We've just split the old root page and need to create a new one.
1749 * In order to do this, we add a new root page to the file, then lock
1750 * the metadata page and update it. This is guaranteed to be deadlock-
1751 * free, because all readers release their locks on the metadata page
1752 * before trying to lock the root, and all writers lock the root before
1753 * trying to lock the metadata page. We have a write lock on the old
1754 * root page, so we have not introduced any cycles into the waits-for
1757 * On entry, lbuf (the old root) and rbuf (its new peer) are write-
1758 * locked. On exit, a new root page exists with entries for the
1759 * two new children, metapage is updated and unlocked/unpinned.
1760 * The new root buffer is returned to caller which has to unlock/unpin
1761 * lbuf, rbuf & rootbuf.
1764 _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
1771 BlockNumber rootblknum;
1772 BTPageOpaque rootopaque;
1776 IndexTuple new_item;
1779 BTMetaPageData *metad;
1781 lbkno = BufferGetBlockNumber(lbuf);
1782 rbkno = BufferGetBlockNumber(rbuf);
1783 lpage = BufferGetPage(lbuf);
1785 /* get a new root page */
1786 rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1787 rootpage = BufferGetPage(rootbuf);
1788 rootblknum = BufferGetBlockNumber(rootbuf);
1790 /* acquire lock on the metapage */
1791 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1792 metapg = BufferGetPage(metabuf);
1793 metad = BTPageGetMeta(metapg);
1795 /* NO EREPORT(ERROR) from here till newroot op is logged */
1796 START_CRIT_SECTION();
1798 /* set btree special data */
1799 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
1800 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
1801 rootopaque->btpo_flags = BTP_ROOT;
1802 rootopaque->btpo.level =
1803 ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
1804 rootopaque->btpo_cycleid = 0;
1806 /* update metapage data */
1807 metad->btm_root = rootblknum;
1808 metad->btm_level = rootopaque->btpo.level;
1809 metad->btm_fastroot = rootblknum;
1810 metad->btm_fastlevel = rootopaque->btpo.level;
1813 * Create downlink item for left page (old root). Since this will be the
1814 * first item in a non-leaf page, it implicitly has minus-infinity key
1815 * value, so we need not store any actual key in it.
1817 itemsz = sizeof(IndexTupleData);
1818 new_item = (IndexTuple) palloc(itemsz);
1819 new_item->t_info = itemsz;
1820 ItemPointerSet(&(new_item->t_tid), lbkno, P_HIKEY);
1823 * Insert the left page pointer into the new root page. The root page is
1824 * the rightmost page on its level so there is no "high key" in it; the
1825 * two items will go into positions P_HIKEY and P_FIRSTKEY.
1827 * Note: we *must* insert the two items in item-number order, for the
1828 * benefit of _bt_restore_page().
1830 if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY,
1831 false, false) == InvalidOffsetNumber)
1832 elog(PANIC, "failed to add leftkey to new root page"
1833 " while splitting block %u of index \"%s\"",
1834 BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
1838 * Create downlink item for right page. The key for it is obtained from
1839 * the "high key" position in the left page.
1841 itemid = PageGetItemId(lpage, P_HIKEY);
1842 itemsz = ItemIdGetLength(itemid);
1843 item = (IndexTuple) PageGetItem(lpage, itemid);
1844 new_item = CopyIndexTuple(item);
1845 ItemPointerSet(&(new_item->t_tid), rbkno, P_HIKEY);
1848 * insert the right page pointer into the new root page.
1850 if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY,
1851 false, false) == InvalidOffsetNumber)
1852 elog(PANIC, "failed to add rightkey to new root page"
1853 " while splitting block %u of index \"%s\"",
1854 BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
1857 MarkBufferDirty(rootbuf);
1858 MarkBufferDirty(metabuf);
1861 if (!rel->rd_istemp)
1863 xl_btree_newroot xlrec;
1865 XLogRecData rdata[2];
1867 xlrec.node = rel->rd_node;
1868 xlrec.rootblk = rootblknum;
1869 xlrec.level = metad->btm_level;
1871 rdata[0].data = (char *) &xlrec;
1872 rdata[0].len = SizeOfBtreeNewroot;
1873 rdata[0].buffer = InvalidBuffer;
1874 rdata[0].next = &(rdata[1]);
1877 * Direct access to page is not good but faster - we should implement
1878 * some new func in page API.
1880 rdata[1].data = (char *) rootpage + ((PageHeader) rootpage)->pd_upper;
1881 rdata[1].len = ((PageHeader) rootpage)->pd_special -
1882 ((PageHeader) rootpage)->pd_upper;
1883 rdata[1].buffer = InvalidBuffer;
1884 rdata[1].next = NULL;
1886 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, rdata);
1888 PageSetLSN(rootpage, recptr);
1889 PageSetTLI(rootpage, ThisTimeLineID);
1890 PageSetLSN(metapg, recptr);
1891 PageSetTLI(metapg, ThisTimeLineID);
1896 /* send out relcache inval for metapage change */
1898 CacheInvalidateRelcache(rel);
1900 /* done with metapage */
1901 _bt_relbuf(rel, metabuf);
1907 * _bt_pgaddtup() -- add a tuple to a particular page in the index.
1909 * This routine adds the tuple to the page as requested. It does
1910 * not affect pin/lock status, but you'd better have a write lock
1911 * and pin on the target buffer! Don't forget to write and release
1912 * the buffer afterwards, either.
1914 * The main difference between this routine and a bare PageAddItem call
1915 * is that this code knows that the leftmost index tuple on a non-leaf
1916 * btree page doesn't need to have a key. Therefore, it strips such
1917 * tuples down to just the tuple header. CAUTION: this works ONLY if
1918 * we insert the tuples in order, so that the given itup_off does
1919 * represent the final position of the tuple!
1922 _bt_pgaddtup(Relation rel,
1926 OffsetNumber itup_off,
1929 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1930 IndexTupleData trunctuple;
1932 if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
1935 trunctuple.t_info = sizeof(IndexTupleData);
1937 itemsize = sizeof(IndexTupleData);
1940 if (PageAddItem(page, (Item) itup, itemsize, itup_off,
1941 false, false) == InvalidOffsetNumber)
1942 elog(PANIC, "failed to add item to the %s in index \"%s\"",
1943 where, RelationGetRelationName(rel));
1947 * _bt_isequal - used in _bt_doinsert in check for duplicates.
1949 * This is very similar to _bt_compare, except for NULL handling.
1950 * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
1953 _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
1954 int keysz, ScanKey scankey)
1959 /* Better be comparing to a leaf item */
1960 Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page)));
1962 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1964 for (i = 1; i <= keysz; i++)
1971 attno = scankey->sk_attno;
1973 datum = index_getattr(itup, attno, itupdesc, &isNull);
1975 /* NULLs are never equal to anything */
1976 if (isNull || (scankey->sk_flags & SK_ISNULL))
1979 result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
1981 scankey->sk_argument));
1989 /* if we get here, the keys are equal */
1994 * _bt_vacuum_one_page - vacuum just one index page.
1996 * Try to remove LP_DEAD items from the given page. The passed buffer
1997 * must be exclusive-locked, but unlike a real VACUUM, we don't need a
1998 * super-exclusive "cleanup" lock (see nbtree/README).
2001 _bt_vacuum_one_page(Relation rel, Buffer buffer)
2003 OffsetNumber deletable[MaxOffsetNumber];
2005 OffsetNumber offnum,
2008 Page page = BufferGetPage(buffer);
2009 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2012 * Scan over all items to see which ones need to be deleted according to
2015 minoff = P_FIRSTDATAKEY(opaque);
2016 maxoff = PageGetMaxOffsetNumber(page);
2017 for (offnum = minoff;
2019 offnum = OffsetNumberNext(offnum))
2021 ItemId itemId = PageGetItemId(page, offnum);
2023 if (ItemIdIsDead(itemId))
2024 deletable[ndeletable++] = offnum;
2028 _bt_delitems(rel, buffer, deletable, ndeletable, false, 0);
2031 * Note: if we didn't find any LP_DEAD items, then the page's
2032 * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
2033 * separate write to clear it, however. We will clear it when we split