1 /*-------------------------------------------------------------------------
4 * Item insertion in Lehman and Yao btrees for Postgres.
6 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.129 2005/11/22 18:17:06 momjian Exp $
13 *-------------------------------------------------------------------------
18 #include "access/heapam.h"
19 #include "access/nbtree.h"
20 #include "miscadmin.h"
25 /* context data for _bt_checksplitloc */
26 Size newitemsz; /* size of new item to be inserted */
27 bool is_leaf; /* T if splitting a leaf page */
28 bool is_rightmost; /* T if splitting a rightmost page */
30 bool have_split; /* found a valid split? */
32 /* these fields valid only if have_split is true */
33 bool newitemonleft; /* new item on left or right of best split */
34 OffsetNumber firstright; /* best split point */
35 int best_delta; /* best size delta so far */
39 static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
41 static TransactionId _bt_check_unique(Relation rel, BTItem btitem,
42 Relation heapRel, Buffer buf,
43 ScanKey itup_scankey);
44 static void _bt_insertonpg(Relation rel, Buffer buf,
46 int keysz, ScanKey scankey,
48 OffsetNumber afteritem,
49 bool split_only_page);
50 static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
51 OffsetNumber newitemoff, Size newitemsz,
52 BTItem newitem, bool newitemonleft);
53 static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
54 OffsetNumber newitemoff,
57 static void _bt_checksplitloc(FindSplitData *state, OffsetNumber firstright,
58 int leftfree, int rightfree,
59 bool newitemonleft, Size firstrightitemsz);
60 static void _bt_pgaddtup(Relation rel, Page page,
61 Size itemsize, BTItem btitem,
62 OffsetNumber itup_off, const char *where);
63 static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
64 int keysz, ScanKey scankey);
68 * _bt_doinsert() -- Handle insertion of a single btitem in the tree.
70 * This routine is called by the public interface routines, btbuild
71 * and btinsert. By here, btitem is filled in, including the TID.
74 _bt_doinsert(Relation rel, BTItem btitem,
75 bool index_is_unique, Relation heapRel)
77 IndexTuple itup = &(btitem->bti_itup);
78 int natts = rel->rd_rel->relnatts;
83 /* we need a scan key to do our search, so build one */
84 itup_scankey = _bt_mkscankey(rel, itup);
87 /* find the first page containing this key */
88 stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);
90 /* trade in our read lock for a write lock */
91 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
92 LockBuffer(buf, BT_WRITE);
95 * If the page was split between the time that we surrendered our read
96 * lock and acquired our write lock, then this page may no longer be the
97 * right place for the key we want to insert. In this case, we need to
98 * move right in the tree. See Lehman and Yao for an excruciatingly
99 * precise description.
101 buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE);
104 * If we're not allowing duplicates, make sure the key isn't already in
107 * NOTE: obviously, _bt_check_unique can only detect keys that are already
108 * in the index; so it cannot defend against concurrent insertions of the
109 * same key. We protect against that by means of holding a write lock on
110 * the target page. Any other would-be inserter of the same key must
111 * acquire a write lock on the same target page, so only one would-be
112 * inserter can be making the check at one time. Furthermore, once we are
113 * past the check we hold write locks continuously until we have performed
114 * our insertion, so no later inserter can fail to see our insertion.
115 * (This requires some care in _bt_insertonpg.)
117 * If we must wait for another xact, we release the lock while waiting,
118 * and then must start over completely.
124 xwait = _bt_check_unique(rel, btitem, heapRel, buf, itup_scankey);
126 if (TransactionIdIsValid(xwait))
128 /* Have to wait for the other guy ... */
129 _bt_relbuf(rel, buf);
130 XactLockTableWait(xwait);
132 _bt_freestack(stack);
137 /* do the insertion */
138 _bt_insertonpg(rel, buf, stack, natts, itup_scankey, btitem, 0, false);
141 _bt_freestack(stack);
142 _bt_freeskey(itup_scankey);
146 * _bt_check_unique() -- Check for violation of unique index constraint
148 * Returns InvalidTransactionId if there is no conflict, else an xact ID
149 * we must wait for to see if it commits a conflicting tuple. If an actual
150 * conflict is detected, no return --- just ereport().
153 _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
154 Buffer buf, ScanKey itup_scankey)
156 TupleDesc itupdesc = RelationGetDescr(rel);
157 int natts = rel->rd_rel->relnatts;
162 Buffer nbuf = InvalidBuffer;
164 page = BufferGetPage(buf);
165 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
166 maxoff = PageGetMaxOffsetNumber(page);
169 * Find first item >= proposed new item. Note we could also get a pointer
170 * to end-of-page here.
172 offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
175 * Scan over all equal tuples, looking for live conflicts.
186 * make sure the offset points to an actual item before trying to
189 if (offset <= maxoff)
191 curitemid = PageGetItemId(page, offset);
194 * We can skip items that are marked killed.
196 * Formerly, we applied _bt_isequal() before checking the kill
197 * flag, so as to fall out of the item loop as soon as possible.
198 * However, in the presence of heavy update activity an index may
199 * contain many killed items with the same key; running
200 * _bt_isequal() on each killed item gets expensive. Furthermore
201 * it is likely that the non-killed version of each key appears
202 * first, so that we didn't actually get to exit any sooner
203 * anyway. So now we just advance over killed items as quickly as
204 * we can. We only apply _bt_isequal() when we get to a non-killed
205 * item or the end of the page.
207 if (!ItemIdDeleted(curitemid))
210 * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
211 * how we handling NULLs - and so we must not use _bt_compare
212 * in real comparison, but only for ordering/finding items on
213 * pages. - vadim 03/24/97
215 if (!_bt_isequal(itupdesc, page, offset, natts, itup_scankey))
216 break; /* we're past all the equal tuples */
218 /* okay, we gotta fetch the heap tuple ... */
219 cbti = (BTItem) PageGetItem(page, curitemid);
220 htup.t_self = cbti->bti_itup.t_tid;
221 if (heap_fetch(heapRel, SnapshotDirty, &htup, &hbuffer,
224 /* it is a duplicate */
225 TransactionId xwait =
226 (TransactionIdIsValid(SnapshotDirty->xmin)) ?
227 SnapshotDirty->xmin : SnapshotDirty->xmax;
229 ReleaseBuffer(hbuffer);
232 * If this tuple is being updated by other transaction
233 * then we have to wait for its commit/abort.
235 if (TransactionIdIsValid(xwait))
237 if (nbuf != InvalidBuffer)
238 _bt_relbuf(rel, nbuf);
239 /* Tell _bt_doinsert to wait... */
244 * Otherwise we have a definite conflict.
247 (errcode(ERRCODE_UNIQUE_VIOLATION),
248 errmsg("duplicate key violates unique constraint \"%s\"",
249 RelationGetRelationName(rel))));
251 else if (htup.t_data != NULL)
254 * Hmm, if we can't see the tuple, maybe it can be marked
255 * killed. This logic should match index_getnext and
258 LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
259 if (HeapTupleSatisfiesVacuum(htup.t_data, RecentGlobalXmin,
260 hbuffer) == HEAPTUPLE_DEAD)
262 curitemid->lp_flags |= LP_DELETE;
263 if (nbuf != InvalidBuffer)
264 SetBufferCommitInfoNeedsSave(nbuf);
266 SetBufferCommitInfoNeedsSave(buf);
268 LockBuffer(hbuffer, BUFFER_LOCK_UNLOCK);
270 ReleaseBuffer(hbuffer);
275 * Advance to next tuple to continue checking.
278 offset = OffsetNumberNext(offset);
281 /* If scankey == hikey we gotta check the next page too */
282 if (P_RIGHTMOST(opaque))
284 if (!_bt_isequal(itupdesc, page, P_HIKEY,
285 natts, itup_scankey))
287 /* Advance to next non-dead page --- there must be one */
290 nblkno = opaque->btpo_next;
291 nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
292 page = BufferGetPage(nbuf);
293 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
294 if (!P_IGNORE(opaque))
296 if (P_RIGHTMOST(opaque))
297 elog(ERROR, "fell off the end of \"%s\"",
298 RelationGetRelationName(rel));
300 maxoff = PageGetMaxOffsetNumber(page);
301 offset = P_FIRSTDATAKEY(opaque);
305 if (nbuf != InvalidBuffer)
306 _bt_relbuf(rel, nbuf);
308 return InvalidTransactionId;
312 * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
314 * This recursive procedure does the following things:
316 * + finds the right place to insert the tuple.
317 * + if necessary, splits the target page (making sure that the
318 * split is equitable as far as post-insert free space goes).
319 * + inserts the tuple.
320 * + if the page was split, pops the parent stack, and finds the
321 * right place to insert the new child pointer (by walking
322 * right using information stored in the parent stack).
323 * + invokes itself with the appropriate tuple for the right
324 * child page on the parent.
325 * + updates the metapage if a true root or fast root is split.
327 * On entry, we must have the right buffer on which to do the
328 * insertion, and the buffer must be pinned and locked. On return,
329 * we will have dropped both the pin and the write lock on the buffer.
331 * If 'afteritem' is >0 then the new tuple must be inserted after the
332 * existing item of that number, noplace else. If 'afteritem' is 0
333 * then the procedure finds the exact spot to insert it by searching.
334 * (keysz and scankey parameters are used ONLY if afteritem == 0.)
336 * NOTE: if the new key is equal to one or more existing keys, we can
337 * legitimately place it anywhere in the series of equal keys --- in fact,
338 * if the new key is equal to the page's "high key" we can place it on
339 * the next page. If it is equal to the high key, and there's not room
340 * to insert the new tuple on the current page without splitting, then
341 * we can move right hoping to find more free space and avoid a split.
342 * (We should not move right indefinitely, however, since that leads to
343 * O(N^2) insertion behavior in the presence of many equal keys.)
344 * Once we have chosen the page to put the key on, we'll insert it before
345 * any existing equal keys because of the way _bt_binsrch() works.
347 * The locking interactions in this code are critical. You should
348 * grok Lehman and Yao's paper before making any changes. In addition,
349 * you need to understand how we disambiguate duplicate keys in this
350 * implementation, in order to be able to find our location using
351 * L&Y "move right" operations. Since we may insert duplicate user
352 * keys, and since these dups may propagate up the tree, we use the
353 * 'afteritem' parameter to position ourselves correctly for the
354 * insertion on internal pages.
358 _bt_insertonpg(Relation rel,
364 OffsetNumber afteritem,
365 bool split_only_page)
368 BTPageOpaque lpageop;
369 OffsetNumber newitemoff;
370 OffsetNumber firstright = InvalidOffsetNumber;
373 page = BufferGetPage(buf);
374 lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
376 itemsz = IndexTupleDSize(btitem->bti_itup)
377 + (sizeof(BTItemData) - sizeof(IndexTupleData));
379 itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
380 * need to be consistent */
383 * Check whether the item can fit on a btree page at all. (Eventually, we
384 * ought to try to apply TOAST methods if not.) We actually need to be
385 * able to fit three items on every page, so restrict any one item to 1/3
386 * the per-page available space. Note that at this point, itemsz doesn't
387 * include the ItemId.
389 if (itemsz > BTMaxItemSize(page))
391 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
392 errmsg("index row size %lu exceeds btree maximum, %lu",
393 (unsigned long) itemsz,
394 (unsigned long) BTMaxItemSize(page)),
395 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
396 "Consider a function index of an MD5 hash of the value, "
397 "or use full text indexing.")));
400 * Determine exactly where new item will go.
403 newitemoff = afteritem + 1;
407 * If we will need to split the page to put the item here,
408 * check whether we can put the tuple somewhere to the right,
409 * instead. Keep scanning right until we
410 * (a) find a page with enough free space,
411 * (b) reach the last page where the tuple can legally go, or
412 * (c) get tired of searching.
413 * (c) is not flippant; it is important because if there are many
414 * pages' worth of equal keys, it's better to split one of the early
415 * pages than to scan all the way to the end of the run of equal keys
416 * on every insert. We implement "get tired" as a random choice,
417 * since stopping after scanning a fixed number of pages wouldn't work
418 * well (we'd never reach the right-hand side of previously split
419 * pages). Currently the probability of moving right is set at 0.99,
420 * which may seem too high to change the behavior much, but it does an
421 * excellent job of preventing O(N^2) behavior with many equal keys.
424 bool movedright = false;
426 while (PageGetFreeSpace(page) < itemsz &&
427 !P_RIGHTMOST(lpageop) &&
428 _bt_compare(rel, keysz, scankey, page, P_HIKEY) == 0 &&
429 random() > (MAX_RANDOM_VALUE / 100))
432 * step right to next non-dead page
434 * must write-lock that page before releasing write lock on
435 * current page; else someone else's _bt_check_unique scan could
436 * fail to see our insertion. write locks on intermediate dead
437 * pages won't do because we don't know when they will get
438 * de-linked from the tree.
440 Buffer rbuf = InvalidBuffer;
444 BlockNumber rblkno = lpageop->btpo_next;
446 rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
447 page = BufferGetPage(rbuf);
448 lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
449 if (!P_IGNORE(lpageop))
451 if (P_RIGHTMOST(lpageop))
452 elog(ERROR, "fell off the end of \"%s\"",
453 RelationGetRelationName(rel));
455 _bt_relbuf(rel, buf);
461 * Now we are on the right page, so find the insert position. If we
462 * moved right at all, we know we should insert at the start of the
463 * page, else must find the position by searching.
466 newitemoff = P_FIRSTDATAKEY(lpageop);
468 newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
472 * Do we need to split the page to fit the item on it?
474 * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
475 * so this comparison is correct even though we appear to be accounting
476 * only for the item and not for its line pointer.
478 if (PageGetFreeSpace(page) < itemsz)
480 bool is_root = P_ISROOT(lpageop);
481 bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
485 /* Choose the split point */
486 firstright = _bt_findsplitloc(rel, page,
490 /* split the buffer into left and right halves */
491 rbuf = _bt_split(rel, buf, firstright,
492 newitemoff, itemsz, btitem, newitemonleft);
497 * + our target page has been split;
498 * + the original tuple has been inserted;
499 * + we have write locks on both the old (left half)
500 * and new (right half) buffers, after the split; and
501 * + we know the key we want to insert into the parent
502 * (it's the "high key" on the left child page).
504 * We're ready to do the parent insertion. We need to hold onto the
505 * locks for the child pages until we locate the parent, but we can
506 * release them before doing the actual insertion (see Lehman and Yao
507 * for the reasoning).
510 _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
514 Buffer metabuf = InvalidBuffer;
516 BTMetaPageData *metad = NULL;
517 OffsetNumber itup_off;
518 BlockNumber itup_blkno;
520 itup_off = newitemoff;
521 itup_blkno = BufferGetBlockNumber(buf);
524 * If we are doing this insert because we split a page that was the
525 * only one on its tree level, but was not the root, it may have been
526 * the "fast root". We need to ensure that the fast root link points
527 * at or above the current page. We can safely acquire a lock on the
528 * metapage here --- see comments for _bt_newroot().
532 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
533 metapg = BufferGetPage(metabuf);
534 metad = BTPageGetMeta(metapg);
536 if (metad->btm_fastlevel >= lpageop->btpo.level)
538 /* no update wanted */
539 _bt_relbuf(rel, metabuf);
540 metabuf = InvalidBuffer;
544 /* Do the update. No ereport(ERROR) until changes are logged */
545 START_CRIT_SECTION();
547 _bt_pgaddtup(rel, page, itemsz, btitem, newitemoff, "page");
549 if (BufferIsValid(metabuf))
551 metad->btm_fastroot = itup_blkno;
552 metad->btm_fastlevel = lpageop->btpo.level;
558 xl_btree_insert xlrec;
559 xl_btree_metadata xlmeta;
562 XLogRecData rdata[3];
563 XLogRecData *nextrdata;
564 BTItemData truncitem;
566 xlrec.target.node = rel->rd_node;
567 ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off);
569 rdata[0].data = (char *) &xlrec;
570 rdata[0].len = SizeOfBtreeInsert;
571 rdata[0].buffer = InvalidBuffer;
572 rdata[0].next = nextrdata = &(rdata[1]);
574 if (BufferIsValid(metabuf))
576 xlmeta.root = metad->btm_root;
577 xlmeta.level = metad->btm_level;
578 xlmeta.fastroot = metad->btm_fastroot;
579 xlmeta.fastlevel = metad->btm_fastlevel;
581 nextrdata->data = (char *) &xlmeta;
582 nextrdata->len = sizeof(xl_btree_metadata);
583 nextrdata->buffer = InvalidBuffer;
584 nextrdata->next = nextrdata + 1;
586 xlinfo = XLOG_BTREE_INSERT_META;
588 else if (P_ISLEAF(lpageop))
589 xlinfo = XLOG_BTREE_INSERT_LEAF;
591 xlinfo = XLOG_BTREE_INSERT_UPPER;
593 /* Read comments in _bt_pgaddtup */
594 if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
597 truncitem.bti_itup.t_info = sizeof(BTItemData);
598 nextrdata->data = (char *) &truncitem;
599 nextrdata->len = sizeof(BTItemData);
603 nextrdata->data = (char *) btitem;
604 nextrdata->len = IndexTupleDSize(btitem->bti_itup) +
605 (sizeof(BTItemData) - sizeof(IndexTupleData));
607 nextrdata->buffer = buf;
608 nextrdata->buffer_std = true;
609 nextrdata->next = NULL;
611 recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
613 if (BufferIsValid(metabuf))
615 PageSetLSN(metapg, recptr);
616 PageSetTLI(metapg, ThisTimeLineID);
619 PageSetLSN(page, recptr);
620 PageSetTLI(page, ThisTimeLineID);
625 /* Write out the updated page and release pin/lock */
626 if (BufferIsValid(metabuf))
627 _bt_wrtbuf(rel, metabuf);
629 _bt_wrtbuf(rel, buf);
634 * _bt_split() -- split a page in the btree.
636 * On entry, buf is the page to split, and is write-locked and pinned.
637 * firstright is the item index of the first item to be moved to the
638 * new right page. newitemoff etc. tell us about the new item that
639 * must be inserted along with the data from the old page.
641 * Returns the new right sibling of buf, pinned and write-locked.
642 * The pin and lock on buf are maintained.
645 _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
646 OffsetNumber newitemoff, Size newitemsz, BTItem newitem,
653 BTPageOpaque ropaque,
656 Buffer sbuf = InvalidBuffer;
658 BTPageOpaque sopaque = NULL;
659 OffsetNumber itup_off = 0;
660 BlockNumber itup_blkno = 0;
664 OffsetNumber leftoff,
669 rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
670 origpage = BufferGetPage(buf);
671 leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData));
672 rightpage = BufferGetPage(rbuf);
674 _bt_pageinit(leftpage, BufferGetPageSize(buf));
675 /* rightpage was already initialized by _bt_getbuf */
677 /* init btree private data */
678 oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
679 lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
680 ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
682 /* if we're splitting this page, it won't be the root when we're done */
683 lopaque->btpo_flags = oopaque->btpo_flags;
684 lopaque->btpo_flags &= ~BTP_ROOT;
685 ropaque->btpo_flags = lopaque->btpo_flags;
686 lopaque->btpo_prev = oopaque->btpo_prev;
687 lopaque->btpo_next = BufferGetBlockNumber(rbuf);
688 ropaque->btpo_prev = BufferGetBlockNumber(buf);
689 ropaque->btpo_next = oopaque->btpo_next;
690 lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
693 * If the page we're splitting is not the rightmost page at its level in
694 * the tree, then the first entry on the page is the high key for the
695 * page. We need to copy that to the right half. Otherwise (meaning the
696 * rightmost page case), all the items on the right half will be user
701 if (!P_RIGHTMOST(oopaque))
703 itemid = PageGetItemId(origpage, P_HIKEY);
704 itemsz = ItemIdGetLength(itemid);
705 item = (BTItem) PageGetItem(origpage, itemid);
706 if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
707 LP_USED) == InvalidOffsetNumber)
708 elog(PANIC, "failed to add hikey to the right sibling");
709 rightoff = OffsetNumberNext(rightoff);
713 * The "high key" for the new left page will be the first key that's going
714 * to go into the new right page. This might be either the existing data
715 * item at position firstright, or the incoming tuple.
718 if (!newitemonleft && newitemoff == firstright)
720 /* incoming tuple will become first on right page */
726 /* existing item at firstright will become first on right page */
727 itemid = PageGetItemId(origpage, firstright);
728 itemsz = ItemIdGetLength(itemid);
729 item = (BTItem) PageGetItem(origpage, itemid);
731 if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
732 LP_USED) == InvalidOffsetNumber)
733 elog(PANIC, "failed to add hikey to the left sibling");
734 leftoff = OffsetNumberNext(leftoff);
737 * Now transfer all the data items to the appropriate page
739 maxoff = PageGetMaxOffsetNumber(origpage);
741 for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
743 itemid = PageGetItemId(origpage, i);
744 itemsz = ItemIdGetLength(itemid);
745 item = (BTItem) PageGetItem(origpage, itemid);
747 /* does new item belong before this one? */
752 _bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff,
755 itup_blkno = BufferGetBlockNumber(buf);
756 leftoff = OffsetNumberNext(leftoff);
760 _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
763 itup_blkno = BufferGetBlockNumber(rbuf);
764 rightoff = OffsetNumberNext(rightoff);
768 /* decide which page to put it on */
771 _bt_pgaddtup(rel, leftpage, itemsz, item, leftoff,
773 leftoff = OffsetNumberNext(leftoff);
777 _bt_pgaddtup(rel, rightpage, itemsz, item, rightoff,
779 rightoff = OffsetNumberNext(rightoff);
783 /* cope with possibility that newitem goes at the end */
788 _bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff,
791 itup_blkno = BufferGetBlockNumber(buf);
792 leftoff = OffsetNumberNext(leftoff);
796 _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
799 itup_blkno = BufferGetBlockNumber(rbuf);
800 rightoff = OffsetNumberNext(rightoff);
805 * We have to grab the right sibling (if any) and fix the prev pointer
806 * there. We are guaranteed that this is deadlock-free since no other
807 * writer will be holding a lock on that page and trying to move left, and
808 * all readers release locks on a page before trying to fetch its
812 if (!P_RIGHTMOST(ropaque))
814 sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
815 spage = BufferGetPage(sbuf);
816 sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
817 if (sopaque->btpo_prev != ropaque->btpo_prev)
818 elog(PANIC, "right sibling's left-link doesn't match");
822 * Right sibling is locked, new siblings are prepared, but original page
823 * is not updated yet. Log changes before continuing.
825 * NO EREPORT(ERROR) till right sibling is updated.
827 START_CRIT_SECTION();
829 if (!P_RIGHTMOST(ropaque))
830 sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
835 xl_btree_split xlrec;
838 XLogRecData rdata[4];
840 xlrec.target.node = rel->rd_node;
841 ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off);
843 xlrec.otherblk = BufferGetBlockNumber(rbuf);
845 xlrec.otherblk = BufferGetBlockNumber(buf);
846 xlrec.leftblk = lopaque->btpo_prev;
847 xlrec.rightblk = ropaque->btpo_next;
848 xlrec.level = lopaque->btpo.level;
851 * Direct access to page is not good but faster - we should implement
852 * some new func in page API. Note we only store the tuples
853 * themselves, knowing that the item pointers are in the same order
854 * and can be reconstructed by scanning the tuples.
856 xlrec.leftlen = ((PageHeader) leftpage)->pd_special -
857 ((PageHeader) leftpage)->pd_upper;
859 rdata[0].data = (char *) &xlrec;
860 rdata[0].len = SizeOfBtreeSplit;
861 rdata[0].buffer = InvalidBuffer;
862 rdata[0].next = &(rdata[1]);
864 rdata[1].data = (char *) leftpage + ((PageHeader) leftpage)->pd_upper;
865 rdata[1].len = xlrec.leftlen;
866 rdata[1].buffer = InvalidBuffer;
867 rdata[1].next = &(rdata[2]);
869 rdata[2].data = (char *) rightpage + ((PageHeader) rightpage)->pd_upper;
870 rdata[2].len = ((PageHeader) rightpage)->pd_special -
871 ((PageHeader) rightpage)->pd_upper;
872 rdata[2].buffer = InvalidBuffer;
873 rdata[2].next = NULL;
875 if (!P_RIGHTMOST(ropaque))
877 rdata[2].next = &(rdata[3]);
878 rdata[3].data = NULL;
880 rdata[3].buffer = sbuf;
881 rdata[3].buffer_std = true;
882 rdata[3].next = NULL;
885 if (P_ISROOT(oopaque))
886 xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;
888 xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
890 recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
892 PageSetLSN(leftpage, recptr);
893 PageSetTLI(leftpage, ThisTimeLineID);
894 PageSetLSN(rightpage, recptr);
895 PageSetTLI(rightpage, ThisTimeLineID);
896 if (!P_RIGHTMOST(ropaque))
898 PageSetLSN(spage, recptr);
899 PageSetTLI(spage, ThisTimeLineID);
904 * By here, the original data page has been split into two new halves, and
905 * these are correct. The algorithm requires that the left page never
906 * move during a split, so we copy the new left page back on top of the
907 * original. Note that this is not a waste of time, since we also require
908 * (in the page management code) that the center of a page always be
909 * clean, and the most efficient way to guarantee this is just to compact
910 * the data by reinserting it into a new left page.
913 PageRestoreTempPage(leftpage, origpage);
917 /* write and release the old right sibling */
918 if (!P_RIGHTMOST(ropaque))
919 _bt_wrtbuf(rel, sbuf);
926 * _bt_findsplitloc() -- find an appropriate place to split a page.
928 * The idea here is to equalize the free space that will be on each split
929 * page, *after accounting for the inserted tuple*. (If we fail to account
930 * for it, we might find ourselves with too little room on the page that
931 * it needs to go into!)
933 * If the page is the rightmost page on its level, we instead try to arrange
934 * for twice as much free space on the right as on the left. In this way,
935 * when we are inserting successively increasing keys (consider sequences,
936 * timestamps, etc) we will end up with a tree whose pages are about 67% full,
937 * instead of the 50% full result that we'd get without this special case.
938 * (We could bias it even further to make the initially-loaded tree more full.
939 * But since the steady-state load for a btree is about 70%, we'd likely just
940 * be making more page-splitting work for ourselves later on, when we start
941 * seeing updates to existing tuples.)
943 * We are passed the intended insert position of the new tuple, expressed as
944 * the offsetnumber of the tuple it must go in front of. (This could be
945 * maxoff+1 if the tuple is to go at the end.)
947 * We return the index of the first existing tuple that should go on the
948 * righthand page, plus a boolean indicating whether the new tuple goes on
949 * the left or right page. The bool is necessary to disambiguate the case
950 * where firstright == newitemoff.
953 _bt_findsplitloc(Relation rel,
955 OffsetNumber newitemoff,
970 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
972 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
973 newitemsz += sizeof(ItemIdData);
974 state.newitemsz = newitemsz;
975 state.is_leaf = P_ISLEAF(opaque);
976 state.is_rightmost = P_RIGHTMOST(opaque);
977 state.have_split = false;
979 /* Total free space available on a btree page, after fixed overhead */
980 leftspace = rightspace =
981 PageGetPageSize(page) - SizeOfPageHeaderData -
982 MAXALIGN(sizeof(BTPageOpaqueData));
985 * Finding the best possible split would require checking all the possible
986 * split points, because of the high-key and left-key special cases.
987 * That's probably more work than it's worth; instead, stop as soon as we
988 * find a "good-enough" split, where good-enough is defined as an
989 * imbalance in free space of no more than pagesize/16 (arbitrary...) This
990 * should let us stop near the middle on most pages, instead of plowing to
993 goodenough = leftspace / 16;
995 /* The right page will have the same high key as the old page */
996 if (!P_RIGHTMOST(opaque))
998 itemid = PageGetItemId(page, P_HIKEY);
999 rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
1000 sizeof(ItemIdData));
1003 /* Count up total space in data items without actually scanning 'em */
1004 dataitemtotal = rightspace - (int) PageGetFreeSpace(page);
1007 * Scan through the data items and calculate space usage for a split at
1008 * each possible position.
1010 dataitemstoleft = 0;
1011 maxoff = PageGetMaxOffsetNumber(page);
1013 for (offnum = P_FIRSTDATAKEY(opaque);
1015 offnum = OffsetNumberNext(offnum))
1021 itemid = PageGetItemId(page, offnum);
1022 itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
1025 * We have to allow for the current item becoming the high key of the
1026 * left page; therefore it counts against left space as well as right
1029 leftfree = leftspace - dataitemstoleft - (int) itemsz;
1030 rightfree = rightspace - (dataitemtotal - dataitemstoleft);
1033 * Will the new item go to left or right of split?
1035 if (offnum > newitemoff)
1036 _bt_checksplitloc(&state, offnum, leftfree, rightfree,
1038 else if (offnum < newitemoff)
1039 _bt_checksplitloc(&state, offnum, leftfree, rightfree,
1043 /* need to try it both ways! */
1044 _bt_checksplitloc(&state, offnum, leftfree, rightfree,
1046 /* here we are contemplating newitem as first on right */
1047 _bt_checksplitloc(&state, offnum, leftfree, rightfree,
1051 /* Abort scan once we find a good-enough choice */
1052 if (state.have_split && state.best_delta <= goodenough)
1055 dataitemstoleft += itemsz;
1059 * I believe it is not possible to fail to find a feasible split, but just
1062 if (!state.have_split)
1063 elog(ERROR, "could not find a feasible split point for \"%s\"",
1064 RelationGetRelationName(rel));
1066 *newitemonleft = state.newitemonleft;
1067 return state.firstright;
1071 * Subroutine to analyze a particular possible split choice (ie, firstright
1072 * and newitemonleft settings), and record the best split so far in *state.
1075 _bt_checksplitloc(FindSplitData *state, OffsetNumber firstright,
1076 int leftfree, int rightfree,
1077 bool newitemonleft, Size firstrightitemsz)
1080 * Account for the new item on whichever side it is to be put.
1083 leftfree -= (int) state->newitemsz;
1085 rightfree -= (int) state->newitemsz;
1088 * If we are not on the leaf level, we will be able to discard the key
1089 * data from the first item that winds up on the right page.
1091 if (!state->is_leaf)
1092 rightfree += (int) firstrightitemsz -
1093 (int) (MAXALIGN(sizeof(BTItemData)) + sizeof(ItemIdData));
1096 * If feasible split point, remember best delta.
1098 if (leftfree >= 0 && rightfree >= 0)
1102 if (state->is_rightmost)
1105 * On a rightmost page, try to equalize right free space with
1106 * twice the left free space. See comments for _bt_findsplitloc.
1108 delta = (2 * leftfree) - rightfree;
1112 /* Otherwise, aim for equal free space on both sides */
1113 delta = leftfree - rightfree;
1118 if (!state->have_split || delta < state->best_delta)
1120 state->have_split = true;
1121 state->newitemonleft = newitemonleft;
1122 state->firstright = firstright;
1123 state->best_delta = delta;
1129 * _bt_insert_parent() -- Insert downlink into parent after a page split.
1131 * On entry, buf and rbuf are the left and right split pages, which we
1132 * still hold write locks on per the L&Y algorithm. We release the
1133 * write locks once we have write lock on the parent page. (Any sooner,
1134 * and it'd be possible for some other process to try to split or delete
1135 * one of these pages, and get confused because it cannot find the downlink.)
1137 * stack - stack showing how we got here. May be NULL in cases that don't
1138 * have to be efficient (concurrent ROOT split, WAL recovery)
1139 * is_root - we split the true root
1140 * is_only - we split a page alone on its level (might have been fast root)
1142 * This is exported so it can be called by nbtxlog.c.
1145 _bt_insert_parent(Relation rel,
1153 * Here we have to do something Lehman and Yao don't talk about: deal with
1154 * a root split and construction of a new root. If our stack is empty
1155 * then we have just split a node on what had been the root level when we
1156 * descended the tree. If it was still the root then we perform a
1157 * new-root construction. If it *wasn't* the root anymore, search to find
1158 * the next higher level that someone constructed meanwhile, and find the
1159 * right place to insert as for the normal case.
1161 * If we have to search for the parent level, we do so by re-descending
1162 * from the root. This is not super-efficient, but it's rare enough not
1163 * to matter. (This path is also taken when called from WAL recovery ---
1164 * we have no stack in that case.)
1170 Assert(stack == NULL);
1172 /* create a new root node and update the metapage */
1173 rootbuf = _bt_newroot(rel, buf, rbuf);
1174 /* release the split buffers */
1175 _bt_wrtbuf(rel, rootbuf);
1176 _bt_wrtbuf(rel, rbuf);
1177 _bt_wrtbuf(rel, buf);
1181 BlockNumber bknum = BufferGetBlockNumber(buf);
1182 BlockNumber rbknum = BufferGetBlockNumber(rbuf);
1183 Page page = BufferGetPage(buf);
1185 BTStackData fakestack;
1191 BTPageOpaque lpageop;
1194 elog(DEBUG2, "concurrent ROOT page split");
1195 lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
1196 /* Find the leftmost page at the next level up */
1197 pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false);
1198 /* Set up a phony stack entry pointing there */
1200 stack->bts_blkno = BufferGetBlockNumber(pbuf);
1201 stack->bts_offset = InvalidOffsetNumber;
1202 /* bts_btitem will be initialized below */
1203 stack->bts_parent = NULL;
1204 _bt_relbuf(rel, pbuf);
1207 /* get high key from left page == lowest key on new right page */
1208 ritem = (BTItem) PageGetItem(page,
1209 PageGetItemId(page, P_HIKEY));
1211 /* form an index tuple that points at the new right page */
1212 new_item = _bt_formitem(&(ritem->bti_itup));
1213 ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY);
1216 * Find the parent buffer and get the parent page.
1218 * Oops - if we were moved right then we need to change stack item! We
1219 * want to find parent pointing to where we are, right ? - vadim
1222 ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid),
1225 pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
1227 /* Now we can write and unlock the children */
1228 _bt_wrtbuf(rel, rbuf);
1229 _bt_wrtbuf(rel, buf);
1231 /* Check for error only after writing children */
1232 if (pbuf == InvalidBuffer)
1233 elog(ERROR, "failed to re-find parent key in \"%s\"",
1234 RelationGetRelationName(rel));
1236 /* Recursively update the parent */
1237 _bt_insertonpg(rel, pbuf, stack->bts_parent,
1238 0, NULL, new_item, stack->bts_offset,
1247 * _bt_getstackbuf() -- Walk back up the tree one step, and find the item
1248 * we last looked at in the parent.
1250 * This is possible because we save the downlink from the parent item,
1251 * which is enough to uniquely identify it. Insertions into the parent
1252 * level could cause the item to move right; deletions could cause it
1253 * to move left, but not left of the page we previously found it in.
1255 * Adjusts bts_blkno & bts_offset if changed.
1257 * Returns InvalidBuffer if item not found (should not happen).
1260 _bt_getstackbuf(Relation rel, BTStack stack, int access)
1265 blkno = stack->bts_blkno;
1266 start = stack->bts_offset;
1272 BTPageOpaque opaque;
1274 buf = _bt_getbuf(rel, blkno, access);
1275 page = BufferGetPage(buf);
1276 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1278 if (!P_IGNORE(opaque))
1280 OffsetNumber offnum,
1286 minoff = P_FIRSTDATAKEY(opaque);
1287 maxoff = PageGetMaxOffsetNumber(page);
1290 * start = InvalidOffsetNumber means "search the whole page". We
1291 * need this test anyway due to possibility that page has a high
1292 * key now when it didn't before.
1298 * Need this check too, to guard against possibility that page
1299 * split since we visited it originally.
1302 start = OffsetNumberNext(maxoff);
1305 * These loops will check every item on the page --- but in an
1306 * order that's attuned to the probability of where it actually
1307 * is. Scan to the right first, then to the left.
1309 for (offnum = start;
1311 offnum = OffsetNumberNext(offnum))
1313 itemid = PageGetItemId(page, offnum);
1314 item = (BTItem) PageGetItem(page, itemid);
1315 if (BTItemSame(item, &stack->bts_btitem))
1317 /* Return accurate pointer to where link is now */
1318 stack->bts_blkno = blkno;
1319 stack->bts_offset = offnum;
1324 for (offnum = OffsetNumberPrev(start);
1326 offnum = OffsetNumberPrev(offnum))
1328 itemid = PageGetItemId(page, offnum);
1329 item = (BTItem) PageGetItem(page, itemid);
1330 if (BTItemSame(item, &stack->bts_btitem))
1332 /* Return accurate pointer to where link is now */
1333 stack->bts_blkno = blkno;
1334 stack->bts_offset = offnum;
1341 * The item we're looking for moved right at least one page.
1343 if (P_RIGHTMOST(opaque))
1345 _bt_relbuf(rel, buf);
1346 return InvalidBuffer;
1348 blkno = opaque->btpo_next;
1349 start = InvalidOffsetNumber;
1350 _bt_relbuf(rel, buf);
1355 * _bt_newroot() -- Create a new root page for the index.
1357 * We've just split the old root page and need to create a new one.
1358 * In order to do this, we add a new root page to the file, then lock
1359 * the metadata page and update it. This is guaranteed to be deadlock-
1360 * free, because all readers release their locks on the metadata page
1361 * before trying to lock the root, and all writers lock the root before
1362 * trying to lock the metadata page. We have a write lock on the old
1363 * root page, so we have not introduced any cycles into the waits-for
1366 * On entry, lbuf (the old root) and rbuf (its new peer) are write-
1367 * locked. On exit, a new root page exists with entries for the
1368 * two new children, metapage is updated and unlocked/unpinned.
1369 * The new root buffer is returned to caller which has to unlock/unpin
1370 * lbuf, rbuf & rootbuf.
1373 _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
1381 BlockNumber rootblknum;
1382 BTPageOpaque rootopaque;
1389 BTMetaPageData *metad;
1391 lbkno = BufferGetBlockNumber(lbuf);
1392 rbkno = BufferGetBlockNumber(rbuf);
1393 lpage = BufferGetPage(lbuf);
1394 rpage = BufferGetPage(rbuf);
1396 /* get a new root page */
1397 rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1398 rootpage = BufferGetPage(rootbuf);
1399 rootblknum = BufferGetBlockNumber(rootbuf);
1401 /* acquire lock on the metapage */
1402 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1403 metapg = BufferGetPage(metabuf);
1404 metad = BTPageGetMeta(metapg);
1406 /* NO EREPORT(ERROR) from here till newroot op is logged */
1407 START_CRIT_SECTION();
1409 /* set btree special data */
1410 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
1411 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
1412 rootopaque->btpo_flags = BTP_ROOT;
1413 rootopaque->btpo.level =
1414 ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
1416 /* update metapage data */
1417 metad->btm_root = rootblknum;
1418 metad->btm_level = rootopaque->btpo.level;
1419 metad->btm_fastroot = rootblknum;
1420 metad->btm_fastlevel = rootopaque->btpo.level;
1423 * Create downlink item for left page (old root). Since this will be the
1424 * first item in a non-leaf page, it implicitly has minus-infinity key
1425 * value, so we need not store any actual key in it.
1427 itemsz = sizeof(BTItemData);
1428 new_item = (BTItem) palloc(itemsz);
1429 new_item->bti_itup.t_info = itemsz;
1430 ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_HIKEY);
1433 * Insert the left page pointer into the new root page. The root page is
1434 * the rightmost page on its level so there is no "high key" in it; the
1435 * two items will go into positions P_HIKEY and P_FIRSTKEY.
1437 if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber)
1438 elog(PANIC, "failed to add leftkey to new root page");
1442 * Create downlink item for right page. The key for it is obtained from
1443 * the "high key" position in the left page.
1445 itemid = PageGetItemId(lpage, P_HIKEY);
1446 itemsz = ItemIdGetLength(itemid);
1447 item = (BTItem) PageGetItem(lpage, itemid);
1448 new_item = _bt_formitem(&(item->bti_itup));
1449 ItemPointerSet(&(new_item->bti_itup.t_tid), rbkno, P_HIKEY);
1452 * insert the right page pointer into the new root page.
1454 if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED) == InvalidOffsetNumber)
1455 elog(PANIC, "failed to add rightkey to new root page");
1459 if (!rel->rd_istemp)
1461 xl_btree_newroot xlrec;
1463 XLogRecData rdata[2];
1465 xlrec.node = rel->rd_node;
1466 xlrec.rootblk = rootblknum;
1467 xlrec.level = metad->btm_level;
1469 rdata[0].data = (char *) &xlrec;
1470 rdata[0].len = SizeOfBtreeNewroot;
1471 rdata[0].buffer = InvalidBuffer;
1472 rdata[0].next = &(rdata[1]);
1475 * Direct access to page is not good but faster - we should implement
1476 * some new func in page API.
1478 rdata[1].data = (char *) rootpage + ((PageHeader) rootpage)->pd_upper;
1479 rdata[1].len = ((PageHeader) rootpage)->pd_special -
1480 ((PageHeader) rootpage)->pd_upper;
1481 rdata[1].buffer = InvalidBuffer;
1482 rdata[1].next = NULL;
1484 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, rdata);
1486 PageSetLSN(rootpage, recptr);
1487 PageSetTLI(rootpage, ThisTimeLineID);
1488 PageSetLSN(metapg, recptr);
1489 PageSetTLI(metapg, ThisTimeLineID);
1490 PageSetLSN(lpage, recptr);
1491 PageSetTLI(lpage, ThisTimeLineID);
1492 PageSetLSN(rpage, recptr);
1493 PageSetTLI(rpage, ThisTimeLineID);
1498 /* write and let go of metapage buffer */
1499 _bt_wrtbuf(rel, metabuf);
1505 * _bt_pgaddtup() -- add a tuple to a particular page in the index.
1507 * This routine adds the tuple to the page as requested. It does
1508 * not affect pin/lock status, but you'd better have a write lock
1509 * and pin on the target buffer! Don't forget to write and release
1510 * the buffer afterwards, either.
1512 * The main difference between this routine and a bare PageAddItem call
1513 * is that this code knows that the leftmost data item on a non-leaf
1514 * btree page doesn't need to have a key. Therefore, it strips such
1515 * items down to just the item header. CAUTION: this works ONLY if
1516 * we insert the items in order, so that the given itup_off does
1517 * represent the final position of the item!
1520 _bt_pgaddtup(Relation rel,
1524 OffsetNumber itup_off,
1527 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1528 BTItemData truncitem;
1530 if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
1532 memcpy(&truncitem, btitem, sizeof(BTItemData));
1533 truncitem.bti_itup.t_info = sizeof(BTItemData);
1534 btitem = &truncitem;
1535 itemsize = sizeof(BTItemData);
1538 if (PageAddItem(page, (Item) btitem, itemsize, itup_off,
1539 LP_USED) == InvalidOffsetNumber)
1540 elog(PANIC, "failed to add item to the %s for \"%s\"",
1541 where, RelationGetRelationName(rel));
1545 * _bt_isequal - used in _bt_doinsert in check for duplicates.
1547 * This is very similar to _bt_compare, except for NULL handling.
1548 * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
1551 _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
1552 int keysz, ScanKey scankey)
1558 /* Better be comparing to a leaf item */
1559 Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page)));
1561 btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
1562 itup = &(btitem->bti_itup);
1564 for (i = 1; i <= keysz; i++)
1571 attno = scankey->sk_attno;
1573 datum = index_getattr(itup, attno, itupdesc, &isNull);
1575 /* NULLs are never equal to anything */
1576 if (isNull || (scankey->sk_flags & SK_ISNULL))
1579 result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
1581 scankey->sk_argument));
1589 /* if we get here, the keys are equal */