1 /*-------------------------------------------------------------------------
4 * routines for handling GIN entry tree pages.
7 * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/gin/ginentrypage.c
12 *-------------------------------------------------------------------------
17 #include "access/gin_private.h"
18 #include "access/xloginsert.h"
19 #include "miscadmin.h"
20 #include "utils/rel.h"
22 static void entrySplitPage(GinBtree btree, Buffer origbuf,
25 BlockNumber updateblkno, XLogRecData **prdata,
26 Page *newlpage, Page *newrpage);
29 * Form a tuple for entry tree.
31 * If the tuple would be too big to be stored, function throws a suitable
32 * error if errorTooBig is TRUE, or returns NULL if errorTooBig is FALSE.
34 * See src/backend/access/gin/README for a description of the index tuple
35 * format that is being built here. We build on the assumption that we
36 * are making a leaf-level key entry containing a posting list of nipd items.
37 * If the caller is actually trying to make a posting-tree entry, non-leaf
38 * entry, or pending-list entry, it should pass dataSize = 0 and then overwrite
39 * the t_tid fields as necessary. In any case, 'data' can be NULL to skip
40 * filling in the posting list; the caller is responsible for filling it
41 * afterwards if data = NULL and nipd > 0.
44 GinFormTuple(GinState *ginstate,
45 OffsetNumber attnum, Datum key, GinNullCategory category,
46 Pointer data, Size dataSize, int nipd,
54 /* Build the basic tuple: optional column number, plus key datum */
58 isnull[0] = (category != GIN_CAT_NORM_KEY);
62 datums[0] = UInt16GetDatum(attnum);
65 isnull[1] = (category != GIN_CAT_NORM_KEY);
68 itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull);
71 * Determine and store offset to the posting list, making sure there is
72 * room for the category byte if needed.
74 * Note: because index_form_tuple MAXALIGNs the tuple size, there may well
75 * be some wasted pad space. Is it worth recomputing the data length to
76 * prevent that? That would also allow us to Assert that the real data
77 * doesn't overlap the GinNullCategory byte, which this code currently
80 newsize = IndexTupleSize(itup);
82 if (IndexTupleHasNulls(itup))
86 Assert(category != GIN_CAT_NORM_KEY);
87 minsize = GinCategoryOffset(itup, ginstate) + sizeof(GinNullCategory);
88 newsize = Max(newsize, minsize);
91 newsize = SHORTALIGN(newsize);
93 GinSetPostingOffset(itup, newsize);
94 GinSetNPosting(itup, nipd);
97 * Add space needed for posting list, if any. Then check that the tuple
98 * won't be too big to store.
102 newsize = MAXALIGN(newsize);
104 if (newsize > GinMaxItemSize)
108 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
109 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
110 (Size) newsize, (Size) GinMaxItemSize,
111 RelationGetRelationName(ginstate->index))));
117 * Resize tuple if needed
119 if (newsize != IndexTupleSize(itup))
121 itup = repalloc(itup, newsize);
124 * PostgreSQL 9.3 and earlier did not clear this new space, so we
125 * might find uninitialized padding when reading tuples from disk.
127 memset((char *) itup + IndexTupleSize(itup),
128 0, newsize - IndexTupleSize(itup));
129 /* set new size in tuple header */
130 itup->t_info &= ~INDEX_SIZE_MASK;
131 itup->t_info |= newsize;
135 * Copy in the posting list, if provided
139 char *ptr = GinGetPosting(itup);
141 memcpy(ptr, data, dataSize);
145 * Insert category byte, if needed
147 if (category != GIN_CAT_NORM_KEY)
149 Assert(IndexTupleHasNulls(itup));
150 GinSetNullCategory(itup, ginstate, category);
156 * Read item pointers from leaf entry tuple.
158 * Returns a palloc'd array of ItemPointers. The number of items is returned
162 ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup,
165 Pointer ptr = GinGetPosting(itup);
166 int nipd = GinGetNPosting(itup);
170 if (GinItupIsCompressed(itup))
174 ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded);
175 if (nipd != ndecoded)
176 elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded",
186 ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd);
187 memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd);
194 * Form a non-leaf entry tuple by copying the key data from the given tuple,
195 * which can be either a leaf or non-leaf entry tuple.
197 * Any posting list in the source tuple is not copied. The specified child
198 * block number is inserted into t_tid.
201 GinFormInteriorTuple(IndexTuple itup, Page page, BlockNumber childblk)
205 if (GinPageIsLeaf(page) && !GinIsPostingTree(itup))
207 /* Tuple contains a posting list, just copy stuff before that */
208 uint32 origsize = GinGetPostingOffset(itup);
210 origsize = MAXALIGN(origsize);
211 nitup = (IndexTuple) palloc(origsize);
212 memcpy(nitup, itup, origsize);
213 /* ... be sure to fix the size header field ... */
214 nitup->t_info &= ~INDEX_SIZE_MASK;
215 nitup->t_info |= origsize;
219 /* Copy the tuple as-is */
220 nitup = (IndexTuple) palloc(IndexTupleSize(itup));
221 memcpy(nitup, itup, IndexTupleSize(itup));
224 /* Now insert the correct downlink */
225 GinSetDownlink(nitup, childblk);
231 * Entry tree is a "static", ie tuple never deletes from it,
232 * so we don't use right bound, we use rightmost key instead.
235 getRightMostTuple(Page page)
237 OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
239 return (IndexTuple) PageGetItem(page, PageGetItemId(page, maxoff));
243 entryIsMoveRight(GinBtree btree, Page page)
248 GinNullCategory category;
250 if (GinPageRightMost(page))
253 itup = getRightMostTuple(page);
254 attnum = gintuple_get_attrnum(btree->ginstate, itup);
255 key = gintuple_get_key(btree->ginstate, itup, &category);
257 if (ginCompareAttEntries(btree->ginstate,
258 btree->entryAttnum, btree->entryKey, btree->entryCategory,
259 attnum, key, category) > 0)
266 * Find correct tuple in non-leaf page. It supposed that
267 * page correctly chosen and searching value SHOULD be on page
270 entryLocateEntry(GinBtree btree, GinBtreeStack *stack)
275 IndexTuple itup = NULL;
277 Page page = BufferGetPage(stack->buffer);
279 Assert(!GinPageIsLeaf(page));
280 Assert(!GinPageIsData(page));
284 stack->off = FirstOffsetNumber;
285 stack->predictNumber *= PageGetMaxOffsetNumber(page);
286 return btree->getLeftMostChild(btree, page);
289 low = FirstOffsetNumber;
290 maxoff = high = PageGetMaxOffsetNumber(page);
297 OffsetNumber mid = low + ((high - low) / 2);
299 if (mid == maxoff && GinPageRightMost(page))
308 GinNullCategory category;
310 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid));
311 attnum = gintuple_get_attrnum(btree->ginstate, itup);
312 key = gintuple_get_key(btree->ginstate, itup, &category);
313 result = ginCompareAttEntries(btree->ginstate,
316 btree->entryCategory,
317 attnum, key, category);
323 Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO);
324 return GinGetDownlink(itup);
332 Assert(high >= FirstOffsetNumber && high <= maxoff);
335 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, high));
336 Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO);
337 return GinGetDownlink(itup);
341 * Searches correct position for value on leaf page.
342 * Page should be correctly chosen.
343 * Returns true if value found on page.
346 entryLocateLeafEntry(GinBtree btree, GinBtreeStack *stack)
348 Page page = BufferGetPage(stack->buffer);
352 Assert(GinPageIsLeaf(page));
353 Assert(!GinPageIsData(page));
357 stack->off = FirstOffsetNumber;
361 low = FirstOffsetNumber;
362 high = PageGetMaxOffsetNumber(page);
366 stack->off = FirstOffsetNumber;
374 OffsetNumber mid = low + ((high - low) / 2);
378 GinNullCategory category;
381 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid));
382 attnum = gintuple_get_attrnum(btree->ginstate, itup);
383 key = gintuple_get_key(btree->ginstate, itup, &category);
384 result = ginCompareAttEntries(btree->ginstate,
387 btree->entryCategory,
388 attnum, key, category);
405 entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff)
408 maxoff = PageGetMaxOffsetNumber(page);
411 Assert(!GinPageIsLeaf(page));
412 Assert(!GinPageIsData(page));
414 /* if page isn't changed, we returns storedOff */
415 if (storedOff >= FirstOffsetNumber && storedOff <= maxoff)
417 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, storedOff));
418 if (GinGetDownlink(itup) == blkno)
422 * we hope, that needed pointer goes to right. It's true if there
425 for (i = storedOff + 1; i <= maxoff; i++)
427 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
428 if (GinGetDownlink(itup) == blkno)
431 maxoff = storedOff - 1;
435 for (i = FirstOffsetNumber; i <= maxoff; i++)
437 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
438 if (GinGetDownlink(itup) == blkno)
442 return InvalidOffsetNumber;
446 entryGetLeftMostPage(GinBtree btree, Page page)
450 Assert(!GinPageIsLeaf(page));
451 Assert(!GinPageIsData(page));
452 Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber);
454 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber));
455 return GinGetDownlink(itup);
459 entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off,
460 GinBtreeEntryInsertData *insertData)
464 Page page = BufferGetPage(buf);
466 Assert(insertData->entry);
467 Assert(!GinPageIsData(page));
469 if (insertData->isDelete)
471 IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
473 releasedsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
476 addedsz = MAXALIGN(IndexTupleSize(insertData->entry)) + sizeof(ItemIdData);
478 if (PageGetFreeSpace(page) + releasedsz >= addedsz)
485 * Delete tuple on leaf page if tuples existed and we
486 * should update it, update old child blkno to new right page
487 * if child split occurred
490 entryPreparePage(GinBtree btree, Page page, OffsetNumber off,
491 GinBtreeEntryInsertData *insertData, BlockNumber updateblkno)
493 Assert(insertData->entry);
494 Assert(!GinPageIsData(page));
496 if (insertData->isDelete)
498 Assert(GinPageIsLeaf(page));
499 PageIndexTupleDelete(page, off);
502 if (!GinPageIsLeaf(page) && updateblkno != InvalidBlockNumber)
504 IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
506 GinSetDownlink(itup, updateblkno);
511 * Place tuple on page and fills WAL record
513 * If the tuple doesn't fit, returns false without modifying the page.
515 * On insertion to an internal node, in addition to inserting the given item,
516 * the downlink of the existing item at 'off' is updated to point to
519 static GinPlaceToPageRC
520 entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
521 void *insertPayload, BlockNumber updateblkno,
522 XLogRecData **prdata, Page *newlpage, Page *newrpage)
524 GinBtreeEntryInsertData *insertData = insertPayload;
525 Page page = BufferGetPage(buf);
526 OffsetNumber off = stack->off;
530 /* these must be static so they can be returned to caller */
531 static XLogRecData rdata[3];
532 static ginxlogInsertEntry data;
534 /* quick exit if it doesn't fit */
535 if (!entryIsEnoughSpace(btree, buf, off, insertData))
537 entrySplitPage(btree, buf, stack, insertPayload, updateblkno,
538 prdata, newlpage, newrpage);
542 START_CRIT_SECTION();
545 entryPreparePage(btree, page, off, insertData, updateblkno);
547 placed = PageAddItem(page,
548 (Item) insertData->entry,
549 IndexTupleSize(insertData->entry),
552 elog(ERROR, "failed to add item to index page in \"%s\"",
553 RelationGetRelationName(btree->index));
555 data.isDelete = insertData->isDelete;
558 rdata[cnt].buffer = buf;
559 rdata[cnt].buffer_std = true;
560 rdata[cnt].data = (char *) &data;
561 rdata[cnt].len = offsetof(ginxlogInsertEntry, tuple);
562 rdata[cnt].next = &rdata[cnt + 1];
565 rdata[cnt].buffer = buf;
566 rdata[cnt].buffer_std = true;
567 rdata[cnt].data = (char *) insertData->entry;
568 rdata[cnt].len = IndexTupleSize(insertData->entry);
569 rdata[cnt].next = NULL;
575 * Place tuple and split page, original buffer(lbuf) leaves untouched,
576 * returns shadow pages filled with new data.
577 * Tuples are distributed between pages by equal size on its, not
581 entrySplitPage(GinBtree btree, Buffer origbuf,
582 GinBtreeStack *stack,
584 BlockNumber updateblkno, XLogRecData **prdata,
585 Page *newlpage, Page *newrpage)
587 GinBtreeEntryInsertData *insertData = insertPayload;
588 OffsetNumber off = stack->off;
591 separator = InvalidOffsetNumber;
599 Page lpage = PageGetTempPageCopy(BufferGetPage(origbuf));
600 Page rpage = PageGetTempPageCopy(BufferGetPage(origbuf));
601 Size pageSize = PageGetPageSize(lpage);
603 /* these must be static so they can be returned to caller */
604 static XLogRecData rdata[2];
605 static ginxlogSplitEntry data;
606 static char tupstore[2 * BLCKSZ];
609 entryPreparePage(btree, lpage, off, insertData, updateblkno);
612 * First, append all the existing tuples and the new tuple we're inserting
613 * one after another in a temporary workspace.
615 maxoff = PageGetMaxOffsetNumber(lpage);
617 for (i = FirstOffsetNumber; i <= maxoff; i++)
621 size = MAXALIGN(IndexTupleSize(insertData->entry));
622 memcpy(ptr, insertData->entry, size);
624 totalsize += size + sizeof(ItemIdData);
627 itup = (IndexTuple) PageGetItem(lpage, PageGetItemId(lpage, i));
628 size = MAXALIGN(IndexTupleSize(itup));
629 memcpy(ptr, itup, size);
631 totalsize += size + sizeof(ItemIdData);
634 if (off == maxoff + 1)
636 size = MAXALIGN(IndexTupleSize(insertData->entry));
637 memcpy(ptr, insertData->entry, size);
639 totalsize += size + sizeof(ItemIdData);
641 tupstoresize = ptr - tupstore;
644 * Initialize the left and right pages, and copy all the tuples back to
647 GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
648 GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize);
655 for (i = FirstOffsetNumber; i <= maxoff; i++)
657 itup = (IndexTuple) ptr;
659 if (lsize > totalsize / 2)
661 if (separator == InvalidOffsetNumber)
667 lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
670 if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
671 elog(ERROR, "failed to add item to index page in \"%s\"",
672 RelationGetRelationName(btree->index));
673 ptr += MAXALIGN(IndexTupleSize(itup));
676 data.separator = separator;
679 rdata[0].buffer = InvalidBuffer;
680 rdata[0].data = (char *) &data;
681 rdata[0].len = sizeof(ginxlogSplitEntry);
682 rdata[0].next = &rdata[1];
684 rdata[1].buffer = InvalidBuffer;
685 rdata[1].data = tupstore;
686 rdata[1].len = tupstoresize;
687 rdata[1].next = NULL;
694 * Construct insertion payload for inserting the downlink for given buffer.
697 entryPrepareDownlink(GinBtree btree, Buffer lbuf)
699 GinBtreeEntryInsertData *insertData;
700 Page lpage = BufferGetPage(lbuf);
701 BlockNumber lblkno = BufferGetBlockNumber(lbuf);
704 itup = getRightMostTuple(lpage);
706 insertData = palloc(sizeof(GinBtreeEntryInsertData));
707 insertData->entry = GinFormInteriorTuple(itup, lpage, lblkno);
708 insertData->isDelete = false;
714 * Fills new root by rightest values from child.
715 * Also called from ginxlog, should not use btree
718 ginEntryFillRoot(GinBtree btree, Page root,
719 BlockNumber lblkno, Page lpage,
720 BlockNumber rblkno, Page rpage)
724 itup = GinFormInteriorTuple(getRightMostTuple(lpage), lpage, lblkno);
725 if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
726 elog(ERROR, "failed to add item to index root page");
729 itup = GinFormInteriorTuple(getRightMostTuple(rpage), rpage, rblkno);
730 if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
731 elog(ERROR, "failed to add item to index root page");
736 * Set up GinBtree for entry page access
738 * Note: during WAL recovery, there may be no valid data in ginstate
739 * other than a faked-up Relation pointer; the key datum is bogus too.
742 ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum,
743 Datum key, GinNullCategory category,
746 memset(btree, 0, sizeof(GinBtreeData));
748 btree->index = ginstate->index;
749 btree->rootBlkno = GIN_ROOT_BLKNO;
750 btree->ginstate = ginstate;
752 btree->findChildPage = entryLocateEntry;
753 btree->getLeftMostChild = entryGetLeftMostPage;
754 btree->isMoveRight = entryIsMoveRight;
755 btree->findItem = entryLocateLeafEntry;
756 btree->findChildPtr = entryFindChildPtr;
757 btree->placeToPage = entryPlaceToPage;
758 btree->fillRoot = ginEntryFillRoot;
759 btree->prepareDownlink = entryPrepareDownlink;
761 btree->isData = FALSE;
762 btree->fullScan = FALSE;
763 btree->isBuild = FALSE;
765 btree->entryAttnum = attnum;
766 btree->entryKey = key;
767 btree->entryCategory = category;