1 /*-------------------------------------------------------------------------
4 * page utilities routines for the postgres inverted index access method.
7 * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/gin/ginentrypage.c
12 *-------------------------------------------------------------------------
17 #include "access/gin_private.h"
18 #include "utils/rel.h"
21 * Form a tuple for entry tree.
23 * If the tuple would be too big to be stored, function throws a suitable
24 * error if errorTooBig is TRUE, or returns NULL if errorTooBig is FALSE.
26 * See src/backend/access/gin/README for a description of the index tuple
27 * format that is being built here. We build on the assumption that we
28 * are making a leaf-level key entry containing a posting list of nipd items.
29 * If the caller is actually trying to make a posting-tree entry, non-leaf
30 * entry, or pending-list entry, it should pass nipd = 0 and then overwrite
31 * the t_tid fields as necessary. In any case, ipd can be NULL to skip
32 * copying any itempointers into the posting list; the caller is responsible
33 * for filling the posting list afterwards, if ipd = NULL and nipd > 0.
36 GinFormTuple(GinState *ginstate,
37 OffsetNumber attnum, Datum key, GinNullCategory category,
38 ItemPointerData *ipd, uint32 nipd,
46 /* Build the basic tuple: optional column number, plus key datum */
50 isnull[0] = (category != GIN_CAT_NORM_KEY);
54 datums[0] = UInt16GetDatum(attnum);
57 isnull[1] = (category != GIN_CAT_NORM_KEY);
60 itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull);
63 * Determine and store offset to the posting list, making sure there is
64 * room for the category byte if needed.
66 * Note: because index_form_tuple MAXALIGNs the tuple size, there may well
67 * be some wasted pad space. Is it worth recomputing the data length to
68 * prevent that? That would also allow us to Assert that the real data
69 * doesn't overlap the GinNullCategory byte, which this code currently
72 newsize = IndexTupleSize(itup);
74 if (IndexTupleHasNulls(itup))
78 Assert(category != GIN_CAT_NORM_KEY);
79 minsize = GinCategoryOffset(itup, ginstate) + sizeof(GinNullCategory);
80 newsize = Max(newsize, minsize);
83 newsize = SHORTALIGN(newsize);
85 GinSetPostingOffset(itup, newsize);
87 GinSetNPosting(itup, nipd);
90 * Add space needed for posting list, if any. Then check that the tuple
91 * won't be too big to store.
93 newsize += sizeof(ItemPointerData) * nipd;
94 newsize = MAXALIGN(newsize);
95 if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize))
99 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
100 errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
101 (unsigned long) newsize,
102 (unsigned long) Min(INDEX_SIZE_MASK,
104 RelationGetRelationName(ginstate->index))));
110 * Resize tuple if needed
112 if (newsize != IndexTupleSize(itup))
114 itup = repalloc(itup, newsize);
116 * PostgreSQL 9.3 and earlier did not clear this new space, so we
117 * might find uninitialized padding when reading tuples from disk.
119 memset((char *) itup + IndexTupleSize(itup),
120 0, newsize - IndexTupleSize(itup));
122 /* set new size in tuple header */
123 itup->t_info &= ~INDEX_SIZE_MASK;
124 itup->t_info |= newsize;
128 * Insert category byte, if needed
130 if (category != GIN_CAT_NORM_KEY)
132 Assert(IndexTupleHasNulls(itup));
133 GinSetNullCategory(itup, ginstate, category);
137 * Copy in the posting list, if provided
140 memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd);
146 * Sometimes we reduce the number of posting list items in a tuple after
147 * having built it with GinFormTuple. This function adjusts the size
151 GinShortenTuple(IndexTuple itup, uint32 nipd)
155 Assert(nipd <= GinGetNPosting(itup));
157 newsize = GinGetPostingOffset(itup) + sizeof(ItemPointerData) * nipd;
158 newsize = MAXALIGN(newsize);
160 Assert(newsize <= (itup->t_info & INDEX_SIZE_MASK));
162 itup->t_info &= ~INDEX_SIZE_MASK;
163 itup->t_info |= newsize;
165 GinSetNPosting(itup, nipd);
169 * Form a non-leaf entry tuple by copying the key data from the given tuple,
170 * which can be either a leaf or non-leaf entry tuple.
172 * Any posting list in the source tuple is not copied. The specified child
173 * block number is inserted into t_tid.
176 GinFormInteriorTuple(IndexTuple itup, Page page, BlockNumber childblk)
180 if (GinPageIsLeaf(page) && !GinIsPostingTree(itup))
182 /* Tuple contains a posting list, just copy stuff before that */
183 uint32 origsize = GinGetPostingOffset(itup);
185 origsize = MAXALIGN(origsize);
186 nitup = (IndexTuple) palloc(origsize);
187 memcpy(nitup, itup, origsize);
188 /* ... be sure to fix the size header field ... */
189 nitup->t_info &= ~INDEX_SIZE_MASK;
190 nitup->t_info |= origsize;
194 /* Copy the tuple as-is */
195 nitup = (IndexTuple) palloc(IndexTupleSize(itup));
196 memcpy(nitup, itup, IndexTupleSize(itup));
199 /* Now insert the correct downlink */
200 GinSetDownlink(nitup, childblk);
206 * Entry tree is a "static", ie tuple never deletes from it,
207 * so we don't use right bound, we use rightmost key instead.
210 getRightMostTuple(Page page)
212 OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
214 return (IndexTuple) PageGetItem(page, PageGetItemId(page, maxoff));
218 entryIsMoveRight(GinBtree btree, Page page)
223 GinNullCategory category;
225 if (GinPageRightMost(page))
228 itup = getRightMostTuple(page);
229 attnum = gintuple_get_attrnum(btree->ginstate, itup);
230 key = gintuple_get_key(btree->ginstate, itup, &category);
232 if (ginCompareAttEntries(btree->ginstate,
233 btree->entryAttnum, btree->entryKey, btree->entryCategory,
234 attnum, key, category) > 0)
241 * Find correct tuple in non-leaf page. It supposed that
242 * page correctly chosen and searching value SHOULD be on page
245 entryLocateEntry(GinBtree btree, GinBtreeStack *stack)
250 IndexTuple itup = NULL;
252 Page page = BufferGetPage(stack->buffer);
254 Assert(!GinPageIsLeaf(page));
255 Assert(!GinPageIsData(page));
259 stack->off = FirstOffsetNumber;
260 stack->predictNumber *= PageGetMaxOffsetNumber(page);
261 return btree->getLeftMostChild(btree, page);
264 low = FirstOffsetNumber;
265 maxoff = high = PageGetMaxOffsetNumber(page);
272 OffsetNumber mid = low + ((high - low) / 2);
274 if (mid == maxoff && GinPageRightMost(page))
283 GinNullCategory category;
285 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid));
286 attnum = gintuple_get_attrnum(btree->ginstate, itup);
287 key = gintuple_get_key(btree->ginstate, itup, &category);
288 result = ginCompareAttEntries(btree->ginstate,
291 btree->entryCategory,
292 attnum, key, category);
298 Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO);
299 return GinGetDownlink(itup);
307 Assert(high >= FirstOffsetNumber && high <= maxoff);
310 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, high));
311 Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO);
312 return GinGetDownlink(itup);
316 * Searches correct position for value on leaf page.
317 * Page should be correctly chosen.
318 * Returns true if value found on page.
321 entryLocateLeafEntry(GinBtree btree, GinBtreeStack *stack)
323 Page page = BufferGetPage(stack->buffer);
327 Assert(GinPageIsLeaf(page));
328 Assert(!GinPageIsData(page));
332 stack->off = FirstOffsetNumber;
336 low = FirstOffsetNumber;
337 high = PageGetMaxOffsetNumber(page);
341 stack->off = FirstOffsetNumber;
349 OffsetNumber mid = low + ((high - low) / 2);
353 GinNullCategory category;
356 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid));
357 attnum = gintuple_get_attrnum(btree->ginstate, itup);
358 key = gintuple_get_key(btree->ginstate, itup, &category);
359 result = ginCompareAttEntries(btree->ginstate,
362 btree->entryCategory,
363 attnum, key, category);
380 entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff)
383 maxoff = PageGetMaxOffsetNumber(page);
386 Assert(!GinPageIsLeaf(page));
387 Assert(!GinPageIsData(page));
389 /* if page isn't changed, we returns storedOff */
390 if (storedOff >= FirstOffsetNumber && storedOff <= maxoff)
392 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, storedOff));
393 if (GinGetDownlink(itup) == blkno)
397 * we hope, that needed pointer goes to right. It's true if there
400 for (i = storedOff + 1; i <= maxoff; i++)
402 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
403 if (GinGetDownlink(itup) == blkno)
406 maxoff = storedOff - 1;
410 for (i = FirstOffsetNumber; i <= maxoff; i++)
412 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
413 if (GinGetDownlink(itup) == blkno)
417 return InvalidOffsetNumber;
421 entryGetLeftMostPage(GinBtree btree, Page page)
425 Assert(!GinPageIsLeaf(page));
426 Assert(!GinPageIsData(page));
427 Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber);
429 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber));
430 return GinGetDownlink(itup);
434 entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off)
437 Page page = BufferGetPage(buf);
439 Assert(btree->entry);
440 Assert(!GinPageIsData(page));
444 IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
446 itupsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
449 if (PageGetFreeSpace(page) + itupsz >= MAXALIGN(IndexTupleSize(btree->entry)) + sizeof(ItemIdData))
456 * Delete tuple on leaf page if tuples existed and we
457 * should update it, update old child blkno to new right page
458 * if child split occurred
461 entryPreparePage(GinBtree btree, Page page, OffsetNumber off)
463 BlockNumber ret = InvalidBlockNumber;
465 Assert(btree->entry);
466 Assert(!GinPageIsData(page));
470 Assert(GinPageIsLeaf(page));
471 PageIndexTupleDelete(page, off);
474 if (!GinPageIsLeaf(page) && btree->rightblkno != InvalidBlockNumber)
476 IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
478 GinSetDownlink(itup, btree->rightblkno);
479 ret = btree->rightblkno;
482 btree->rightblkno = InvalidBlockNumber;
488 * Place tuple on page and fills WAL record
490 * If the tuple doesn't fit, returns false without modifying the page.
493 entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off,
494 XLogRecData **prdata)
496 Page page = BufferGetPage(buf);
500 /* these must be static so they can be returned to caller */
501 static XLogRecData rdata[3];
502 static ginxlogInsert data;
504 /* quick exit if it doesn't fit */
505 if (!entryIsEnoughSpace(btree, buf, off))
509 data.updateBlkno = entryPreparePage(btree, page, off);
511 placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false, false);
513 elog(ERROR, "failed to add item to index page in \"%s\"",
514 RelationGetRelationName(btree->index));
516 data.node = btree->index->rd_node;
517 data.blkno = BufferGetBlockNumber(buf);
520 data.isDelete = btree->isDelete;
522 data.isLeaf = GinPageIsLeaf(page) ? TRUE : FALSE;
525 * Prevent full page write if child's split occurs. That is needed to
526 * remove incomplete splits while replaying WAL
528 * data.updateBlkno contains new block number (of newly created right
529 * page) for recently splited page.
531 if (data.updateBlkno == InvalidBlockNumber)
533 rdata[0].buffer = buf;
534 rdata[0].buffer_std = TRUE;
535 rdata[0].data = NULL;
537 rdata[0].next = &rdata[1];
541 rdata[cnt].buffer = InvalidBuffer;
542 rdata[cnt].data = (char *) &data;
543 rdata[cnt].len = sizeof(ginxlogInsert);
544 rdata[cnt].next = &rdata[cnt + 1];
547 rdata[cnt].buffer = InvalidBuffer;
548 rdata[cnt].data = (char *) btree->entry;
549 rdata[cnt].len = IndexTupleSize(btree->entry);
550 rdata[cnt].next = NULL;
558 * Place tuple and split page, original buffer(lbuf) leaves untouched,
559 * returns shadow page of lbuf filled new data.
560 * Tuples are distributed between pages by equal size on its, not
564 entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRecData **prdata)
568 separator = InvalidOffsetNumber;
575 Page lpage = PageGetTempPageCopy(BufferGetPage(lbuf));
576 Page rpage = BufferGetPage(rbuf);
577 Size pageSize = PageGetPageSize(lpage);
579 /* these must be static so they can be returned to caller */
580 static XLogRecData rdata[2];
581 static ginxlogSplit data;
582 static char tupstore[2 * BLCKSZ];
585 data.leftChildBlkno = (GinPageIsLeaf(lpage)) ?
586 InvalidOffsetNumber : GinGetDownlink(btree->entry);
587 data.updateBlkno = entryPreparePage(btree, lpage, off);
589 maxoff = PageGetMaxOffsetNumber(lpage);
592 for (i = FirstOffsetNumber; i <= maxoff; i++)
596 size = MAXALIGN(IndexTupleSize(btree->entry));
597 memcpy(ptr, btree->entry, size);
599 totalsize += size + sizeof(ItemIdData);
602 itup = (IndexTuple) PageGetItem(lpage, PageGetItemId(lpage, i));
603 size = MAXALIGN(IndexTupleSize(itup));
604 memcpy(ptr, itup, size);
606 totalsize += size + sizeof(ItemIdData);
609 if (off == maxoff + 1)
611 size = MAXALIGN(IndexTupleSize(btree->entry));
612 memcpy(ptr, btree->entry, size);
614 totalsize += size + sizeof(ItemIdData);
617 GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
618 GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize);
625 for (i = FirstOffsetNumber; i <= maxoff; i++)
627 itup = (IndexTuple) ptr;
629 if (lsize > totalsize / 2)
631 if (separator == InvalidOffsetNumber)
637 lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
640 if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
641 elog(ERROR, "failed to add item to index page in \"%s\"",
642 RelationGetRelationName(btree->index));
643 ptr += MAXALIGN(IndexTupleSize(itup));
646 data.node = btree->index->rd_node;
647 data.rootBlkno = InvalidBlockNumber;
648 data.lblkno = BufferGetBlockNumber(lbuf);
649 data.rblkno = BufferGetBlockNumber(rbuf);
650 data.separator = separator;
653 data.isLeaf = GinPageIsLeaf(lpage) ? TRUE : FALSE;
654 data.isRootSplit = FALSE;
656 rdata[0].buffer = InvalidBuffer;
657 rdata[0].data = (char *) &data;
658 rdata[0].len = sizeof(ginxlogSplit);
659 rdata[0].next = &rdata[1];
661 rdata[1].buffer = InvalidBuffer;
662 rdata[1].data = tupstore;
663 rdata[1].len = MAXALIGN(totalsize);
664 rdata[1].next = NULL;
670 * Prepare the state in 'btree' for inserting a downlink for given buffer.
673 entryPrepareDownlink(GinBtree btree, Buffer lbuf)
675 Page lpage = BufferGetPage(lbuf);
678 itup = getRightMostTuple(lpage);
680 btree->entry = GinFormInteriorTuple(itup,
682 BufferGetBlockNumber(lbuf));
683 btree->rightblkno = GinPageGetOpaque(lpage)->rightlink;
687 * Fills new root by rightest values from child.
688 * Also called from ginxlog, should not use btree
691 ginEntryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
693 Page page = BufferGetPage(root);
694 Page lpage = BufferGetPage(lbuf);
695 Page rpage = BufferGetPage(rbuf);
698 itup = GinFormInteriorTuple(getRightMostTuple(lpage),
700 BufferGetBlockNumber(lbuf));
701 if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
702 elog(ERROR, "failed to add item to index root page");
705 itup = GinFormInteriorTuple(getRightMostTuple(rpage),
707 BufferGetBlockNumber(rbuf));
708 if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
709 elog(ERROR, "failed to add item to index root page");
714 * Set up GinBtree for entry page access
716 * Note: during WAL recovery, there may be no valid data in ginstate
717 * other than a faked-up Relation pointer; the key datum is bogus too.
720 ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum,
721 Datum key, GinNullCategory category,
724 memset(btree, 0, sizeof(GinBtreeData));
726 btree->index = ginstate->index;
727 btree->ginstate = ginstate;
729 btree->findChildPage = entryLocateEntry;
730 btree->getLeftMostChild = entryGetLeftMostPage;
731 btree->isMoveRight = entryIsMoveRight;
732 btree->findItem = entryLocateLeafEntry;
733 btree->findChildPtr = entryFindChildPtr;
734 btree->placeToPage = entryPlaceToPage;
735 btree->splitPage = entrySplitPage;
736 btree->fillRoot = ginEntryFillRoot;
737 btree->prepareDownlink = entryPrepareDownlink;
739 btree->isData = FALSE;
740 btree->fullScan = FALSE;
741 btree->isBuild = FALSE;
743 btree->entryAttnum = attnum;
744 btree->entryKey = key;
745 btree->entryCategory = category;
746 btree->isDelete = FALSE;