1 /*-------------------------------------------------------------------------
4 * page utilities routines for the postgres inverted index access method.
7 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * $PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.22 2009/10/02 21:14:04 tgl Exp $
12 *-------------------------------------------------------------------------
17 #include "access/gin.h"
18 #include "storage/bufmgr.h"
19 #include "utils/rel.h"
22 * Form a tuple for entry tree.
24 * If the tuple would be too big to be stored, function throws a suitable
25 * error if errorTooBig is TRUE, or returns NULL if errorTooBig is FALSE.
27 * On leaf pages, Index tuple has non-traditional layout. Tuple may contain
28 * posting list or root blocknumber of posting tree.
29 * Macros: GinIsPostingTree(itup) / GinSetPostingTree(itup, blkno)
31 * - itup->t_info & INDEX_SIZE_MASK contains total size of tuple as usual
32 * - ItemPointerGetBlockNumber(&itup->t_tid) contains original
33 * size of tuple (without posting list).
34 * Macros: GinGetOrigSizePosting(itup) / GinSetOrigSizePosting(itup,n)
35 * - ItemPointerGetOffsetNumber(&itup->t_tid) contains number
36 * of elements in posting list (number of heap itempointers)
37 * Macros: GinGetNPosting(itup) / GinSetNPosting(itup,n)
38 * - After standard part of tuple there is a posting list, ie, array
39 * of heap itempointers
40 * Macros: GinGetPosting(itup)
42 * - itup->t_info & INDEX_SIZE_MASK contains size of tuple as usual
43 * - ItemPointerGetBlockNumber(&itup->t_tid) contains block number of
44 * root of posting tree
45 * - ItemPointerGetOffsetNumber(&itup->t_tid) contains magic number
46 * GIN_TREE_POSTING, which distinguishes this from posting-list case
48 * Attributes of an index tuple are different for single and multicolumn index.
49 * For single-column case, index tuple stores only value to be indexed.
50 * For multicolumn case, it stores two attributes: column number of value
54 GinFormTuple(Relation index, GinState *ginstate,
55 OffsetNumber attnum, Datum key,
56 ItemPointerData *ipd, uint32 nipd, bool errorTooBig)
58 bool isnull[2] = {FALSE, FALSE};
63 itup = index_form_tuple(ginstate->origTupdesc, &key, isnull);
68 datums[0] = UInt16GetDatum(attnum);
70 itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull);
73 GinSetOrigSizePosting(itup, IndexTupleSize(itup));
77 newsize = MAXALIGN(SHORTALIGN(IndexTupleSize(itup)) + sizeof(ItemPointerData) * nipd);
78 if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize))
82 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
83 errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
84 (unsigned long) newsize,
85 (unsigned long) Min(INDEX_SIZE_MASK,
87 RelationGetRelationName(index))));
91 itup = repalloc(itup, newsize);
94 itup->t_info &= ~INDEX_SIZE_MASK;
95 itup->t_info |= newsize;
98 memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd);
99 GinSetNPosting(itup, nipd);
104 * Gin tuple without any ItemPointers should be large enough to keep
105 * one ItemPointer, to prevent inconsistency between
106 * ginHeapTupleFastCollect and ginEntryInsert called by
107 * ginHeapTupleInsert. ginHeapTupleFastCollect forms tuple without
108 * extra pointer to heap, but ginEntryInsert (called for pending list
109 * cleanup during vacuum) will form the same tuple with one
112 newsize = MAXALIGN(SHORTALIGN(IndexTupleSize(itup)) + sizeof(ItemPointerData));
113 if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize))
117 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
118 errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
119 (unsigned long) newsize,
120 (unsigned long) Min(INDEX_SIZE_MASK,
122 RelationGetRelationName(index))));
126 GinSetNPosting(itup, 0);
132 * Sometimes we reduce the number of posting list items in a tuple after
133 * having built it with GinFormTuple. This function adjusts the size
137 GinShortenTuple(IndexTuple itup, uint32 nipd)
141 Assert(nipd <= GinGetNPosting(itup));
143 newsize = MAXALIGN(SHORTALIGN(GinGetOrigSizePosting(itup)) + sizeof(ItemPointerData) * nipd);
145 Assert(newsize <= (itup->t_info & INDEX_SIZE_MASK));
147 itup->t_info &= ~INDEX_SIZE_MASK;
148 itup->t_info |= newsize;
150 GinSetNPosting(itup, nipd);
154 * Entry tree is a "static", ie tuple never deletes from it,
155 * so we don't use right bound, we use rightest key instead.
158 getRightMostTuple(Page page)
160 OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
162 return (IndexTuple) PageGetItem(page, PageGetItemId(page, maxoff));
166 entryIsMoveRight(GinBtree btree, Page page)
170 if (GinPageRightMost(page))
173 itup = getRightMostTuple(page);
175 if (compareAttEntries(btree->ginstate,
176 btree->entryAttnum, btree->entryValue,
177 gintuple_get_attrnum(btree->ginstate, itup),
178 gin_index_getattr(btree->ginstate, itup)) > 0)
185 * Find correct tuple in non-leaf page. It supposed that
186 * page correctly choosen and searching value SHOULD be on page
189 entryLocateEntry(GinBtree btree, GinBtreeStack *stack)
194 IndexTuple itup = NULL;
196 Page page = BufferGetPage(stack->buffer);
198 Assert(!GinPageIsLeaf(page));
199 Assert(!GinPageIsData(page));
203 stack->off = FirstOffsetNumber;
204 stack->predictNumber *= PageGetMaxOffsetNumber(page);
205 return btree->getLeftMostPage(btree, page);
208 low = FirstOffsetNumber;
209 maxoff = high = PageGetMaxOffsetNumber(page);
216 OffsetNumber mid = low + ((high - low) / 2);
218 if (mid == maxoff && GinPageRightMost(page))
223 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid));
224 result = compareAttEntries(btree->ginstate,
225 btree->entryAttnum, btree->entryValue,
226 gintuple_get_attrnum(btree->ginstate, itup),
227 gin_index_getattr(btree->ginstate, itup));
233 Assert(GinItemPointerGetBlockNumber(&(itup)->t_tid) != GIN_ROOT_BLKNO);
234 return GinItemPointerGetBlockNumber(&(itup)->t_tid);
242 Assert(high >= FirstOffsetNumber && high <= maxoff);
245 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, high));
246 Assert(GinItemPointerGetBlockNumber(&(itup)->t_tid) != GIN_ROOT_BLKNO);
247 return GinItemPointerGetBlockNumber(&(itup)->t_tid);
251 * Searches correct position for value on leaf page.
252 * Page should be corrrectly choosen.
253 * Returns true if value found on page.
256 entryLocateLeafEntry(GinBtree btree, GinBtreeStack *stack)
258 Page page = BufferGetPage(stack->buffer);
263 Assert(GinPageIsLeaf(page));
264 Assert(!GinPageIsData(page));
268 stack->off = FirstOffsetNumber;
272 low = FirstOffsetNumber;
273 high = PageGetMaxOffsetNumber(page);
277 stack->off = FirstOffsetNumber;
285 OffsetNumber mid = low + ((high - low) / 2);
288 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid));
289 result = compareAttEntries(btree->ginstate,
290 btree->entryAttnum, btree->entryValue,
291 gintuple_get_attrnum(btree->ginstate, itup),
292 gin_index_getattr(btree->ginstate, itup));
309 entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff)
312 maxoff = PageGetMaxOffsetNumber(page);
315 Assert(!GinPageIsLeaf(page));
316 Assert(!GinPageIsData(page));
318 /* if page isn't changed, we returns storedOff */
319 if (storedOff >= FirstOffsetNumber && storedOff <= maxoff)
321 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, storedOff));
322 if (GinItemPointerGetBlockNumber(&(itup)->t_tid) == blkno)
326 * we hope, that needed pointer goes to right. It's true if there
329 for (i = storedOff + 1; i <= maxoff; i++)
331 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
332 if (GinItemPointerGetBlockNumber(&(itup)->t_tid) == blkno)
335 maxoff = storedOff - 1;
339 for (i = FirstOffsetNumber; i <= maxoff; i++)
341 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
342 if (GinItemPointerGetBlockNumber(&(itup)->t_tid) == blkno)
346 return InvalidOffsetNumber;
350 entryGetLeftMostPage(GinBtree btree, Page page)
354 Assert(!GinPageIsLeaf(page));
355 Assert(!GinPageIsData(page));
356 Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber);
358 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber));
359 return GinItemPointerGetBlockNumber(&(itup)->t_tid);
363 entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off)
366 Page page = BufferGetPage(buf);
368 Assert(btree->entry);
369 Assert(!GinPageIsData(page));
373 IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
375 itupsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
378 if (PageGetFreeSpace(page) + itupsz >= MAXALIGN(IndexTupleSize(btree->entry)) + sizeof(ItemIdData))
385 * Delete tuple on leaf page if tuples was existed and we
386 * should update it, update old child blkno to new right page
387 * if child split is occured
390 entryPreparePage(GinBtree btree, Page page, OffsetNumber off)
392 BlockNumber ret = InvalidBlockNumber;
394 Assert(btree->entry);
395 Assert(!GinPageIsData(page));
399 Assert(GinPageIsLeaf(page));
400 PageIndexTupleDelete(page, off);
403 if (!GinPageIsLeaf(page) && btree->rightblkno != InvalidBlockNumber)
405 IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
407 ItemPointerSet(&itup->t_tid, btree->rightblkno, InvalidOffsetNumber);
408 ret = btree->rightblkno;
411 btree->rightblkno = InvalidBlockNumber;
417 * Place tuple on page and fills WAL record
420 entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prdata)
422 Page page = BufferGetPage(buf);
423 static XLogRecData rdata[3];
425 static ginxlogInsert data;
429 data.updateBlkno = entryPreparePage(btree, page, off);
431 placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false, false);
433 elog(ERROR, "failed to add item to index page in \"%s\"",
434 RelationGetRelationName(btree->index));
436 data.node = btree->index->rd_node;
437 data.blkno = BufferGetBlockNumber(buf);
440 data.isDelete = btree->isDelete;
442 data.isLeaf = GinPageIsLeaf(page) ? TRUE : FALSE;
445 * Prevent full page write if child's split occurs. That is needed to
446 * remove incomplete splits while replaying WAL
448 * data.updateBlkno contains new block number (of newly created right
449 * page) for recently splited page.
451 if (data.updateBlkno == InvalidBlockNumber)
453 rdata[0].buffer = buf;
454 rdata[0].buffer_std = TRUE;
455 rdata[0].data = NULL;
457 rdata[0].next = &rdata[1];
461 rdata[cnt].buffer = InvalidBuffer;
462 rdata[cnt].data = (char *) &data;
463 rdata[cnt].len = sizeof(ginxlogInsert);
464 rdata[cnt].next = &rdata[cnt + 1];
467 rdata[cnt].buffer = InvalidBuffer;
468 rdata[cnt].data = (char *) btree->entry;
469 rdata[cnt].len = IndexTupleSize(btree->entry);
470 rdata[cnt].next = NULL;
476 * Returns new tuple with copied value from source tuple.
477 * New tuple will not store posting list
480 copyIndexTuple(IndexTuple itup, Page page)
484 if (GinPageIsLeaf(page) && !GinIsPostingTree(itup))
486 nitup = (IndexTuple) palloc(MAXALIGN(GinGetOrigSizePosting(itup)));
487 memcpy(nitup, itup, GinGetOrigSizePosting(itup));
488 nitup->t_info &= ~INDEX_SIZE_MASK;
489 nitup->t_info |= GinGetOrigSizePosting(itup);
493 nitup = (IndexTuple) palloc(MAXALIGN(IndexTupleSize(itup)));
494 memcpy(nitup, itup, IndexTupleSize(itup));
501 * Place tuple and split page, original buffer(lbuf) leaves untouched,
502 * returns shadow page of lbuf filled new data.
503 * Tuples are distributed between pages by equal size on its, not
507 entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRecData **prdata)
509 static XLogRecData rdata[2];
512 separator = InvalidOffsetNumber;
516 static char tupstore[2 * BLCKSZ];
519 leftrightmost = NULL;
520 static ginxlogSplit data;
522 Page lpage = PageGetTempPageCopy(BufferGetPage(lbuf));
523 Page rpage = BufferGetPage(rbuf);
524 Size pageSize = PageGetPageSize(lpage);
527 data.leftChildBlkno = (GinPageIsLeaf(lpage)) ?
528 InvalidOffsetNumber : GinItemPointerGetBlockNumber(&(btree->entry->t_tid));
529 data.updateBlkno = entryPreparePage(btree, lpage, off);
531 maxoff = PageGetMaxOffsetNumber(lpage);
534 for (i = FirstOffsetNumber; i <= maxoff; i++)
538 size = MAXALIGN(IndexTupleSize(btree->entry));
539 memcpy(ptr, btree->entry, size);
541 totalsize += size + sizeof(ItemIdData);
544 itup = (IndexTuple) PageGetItem(lpage, PageGetItemId(lpage, i));
545 size = MAXALIGN(IndexTupleSize(itup));
546 memcpy(ptr, itup, size);
548 totalsize += size + sizeof(ItemIdData);
551 if (off == maxoff + 1)
553 size = MAXALIGN(IndexTupleSize(btree->entry));
554 memcpy(ptr, btree->entry, size);
556 totalsize += size + sizeof(ItemIdData);
559 GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
560 GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize);
567 for (i = FirstOffsetNumber; i <= maxoff; i++)
569 itup = (IndexTuple) ptr;
571 if (lsize > totalsize / 2)
573 if (separator == InvalidOffsetNumber)
579 leftrightmost = itup;
580 lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
583 if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
584 elog(ERROR, "failed to add item to index page in \"%s\"",
585 RelationGetRelationName(btree->index));
586 ptr += MAXALIGN(IndexTupleSize(itup));
589 btree->entry = copyIndexTuple(leftrightmost, lpage);
590 ItemPointerSet(&(btree->entry)->t_tid, BufferGetBlockNumber(lbuf), InvalidOffsetNumber);
592 btree->rightblkno = BufferGetBlockNumber(rbuf);
594 data.node = btree->index->rd_node;
595 data.rootBlkno = InvalidBlockNumber;
596 data.lblkno = BufferGetBlockNumber(lbuf);
597 data.rblkno = BufferGetBlockNumber(rbuf);
598 data.separator = separator;
601 data.isLeaf = GinPageIsLeaf(lpage) ? TRUE : FALSE;
602 data.isRootSplit = FALSE;
604 rdata[0].buffer = InvalidBuffer;
605 rdata[0].data = (char *) &data;
606 rdata[0].len = sizeof(ginxlogSplit);
607 rdata[0].next = &rdata[1];
609 rdata[1].buffer = InvalidBuffer;
610 rdata[1].data = tupstore;
611 rdata[1].len = MAXALIGN(totalsize);
612 rdata[1].next = NULL;
618 * return newly allocate rightmost tuple
621 ginPageGetLinkItup(Buffer buf)
625 Page page = BufferGetPage(buf);
627 itup = getRightMostTuple(page);
628 nitup = copyIndexTuple(itup, page);
629 ItemPointerSet(&nitup->t_tid, BufferGetBlockNumber(buf), InvalidOffsetNumber);
635 * Fills new root by rightest values from child.
636 * Also called from ginxlog, should not use btree
639 entryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
644 page = BufferGetPage(root);
646 itup = ginPageGetLinkItup(lbuf);
647 if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
648 elog(ERROR, "failed to add item to index root page");
650 itup = ginPageGetLinkItup(rbuf);
651 if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
652 elog(ERROR, "failed to add item to index root page");
656 prepareEntryScan(GinBtree btree, Relation index, OffsetNumber attnum, Datum value, GinState *ginstate)
658 memset(btree, 0, sizeof(GinBtreeData));
660 btree->isMoveRight = entryIsMoveRight;
661 btree->findChildPage = entryLocateEntry;
662 btree->findItem = entryLocateLeafEntry;
663 btree->findChildPtr = entryFindChildPtr;
664 btree->getLeftMostPage = entryGetLeftMostPage;
665 btree->isEnoughSpace = entryIsEnoughSpace;
666 btree->placeToPage = entryPlaceToPage;
667 btree->splitPage = entrySplitPage;
668 btree->fillRoot = entryFillRoot;
670 btree->index = index;
671 btree->ginstate = ginstate;
672 btree->entryAttnum = attnum;
673 btree->entryValue = value;
675 btree->isDelete = FALSE;
676 btree->searchMode = FALSE;
677 btree->fullScan = FALSE;
678 btree->isBuild = FALSE;