1 /*-------------------------------------------------------------------------
4 * page utilities routines for the postgres inverted index access method.
7 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * $PostgreSQL: pgsql/src/backend/access/gin/gindatapage.c,v 1.10 2008/05/12 00:00:44 alvherre Exp $
12 *-------------------------------------------------------------------------
17 #include "access/gin.h"
18 #include "storage/bufmgr.h"
21 compareItemPointers(ItemPointer a, ItemPointer b)
23 if (GinItemPointerGetBlockNumber(a) == GinItemPointerGetBlockNumber(b))
25 if (GinItemPointerGetOffsetNumber(a) == GinItemPointerGetOffsetNumber(b))
27 return (GinItemPointerGetOffsetNumber(a) > GinItemPointerGetOffsetNumber(b)) ? 1 : -1;
30 return (GinItemPointerGetBlockNumber(a) > GinItemPointerGetBlockNumber(b)) ? 1 : -1;
34 * Merge two ordered array of itempointer
37 MergeItemPointers(ItemPointerData *dst, ItemPointerData *a, uint32 na, ItemPointerData *b, uint32 nb)
39 ItemPointerData *dptr = dst;
40 ItemPointerData *aptr = a,
43 while (aptr - a < na && bptr - b < nb)
45 if (compareItemPointers(aptr, bptr) > 0)
59 * Checks, should we move to right link...
60 * Compares inserting itemp pointer with right bound of current page
63 dataIsMoveRight(GinBtree btree, Page page)
65 ItemPointer iptr = GinDataPageGetRightBound(page);
67 if (GinPageRightMost(page))
70 return (compareItemPointers(btree->items + btree->curitem, iptr) > 0) ? TRUE : FALSE;
74 * Find correct PostingItem in non-leaf page. It supposed that page
75 * correctly chosen and searching value SHOULD be on page
78 dataLocateItem(GinBtree btree, GinBtreeStack *stack)
83 PostingItem *pitem = NULL;
85 Page page = BufferGetPage(stack->buffer);
87 Assert(!GinPageIsLeaf(page));
88 Assert(GinPageIsData(page));
92 stack->off = FirstOffsetNumber;
93 stack->predictNumber *= GinPageGetOpaque(page)->maxoff;
94 return btree->getLeftMostPage(btree, page);
97 low = FirstOffsetNumber;
98 maxoff = high = GinPageGetOpaque(page)->maxoff;
105 OffsetNumber mid = low + ((high - low) / 2);
107 pitem = (PostingItem *) GinDataPageGetItem(page, mid);
112 * Right infinity, page already correctly chosen with a help of
118 pitem = (PostingItem *) GinDataPageGetItem(page, mid);
119 result = compareItemPointers(btree->items + btree->curitem, &(pitem->key));
125 return PostingItemGetBlockNumber(pitem);
133 Assert(high >= FirstOffsetNumber && high <= maxoff);
136 pitem = (PostingItem *) GinDataPageGetItem(page, high);
137 return PostingItemGetBlockNumber(pitem);
141 * Searches correct position for value on leaf page.
142 * Page should be correctly chosen.
143 * Returns true if value found on page.
146 dataLocateLeafItem(GinBtree btree, GinBtreeStack *stack)
148 Page page = BufferGetPage(stack->buffer);
153 Assert(GinPageIsLeaf(page));
154 Assert(GinPageIsData(page));
158 stack->off = FirstOffsetNumber;
162 low = FirstOffsetNumber;
163 high = GinPageGetOpaque(page)->maxoff;
167 stack->off = FirstOffsetNumber;
175 OffsetNumber mid = low + ((high - low) / 2);
177 result = compareItemPointers(btree->items + btree->curitem, (ItemPointer) GinDataPageGetItem(page, mid));
195 * Finds links to blkno on non-leaf page, returns
196 * offset of PostingItem
199 dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff)
202 maxoff = GinPageGetOpaque(page)->maxoff;
205 Assert(!GinPageIsLeaf(page));
206 Assert(GinPageIsData(page));
208 /* if page isn't changed, we returns storedOff */
209 if (storedOff >= FirstOffsetNumber && storedOff <= maxoff)
211 pitem = (PostingItem *) GinDataPageGetItem(page, storedOff);
212 if (PostingItemGetBlockNumber(pitem) == blkno)
216 * we hope, that needed pointer goes to right. It's true if there
219 for (i = storedOff + 1; i <= maxoff; i++)
221 pitem = (PostingItem *) GinDataPageGetItem(page, i);
222 if (PostingItemGetBlockNumber(pitem) == blkno)
226 maxoff = storedOff - 1;
230 for (i = FirstOffsetNumber; i <= maxoff; i++)
232 pitem = (PostingItem *) GinDataPageGetItem(page, i);
233 if (PostingItemGetBlockNumber(pitem) == blkno)
237 return InvalidOffsetNumber;
241 * returns blkno of leftmost child
244 dataGetLeftMostPage(GinBtree btree, Page page)
248 Assert(!GinPageIsLeaf(page));
249 Assert(GinPageIsData(page));
250 Assert(GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber);
252 pitem = (PostingItem *) GinDataPageGetItem(page, FirstOffsetNumber);
253 return PostingItemGetBlockNumber(pitem);
257 * add ItemPointer or PostingItem to page. data should point to
258 * correct value! depending on leaf or non-leaf page
261 GinDataPageAddItem(Page page, void *data, OffsetNumber offset)
263 OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
266 if (offset == InvalidOffsetNumber)
268 ptr = GinDataPageGetItem(page, maxoff + 1);
272 ptr = GinDataPageGetItem(page, offset);
273 if (maxoff + 1 - offset != 0)
274 memmove(ptr + GinSizeOfItem(page), ptr, (maxoff - offset + 1) * GinSizeOfItem(page));
276 memcpy(ptr, data, GinSizeOfItem(page));
278 GinPageGetOpaque(page)->maxoff++;
282 * Deletes posting item from non-leaf page
285 PageDeletePostingItem(Page page, OffsetNumber offset)
287 OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
289 Assert(!GinPageIsLeaf(page));
290 Assert(offset >= FirstOffsetNumber && offset <= maxoff);
292 if (offset != maxoff)
293 memmove(GinDataPageGetItem(page, offset), GinDataPageGetItem(page, offset + 1),
294 sizeof(PostingItem) * (maxoff - offset));
296 GinPageGetOpaque(page)->maxoff--;
300 * checks space to install new value,
301 * item pointer never deletes!
304 dataIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off)
306 Page page = BufferGetPage(buf);
308 Assert(GinPageIsData(page));
309 Assert(!btree->isDelete);
311 if (GinPageIsLeaf(page))
313 if (GinPageRightMost(page) && off > GinPageGetOpaque(page)->maxoff)
315 if ((btree->nitem - btree->curitem) * sizeof(ItemPointerData) <= GinDataPageGetFreeSpace(page))
318 else if (sizeof(ItemPointerData) <= GinDataPageGetFreeSpace(page))
321 else if (sizeof(PostingItem) <= GinDataPageGetFreeSpace(page))
328 * In case of previous split update old child blkno to
330 * item pointer never deletes!
333 dataPrepareData(GinBtree btree, Page page, OffsetNumber off)
335 BlockNumber ret = InvalidBlockNumber;
337 Assert(GinPageIsData(page));
339 if (!GinPageIsLeaf(page) && btree->rightblkno != InvalidBlockNumber)
341 PostingItem *pitem = (PostingItem *) GinDataPageGetItem(page, off);
343 PostingItemSetBlockNumber(pitem, btree->rightblkno);
344 ret = btree->rightblkno;
347 btree->rightblkno = InvalidBlockNumber;
353 * Places keys to page and fills WAL record. In case leaf page and
354 * build mode puts all ItemPointers to page.
357 dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prdata)
359 Page page = BufferGetPage(buf);
360 static XLogRecData rdata[3];
361 int sizeofitem = GinSizeOfItem(page);
362 static ginxlogInsert data;
366 Assert(GinPageIsData(page));
368 data.updateBlkno = dataPrepareData(btree, page, off);
370 data.node = btree->index->rd_node;
371 data.blkno = BufferGetBlockNumber(buf);
374 data.isDelete = FALSE;
376 data.isLeaf = GinPageIsLeaf(page) ? TRUE : FALSE;
379 * Prevent full page write if child's split occurs. That is needed to
380 * remove incomplete splits while replaying WAL
382 * data.updateBlkno contains new block number (of newly created right
383 * page) for recently splited page.
385 if (data.updateBlkno == InvalidBlockNumber)
387 rdata[0].buffer = buf;
388 rdata[0].buffer_std = FALSE;
389 rdata[0].data = NULL;
391 rdata[0].next = &rdata[1];
395 rdata[cnt].buffer = InvalidBuffer;
396 rdata[cnt].data = (char *) &data;
397 rdata[cnt].len = sizeof(ginxlogInsert);
398 rdata[cnt].next = &rdata[cnt + 1];
401 rdata[cnt].buffer = InvalidBuffer;
402 rdata[cnt].data = (GinPageIsLeaf(page)) ? ((char *) (btree->items + btree->curitem)) : ((char *) &(btree->pitem));
403 rdata[cnt].len = sizeofitem;
404 rdata[cnt].next = NULL;
406 if (GinPageIsLeaf(page))
408 if (GinPageRightMost(page) && off > GinPageGetOpaque(page)->maxoff)
410 /* usually, create index... */
411 uint32 savedPos = btree->curitem;
413 while (btree->curitem < btree->nitem)
415 GinDataPageAddItem(page, btree->items + btree->curitem, off);
419 data.nitem = btree->curitem - savedPos;
420 rdata[cnt].len = sizeofitem * data.nitem;
424 GinDataPageAddItem(page, btree->items + btree->curitem, off);
429 GinDataPageAddItem(page, &(btree->pitem), off);
433 * split page and fills WAL record. original buffer(lbuf) leaves untouched,
434 * returns shadow page of lbuf filled new data. In leaf page and build mode puts all
435 * ItemPointers to pages. Also, in build mode splits data by way to full fulled
439 dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRecData **prdata)
441 static ginxlogSplit data;
442 static XLogRecData rdata[4];
443 static char vector[2 * BLCKSZ];
445 OffsetNumber separator;
447 Page lpage = GinPageGetCopyPage(BufferGetPage(lbuf));
448 ItemPointerData oldbound = *GinDataPageGetRightBound(lpage);
449 int sizeofitem = GinSizeOfItem(lpage);
450 OffsetNumber maxoff = GinPageGetOpaque(lpage)->maxoff;
451 Page rpage = BufferGetPage(rbuf);
452 Size pageSize = PageGetPageSize(lpage);
456 GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
457 freeSpace = GinDataPageGetFreeSpace(rpage);
460 data.leftChildBlkno = (GinPageIsLeaf(lpage)) ?
461 InvalidOffsetNumber : PostingItemGetBlockNumber(&(btree->pitem));
462 data.updateBlkno = dataPrepareData(btree, lpage, off);
464 memcpy(vector, GinDataPageGetItem(lpage, FirstOffsetNumber),
465 maxoff * sizeofitem);
467 if (GinPageIsLeaf(lpage) && GinPageRightMost(lpage) && off > GinPageGetOpaque(lpage)->maxoff)
470 while (btree->curitem < btree->nitem && maxoff * sizeof(ItemPointerData) < 2 * (freeSpace - sizeof(ItemPointerData)))
472 memcpy(vector + maxoff * sizeof(ItemPointerData), btree->items + btree->curitem,
473 sizeof(ItemPointerData));
481 ptr = vector + (off - 1) * sizeofitem;
482 if (maxoff + 1 - off != 0)
483 memmove(ptr + sizeofitem, ptr, (maxoff - off + 1) * sizeofitem);
484 if (GinPageIsLeaf(lpage))
486 memcpy(ptr, btree->items + btree->curitem, sizeofitem);
490 memcpy(ptr, &(btree->pitem), sizeofitem);
496 * we suppose that during index creation table scaned from begin to end,
497 * so ItemPointers are monotonically increased..
499 if (btree->isBuild && GinPageRightMost(lpage))
500 separator = freeSpace / sizeofitem;
502 separator = maxoff / 2;
504 GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
505 GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize);
507 memcpy(GinDataPageGetItem(lpage, FirstOffsetNumber), vector, separator * sizeofitem);
508 GinPageGetOpaque(lpage)->maxoff = separator;
509 memcpy(GinDataPageGetItem(rpage, FirstOffsetNumber),
510 vector + separator * sizeofitem, (maxoff - separator) * sizeofitem);
511 GinPageGetOpaque(rpage)->maxoff = maxoff - separator;
513 PostingItemSetBlockNumber(&(btree->pitem), BufferGetBlockNumber(lbuf));
514 if (GinPageIsLeaf(lpage))
515 btree->pitem.key = *(ItemPointerData *) GinDataPageGetItem(lpage,
516 GinPageGetOpaque(lpage)->maxoff);
518 btree->pitem.key = ((PostingItem *) GinDataPageGetItem(lpage,
519 GinPageGetOpaque(lpage)->maxoff))->key;
520 btree->rightblkno = BufferGetBlockNumber(rbuf);
522 /* set up right bound for left page */
523 bound = GinDataPageGetRightBound(lpage);
524 *bound = btree->pitem.key;
526 /* set up right bound for right page */
527 bound = GinDataPageGetRightBound(rpage);
530 data.node = btree->index->rd_node;
531 data.rootBlkno = InvalidBlockNumber;
532 data.lblkno = BufferGetBlockNumber(lbuf);
533 data.rblkno = BufferGetBlockNumber(rbuf);
534 data.separator = separator;
537 data.isLeaf = GinPageIsLeaf(lpage) ? TRUE : FALSE;
538 data.isRootSplit = FALSE;
539 data.rightbound = oldbound;
541 rdata[0].buffer = InvalidBuffer;
542 rdata[0].data = (char *) &data;
543 rdata[0].len = sizeof(ginxlogSplit);
544 rdata[0].next = &rdata[1];
546 rdata[1].buffer = InvalidBuffer;
547 rdata[1].data = vector;
548 rdata[1].len = MAXALIGN(maxoff * sizeofitem);
549 rdata[1].next = NULL;
555 * Fills new root by right bound values from child.
556 * Also called from ginxlog, should not use btree
559 dataFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
561 Page page = BufferGetPage(root),
562 lpage = BufferGetPage(lbuf),
563 rpage = BufferGetPage(rbuf);
567 li.key = *GinDataPageGetRightBound(lpage);
568 PostingItemSetBlockNumber(&li, BufferGetBlockNumber(lbuf));
569 GinDataPageAddItem(page, &li, InvalidOffsetNumber);
571 ri.key = *GinDataPageGetRightBound(rpage);
572 PostingItemSetBlockNumber(&ri, BufferGetBlockNumber(rbuf));
573 GinDataPageAddItem(page, &ri, InvalidOffsetNumber);
577 prepareDataScan(GinBtree btree, Relation index)
579 memset(btree, 0, sizeof(GinBtreeData));
580 btree->index = index;
581 btree->isMoveRight = dataIsMoveRight;
582 btree->findChildPage = dataLocateItem;
583 btree->findItem = dataLocateLeafItem;
584 btree->findChildPtr = dataFindChildPtr;
585 btree->getLeftMostPage = dataGetLeftMostPage;
586 btree->isEnoughSpace = dataIsEnoughSpace;
587 btree->placeToPage = dataPlaceToPage;
588 btree->splitPage = dataSplitPage;
589 btree->fillRoot = dataFillRoot;
591 btree->searchMode = FALSE;
592 btree->isDelete = FALSE;
593 btree->fullScan = FALSE;
594 btree->isBuild = FALSE;
598 prepareScanPostingTree(Relation index, BlockNumber rootBlkno, bool searchMode)
600 GinPostingTreeScan *gdi = (GinPostingTreeScan *) palloc0(sizeof(GinPostingTreeScan));
602 prepareDataScan(&gdi->btree, index);
604 gdi->btree.searchMode = searchMode;
605 gdi->btree.fullScan = searchMode;
607 gdi->stack = ginPrepareFindLeafPage(&gdi->btree, rootBlkno);
613 * Inserts array of item pointers, may execute several tree scan (very rare)
616 insertItemPointer(GinPostingTreeScan *gdi, ItemPointerData *items, uint32 nitem)
618 BlockNumber rootBlkno = gdi->stack->blkno;
620 gdi->btree.items = items;
621 gdi->btree.nitem = nitem;
622 gdi->btree.curitem = 0;
624 while (gdi->btree.curitem < gdi->btree.nitem)
627 gdi->stack = ginPrepareFindLeafPage(&gdi->btree, rootBlkno);
629 gdi->stack = ginFindLeafPage(&gdi->btree, gdi->stack);
631 if (gdi->btree.findItem(&(gdi->btree), gdi->stack))
632 elog(ERROR, "item pointer (%u,%d) already exists",
633 ItemPointerGetBlockNumber(gdi->btree.items + gdi->btree.curitem),
634 ItemPointerGetOffsetNumber(gdi->btree.items + gdi->btree.curitem));
636 ginInsertValue(&(gdi->btree), gdi->stack);
643 scanBeginPostingTree(GinPostingTreeScan *gdi)
645 gdi->stack = ginFindLeafPage(&gdi->btree, gdi->stack);
646 return gdi->stack->buffer;