1 /*-------------------------------------------------------------------------
4 * page utilities routines for the postgres inverted index access method.
7 * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/gin/ginbtree.c
12 *-------------------------------------------------------------------------
17 #include "access/gin_private.h"
18 #include "access/xloginsert.h"
19 #include "miscadmin.h"
20 #include "utils/rel.h"
22 static void ginFindParents(GinBtree btree, GinBtreeStack *stack);
23 static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
24 void *insertdata, BlockNumber updateblkno,
25 Buffer childbuf, GinStatsData *buildStats);
26 static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack,
27 bool freestack, GinStatsData *buildStats);
30 * Lock buffer by needed method for search.
33 ginTraverseLock(Buffer buffer, bool searchMode)
36 int access = GIN_SHARE;
38 LockBuffer(buffer, GIN_SHARE);
39 page = BufferGetPage(buffer);
40 if (GinPageIsLeaf(page))
42 if (searchMode == FALSE)
44 /* we should relock our page */
45 LockBuffer(buffer, GIN_UNLOCK);
46 LockBuffer(buffer, GIN_EXCLUSIVE);
48 /* But root can become non-leaf during relock */
49 if (!GinPageIsLeaf(page))
51 /* restore old lock type (very rare) */
52 LockBuffer(buffer, GIN_UNLOCK);
53 LockBuffer(buffer, GIN_SHARE);
56 access = GIN_EXCLUSIVE;
64 * Descend the tree to the leaf page that contains or would contain the key
65 * we're searching for. The key should already be filled in 'btree', in
66 * tree-type specific manner. If btree->fullScan is true, descends to the
69 * If 'searchmode' is false, on return stack->buffer is exclusively locked,
70 * and the stack represents the full path to the root. Otherwise stack->buffer
71 * is share-locked, and stack->parent is NULL.
74 ginFindLeafPage(GinBtree btree, bool searchMode)
78 stack = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
79 stack->blkno = btree->rootBlkno;
80 stack->buffer = ReadBuffer(btree->index, btree->rootBlkno);
82 stack->predictNumber = 1;
90 stack->off = InvalidOffsetNumber;
92 page = BufferGetPage(stack->buffer);
94 access = ginTraverseLock(stack->buffer, searchMode);
97 * If we're going to modify the tree, finish any incomplete splits we
98 * encounter on the way.
100 if (!searchMode && GinPageIsIncompleteSplit(page))
101 ginFinishSplit(btree, stack, false, NULL);
104 * ok, page is correctly locked, we should check to move right ..,
105 * root never has a right link, so small optimization
107 while (btree->fullScan == FALSE && stack->blkno != btree->rootBlkno &&
108 btree->isMoveRight(btree, page))
110 BlockNumber rightlink = GinPageGetOpaque(page)->rightlink;
112 if (rightlink == InvalidBlockNumber)
116 stack->buffer = ginStepRight(stack->buffer, btree->index, access);
117 stack->blkno = rightlink;
118 page = BufferGetPage(stack->buffer);
120 if (!searchMode && GinPageIsIncompleteSplit(page))
121 ginFinishSplit(btree, stack, false, NULL);
124 if (GinPageIsLeaf(page)) /* we found, return locked page */
127 /* now we have correct buffer, try to find child */
128 child = btree->findChildPage(btree, stack);
130 LockBuffer(stack->buffer, GIN_UNLOCK);
131 Assert(child != InvalidBlockNumber);
132 Assert(stack->blkno != child);
136 /* in search mode we may forget path to leaf */
137 stack->blkno = child;
138 stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno);
142 GinBtreeStack *ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
146 stack->blkno = child;
147 stack->buffer = ReadBuffer(btree->index, stack->blkno);
148 stack->predictNumber = 1;
154 * Step right from current page.
156 * The next page is locked first, before releasing the current page. This is
157 * crucial to protect from concurrent page deletion (see comment in
161 ginStepRight(Buffer buffer, Relation index, int lockmode)
164 Page page = BufferGetPage(buffer);
165 bool isLeaf = GinPageIsLeaf(page);
166 bool isData = GinPageIsData(page);
167 BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
169 nextbuffer = ReadBuffer(index, blkno);
170 LockBuffer(nextbuffer, lockmode);
171 UnlockReleaseBuffer(buffer);
173 /* Sanity check that the page we stepped to is of similar kind. */
174 page = BufferGetPage(nextbuffer);
175 if (isLeaf != GinPageIsLeaf(page) || isData != GinPageIsData(page))
176 elog(ERROR, "right sibling of GIN page is of different type");
179 * Given the proper lock sequence above, we should never land on a deleted
182 if (GinPageIsDeleted(page))
183 elog(ERROR, "right sibling of GIN page was deleted");
189 freeGinBtreeStack(GinBtreeStack *stack)
193 GinBtreeStack *tmp = stack->parent;
195 if (stack->buffer != InvalidBuffer)
196 ReleaseBuffer(stack->buffer);
204 * Try to find parent for current stack position. Returns correct parent and
205 * child's offset in stack->parent. The root page is never released, to
206 * to prevent conflict with vacuum process.
209 ginFindParents(GinBtree btree, GinBtreeStack *stack)
220 * Unwind the stack all the way up to the root, leaving only the root
223 * Be careful not to release the pin on the root page! The pin on root
224 * page is required to lock out concurrent vacuums on the tree.
226 root = stack->parent;
229 ReleaseBuffer(root->buffer);
233 Assert(root->blkno == btree->rootBlkno);
234 Assert(BufferGetBlockNumber(root->buffer) == btree->rootBlkno);
235 root->off = InvalidOffsetNumber;
238 buffer = root->buffer;
239 offset = InvalidOffsetNumber;
241 ptr = (GinBtreeStack *) palloc(sizeof(GinBtreeStack));
245 LockBuffer(buffer, GIN_EXCLUSIVE);
246 page = BufferGetPage(buffer);
247 if (GinPageIsLeaf(page))
248 elog(ERROR, "Lost path");
250 if (GinPageIsIncompleteSplit(page))
252 Assert(blkno != btree->rootBlkno);
254 ptr->buffer = buffer;
257 * parent may be wrong, but if so, the ginFinishSplit call will
258 * recurse to call ginFindParents again to fix it.
261 ptr->off = InvalidOffsetNumber;
263 ginFinishSplit(btree, ptr, false, NULL);
266 leftmostBlkno = btree->getLeftMostChild(btree, page);
268 while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber)
270 blkno = GinPageGetOpaque(page)->rightlink;
271 if (blkno == InvalidBlockNumber)
273 UnlockReleaseBuffer(buffer);
276 buffer = ginStepRight(buffer, btree->index, GIN_EXCLUSIVE);
277 page = BufferGetPage(buffer);
279 /* finish any incomplete splits, as above */
280 if (GinPageIsIncompleteSplit(page))
282 Assert(blkno != btree->rootBlkno);
284 ptr->buffer = buffer;
286 ptr->off = InvalidOffsetNumber;
288 ginFinishSplit(btree, ptr, false, NULL);
292 if (blkno != InvalidBlockNumber)
295 ptr->buffer = buffer;
296 ptr->parent = root; /* it may be wrong, but in next call we will
303 /* Descend down to next level */
304 blkno = leftmostBlkno;
305 buffer = ReadBuffer(btree->index, blkno);
310 * Insert a new item to a page.
312 * Returns true if the insertion was finished. On false, the page was split and
313 * the parent needs to be updated. (a root split returns true as it doesn't
314 * need any further action by the caller to complete)
316 * When inserting a downlink to a internal page, 'childbuf' contains the
317 * child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared
318 * atomically with the insert. Also, the existing item at the given location
319 * is updated to point to 'updateblkno'.
321 * stack->buffer is locked on entry, and is kept locked.
324 ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
325 void *insertdata, BlockNumber updateblkno,
326 Buffer childbuf, GinStatsData *buildStats)
328 Page page = BufferGetPage(stack->buffer);
331 Page childpage = NULL;
332 Page newlpage = NULL,
335 if (GinPageIsData(page))
336 xlflags |= GIN_INSERT_ISDATA;
337 if (GinPageIsLeaf(page))
339 xlflags |= GIN_INSERT_ISLEAF;
340 Assert(!BufferIsValid(childbuf));
341 Assert(updateblkno == InvalidBlockNumber);
345 Assert(BufferIsValid(childbuf));
346 Assert(updateblkno != InvalidBlockNumber);
347 childpage = BufferGetPage(childbuf);
351 * Try to put the incoming tuple on the page. placeToPage will decide if
352 * the page needs to be split.
354 * WAL-logging this operation is a bit funny:
356 * We're responsible for calling XLogBeginInsert() and XLogInsert().
357 * XLogBeginInsert() must be called before placeToPage, because
358 * placeToPage can register some data to the WAL record.
360 * If placeToPage returns INSERTED, placeToPage has already called
361 * START_CRIT_SECTION(), and we're responsible for calling
362 * END_CRIT_SECTION. When it returns INSERTED, it is also responsible for
363 * registering any data required to replay the operation with
364 * XLogRegisterData(0, ...). It may only add data to block index 0; the
365 * main data of the WAL record is reserved for this function.
367 * If placeToPage returns SPLIT, we're wholly responsible for WAL logging.
368 * Splits happen infrequently, so we just make a full-page image of all
369 * the pages involved.
372 if (RelationNeedsWAL(btree->index))
375 rc = btree->placeToPage(btree, stack->buffer, stack,
376 insertdata, updateblkno,
377 &newlpage, &newrpage);
378 if (rc == UNMODIFIED)
380 XLogResetInsertion();
383 else if (rc == INSERTED)
385 /* placeToPage did START_CRIT_SECTION() */
386 MarkBufferDirty(stack->buffer);
388 /* An insert to an internal page finishes the split of the child. */
389 if (childbuf != InvalidBuffer)
391 GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT;
392 MarkBufferDirty(childbuf);
395 if (RelationNeedsWAL(btree->index))
399 BlockIdData childblknos[2];
402 * placetopage already registered stack->buffer as block 0.
404 xlrec.flags = xlflags;
406 if (childbuf != InvalidBuffer)
407 XLogRegisterBuffer(1, childbuf, REGBUF_STANDARD);
409 XLogRegisterData((char *) &xlrec, sizeof(ginxlogInsert));
412 * Log information about child if this was an insertion of a
415 if (childbuf != InvalidBuffer)
417 BlockIdSet(&childblknos[0], BufferGetBlockNumber(childbuf));
418 BlockIdSet(&childblknos[1], GinPageGetOpaque(childpage)->rightlink);
419 XLogRegisterData((char *) childblknos,
420 sizeof(BlockIdData) * 2);
423 recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT);
424 PageSetLSN(page, recptr);
425 if (childbuf != InvalidBuffer)
426 PageSetLSN(childpage, recptr);
433 else if (rc == SPLIT)
435 /* Didn't fit, had to split */
437 BlockNumber savedRightLink;
439 Buffer lbuffer = InvalidBuffer;
440 Page newrootpg = NULL;
442 rbuffer = GinNewBuffer(btree->index);
444 /* During index build, count the new page */
448 buildStats->nDataPages++;
450 buildStats->nEntryPages++;
453 savedRightLink = GinPageGetOpaque(page)->rightlink;
456 * newlpage and newrpage are pointers to memory pages, not associated
457 * with buffers. stack->buffer is not touched yet.
460 data.node = btree->index->rd_node;
461 data.flags = xlflags;
462 if (childbuf != InvalidBuffer)
464 Page childpage = BufferGetPage(childbuf);
466 GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT;
468 data.leftChildBlkno = BufferGetBlockNumber(childbuf);
469 data.rightChildBlkno = GinPageGetOpaque(childpage)->rightlink;
472 data.leftChildBlkno = data.rightChildBlkno = InvalidBlockNumber;
474 if (stack->parent == NULL)
477 * split root, so we need to allocate new left page and place
478 * pointer on root to left and right page
480 lbuffer = GinNewBuffer(btree->index);
482 /* During index build, count the newly-added root page */
486 buildStats->nDataPages++;
488 buildStats->nEntryPages++;
491 data.rrlink = InvalidBlockNumber;
492 data.flags |= GIN_SPLIT_ROOT;
494 GinPageGetOpaque(newrpage)->rightlink = InvalidBlockNumber;
495 GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
498 * Construct a new root page containing downlinks to the new left
499 * and right pages. (do this in a temporary copy first rather than
500 * overwriting the original page directly, so that we can still
501 * abort gracefully if this fails.)
503 newrootpg = PageGetTempPage(newrpage);
504 GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ);
506 btree->fillRoot(btree, newrootpg,
507 BufferGetBlockNumber(lbuffer), newlpage,
508 BufferGetBlockNumber(rbuffer), newrpage);
512 /* split non-root page */
513 data.rrlink = savedRightLink;
515 GinPageGetOpaque(newrpage)->rightlink = savedRightLink;
516 GinPageGetOpaque(newlpage)->flags |= GIN_INCOMPLETE_SPLIT;
517 GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
521 * Ok, we have the new contents of the left page in a temporary copy
522 * now (newlpage), and the newly-allocated right block has been filled
523 * in. The original page is still unchanged.
525 * If this is a root split, we also have a temporary page containing
526 * the new contents of the root. Copy the new left page to a
527 * newly-allocated block, and initialize the (original) root page the
528 * new copy. Otherwise, copy over the temporary copy of the new left
529 * page over the old left page.
532 START_CRIT_SECTION();
534 MarkBufferDirty(rbuffer);
535 MarkBufferDirty(stack->buffer);
536 if (BufferIsValid(childbuf))
537 MarkBufferDirty(childbuf);
540 * Restore the temporary copies over the real buffers. But don't free
541 * the temporary copies yet, WAL record data points to them.
543 if (stack->parent == NULL)
545 MarkBufferDirty(lbuffer);
546 memcpy(BufferGetPage(stack->buffer), newrootpg, BLCKSZ);
547 memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ);
548 memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ);
552 memcpy(BufferGetPage(stack->buffer), newlpage, BLCKSZ);
553 memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ);
556 /* write WAL record */
557 if (RelationNeedsWAL(btree->index))
562 * We just take full page images of all the split pages. Splits
563 * are uncommon enough that it's not worth complicating the code
564 * to be more efficient.
566 if (stack->parent == NULL)
568 XLogRegisterBuffer(0, lbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
569 XLogRegisterBuffer(1, rbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
570 XLogRegisterBuffer(2, stack->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
574 XLogRegisterBuffer(0, stack->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
575 XLogRegisterBuffer(1, rbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
577 if (BufferIsValid(childbuf))
578 XLogRegisterBuffer(3, childbuf, 0);
580 XLogRegisterData((char *) &data, sizeof(ginxlogSplit));
582 recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT);
583 PageSetLSN(BufferGetPage(stack->buffer), recptr);
584 PageSetLSN(BufferGetPage(rbuffer), recptr);
585 if (stack->parent == NULL)
586 PageSetLSN(BufferGetPage(lbuffer), recptr);
587 if (BufferIsValid(childbuf))
588 PageSetLSN(childpage, recptr);
593 * We can release the lock on the right page now, but keep the
594 * original buffer locked.
596 UnlockReleaseBuffer(rbuffer);
597 if (stack->parent == NULL)
598 UnlockReleaseBuffer(lbuffer);
606 * If we split the root, we're done. Otherwise the split is not
607 * complete until the downlink for the new page has been inserted to
610 if (stack->parent == NULL)
617 elog(ERROR, "unknown return code from GIN placeToPage method: %d", rc);
618 return false; /* keep compiler quiet */
623 * Finish a split by inserting the downlink for the new page to parent.
625 * On entry, stack->buffer is exclusively locked.
627 * If freestack is true, all the buffers are released and unlocked as we
628 * crawl up the tree, and 'stack' is freed. Otherwise stack->buffer is kept
629 * locked, and stack is unmodified, except for possibly moving right to find
630 * the correct parent of page.
633 ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
634 GinStatsData *buildStats)
641 * freestack == false when we encounter an incompletely split page during
642 * a scan, while freestack == true is used in the normal scenario that a
643 * split is finished right after the initial insert.
646 elog(DEBUG1, "finishing incomplete split of block %u in gin index \"%s\"",
647 stack->blkno, RelationGetRelationName(btree->index));
649 /* this loop crawls up the stack until the insertion is complete */
652 GinBtreeStack *parent = stack->parent;
654 BlockNumber updateblkno;
656 /* search parent to lock */
657 LockBuffer(parent->buffer, GIN_EXCLUSIVE);
660 * If the parent page was incompletely split, finish that split first,
661 * then continue with the current one.
663 * Note: we have to finish *all* incomplete splits we encounter, even
664 * if we have to move right. Otherwise we might choose as the target a
665 * page that has no downlink in the parent, and splitting it further
668 if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer)))
669 ginFinishSplit(btree, parent, false, buildStats);
671 /* move right if it's needed */
672 page = BufferGetPage(parent->buffer);
673 while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber)
675 if (GinPageRightMost(page))
678 * rightmost page, but we don't find parent, we should use
681 LockBuffer(parent->buffer, GIN_UNLOCK);
682 ginFindParents(btree, stack);
683 parent = stack->parent;
684 Assert(parent != NULL);
688 parent->buffer = ginStepRight(parent->buffer, btree->index, GIN_EXCLUSIVE);
689 parent->blkno = BufferGetBlockNumber(parent->buffer);
690 page = BufferGetPage(parent->buffer);
692 if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer)))
693 ginFinishSplit(btree, parent, false, buildStats);
696 /* insert the downlink */
697 insertdata = btree->prepareDownlink(btree, stack->buffer);
698 updateblkno = GinPageGetOpaque(BufferGetPage(stack->buffer))->rightlink;
699 done = ginPlaceToPage(btree, parent,
700 insertdata, updateblkno,
701 stack->buffer, buildStats);
705 * If the caller requested to free the stack, unlock and release the
706 * child buffer now. Otherwise keep it pinned and locked, but if we
707 * have to recurse up the tree, we can unlock the upper pages, only
708 * keeping the page at the bottom of the stack locked.
710 if (!first || freestack)
711 LockBuffer(stack->buffer, GIN_UNLOCK);
714 ReleaseBuffer(stack->buffer);
722 /* unlock the parent */
723 LockBuffer(stack->buffer, GIN_UNLOCK);
726 freeGinBtreeStack(stack);
730 * Insert a value to tree described by stack.
732 * The value to be inserted is given in 'insertdata'. Its format depends
733 * on whether this is an entry or data tree, ginInsertValue just passes it
734 * through to the tree-specific callback function.
736 * During an index build, buildStats is non-null and the counters it contains
737 * are incremented as needed.
739 * NB: the passed-in stack is freed, as though by freeGinBtreeStack.
742 ginInsertValue(GinBtree btree, GinBtreeStack *stack, void *insertdata,
743 GinStatsData *buildStats)
747 /* If the leaf page was incompletely split, finish the split first */
748 if (GinPageIsIncompleteSplit(BufferGetPage(stack->buffer)))
749 ginFinishSplit(btree, stack, false, buildStats);
751 done = ginPlaceToPage(btree, stack,
752 insertdata, InvalidBlockNumber,
753 InvalidBuffer, buildStats);
756 LockBuffer(stack->buffer, GIN_UNLOCK);
757 freeGinBtreeStack(stack);
760 ginFinishSplit(btree, stack, true, buildStats);