results in wasting much of the space on the preceding page and the last
page for the tuple.)
+Concurrency
+-----------
+
+The entry tree and each posting tree is a B-tree, with right-links connecting
+sibling pages at the same level. This is the same structure that is used in
+the regular B-tree indexam (invented by Lehman & Yao), but we don't support
+scanning a GIN trees backwards, so we don't need left-links.
+
+To avoid deadlocks, B-tree pages must always be locked in the same order:
+left to right, and bottom to top. When searching, the tree is traversed from
+top to bottom, so the lock on the parent page must be released before
+descending to the next level. Concurrent page splits move the keyspace to
+right, so after following a downlink, the page actually containing the key
+we're looking for might be somewhere to the right of the page we landed on.
+In that case, we follow the right-links until we find the page we're looking
+for.
+
+To delete a page, the page's left sibling, the target page, and its parent,
+are locked in that order, and the page is marked as deleted. However, a
+concurrent search might already have read a pointer to the page, and might be
+just about to follow it. A page can be reached via the right-link of its left
+sibling, or via its downlink in the parent.
+
+To prevent a backend from reaching a deleted page via a right-link, when
+following a right-link the lock on the previous page is not released until
+the lock on next page has been acquired.
+
+The downlink is more tricky. A search descending the tree must release the
+lock on the parent page before locking the child, or it could deadlock with
+a concurrent split of the child page; a page split locks the parent, while
+already holding a lock on the child page. However, posting trees are only
+fully searched from left to right, starting from the leftmost leaf. (The
+tree-structure is only needed by insertions, to quickly find the correct
+insert location). So as long as we don't delete the leftmost page on each
+level, a search can never follow a downlink to page that's about to be
+deleted.
+
+The previous paragraph's reasoning only applies to searches, and only to
+posting trees. To protect from inserters following a downlink to a deleted
+page, vacuum simply locks out all concurrent insertions to the posting tree,
+by holding a super-exclusive lock on the posting tree root. Inserters hold a
+pin on the root page, but searches do not, so while new searches cannot begin
+while root page is locked, any already-in-progress scans can continue
+concurrently with vacuum. In the entry tree, we never delete pages.
+
+(This is quite different from the mechanism the btree indexam uses to make
+page-deletions safe; it stamps the deleted pages with an XID and keeps the
+deleted pages around with the right-link intact until all concurrent scans
+have finished.)
+
Limitations
-----------
/* rightmost page */
break;
+ stack->buffer = ginStepRight(stack->buffer, btree->index, access);
stack->blkno = rightlink;
- LockBuffer(stack->buffer, GIN_UNLOCK);
- stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno);
- LockBuffer(stack->buffer, access);
page = BufferGetPage(stack->buffer);
}
return NULL;
}
+/*
+ * Step right from current page.
+ *
+ * The next page is locked first, before releasing the current page. This is
+ * crucial to protect from concurrent page deletion (see comment in
+ * ginDeletePage).
+ */
+Buffer
+ginStepRight(Buffer buffer, Relation index, int lockmode)
+{
+ Buffer nextbuffer;
+ Page page = BufferGetPage(buffer);
+ bool isLeaf = GinPageIsLeaf(page);
+ bool isData = GinPageIsData(page);
+ BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
+
+ nextbuffer = ReadBuffer(index, blkno);
+ LockBuffer(nextbuffer, lockmode);
+ UnlockReleaseBuffer(buffer);
+
+ /* Sanity check that the page we stepped to is of similar kind. */
+ page = BufferGetPage(nextbuffer);
+ if (isLeaf != GinPageIsLeaf(page) || isData != GinPageIsData(page))
+ elog(ERROR, "right sibling of GIN page is of different type");
+
+ /*
+ * Given the proper lock sequence above, we should never land on a
+ * deleted page.
+ */
+ if (GinPageIsDeleted(page))
+ elog(ERROR, "right sibling of GIN page was deleted");
+
+ return nextbuffer;
+}
+
void
freeGinBtreeStack(GinBtreeStack *stack)
{
while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber)
{
blkno = GinPageGetOpaque(page)->rightlink;
- LockBuffer(buffer, GIN_UNLOCK);
- ReleaseBuffer(buffer);
if (blkno == InvalidBlockNumber)
+ {
+ UnlockReleaseBuffer(buffer);
break;
- buffer = ReadBuffer(btree->index, blkno);
- LockBuffer(buffer, GIN_EXCLUSIVE);
+ }
+ buffer = ginStepRight(buffer, btree->index, GIN_EXCLUSIVE);
page = BufferGetPage(buffer);
}
{
BlockNumber rightlink = GinPageGetOpaque(page)->rightlink;
- LockBuffer(parent->buffer, GIN_UNLOCK);
-
if (rightlink == InvalidBlockNumber)
{
/*
* rightmost page, but we don't find parent, we should use
* plain search...
*/
+ LockBuffer(parent->buffer, GIN_UNLOCK);
ginFindParents(btree, stack, rootBlkno);
parent = stack->parent;
page = BufferGetPage(parent->buffer);
break;
}
+ parent->buffer = ginStepRight(parent->buffer, btree->index, GIN_EXCLUSIVE);
parent->blkno = rightlink;
- parent->buffer = ReleaseAndReadBuffer(parent->buffer, btree->index, parent->blkno);
- LockBuffer(parent->buffer, GIN_EXCLUSIVE);
page = BufferGetPage(parent->buffer);
}
/*
* We scanned the whole page, so we should take right page
*/
- stack->blkno = GinPageGetOpaque(page)->rightlink;
-
if (GinPageRightMost(page))
return false; /* no more pages */
- LockBuffer(stack->buffer, GIN_UNLOCK);
- stack->buffer = ReleaseAndReadBuffer(stack->buffer,
- btree->index,
- stack->blkno);
- LockBuffer(stack->buffer, GIN_SHARE);
+ stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
+ stack->blkno = BufferGetBlockNumber(stack->buffer);
stack->off = FirstOffsetNumber;
}
GinPostingTreeScan *gdi;
Buffer buffer;
Page page;
- BlockNumber blkno;
/* Descend to the leftmost leaf page */
gdi = ginPrepareScanPostingTree(index, rootPostingTree, TRUE);
if (GinPageRightMost(page))
break; /* no more pages */
- blkno = GinPageGetOpaque(page)->rightlink;
- LockBuffer(buffer, GIN_UNLOCK);
- buffer = ReleaseAndReadBuffer(buffer, index, blkno);
- LockBuffer(buffer, GIN_SHARE);
+ buffer = ginStepRight(buffer, index, GIN_SHARE);
}
UnlockReleaseBuffer(buffer);
entryGetNextItem(GinState *ginstate, GinScanEntry entry)
{
Page page;
- BlockNumber blkno;
for (;;)
{
* It's needed to go by right link. During that we should refind
* first ItemPointer greater that stored
*/
-
- blkno = GinPageGetOpaque(page)->rightlink;
-
- LockBuffer(entry->buffer, GIN_UNLOCK);
- if (blkno == InvalidBlockNumber)
+ if (GinPageRightMost(page))
{
- ReleaseBuffer(entry->buffer);
+ UnlockReleaseBuffer(entry->buffer);
ItemPointerSetInvalid(&entry->curItem);
entry->buffer = InvalidBuffer;
entry->isFinished = TRUE;
return;
}
- entry->buffer = ReleaseAndReadBuffer(entry->buffer,
- ginstate->index,
- blkno);
- LockBuffer(entry->buffer, GIN_SHARE);
+ entry->buffer = ginStepRight(entry->buffer,
+ ginstate->index,
+ GIN_SHARE);
page = BufferGetPage(entry->buffer);
entry->offset = InvalidOffsetNumber;
return hasVoidPage;
}
+/*
+ * Delete a posting tree page.
+ */
static void
ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno,
BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot)
Buffer pBuffer;
Page page,
parentPage;
+ BlockNumber rightlink;
+ /*
+ * Lock the pages in the same order as an insertion would, to avoid
+ * deadlocks: left, then right, then parent.
+ */
+ lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno,
+ RBM_NORMAL, gvs->strategy);
dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno,
RBM_NORMAL, gvs->strategy);
-
- if (leftBlkno != InvalidBlockNumber)
- lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno,
- RBM_NORMAL, gvs->strategy);
- else
- lBuffer = InvalidBuffer;
-
pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno,
RBM_NORMAL, gvs->strategy);
+ LockBuffer(lBuffer, GIN_EXCLUSIVE);
LockBuffer(dBuffer, GIN_EXCLUSIVE);
if (!isParentRoot) /* parent is already locked by
* LockBufferForCleanup() */
LockBuffer(pBuffer, GIN_EXCLUSIVE);
- if (leftBlkno != InvalidBlockNumber)
- LockBuffer(lBuffer, GIN_EXCLUSIVE);
START_CRIT_SECTION();
- if (leftBlkno != InvalidBlockNumber)
- {
- BlockNumber rightlink;
-
- page = BufferGetPage(dBuffer);
- rightlink = GinPageGetOpaque(page)->rightlink;
+ /* Unlink the page by changing left sibling's rightlink */
+ page = BufferGetPage(dBuffer);
+ rightlink = GinPageGetOpaque(page)->rightlink;
- page = BufferGetPage(lBuffer);
- GinPageGetOpaque(page)->rightlink = rightlink;
- }
+ page = BufferGetPage(lBuffer);
+ GinPageGetOpaque(page)->rightlink = rightlink;
+ /* Delete downlink from parent */
parentPage = BufferGetPage(pBuffer);
#ifdef USE_ASSERT_CHECKING
do
if (!isParentRoot)
LockBuffer(pBuffer, GIN_UNLOCK);
ReleaseBuffer(pBuffer);
-
- if (leftBlkno != InvalidBlockNumber)
- UnlockReleaseBuffer(lBuffer);
-
+ UnlockReleaseBuffer(lBuffer);
UnlockReleaseBuffer(dBuffer);
END_CRIT_SECTION();
if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)
{
- if (!(me->leftBlkno == InvalidBlockNumber && GinPageRightMost(page)))
+ /* we never delete the left- or rightmost branch */
+ if (me->leftBlkno != InvalidBlockNumber && !GinPageRightMost(page))
{
- /* we never delete right most branch */
Assert(!isRoot);
- if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)
- {
- ginDeletePage(gvs, blkno, me->leftBlkno, me->parent->blkno, myoff, me->parent->isRoot);
- meDelete = TRUE;
- }
+ ginDeletePage(gvs, blkno, me->leftBlkno, me->parent->blkno, myoff, me->parent->isRoot);
+ meDelete = TRUE;
}
}
extern GinBtreeStack *ginPrepareFindLeafPage(GinBtree btree, BlockNumber blkno);
extern GinBtreeStack *ginFindLeafPage(GinBtree btree, GinBtreeStack *stack);
+extern Buffer ginStepRight(Buffer buffer, Relation index, int lockmode);
extern void freeGinBtreeStack(GinBtreeStack *stack);
extern void ginInsertValue(GinBtree btree, GinBtreeStack *stack,
GinStatsData *buildStats);