]> granicus.if.org Git - postgresql/commitdiff
Fix race condition in GIN posting tree page deletion.
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Fri, 8 Nov 2013 20:21:42 +0000 (22:21 +0200)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Fri, 8 Nov 2013 20:23:11 +0000 (22:23 +0200)
If a page is deleted, and reused for something else, just as a search is
following a rightlink to it from its left sibling, the search would continue
scanning whatever the new contents of the page are. That could lead to
incorrect query results, or even something more curious if the page is
reused for a different kind of a page.

To fix, modify the search algorithm to lock the next page before releasing
the previous one, and refrain from deleting pages from the leftmost branch
of the tree.

Add a new Concurrency section to the README, explaining why this works.
There is a lot more one could say about concurrency in GIN, but that's for
another patch.

Backpatch to all supported versions.

src/backend/access/gin/README
src/backend/access/gin/ginbtree.c
src/backend/access/gin/ginget.c
src/backend/access/gin/ginvacuum.c
src/include/access/gin_private.h

index 67159d85294f151abb46e0fe87e29336e011dab3..a2634a09279172929e578b7494a4eef408d1eef4 100644 (file)
@@ -210,6 +210,56 @@ fit on one pending-list page must have those pages to itself, even if this
 results in wasting much of the space on the preceding page and the last
 page for the tuple.)
 
+Concurrency
+-----------
+
+The entry tree and each posting tree is a B-tree, with right-links connecting
+sibling pages at the same level. This is the same structure that is used in
+the regular B-tree indexam (invented by Lehman & Yao), but we don't support
+scanning a GIN trees backwards, so we don't need left-links.
+
+To avoid deadlocks, B-tree pages must always be locked in the same order:
+left to right, and bottom to top. When searching, the tree is traversed from
+top to bottom, so the lock on the parent page must be released before
+descending to the next level. Concurrent page splits move the keyspace to
+right, so after following a downlink, the page actually containing the key
+we're looking for might be somewhere to the right of the page we landed on.
+In that case, we follow the right-links until we find the page we're looking
+for.
+
+To delete a page, the page's left sibling, the target page, and its parent,
+are locked in that order, and the page is marked as deleted. However, a
+concurrent search might already have read a pointer to the page, and might be
+just about to follow it. A page can be reached via the right-link of its left
+sibling, or via its downlink in the parent.
+
+To prevent a backend from reaching a deleted page via a right-link, when
+following a right-link the lock on the previous page is not released until
+the lock on next page has been acquired.
+
+The downlink is more tricky. A search descending the tree must release the
+lock on the parent page before locking the child, or it could deadlock with
+a concurrent split of the child page; a page split locks the parent, while
+already holding a lock on the child page. However, posting trees are only
+fully searched from left to right, starting from the leftmost leaf. (The
+tree-structure is only needed by insertions, to quickly find the correct
+insert location). So as long as we don't delete the leftmost page on each
+level, a search can never follow a downlink to page that's about to be
+deleted.
+
+The previous paragraph's reasoning only applies to searches, and only to
+posting trees. To protect from inserters following a downlink to a deleted
+page, vacuum simply locks out all concurrent insertions to the posting tree,
+by holding a super-exclusive lock on the posting tree root. Inserters hold a
+pin on the root page, but searches do not, so while new searches cannot begin
+while root page is locked, any already-in-progress scans can continue
+concurrently with vacuum. In the entry tree, we never delete pages.
+
+(This is quite different from the mechanism the btree indexam uses to make
+page-deletions safe; it stamps the deleted pages with an XID and keeps the
+deleted pages around with the right-link intact until all concurrent scans
+have finished.)
+
 Limitations
 -----------
 
index b160551b5ed6ed16693904488b9c3bda7e0146d3..d2ac679fc418d405bb25455f0af74ed761e0dcdf 100644 (file)
@@ -112,10 +112,8 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
                                /* rightmost page */
                                break;
 
+                       stack->buffer = ginStepRight(stack->buffer, btree->index, access);
                        stack->blkno = rightlink;
-                       LockBuffer(stack->buffer, GIN_UNLOCK);
-                       stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno);
-                       LockBuffer(stack->buffer, access);
                        page = BufferGetPage(stack->buffer);
                }
 
@@ -151,6 +149,41 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack)
        return NULL;
 }
 
+/*
+ * Step right from current page.
+ *
+ * The next page is locked first, before releasing the current page. This is
+ * crucial to protect from concurrent page deletion (see comment in
+ * ginDeletePage).
+ */
+Buffer
+ginStepRight(Buffer buffer, Relation index, int lockmode)
+{
+       Buffer          nextbuffer;
+       Page            page = BufferGetPage(buffer);
+       bool            isLeaf = GinPageIsLeaf(page);
+       bool            isData = GinPageIsData(page);
+       BlockNumber     blkno = GinPageGetOpaque(page)->rightlink;
+
+       nextbuffer = ReadBuffer(index, blkno);
+       LockBuffer(nextbuffer, lockmode);
+       UnlockReleaseBuffer(buffer);
+
+       /* Sanity check that the page we stepped to is of similar kind. */
+       page = BufferGetPage(nextbuffer);
+       if (isLeaf != GinPageIsLeaf(page) || isData != GinPageIsData(page))
+               elog(ERROR, "right sibling of GIN page is of different type");
+
+       /*
+        * Given the proper lock sequence above, we should never land on a
+        * deleted page.
+        */
+       if  (GinPageIsDeleted(page))
+               elog(ERROR, "right sibling of GIN page was deleted");
+
+       return nextbuffer;
+}
+
 void
 freeGinBtreeStack(GinBtreeStack *stack)
 {
@@ -239,12 +272,12 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack,
                while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber)
                {
                        blkno = GinPageGetOpaque(page)->rightlink;
-                       LockBuffer(buffer, GIN_UNLOCK);
-                       ReleaseBuffer(buffer);
                        if (blkno == InvalidBlockNumber)
+                       {
+                               UnlockReleaseBuffer(buffer);
                                break;
-                       buffer = ReadBuffer(btree->index, blkno);
-                       LockBuffer(buffer, GIN_EXCLUSIVE);
+                       }
+                       buffer = ginStepRight(buffer, btree->index, GIN_EXCLUSIVE);
                        page = BufferGetPage(buffer);
                }
 
@@ -447,23 +480,21 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats)
                {
                        BlockNumber rightlink = GinPageGetOpaque(page)->rightlink;
 
-                       LockBuffer(parent->buffer, GIN_UNLOCK);
-
                        if (rightlink == InvalidBlockNumber)
                        {
                                /*
                                 * rightmost page, but we don't find parent, we should use
                                 * plain search...
                                 */
+                               LockBuffer(parent->buffer, GIN_UNLOCK);
                                ginFindParents(btree, stack, rootBlkno);
                                parent = stack->parent;
                                page = BufferGetPage(parent->buffer);
                                break;
                        }
 
+                       parent->buffer = ginStepRight(parent->buffer, btree->index, GIN_EXCLUSIVE);
                        parent->blkno = rightlink;
-                       parent->buffer = ReleaseAndReadBuffer(parent->buffer, btree->index, parent->blkno);
-                       LockBuffer(parent->buffer, GIN_EXCLUSIVE);
                        page = BufferGetPage(parent->buffer);
                }
 
index 022bd27b2341f776e71aa1994712a6b7840eb061..0630e7ee9f67987c9e0a3f808d56278e811c4f7e 100644 (file)
@@ -105,16 +105,11 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack)
                /*
                 * We scanned the whole page, so we should take right page
                 */
-               stack->blkno = GinPageGetOpaque(page)->rightlink;
-
                if (GinPageRightMost(page))
                        return false;           /* no more pages */
 
-               LockBuffer(stack->buffer, GIN_UNLOCK);
-               stack->buffer = ReleaseAndReadBuffer(stack->buffer,
-                                                                                        btree->index,
-                                                                                        stack->blkno);
-               LockBuffer(stack->buffer, GIN_SHARE);
+               stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
+               stack->blkno = BufferGetBlockNumber(stack->buffer);
                stack->off = FirstOffsetNumber;
        }
 
@@ -132,7 +127,6 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
        GinPostingTreeScan *gdi;
        Buffer          buffer;
        Page            page;
-       BlockNumber blkno;
 
        /* Descend to the leftmost leaf page */
        gdi = ginPrepareScanPostingTree(index, rootPostingTree, TRUE);
@@ -162,10 +156,7 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
                if (GinPageRightMost(page))
                        break;                          /* no more pages */
 
-               blkno = GinPageGetOpaque(page)->rightlink;
-               LockBuffer(buffer, GIN_UNLOCK);
-               buffer = ReleaseAndReadBuffer(buffer, index, blkno);
-               LockBuffer(buffer, GIN_SHARE);
+               buffer = ginStepRight(buffer, index, GIN_SHARE);
        }
 
        UnlockReleaseBuffer(buffer);
@@ -544,7 +535,6 @@ static void
 entryGetNextItem(GinState *ginstate, GinScanEntry entry)
 {
        Page            page;
-       BlockNumber blkno;
 
        for (;;)
        {
@@ -562,23 +552,18 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
                         * It's needed to go by right link. During that we should refind
                         * first ItemPointer greater that stored
                         */
-
-                       blkno = GinPageGetOpaque(page)->rightlink;
-
-                       LockBuffer(entry->buffer, GIN_UNLOCK);
-                       if (blkno == InvalidBlockNumber)
+                       if (GinPageRightMost(page))
                        {
-                               ReleaseBuffer(entry->buffer);
+                               UnlockReleaseBuffer(entry->buffer);
                                ItemPointerSetInvalid(&entry->curItem);
                                entry->buffer = InvalidBuffer;
                                entry->isFinished = TRUE;
                                return;
                        }
 
-                       entry->buffer = ReleaseAndReadBuffer(entry->buffer,
-                                                                                                ginstate->index,
-                                                                                                blkno);
-                       LockBuffer(entry->buffer, GIN_SHARE);
+                       entry->buffer = ginStepRight(entry->buffer,
+                                                                                ginstate->index,
+                                                                                GIN_SHARE);
                        page = BufferGetPage(entry->buffer);
 
                        entry->offset = InvalidOffsetNumber;
index 5418fa0f74e411774e3fe4996482d54c9a6ecc53..be9c5e9ce4d7c8c5bbfd4e6b565cbac64ad7ef1b 100644 (file)
@@ -238,6 +238,9 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
        return hasVoidPage;
 }
 
+/*
+ * Delete a posting tree page.
+ */
 static void
 ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno,
                          BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot)
@@ -247,39 +250,35 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
        Buffer          pBuffer;
        Page            page,
                                parentPage;
+       BlockNumber     rightlink;
 
+       /*
+        * Lock the pages in the same order as an insertion would, to avoid
+        * deadlocks: left, then right, then parent.
+        */
+       lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno,
+                                                                RBM_NORMAL, gvs->strategy);
        dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno,
                                                                 RBM_NORMAL, gvs->strategy);
-
-       if (leftBlkno != InvalidBlockNumber)
-               lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno,
-                                                                        RBM_NORMAL, gvs->strategy);
-       else
-               lBuffer = InvalidBuffer;
-
        pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno,
                                                                 RBM_NORMAL, gvs->strategy);
 
+       LockBuffer(lBuffer, GIN_EXCLUSIVE);
        LockBuffer(dBuffer, GIN_EXCLUSIVE);
        if (!isParentRoot)                      /* parent is already locked by
                                                                 * LockBufferForCleanup() */
                LockBuffer(pBuffer, GIN_EXCLUSIVE);
-       if (leftBlkno != InvalidBlockNumber)
-               LockBuffer(lBuffer, GIN_EXCLUSIVE);
 
        START_CRIT_SECTION();
 
-       if (leftBlkno != InvalidBlockNumber)
-       {
-               BlockNumber rightlink;
-
-               page = BufferGetPage(dBuffer);
-               rightlink = GinPageGetOpaque(page)->rightlink;
+       /* Unlink the page by changing left sibling's rightlink */
+       page = BufferGetPage(dBuffer);
+       rightlink = GinPageGetOpaque(page)->rightlink;
 
-               page = BufferGetPage(lBuffer);
-               GinPageGetOpaque(page)->rightlink = rightlink;
-       }
+       page = BufferGetPage(lBuffer);
+       GinPageGetOpaque(page)->rightlink = rightlink;
 
+       /* Delete downlink from parent */
        parentPage = BufferGetPage(pBuffer);
 #ifdef USE_ASSERT_CHECKING
        do
@@ -364,10 +363,7 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
        if (!isParentRoot)
                LockBuffer(pBuffer, GIN_UNLOCK);
        ReleaseBuffer(pBuffer);
-
-       if (leftBlkno != InvalidBlockNumber)
-               UnlockReleaseBuffer(lBuffer);
-
+       UnlockReleaseBuffer(lBuffer);
        UnlockReleaseBuffer(dBuffer);
 
        END_CRIT_SECTION();
@@ -435,15 +431,12 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, DataPageDel
 
        if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)
        {
-               if (!(me->leftBlkno == InvalidBlockNumber && GinPageRightMost(page)))
+               /* we never delete the left- or rightmost branch */
+               if (me->leftBlkno != InvalidBlockNumber && !GinPageRightMost(page))
                {
-                       /* we never delete right most branch */
                        Assert(!isRoot);
-                       if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)
-                       {
-                               ginDeletePage(gvs, blkno, me->leftBlkno, me->parent->blkno, myoff, me->parent->isRoot);
-                               meDelete = TRUE;
-                       }
+                       ginDeletePage(gvs, blkno, me->leftBlkno, me->parent->blkno, myoff, me->parent->isRoot);
+                       meDelete = TRUE;
                }
        }
 
index e56a92358c75b564347499f6b72c8b6806a2a007..5068d931cc339b31b3562d395d2a6cc1894331f0 100644 (file)
@@ -513,6 +513,7 @@ typedef struct GinBtreeData
 
 extern GinBtreeStack *ginPrepareFindLeafPage(GinBtree btree, BlockNumber blkno);
 extern GinBtreeStack *ginFindLeafPage(GinBtree btree, GinBtreeStack *stack);
+extern Buffer ginStepRight(Buffer buffer, Relation index, int lockmode);
 extern void freeGinBtreeStack(GinBtreeStack *stack);
 extern void ginInsertValue(GinBtree btree, GinBtreeStack *stack,
                           GinStatsData *buildStats);