]> granicus.if.org Git - postgresql/commitdiff
Prevent GIN deleted pages from being reclaimed too early
authorAlexander Korotkov <akorotkov@postgresql.org>
Thu, 13 Dec 2018 03:12:31 +0000 (06:12 +0300)
committerAlexander Korotkov <akorotkov@postgresql.org>
Thu, 13 Dec 2018 03:39:53 +0000 (06:39 +0300)
When GIN vacuum deletes a posting tree page, it assumes that no concurrent
searchers can access it, thanks to ginStepRight() locking two pages at once.
However, since 9.4 searches can skip parts of posting trees descending from the
root.  That leads to the risk that page is deleted and reclaimed before
concurrent search can access it.

This commit prevents the risk of above by waiting for every transaction, which
might wait to reference this page, to finish.  Due to binary compatibility
we can't change GinPageOpaqueData to store corresponding transaction id.
Instead we reuse page header pd_prune_xid field, which is unused in index pages.

Discussion: https://postgr.es/m/31a702a.14dd.166c1366ac1.Coremail.chjischj%40163.com
Author: Andrey Borodin, Alexander Korotkov
Reviewed-by: Alexander Korotkov
Backpatch-through: 9.4

src/backend/access/gin/README
src/backend/access/gin/ginutil.c
src/backend/access/gin/ginvacuum.c
src/backend/access/gin/ginxlog.c
src/include/access/ginblock.h
src/include/access/ginxlog.h

index 421b5b26d5bd53ce0d577a063257230beaa561a0..30c0867829e6823e1882ce3ac04c58196e98239d 100644 (file)
@@ -304,12 +304,10 @@ the lock on next page has been acquired.
 The downlink is more tricky. A search descending the tree must release the
 lock on the parent page before locking the child, or it could deadlock with
 a concurrent split of the child page; a page split locks the parent, while
-already holding a lock on the child page. However, posting trees are only
-fully searched from left to right, starting from the leftmost leaf. (The
-tree-structure is only needed by insertions, to quickly find the correct
-insert location). So as long as we don't delete the leftmost page on each
-level, a search can never follow a downlink to page that's about to be
-deleted.
+already holding a lock on the child page. So, deleted page cannot be reclaimed
+immediately. Instead, we have to wait for every transaction, which might wait
+to reference this page, to finish. Corresponding processes must observe that
+the page is marked deleted and recover accordingly.
 
 The previous paragraph's reasoning only applies to searches, and only to
 posting trees. To protect from inserters following a downlink to a deleted
index 0a32182dd7fbafbedb4548a458f37b7286be9a37..64dd1a64860a7fd579695f88209f2702034ce555 100644 (file)
@@ -309,12 +309,7 @@ GinNewBuffer(Relation index)
                 */
                if (ConditionalLockBuffer(buffer))
                {
-                       Page            page = BufferGetPage(buffer);
-
-                       if (PageIsNew(page))
-                               return buffer;  /* OK to use, if never initialized */
-
-                       if (GinPageIsDeleted(page))
+                       if (GinPageIsRecyclable(BufferGetPage(buffer)))
                                return buffer;  /* OK to use */
 
                        LockBuffer(buffer, GIN_UNLOCK);
index bdeb0bf4f528bb2f81a70fdd8e8e2a7e6efe6f8e..96609835eebd5981ddd773124b9405747c98b753 100644 (file)
@@ -157,6 +157,9 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
        page = BufferGetPage(dBuffer);
        rightlink = GinPageGetOpaque(page)->rightlink;
 
+       /* For deleted page remember last xid which could knew its address */
+       GinPageSetDeleteXid(page, ReadNewTransactionId());
+
        /*
         * Any insert which would have gone on the leaf block will now go to its
         * right sibling.
@@ -213,6 +216,7 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
 
                data.parentOffset = myoff;
                data.rightLink = GinPageGetOpaque(page)->rightlink;
+               data.deleteXid = GinPageGetDeleteXid(page);
 
                XLogRegisterData((char *) &data, sizeof(ginxlogDeletePage));
 
@@ -732,7 +736,7 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
                LockBuffer(buffer, GIN_SHARE);
                page = (Page) BufferGetPage(buffer);
 
-               if (PageIsNew(page) || GinPageIsDeleted(page))
+               if (GinPageIsRecyclable(page))
                {
                        Assert(blkno != GIN_ROOT_BLKNO);
                        RecordFreeIndexPage(index, blkno);
index b626a219dec6c82208b1c9d9e799b5bd15fddca1..b84ecf2ab1580c243aa7e216b55b37e0f7f9b26f 100644 (file)
@@ -531,6 +531,7 @@ ginRedoDeletePage(XLogReaderState *record)
                page = BufferGetPage(dbuffer);
                Assert(GinPageIsData(page));
                GinPageGetOpaque(page)->flags = GIN_DELETED;
+               GinPageSetDeleteXid(page, data->deleteXid);
                PageSetLSN(page, lsn);
                MarkBufferDirty(dbuffer);
        }
index 553566529a47281a3eaee8f23b1a034f794e9f3d..f626c40ae3b0bb6de91da146071edab8eb07ae8b 100644 (file)
@@ -10,6 +10,7 @@
 #ifndef GINBLOCK_H
 #define GINBLOCK_H
 
+#include "access/transam.h"
 #include "storage/block.h"
 #include "storage/itemptr.h"
 #include "storage/off.h"
@@ -127,6 +128,15 @@ typedef struct GinMetaPageData
 
 #define GinPageRightMost(page) ( GinPageGetOpaque(page)->rightlink == InvalidBlockNumber)
 
+/*
+ * We should reclaim deleted page only once every transaction started before
+ * its deletion is over.
+ */
+#define GinPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid )
+#define GinPageSetDeleteXid(page, xid) ( ((PageHeader) (page))->pd_prune_xid = xid)
+#define GinPageIsRecyclable(page) ( PageIsNew(page) || (GinPageIsDeleted(page) \
+       && TransactionIdPrecedes(GinPageGetDeleteXid(page), RecentGlobalXmin)))
+
 /*
  * We use our own ItemPointerGet(BlockNumber|OffsetNumber)
  * to avoid Asserts, since sometimes the ip_posid isn't "valid"
index 64a3c9e18b4ba27d17e0ec6579d2acced265c58c..b2f3126aa8a6e8fe7ed40d75201071038158f6ae 100644 (file)
@@ -158,6 +158,7 @@ typedef struct ginxlogDeletePage
 {
        OffsetNumber parentOffset;
        BlockNumber rightLink;
+       TransactionId deleteXid;        /* last Xid which could see this page in scan */
 } ginxlogDeletePage;
 
 #define XLOG_GIN_UPDATE_META_PAGE 0x60