From cd24b4eaece3ee816b8a7decacc8dd00e5f16906 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 13 Dec 2018 06:12:31 +0300 Subject: [PATCH] Prevent GIN deleted pages from being reclaimed too early When GIN vacuum deletes a posting tree page, it assumes that no concurrent searchers can access it, thanks to ginStepRight() locking two pages at once. However, since 9.4 searches can skip parts of posting trees descending from the root. That leads to the risk that page is deleted and reclaimed before concurrent search can access it. This commit prevents the risk of above by waiting for every transaction, which might wait to reference this page, to finish. Due to binary compatibility we can't change GinPageOpaqueData to store corresponding transaction id. Instead we reuse page header pd_prune_xid field, which is unused in index pages. Discussion: https://postgr.es/m/31a702a.14dd.166c1366ac1.Coremail.chjischj%40163.com Author: Andrey Borodin, Alexander Korotkov Reviewed-by: Alexander Korotkov Backpatch-through: 9.4 --- src/backend/access/gin/README | 10 ++++------ src/backend/access/gin/ginutil.c | 7 +------ src/backend/access/gin/ginvacuum.c | 6 +++++- src/backend/access/gin/ginxlog.c | 1 + src/include/access/gin_private.h | 11 +++++++++++ 5 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README index fade0cbb61..d551df1166 100644 --- a/src/backend/access/gin/README +++ b/src/backend/access/gin/README @@ -304,12 +304,10 @@ the lock on next page has been acquired. The downlink is more tricky. A search descending the tree must release the lock on the parent page before locking the child, or it could deadlock with a concurrent split of the child page; a page split locks the parent, while -already holding a lock on the child page. However, posting trees are only -fully searched from left to right, starting from the leftmost leaf. (The -tree-structure is only needed by insertions, to quickly find the correct -insert location). So as long as we don't delete the leftmost page on each -level, a search can never follow a downlink to page that's about to be -deleted. +already holding a lock on the child page. So, deleted page cannot be reclaimed +immediately. Instead, we have to wait for every transaction, which might wait +to reference this page, to finish. Corresponding processes must observe that +the page is marked deleted and recover accordingly. The previous paragraph's reasoning only applies to searches, and only to posting trees. To protect from inserters following a downlink to a deleted diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index d9146488c4..637caf9fdc 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -274,12 +274,7 @@ GinNewBuffer(Relation index) */ if (ConditionalLockBuffer(buffer)) { - Page page = BufferGetPage(buffer); - - if (PageIsNew(page)) - return buffer; /* OK to use, if never initialized */ - - if (GinPageIsDeleted(page)) + if (GinPageIsRecyclable(BufferGetPage(buffer))) return buffer; /* OK to use */ LockBuffer(buffer, GIN_UNLOCK); diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index 3d23cebf62..58f0d03fd1 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -215,6 +215,9 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn page = BufferGetPage(dBuffer); rightlink = GinPageGetOpaque(page)->rightlink; + /* For deleted page remember last xid which could knew its address */ + GinPageSetDeleteXid(page, ReadNewTransactionId()); + page = BufferGetPage(lBuffer); GinPageGetOpaque(page)->rightlink = rightlink; @@ -262,6 +265,7 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn data.parentOffset = myoff; data.rightLink = GinPageGetOpaque(page)->rightlink; + data.deleteXid = GinPageGetDeleteXid(page); XLogRegisterData((char *) &data, sizeof(ginxlogDeletePage)); @@ -708,7 +712,7 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) LockBuffer(buffer, GIN_SHARE); page = (Page) BufferGetPage(buffer); - if (PageIsNew(page) || GinPageIsDeleted(page)) + if (GinPageIsRecyclable(page)) { Assert(blkno != GIN_ROOT_BLKNO); RecordFreeIndexPage(index, blkno); diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 19ab1b61dc..bfe88827e4 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -529,6 +529,7 @@ ginRedoDeletePage(XLogReaderState *record) page = BufferGetPage(dbuffer); Assert(GinPageIsData(page)); GinPageGetOpaque(page)->flags = GIN_DELETED; + GinPageSetDeleteXid(page, data->deleteXid); PageSetLSN(page, lsn); MarkBufferDirty(dbuffer); } diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index 0b6006f0ad..8ee17b72a5 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -13,6 +13,7 @@ #include "access/amapi.h" #include "access/gin.h" #include "access/itup.h" +#include "access/transam.h" #include "fmgr.h" #include "storage/bufmgr.h" #include "lib/rbtree.h" @@ -131,6 +132,15 @@ typedef struct GinMetaPageData #define GinPageRightMost(page) ( GinPageGetOpaque(page)->rightlink == InvalidBlockNumber) +/* + * We should reclaim deleted page only once every transaction started before + * its deletion is over. + */ +#define GinPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid ) +#define GinPageSetDeleteXid(page, xid) ( ((PageHeader) (page))->pd_prune_xid = xid) +#define GinPageIsRecyclable(page) ( PageIsNew(page) || (GinPageIsDeleted(page) \ + && TransactionIdPrecedes(GinPageGetDeleteXid(page), RecentGlobalXmin))) + /* * We use our own ItemPointerGet(BlockNumber|OffsetNumber) * to avoid Asserts, since sometimes the ip_posid isn't "valid" @@ -540,6 +550,7 @@ typedef struct ginxlogDeletePage { OffsetNumber parentOffset; BlockNumber rightLink; + TransactionId deleteXid; /* last Xid which could see this page in scan */ } ginxlogDeletePage; #define XLOG_GIN_UPDATE_META_PAGE 0x60 -- 2.40.0