deleted pages around with the right-link intact until all concurrent scans
have finished.)
+Predicate Locking
+-----------------
+
+GIN supports predicate locking, for serializable snapshot isolation.
+A predicate locks represent that a scan has scanned a range of values. They
+are not concerned with physical pages as such, but the logical key values.
+A predicate lock on a page covers the key range that would belong on that
+page, whether or not there are any matching tuples there currently. In other
+words, a predicate lock on an index page covers the "gaps" between the index
+tuples. To minimize false positives, predicate locks are acquired at the
+finest level possible.
+
+* Like in the B-tree index, it is enough to lock only leaf pages, because all
+ insertions happen at the leaf level.
+
+* In an equality search (i.e. not a partial match search), if a key entry has
+ a posting tree, we lock the posting tree root page, to represent a lock on
+ just that key entry. Otherwise, we lock the entry tree page. We also lock
+ the entry tree page if no match is found, to lock the "gap" where the entry
+ would've been, had there been one.
+
+* In a partial match search, we lock all the entry leaf pages that we scan,
+ in addition to locks on posting tree roots, to represent the "gaps" between
+ values.
+
+* In addition to the locks on entry leaf pages and posting tree roots, all
+ scans grab a lock the metapage. This is to interlock with insertions to
+ the fast update pending list. An insertion to the pending list can really
+ belong anywhere in the tree, and the lock on the metapage represents that.
+
+The interlock for fastupdate pending lists means that with fastupdate=on,
+we effectively always grab a full-index lock, so you could get a lot of false
+positives.
+
Compatibility
-------------
stack->parent = NULL;
stack->predictNumber = 1;
+ if (!searchMode)
+ CheckForSerializableConflictIn(btree->index, NULL, stack->buffer);
+
for (;;)
{
Page page;
blkno = BufferGetBlockNumber(buffer);
/*
- * Copy a predicate lock from entry tree leaf (containing posting list) to
- * posting tree.
+ * Copy any predicate locks from the entry tree leaf (containing posting
+ * list) to the posting tree.
*/
PredicateLockPageSplit(index, BufferGetBlockNumber(entrybuffer), blkno);
return blkno;
}
-void
+static void
ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno)
{
memset(btree, 0, sizeof(GinBtreeData));
btree.itemptr = insertdata.items[insertdata.curitem];
stack = ginFindLeafPage(&btree, false, NULL);
- GinCheckForSerializableConflictIn(btree.index, NULL, stack->buffer);
ginInsertValue(&btree, stack, &insertdata, buildStats);
}
}
#include "postmaster/autovacuum.h"
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
+#include "storage/predicate.h"
#include "utils/builtins.h"
/* GUC parameter */
metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
metapage = BufferGetPage(metabuffer);
+ /*
+ * An insertion to the pending list could logically belong anywhere in
+ * the tree, so it conflicts with all serializable scans. All scans
+ * acquire a predicate lock on the metabuffer to represent that.
+ */
+ CheckForSerializableConflictIn(index, NULL, metabuffer);
+
if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize)
{
/*
} pendingPosition;
-/*
- * Place predicate lock on GIN page if needed.
- */
-static void
-GinPredicateLockPage(Relation index, BlockNumber blkno, Snapshot snapshot)
-{
- /*
- * When fast update is on then no need in locking pages, because we anyway
- * need to lock the whole index.
- */
- if (!GinGetUseFastUpdate(index))
- PredicateLockPage(index, blkno, snapshot);
-}
-
/*
* Goes to the next page if current offset is outside of bounds
*/
stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
stack->blkno = BufferGetBlockNumber(stack->buffer);
stack->off = FirstOffsetNumber;
- GinPredicateLockPage(btree->index, stack->blkno, snapshot);
+ PredicateLockPage(btree->index, stack->blkno, snapshot);
}
return true;
*/
for (;;)
{
- /*
- * Predicate lock each leaf page in posting tree
- */
- GinPredicateLockPage(index, BufferGetBlockNumber(buffer), snapshot);
-
page = BufferGetPage(buffer);
if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
{
* Predicate lock entry leaf page, following pages will be locked by
* moveRightIfItNeeded()
*/
- GinPredicateLockPage(btree->index, stack->buffer, snapshot);
+ PredicateLockPage(btree->index, stack->buffer, snapshot);
for (;;)
{
LockBuffer(stack->buffer, GIN_UNLOCK);
+ /*
+ * Acquire predicate lock on the posting tree. We already hold
+ * a lock on the entry page, but insertions to the posting tree
+ * don't check for conflicts on that level.
+ */
+ PredicateLockPage(btree->index, rootPostingTree, snapshot);
+
/* Collect all the TIDs in this entry's posting tree */
scanPostingTree(btree->index, scanEntry, rootPostingTree,
snapshot);
{
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
- /* Predicate lock visited entry leaf page */
- GinPredicateLockPage(ginstate->index,
- BufferGetBlockNumber(stackEntry->buffer), snapshot);
-
if (GinIsPostingTree(itup))
{
BlockNumber rootPostingTree = GinGetPostingTree(itup);
Page page;
ItemPointerData minItem;
+ /*
+ * This is an equality scan, so lock the root of the posting tree.
+ * It represents a lock on the exact key value, and covers all the
+ * items in the posting tree.
+ */
+ PredicateLockPage(ginstate->index, rootPostingTree, snapshot);
+
/*
* We should unlock entry page before touching posting tree to
* prevent deadlocks with vacuum processes. Because entry is never
rootPostingTree, snapshot);
entry->buffer = stack->buffer;
- /*
- * Predicate lock visited posting tree page, following pages will
- * be locked by moveRightIfItNeeded or entryLoadMoreItems
- */
- GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(entry->buffer), snapshot);
-
/*
* We keep buffer pinned because we need to prevent deletion of
* page during scan. See GIN's vacuum implementation. RefCount is
freeGinBtreeStack(stack);
entry->isFinished = false;
}
- else if (GinGetNPosting(itup) > 0)
+ else
{
- entry->list = ginReadTuple(ginstate, entry->attnum, itup,
- &entry->nlist);
- entry->predictNumberResult = entry->nlist;
+ /*
+ * Lock the entry leaf page. This is more coarse-grained than
+ * necessary, because it will conflict with any insertions that
+ * land on the same leaf page, not only the exacty key we searched
+ * for. But locking an individual tuple would require updating
+ * that lock whenever it moves because of insertions or vacuums,
+ * which seems too complicated.
+ */
+ PredicateLockPage(ginstate->index,
+ BufferGetBlockNumber(stackEntry->buffer),
+ snapshot);
+ if (GinGetNPosting(itup) > 0)
+ {
+ entry->list = ginReadTuple(ginstate, entry->attnum, itup,
+ &entry->nlist);
+ entry->predictNumberResult = entry->nlist;
- entry->isFinished = false;
+ entry->isFinished = false;
+ }
}
}
+ else
+ {
+ /*
+ * No entry found. Predicate lock the leaf page, to lock the place
+ * where the entry would've been, had there been one.
+ */
+ PredicateLockPage(ginstate->index,
+ BufferGetBlockNumber(stackEntry->buffer), snapshot);
+ }
if (needUnlock)
LockBuffer(stackEntry->buffer, GIN_UNLOCK);
for (i = 0; i < key->nentries - 1; i++)
{
- /* Pass all entries <= i as false, and the rest as MAYBE */
+ /* Pass all entries <= i as FALSE, and the rest as MAYBE */
for (j = 0; j <= i; j++)
key->entryRes[entryIndexes[j]] = GIN_FALSE;
for (j = i + 1; j < key->nentries; j++)
entry->btree.fullScan = false;
stack = ginFindLeafPage(&entry->btree, true, snapshot);
- GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(stack->buffer), snapshot);
-
/* we don't need the stack, just the buffer. */
entry->buffer = stack->buffer;
IncrBufferRefCount(entry->buffer);
entry->buffer = ginStepRight(entry->buffer,
ginstate->index,
GIN_SHARE);
-
- GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(entry->buffer), snapshot);
-
-
page = BufferGetPage(entry->buffer);
}
stepright = true;
* lossy page even when none of the other entries match.
*
* Our strategy is to call the tri-state consistent function, with the
- * lossy-page entries set to MAYBE, and all the other entries false. If it
- * returns false, none of the lossy items alone are enough for a match, so
+ * lossy-page entries set to MAYBE, and all the other entries FALSE. If it
+ * returns FALSE, none of the lossy items alone are enough for a match, so
* we don't need to return a lossy-page pointer. Otherwise, return a
* lossy-page pointer to indicate that the whole heap page must be
* checked. (On subsequent calls, we'll do nothing until minItem is past
}
/*
- * Collect all matched rows from pending list into bitmap. Also function
- * takes PendingLockRelation if it's needed.
+ * Collect all matched rows from pending list into bitmap.
*/
static void
scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
*ntids = 0;
+ /*
+ * Acquire predicate lock on the metapage, to conflict with any
+ * fastupdate insertions.
+ */
+ PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot);
+
LockBuffer(metabuffer, GIN_SHARE);
page = BufferGetPage(metabuffer);
TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
{
/* No pending list, so proceed with normal scan */
UnlockReleaseBuffer(metabuffer);
-
- /*
- * If fast update is enabled, we acquire a predicate lock on the
- * entire relation as fast update postpones the insertion of tuples
- * into index structure due to which we can't detect rw conflicts.
- */
- if (GinGetUseFastUpdate(scan->indexRelation))
- PredicateLockRelation(scan->indexRelation, scan->xs_snapshot);
-
return;
}
- /*
- * Pending list is not empty, we need to lock the index doesn't despite on
- * fastupdate state
- */
- PredicateLockRelation(scan->indexRelation, scan->xs_snapshot);
-
pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
LockBuffer(pos.pendingBuffer, GIN_SHARE);
pos.firstOffset = FirstOffsetNumber;
return;
}
- GinCheckForSerializableConflictIn(btree.index, NULL, stack->buffer);
+ CheckForSerializableConflictIn(ginstate->index, NULL, stack->buffer);
/* modify an existing leaf entry */
itup = addItemPointersToLeafTuple(ginstate, itup,
items, nitem, buildStats, stack->buffer);
}
else
{
- GinCheckForSerializableConflictIn(btree.index, NULL, stack->buffer);
+ CheckForSerializableConflictIn(ginstate->index, NULL, stack->buffer);
/* no match, so construct a new leaf entry */
itup = buildFreshLeafTuple(ginstate, attnum, key, category,
items, nitem, buildStats, stack->buffer);
memset(&collector, 0, sizeof(GinTupleCollector));
- /*
- * With fastupdate on each scan and each insert begin with access to
- * pending list, so it effectively lock entire index. In this case we
- * aquire predicate lock and check for conflicts over index relation,
- * and hope that it will reduce locking overhead.
- *
- * Do not use GinCheckForSerializableConflictIn() here, because it
- * will do nothing (it does actual work only with fastupdate off).
- * Check for conflicts for entire index.
- */
- CheckForSerializableConflictIn(index, NULL, InvalidBuffer);
-
for (i = 0; i < ginstate->origTupdesc->natts; i++)
ginHeapTupleFastCollect(ginstate, &collector,
(OffsetNumber) (i + 1),
}
else
{
- GinStatsData stats;
-
- /*
- * Fastupdate is off but if pending list isn't empty then we need to
- * check conflicts with PredicateLockRelation in scanPendingInsert().
- */
- ginGetStats(index, &stats);
- if (stats.nPendingPages > 0)
- CheckForSerializableConflictIn(index, NULL, InvalidBuffer);
-
for (i = 0; i < ginstate->origTupdesc->natts; i++)
ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1),
values[i], isnull[i],
END_CRIT_SECTION();
}
-
-void
-GinCheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
-{
- if (!GinGetUseFastUpdate(relation))
- CheckForSerializableConflictIn(relation, tuple, buffer);
-}
START_CRIT_SECTION();
/* Unlink the page by changing left sibling's rightlink */
-
page = BufferGetPage(lBuffer);
GinPageGetOpaque(page)->rightlink = rightlink;
bool is_split;
/*
- * Check for any rw conflicts (in serialisation isolation level) just
+ * Check for any rw conflicts (in serializable isolation level) just
* before we intend to modify the page
*/
CheckForSerializableConflictIn(state->r, NULL, stack->buffer);
however, a search discovers that no root page has yet been created, a
predicate lock on the index relation is required.
+ * Like a B-tree, GIN searches acquire predicate locks only on the
+leaf pages of entry tree. When performing an equality scan, and an
+entry has a posting tree, the posting tree root is locked instead, to
+lock only that key value. However, fastupdate=on postpones the
+insertion of tuples into index structure by temporarily storing them
+into pending list. That makes us unable to detect r-w conflicts using
+page-level locks. To cope with that, insertions to the pending list
+conflict with all scans.
+
* GiST searches can determine that there are no matches at any
level of the index, so we acquire predicate lock at each index
level during a GiST search. An index insert at the leaf level can
then be trusted to ripple up to all levels and locations where
conflicting predicate locks may exist. In case there is a page split,
-we need to copy predicate lock from an original page to all new pages.
-
- * GIN searches acquire predicate locks only on the leaf pages
-of entry tree and posting tree. During a page split, a predicate locks are
-copied from the original page to the new page. In the same way predicate locks
-are copied from entry tree leaf page to freshly created posting tree root.
-However, when fast update is enabled, a predicate lock on the whole index
-relation is required. Fast update postpones the insertion of tuples into index
-structure by temporarily storing them into pending list. That makes us unable
-to detect r-w conflicts using page-level locks.
+we need to copy predicate lock from the original page to all the new
+pages.
* Hash index searches acquire predicate locks on the primary
page of a bucket. It acquires a lock on both the old and new buckets
split, a predicate lock is copied from the primary page of an old
bucket to the primary page of a new bucket.
-
* The effects of page splits, overflows, consolidations, and
removals must be carefully reviewed to ensure that predicate locks
aren't "lost" during those operations, or kept with pages which could
extern OffsetNumber gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple);
extern Datum gintuple_get_key(GinState *ginstate, IndexTuple tuple,
GinNullCategory *category);
-extern void GinCheckForSerializableConflictIn(Relation relation,
- HeapTuple tuple, Buffer buffer);
/* gininsert.c */
extern IndexBuildResult *ginbuild(Relation heap, Relation index,
GinStatsData *buildStats);
extern GinBtreeStack *ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno, Snapshot snapshot);
extern void ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage);
-extern void ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno);
/*
* This is declared in ginvacuum.c, but is passed between ginVacuumItemPointers
--- /dev/null
+Parsed test spec with 3 sessions
+
+starting permutation: r1 r2 w1 c1 w2 c2
+step r1: SELECT count(*) FROM gin_tbl WHERE p @> array[1000];
+count
+
+2
+step r2: SELECT * FROM other_tbl;
+id
+
+step w1: INSERT INTO other_tbl VALUES (42);
+step c1: COMMIT;
+step w2: INSERT INTO gin_tbl SELECT array[1000,19001];
+ERROR: could not serialize access due to read/write dependencies among transactions
+step c2: COMMIT;
+
+starting permutation: r1 r2 w1 c1 fastupdate_on w2 c2
+step r1: SELECT count(*) FROM gin_tbl WHERE p @> array[1000];
+count
+
+2
+step r2: SELECT * FROM other_tbl;
+id
+
+step w1: INSERT INTO other_tbl VALUES (42);
+step c1: COMMIT;
+step fastupdate_on: ALTER INDEX ginidx SET (fastupdate = on);
+step w2: INSERT INTO gin_tbl SELECT array[1000,19001];
+ERROR: could not serialize access due to read/write dependencies among transactions
+step c2: COMMIT;
--- /dev/null
+Parsed test spec with 2 sessions
+
+starting permutation: r1 r2 w1 c1 w2 c2
+step r1: SELECT count(*) FROM gin_tbl WHERE p @> array[-1];
+count
+
+0
+step r2: SELECT * FROM other_tbl;
+id
+
+step w1: INSERT INTO other_tbl VALUES (42);
+step c1: COMMIT;
+step w2: INSERT INTO gin_tbl SELECT array[-1];
+ERROR: could not serialize access due to read/write dependencies among transactions
+step c2: COMMIT;
starting permutation: fu1 rxy1 rxy2fu wx1 c1 wy2fu c2
step fu1: alter index ginidx set (fastupdate = on);
commit;
- begin isolation level serializable;
- set enable_seqscan=off;
+ begin isolation level serializable;
+ set enable_seqscan=off;
step rxy1: select count(*) from gin_tbl where p @> array[4,5];
count
test: predicate-hash
test: predicate-gist
test: predicate-gin
+test: predicate-gin-fastupdate
+test: predicate-gin-nomatch
test: partition-key-update-1
test: partition-key-update-2
test: partition-key-update-3
--- /dev/null
+#
+# Test that predicate locking on a GIN index works correctly, even if
+# fastupdate is turned on concurrently.
+#
+# 0. fastupdate is off
+# 1. Session 's1' acquires predicate lock on page X
+# 2. fastupdate is turned on
+# 3. Session 's2' inserts a new tuple to the pending list
+#
+# This test tests that if the lock acquired in step 1 would conflict with
+# the scan in step 1, we detect that conflict correctly, even if fastupdate
+# was turned on in-between.
+#
+setup
+{
+ create table gin_tbl(p int4[]);
+ insert into gin_tbl select array[g, g*2,g*3] from generate_series(1, 10000) g;
+ insert into gin_tbl select array[4,5,6] from generate_series(10001, 20000) g;
+ create index ginidx on gin_tbl using gin(p) with (fastupdate = off);
+
+ create table other_tbl (id int4);
+}
+
+teardown
+{
+ drop table gin_tbl;
+ drop table other_tbl;
+}
+
+session "s1"
+setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan=off; }
+step "r1" { SELECT count(*) FROM gin_tbl WHERE p @> array[1000]; }
+step "w1" { INSERT INTO other_tbl VALUES (42); }
+step "c1" { COMMIT; }
+
+session "s2"
+setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan=off; }
+step "r2" { SELECT * FROM other_tbl; }
+step "w2" { INSERT INTO gin_tbl SELECT array[1000,19001]; }
+step "c2" { COMMIT; }
+
+session "s3"
+step "fastupdate_on" { ALTER INDEX ginidx SET (fastupdate = on); }
+
+# This correctly throws serialization failure.
+permutation "r1" "r2" "w1" "c1" "w2" "c2"
+
+# But if fastupdate is turned on in the middle, we miss it.
+permutation "r1" "r2" "w1" "c1" "fastupdate_on" "w2" "c2"
--- /dev/null
+#
+# Check that GIN index grabs an appropriate lock, even if there is no match.
+#
+setup
+{
+ create table gin_tbl(p int4[]);
+ insert into gin_tbl select array[g, g*2,g*3] from generate_series(1, 10000) g;
+ insert into gin_tbl select array[4,5,6] from generate_series(10001, 20000) g;
+ create index ginidx on gin_tbl using gin(p) with (fastupdate = off);
+
+ create table other_tbl (id int4);
+}
+
+teardown
+{
+ drop table gin_tbl;
+ drop table other_tbl;
+}
+
+session "s1"
+setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan=off; }
+# Scan with no match.
+step "r1" { SELECT count(*) FROM gin_tbl WHERE p @> array[-1]; }
+step "w1" { INSERT INTO other_tbl VALUES (42); }
+step "c1" { COMMIT; }
+
+session "s2"
+setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan=off; }
+step "r2" { SELECT * FROM other_tbl; }
+# Insert row that would've matched in step "r1"
+step "w2" { INSERT INTO gin_tbl SELECT array[-1]; }
+step "c2" { COMMIT; }
+
+# This should throw serialization failure.
+permutation "r1" "r2" "w1" "c1" "w2" "c2"
# enable pending list for a small subset of tests
step "fu1" { alter index ginidx set (fastupdate = on);
commit;
- begin isolation level serializable;
- set enable_seqscan=off; }
+ begin isolation level serializable;
+ set enable_seqscan=off; }
step "rxy1" { select count(*) from gin_tbl where p @> array[4,5]; }
step "wx1" { insert into gin_tbl select g, array[5,6] from generate_series