X-Git-Url: https://granicus.if.org/sourcecode?a=blobdiff_plain;f=src%2Fbackend%2Fstorage%2Flmgr%2Fpredicate.c;h=3b355641c250198f331935dcaf28363940073083;hb=c7b8998ebbf310a156aa38022555a24d98fdbfb4;hp=8b5a95ceaa0ff460d34231dadc2d3562fa7b0c63;hpb=7abaa6b9d3eeccf777c0210b7340d960b9cceb97;p=postgresql diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index 8b5a95ceaa..3b355641c2 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -32,11 +32,11 @@ * examining the MVCC data.) * * (1) Besides tuples actually read, they must cover ranges of tuples - * which would have been read based on the predicate. This will + * which would have been read based on the predicate. This will * require modelling the predicates through locks against database * objects such as pages, index ranges, or entire tables. * - * (2) They must be kept in RAM for quick access. Because of this, it + * (2) They must be kept in RAM for quick access. Because of this, it * isn't possible to always maintain tuple-level granularity -- when * the space allocated to store these approaches exhaustion, a * request for a lock may need to scan for situations where a single @@ -49,7 +49,7 @@ * * (4) While they are associated with a transaction, they must survive * a successful COMMIT of that transaction, and remain until all - * overlapping transactions complete. This even means that they + * overlapping transactions complete. This even means that they * must survive termination of the transaction's process. If a * top level transaction is rolled back, however, it is immediately * flagged so that it can be ignored, and its SIREAD locks can be @@ -62,7 +62,7 @@ * an existing SIREAD lock for the same transaction, the SIREAD lock * can be deleted. * - * (7) A write from a serializable transaction must ensure that a xact + * (7) A write from a serializable transaction must ensure that an xact * record exists for the transaction, with the same lifespan (until * all concurrent transaction complete or the transaction is rolled * back) so that rw-dependencies to that transaction can be @@ -90,7 +90,7 @@ * may yet matter because they overlap still-active transactions. * * SerializablePredicateLockListLock - * - Protects the linked list of locks held by a transaction. Note + * - Protects the linked list of locks held by a transaction. Note * that the locks themselves are also covered by the partition * locks of their respective lock targets; this lock only affects * the linked list connecting the locks related to a transaction. @@ -101,11 +101,11 @@ * - It is relatively infrequent that another process needs to * modify the list for a transaction, but it does happen for such * things as index page splits for pages with predicate locks and - * freeing of predicate locked pages by a vacuum process. When + * freeing of predicate locked pages by a vacuum process. When * removing a lock in such cases, the lock itself contains the * pointers needed to remove it from the list. When adding a * lock in such cases, the lock can be added using the anchor in - * the transaction structure. Neither requires walking the list. + * the transaction structure. Neither requires walking the list. * - Cleaning up the list for a terminated transaction is sometimes * not done on a retail basis, in which case no lock is required. * - Due to the above, a process accessing its active transaction's @@ -125,7 +125,7 @@ * - Protects both PredXact and SerializableXidHash. * * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -148,7 +148,7 @@ * predicate lock maintenance * GetSerializableTransactionSnapshot(Snapshot snapshot) * SetSerializableTransactionSnapshot(Snapshot snapshot, - * TransactionId sourcexid) + * VirtualTransactionId *sourcevxid) * RegisterPredicateLockingXid(void) * PredicateLockRelation(Relation relation, Snapshot snapshot) * PredicateLockPage(Relation relation, BlockNumber blkno, @@ -156,9 +156,9 @@ * PredicateLockTuple(Relation relation, HeapTuple tuple, * Snapshot snapshot) * PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, - * BlockNumber newblkno); + * BlockNumber newblkno) * PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, - * BlockNumber newblkno); + * BlockNumber newblkno) * TransferPredicateLocksToHeapRelation(Relation relation) * ReleasePredicateLocks(bool isCommit) * @@ -183,13 +183,16 @@ #include "postgres.h" +#include "access/htup_details.h" #include "access/slru.h" #include "access/subtrans.h" #include "access/transam.h" #include "access/twophase.h" #include "access/twophase_rmgr.h" #include "access/xact.h" +#include "access/xlog.h" #include "miscadmin.h" +#include "pgstat.h" #include "storage/bufmgr.h" #include "storage/predicate.h" #include "storage/predicate_internals.h" @@ -240,7 +243,10 @@ #define PredicateLockHashPartition(hashcode) \ ((hashcode) % NUM_PREDICATELOCK_PARTITIONS) #define PredicateLockHashPartitionLock(hashcode) \ - ((LWLockId) (FirstPredicateLockMgrLock + PredicateLockHashPartition(hashcode))) + (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + \ + PredicateLockHashPartition(hashcode)].lock) +#define PredicateLockHashPartitionLockByIndex(i) \ + (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + (i)].lock) #define NPREDICATELOCKTARGETENTS() \ mul_size(max_predicate_locks_per_xact, add_size(MaxBackends, max_prepared_xacts)) @@ -281,7 +287,7 @@ * the lock partition number from the hashcode. */ #define PredicateLockTargetTagHashCode(predicatelocktargettag) \ - (tag_hash((predicatelocktargettag), sizeof(PREDICATELOCKTARGETTAG))) + get_hash_value(PredicateLockTargetHash, predicatelocktargettag) /* * Given a predicate lock tag, and the hash for its target, @@ -331,7 +337,7 @@ typedef struct OldSerXidControlData TransactionId headXid; /* newest valid Xid in the SLRU */ TransactionId tailXid; /* oldest xmin we might be interested in */ bool warningIssued; /* have we issued SLRU wrap-around warning? */ -} OldSerXidControlData; +} OldSerXidControlData; typedef struct OldSerXidControlData *OldSerXidControl; @@ -346,12 +352,19 @@ static OldSerXidControl oldSerXidControl; static SERIALIZABLEXACT *OldCommittedSxact; -/* This configuration variable is used to set the predicate lock table size */ -int max_predicate_locks_per_xact; /* set by guc.c */ +/* + * These configuration variables are used to set the predicate lock table size + * and to control promotion of predicate locks to coarser granularity in an + * attempt to degrade performance (mostly as false positive serialization + * failure) gracefully in the face of memory pressurel + */ +int max_predicate_locks_per_xact; /* set by guc.c */ +int max_predicate_locks_per_relation; /* set by guc.c */ +int max_predicate_locks_per_page; /* set by guc.c */ /* * This provides a list of objects in order to track transactions - * participating in predicate locking. Entries in the list are fixed size, + * participating in predicate locking. Entries in the list are fixed size, * and reside in shared memory. The memory address of an entry must remain * fixed during its lifetime. The list will be protected from concurrent * update externally; no provision is made in this code to manage that. The @@ -380,9 +393,9 @@ static SHM_QUEUE *FinishedSerializableTransactions; * this entry, you can ensure that there's enough scratch space available for * inserting one entry in the hash table. This is an otherwise-invalid tag. */ -static const PREDICATELOCKTARGETTAG ScratchTargetTag = {0, 0, 0, 0, 0}; +static const PREDICATELOCKTARGETTAG ScratchTargetTag = {0, 0, 0, 0}; static uint32 ScratchTargetTagHash; -static int ScratchPartitionLock; +static LWLock *ScratchPartitionLock; /* * The local hash table used to determine when to combine multiple fine- @@ -421,7 +434,8 @@ static uint32 predicatelock_hash(const void *key, Size keysize); static void SummarizeOldestCommittedSxact(void); static Snapshot GetSafeSnapshot(Snapshot snapshot); static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot, - TransactionId sourcexid); + VirtualTransactionId *sourcevxid, + int sourcepid); static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag); static bool GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag, PREDICATELOCKTARGETTAG *parent); @@ -431,7 +445,7 @@ static void RestoreScratchTarget(bool lockheld); static void RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash); static void DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag); -static int PredicateLockPromotionThreshold(const PREDICATELOCKTARGETTAG *tag); +static int MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag); static bool CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag); static void DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag); static void CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag, @@ -459,13 +473,14 @@ static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *read /* * Does this relation participate in predicate locking? Temporary and system - * relations are exempt. + * relations are exempt, as are materialized views. */ static inline bool PredicateLockingNeededForRelation(Relation relation) { return !(relation->rd_id < FirstBootstrapObjectId || - RelationUsesLocalBuffers(relation)); + RelationUsesLocalBuffers(relation) || + relation->rd_rel->relkind == RELKIND_MATVIEW); } /* @@ -490,8 +505,8 @@ SerializationNeededForRead(Relation relation, Snapshot snapshot) * Don't acquire locks or conflict when scanning with a special snapshot. * This excludes things like CLUSTER and REINDEX. They use the wholesale * functions TransferPredicateLocksToHeapRelation() and - * CheckTableForSerializableConflictIn() to participate serialization, but - * the scans involved don't need serialization. + * CheckTableForSerializableConflictIn() to participate in serialization, + * but the scans involved don't need serialization. */ if (!IsMVCCSnapshot(snapshot)) return false; @@ -542,7 +557,7 @@ SerializationNeededForWrite(Relation relation) /* * These functions are a simple implementation of a list for this specific - * type of struct. If there is ever a generalized shared memory list, we + * type of struct. If there is ever a generalized shared memory list, we * should probably switch to that. */ static SERIALIZABLEXACT * @@ -762,7 +777,7 @@ OldSerXidPagePrecedesLogically(int p, int q) int diff; /* - * We have to compare modulo (OLDSERXID_MAX_PAGE+1)/2. Both inputs should + * We have to compare modulo (OLDSERXID_MAX_PAGE+1)/2. Both inputs should * be in the range 0..OLDSERXID_MAX_PAGE. */ Assert(p >= 0 && p <= OLDSERXID_MAX_PAGE); @@ -788,8 +803,9 @@ OldSerXidInit(void) * Set up SLRU management of the pg_serial data. */ OldSerXidSlruCtl->PagePrecedes = OldSerXidPagePrecedesLogically; - SimpleLruInit(OldSerXidSlruCtl, "OldSerXid SLRU Ctl", - NUM_OLDSERXID_BUFFERS, 0, OldSerXidLock, "pg_serial"); + SimpleLruInit(OldSerXidSlruCtl, "oldserxid", + NUM_OLDSERXID_BUFFERS, 0, OldSerXidLock, "pg_serial", + LWTRANCHE_OLDSERXID_BUFFERS); /* Override default assumption that writes should be fsync'd */ OldSerXidSlruCtl->do_fsync = false; @@ -924,7 +940,7 @@ OldSerXidAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo) } /* - * Get the minimum commitSeqNo for any conflict out for the given xid. For + * Get the minimum commitSeqNo for any conflict out for the given xid. For * a transaction which exists but has no conflict out, InvalidSerCommitSeqNo * will be returned. */ @@ -977,7 +993,7 @@ OldSerXidSetActiveSerXmin(TransactionId xid) /* * When no sxacts are active, nothing overlaps, set the xid values to * invalid to show that there are no valid entries. Don't clear headPage, - * though. A new xmin might still land on that page, and we don't want to + * though. A new xmin might still land on that page, and we don't want to * repeatedly zero out the same page. */ if (!TransactionIdIsValid(xid)) @@ -1089,7 +1105,6 @@ void InitPredicateLocks(void) { HASHCTL info; - int hash_flags; long max_table_size; Size requestSize; bool found; @@ -1107,15 +1122,14 @@ InitPredicateLocks(void) MemSet(&info, 0, sizeof(info)); info.keysize = sizeof(PREDICATELOCKTARGETTAG); info.entrysize = sizeof(PREDICATELOCKTARGET); - info.hash = tag_hash; info.num_partitions = NUM_PREDICATELOCK_PARTITIONS; - hash_flags = (HASH_ELEM | HASH_FUNCTION | HASH_PARTITION | HASH_FIXED_SIZE); PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash", max_table_size, max_table_size, &info, - hash_flags); + HASH_ELEM | HASH_BLOBS | + HASH_PARTITION | HASH_FIXED_SIZE); /* Assume an average of 2 xacts per target */ max_table_size *= 2; @@ -1137,13 +1151,13 @@ InitPredicateLocks(void) info.entrysize = sizeof(PREDICATELOCK); info.hash = predicatelock_hash; info.num_partitions = NUM_PREDICATELOCK_PARTITIONS; - hash_flags = (HASH_ELEM | HASH_FUNCTION | HASH_PARTITION | HASH_FIXED_SIZE); PredicateLockHash = ShmemInitHash("PREDICATELOCK hash", max_table_size, max_table_size, &info, - hash_flags); + HASH_ELEM | HASH_FUNCTION | + HASH_PARTITION | HASH_FIXED_SIZE); /* * Compute size for serializable transaction hashtable. Note these @@ -1179,12 +1193,6 @@ InitPredicateLocks(void) requestSize = mul_size((Size) max_table_size, PredXactListElementDataSize); PredXact->element = ShmemAlloc(requestSize); - if (PredXact->element == NULL) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("not enough shared memory for elements of data structure" - " \"%s\" (%lu bytes requested)", - "PredXactList", (unsigned long) requestSize))); /* Add all elements to available list, clean. */ memset(PredXact->element, 0, requestSize); for (i = 0; i < max_table_size; i++) @@ -1218,14 +1226,13 @@ InitPredicateLocks(void) MemSet(&info, 0, sizeof(info)); info.keysize = sizeof(SERIALIZABLEXIDTAG); info.entrysize = sizeof(SERIALIZABLEXID); - info.hash = tag_hash; - hash_flags = (HASH_ELEM | HASH_FUNCTION | HASH_FIXED_SIZE); SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash", max_table_size, max_table_size, &info, - hash_flags); + HASH_ELEM | HASH_BLOBS | + HASH_FIXED_SIZE); /* * Allocate space for tracking rw-conflicts in lists attached to the @@ -1251,12 +1258,6 @@ InitPredicateLocks(void) requestSize = mul_size((Size) max_table_size, RWConflictDataSize); RWConflictPool->element = ShmemAlloc(requestSize); - if (RWConflictPool->element == NULL) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("not enough shared memory for elements of data structure" - " \"%s\" (%lu bytes requested)", - "RWConflictPool", (unsigned long) requestSize))); /* Add all elements to available list, clean. */ memset(RWConflictPool->element, 0, requestSize); for (i = 0; i < max_table_size; i++) @@ -1396,7 +1397,7 @@ GetPredicateLockStatusData(void) * in ascending order, then SerializableXactHashLock. */ for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++) - LWLockAcquire(FirstPredicateLockMgrLock + i, LW_SHARED); + LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED); LWLockAcquire(SerializableXactHashLock, LW_SHARED); /* Get number of locks and allocate appropriately-sized arrays. */ @@ -1425,7 +1426,7 @@ GetPredicateLockStatusData(void) /* Release locks in reverse order */ LWLockRelease(SerializableXactHashLock); for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--) - LWLockRelease(FirstPredicateLockMgrLock + i); + LWLockRelease(PredicateLockHashPartitionLockByIndex(i)); return data; } @@ -1462,7 +1463,7 @@ SummarizeOldestCommittedSxact(void) /* * Grab the first sxact off the finished list -- this will be the earliest - * commit. Remove it from the list. + * commit. Remove it from the list. */ sxact = (SERIALIZABLEXACT *) SHMQueueNext(FinishedSerializableTransactions, @@ -1510,7 +1511,7 @@ GetSafeSnapshot(Snapshot origSnapshot) * one passed to it, but we avoid assuming that here. */ snapshot = GetSerializableTransactionSnapshotInt(origSnapshot, - InvalidTransactionId); + NULL, InvalidPid); if (MySerializableXact == InvalidSerializableXact) return snapshot; /* no concurrent r/w xacts; it's safe */ @@ -1526,7 +1527,7 @@ GetSafeSnapshot(Snapshot origSnapshot) SxactIsROUnsafe(MySerializableXact))) { LWLockRelease(SerializableXactHashLock); - ProcWaitForSignal(); + ProcWaitForSignal(WAIT_EVENT_SAFE_SNAPSHOT); LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); } MySerializableXact->flags &= ~SXACT_FLAG_DEFERRABLE_WAITING; @@ -1555,6 +1556,56 @@ GetSafeSnapshot(Snapshot origSnapshot) return snapshot; } +/* + * GetSafeSnapshotBlockingPids + * If the specified process is currently blocked in GetSafeSnapshot, + * write the process IDs of all processes that it is blocked by + * into the caller-supplied buffer output[]. The list is truncated at + * output_size, and the number of PIDs written into the buffer is + * returned. Returns zero if the given PID is not currently blocked + * in GetSafeSnapshot. + */ +int +GetSafeSnapshotBlockingPids(int blocked_pid, int *output, int output_size) +{ + int num_written = 0; + SERIALIZABLEXACT *sxact; + + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + + /* Find blocked_pid's SERIALIZABLEXACT by linear search. */ + for (sxact = FirstPredXact(); sxact != NULL; sxact = NextPredXact(sxact)) + { + if (sxact->pid == blocked_pid) + break; + } + + /* Did we find it, and is it currently waiting in GetSafeSnapshot? */ + if (sxact != NULL && SxactIsDeferrableWaiting(sxact)) + { + RWConflict possibleUnsafeConflict; + + /* Traverse the list of possible unsafe conflicts collecting PIDs. */ + possibleUnsafeConflict = (RWConflict) + SHMQueueNext(&sxact->possibleUnsafeConflicts, + &sxact->possibleUnsafeConflicts, + offsetof(RWConflictData, inLink)); + + while (possibleUnsafeConflict != NULL && num_written < output_size) + { + output[num_written++] = possibleUnsafeConflict->sxactOut->pid; + possibleUnsafeConflict = (RWConflict) + SHMQueueNext(&sxact->possibleUnsafeConflicts, + &possibleUnsafeConflict->inLink, + offsetof(RWConflictData, inLink)); + } + } + + LWLockRelease(SerializableXactHashLock); + + return num_written; +} + /* * Acquire a snapshot that can be used for the current transaction. * @@ -1573,8 +1624,8 @@ GetSerializableTransactionSnapshot(Snapshot snapshot) /* * Can't use serializable mode while recovery is still active, as it is, - * for example, on a hot standby. We could get here despite the check - * in check_XactIsoLevel() if default_transaction_isolation is set to + * for example, on a hot standby. We could get here despite the check in + * check_XactIsoLevel() if default_transaction_isolation is set to * serializable, so phrase the hint accordingly. */ if (RecoveryInProgress()) @@ -1593,7 +1644,7 @@ GetSerializableTransactionSnapshot(Snapshot snapshot) return GetSafeSnapshot(snapshot); return GetSerializableTransactionSnapshotInt(snapshot, - InvalidTransactionId); + NULL, InvalidPid); } /* @@ -1608,14 +1659,15 @@ GetSerializableTransactionSnapshot(Snapshot snapshot) */ void SetSerializableTransactionSnapshot(Snapshot snapshot, - TransactionId sourcexid) + VirtualTransactionId *sourcevxid, + int sourcepid) { Assert(IsolationIsSerializable()); /* * We do not allow SERIALIZABLE READ ONLY DEFERRABLE transactions to * import snapshots, since there's no way to wait for a safe snapshot when - * we're using the snap we're told to. (XXX instead of throwing an error, + * we're using the snap we're told to. (XXX instead of throwing an error, * we could just ignore the XactDeferrable flag?) */ if (XactReadOnly && XactDeferrable) @@ -1623,7 +1675,8 @@ SetSerializableTransactionSnapshot(Snapshot snapshot, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("a snapshot-importing transaction must not be READ ONLY DEFERRABLE"))); - (void) GetSerializableTransactionSnapshotInt(snapshot, sourcexid); + (void) GetSerializableTransactionSnapshotInt(snapshot, sourcevxid, + sourcepid); } /* @@ -1637,7 +1690,8 @@ SetSerializableTransactionSnapshot(Snapshot snapshot, */ static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot, - TransactionId sourcexid) + VirtualTransactionId *sourcevxid, + int sourcepid) { PGPROC *proc; VirtualTransactionId vxid; @@ -1650,6 +1704,14 @@ GetSerializableTransactionSnapshotInt(Snapshot snapshot, Assert(!RecoveryInProgress()); + /* + * Since all parts of a serializable transaction must use the same + * snapshot, it is too late to establish one after a parallel operation + * has begun. + */ + if (IsInParallelMode()) + elog(ERROR, "cannot establish serializable snapshot during a parallel operation"); + proc = MyProc; Assert(proc != NULL); GET_VXID_FROM_PGPROC(vxid, *proc); @@ -1664,7 +1726,7 @@ GetSerializableTransactionSnapshotInt(Snapshot snapshot, * release SerializableXactHashLock to call SummarizeOldestCommittedSxact, * this means we have to create the sxact first, which is a bit annoying * (in particular, an elog(ERROR) in procarray.c would cause us to leak - * the sxact). Consider refactoring to avoid this. + * the sxact). Consider refactoring to avoid this. */ #ifdef TEST_OLDSERXID SummarizeOldestCommittedSxact(); @@ -1683,17 +1745,17 @@ GetSerializableTransactionSnapshotInt(Snapshot snapshot, } while (!sxact); /* Get the snapshot, or check that it's safe to use */ - if (!TransactionIdIsValid(sourcexid)) + if (!sourcevxid) snapshot = GetSnapshotData(snapshot); - else if (!ProcArrayInstallImportedXmin(snapshot->xmin, sourcexid)) + else if (!ProcArrayInstallImportedXmin(snapshot->xmin, sourcevxid)) { ReleasePredXact(sxact); LWLockRelease(SerializableXactHashLock); ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not import the requested snapshot"), - errdetail("The source transaction %u is not running anymore.", - sourcexid))); + errdetail("The source process with pid %d is not running anymore.", + sourcepid))); } /* @@ -1787,11 +1849,10 @@ GetSerializableTransactionSnapshotInt(Snapshot snapshot, MemSet(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(PREDICATELOCKTARGETTAG); hash_ctl.entrysize = sizeof(LOCALPREDICATELOCK); - hash_ctl.hash = tag_hash; LocalPredicateLockHash = hash_create("Local predicate lock", max_predicate_locks_per_xact, &hash_ctl, - HASH_ELEM | HASH_FUNCTION); + HASH_ELEM | HASH_BLOBS); return snapshot; } @@ -1854,7 +1915,7 @@ PageIsPredicateLocked(Relation relation, BlockNumber blkno) { PREDICATELOCKTARGETTAG targettag; uint32 targettaghash; - LWLockId partitionLock; + LWLock *partitionLock; PREDICATELOCKTARGET *target; SET_PREDICATELOCKTARGETTAG_PAGE(targettag, @@ -2046,7 +2107,7 @@ RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash) /* * Delete child target locks owned by this process. * This implementation is assuming that the usage of each target tag field - * is uniform. No need to make this hard if we don't have to. + * is uniform. No need to make this hard if we don't have to. * * We aren't acquiring lightweight locks for the predicate lock or lock * target structures associated with this transaction unless we're going @@ -2087,7 +2148,7 @@ DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag) if (TargetTagIsCoveredBy(oldtargettag, *newtargettag)) { uint32 oldtargettaghash; - LWLockId partitionLock; + LWLock *partitionLock; PREDICATELOCK *rmpredlock PG_USED_FOR_ASSERTS_ONLY; oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag); @@ -2118,28 +2179,35 @@ DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag) } /* - * Returns the promotion threshold for a given predicate lock - * target. This is the number of descendant locks required to promote - * to the specified tag. Note that the threshold includes non-direct - * descendants, e.g. both tuples and pages for a relation lock. + * Returns the promotion limit for a given predicate lock target. This is the + * max number of descendant locks allowed before promoting to the specified + * tag. Note that the limit includes non-direct descendants (e.g., both tuples + * and pages for a relation lock). * - * TODO SSI: We should do something more intelligent about what the - * thresholds are, either making it proportional to the number of - * tuples in a page & pages in a relation, or at least making it a - * GUC. Currently the threshold is 3 for a page lock, and - * max_pred_locks_per_transaction/2 for a relation lock, chosen - * entirely arbitrarily (and without benchmarking). + * Currently the default limit is 2 for a page lock, and half of the value of + * max_pred_locks_per_transaction - 1 for a relation lock, to match behavior + * of earlier releases when upgrading. + * + * TODO SSI: We should probably add additional GUCs to allow a maximum ratio + * of page and tuple locks based on the pages in a relation, and the maximum + * ratio of tuple locks to tuples in a page. This would provide more + * generally "balanced" allocation of locks to where they are most useful, + * while still allowing the absolute numbers to prevent one relation from + * tying up all predicate lock resources. */ static int -PredicateLockPromotionThreshold(const PREDICATELOCKTARGETTAG *tag) +MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag) { switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag)) { case PREDLOCKTAG_RELATION: - return max_predicate_locks_per_xact / 2; + return max_predicate_locks_per_relation < 0 + ? (max_predicate_locks_per_xact + / (-max_predicate_locks_per_relation)) - 1 + : max_predicate_locks_per_relation; case PREDLOCKTAG_PAGE: - return 3; + return max_predicate_locks_per_page; case PREDLOCKTAG_TUPLE: @@ -2194,8 +2262,8 @@ CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag) else parentlock->childLocks++; - if (parentlock->childLocks >= - PredicateLockPromotionThreshold(&targettag)) + if (parentlock->childLocks > + MaxPredicateChildLocks(&targettag)) { /* * We should promote to this parent lock. Continue to check its @@ -2299,7 +2367,7 @@ CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag, PREDICATELOCKTARGET *target; PREDICATELOCKTAG locktag; PREDICATELOCK *lock; - LWLockId partitionLock; + LWLock *partitionLock; bool found; partitionLock = PredicateLockHashPartitionLock(targettaghash); @@ -2490,11 +2558,9 @@ PredicateLockTuple(Relation relation, HeapTuple tuple, Snapshot snapshot) } } } - else - targetxmin = InvalidTransactionId; /* - * Do quick-but-not-definitive test for a relation lock first. This will + * Do quick-but-not-definitive test for a relation lock first. This will * never cause a return when the relation is *not* locked, but will * occasionally let the check continue when there really *is* a relation * level lock. @@ -2510,8 +2576,7 @@ PredicateLockTuple(Relation relation, HeapTuple tuple, Snapshot snapshot) relation->rd_node.dbNode, relation->rd_id, ItemPointerGetBlockNumber(tid), - ItemPointerGetOffsetNumber(tid), - targetxmin); + ItemPointerGetOffsetNumber(tid)); PredicateLockAcquire(&tag); } @@ -2600,10 +2665,10 @@ TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag, bool removeOld) { uint32 oldtargettaghash; - LWLockId oldpartitionLock; + LWLock *oldpartitionLock; PREDICATELOCKTARGET *oldtarget; uint32 newtargettaghash; - LWLockId newpartitionLock; + LWLock *newpartitionLock; bool found; bool outOfShmem = false; @@ -2780,7 +2845,7 @@ exit: /* We shouldn't run out of memory if we're moving locks */ Assert(!outOfShmem); - /* Put the scrach entry back */ + /* Put the scratch entry back */ RestoreScratchTarget(false); } @@ -2807,7 +2872,7 @@ exit: * transaction which is not serializable. * * NOTE: This is currently only called with transfer set to true, but that may - * change. If we decide to clean up the locks from a table on commit of a + * change. If we decide to clean up the locks from a table on commit of a * transaction which executed DROP TABLE, the false condition will be useful. */ static void @@ -2849,8 +2914,8 @@ DropAllPredicateLocksFromTable(Relation relation, bool transfer) heapId = relation->rd_index->indrelid; } Assert(heapId != InvalidOid); - Assert(transfer || !isIndex); /* index OID only makes sense with - * transfer */ + Assert(transfer || !isIndex); /* index OID only makes sense with + * transfer */ /* Retrieve first time needed, then keep. */ heaptargettaghash = 0; @@ -2859,7 +2924,7 @@ DropAllPredicateLocksFromTable(Relation relation, bool transfer) /* Acquire locks on all lock partitions */ LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE); for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++) - LWLockAcquire(FirstPredicateLockMgrLock + i, LW_EXCLUSIVE); + LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_EXCLUSIVE); LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); /* @@ -2888,7 +2953,7 @@ DropAllPredicateLocksFromTable(Relation relation, bool transfer) continue; /* already the right lock */ /* - * If we made it here, we have work to do. We make sure the heap + * If we made it here, we have work to do. We make sure the heap * relation lock exists, then we walk the list of predicate locks for * the old target we found, moving all locks to the heap relation lock * -- unless they already hold that. @@ -2997,7 +3062,7 @@ DropAllPredicateLocksFromTable(Relation relation, bool transfer) /* Release locks in reverse order */ LWLockRelease(SerializableXactHashLock); for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--) - LWLockRelease(FirstPredicateLockMgrLock + i); + LWLockRelease(PredicateLockHashPartitionLockByIndex(i)); LWLockRelease(SerializablePredicateLockListLock); } @@ -3196,7 +3261,7 @@ ReleasePredicateLocks(bool isCommit) /* * We can't trust XactReadOnly here, because a transaction which started * as READ WRITE can show as READ ONLY later, e.g., within - * substransactions. We want to flag a transaction as READ ONLY if it + * subtransactions. We want to flag a transaction as READ ONLY if it * commits without writing so that de facto READ ONLY transactions get the * benefit of some RO optimizations, so we will use this local variable to * get some cleanup logic right which is based on whether the transaction @@ -3210,30 +3275,29 @@ ReleasePredicateLocks(bool isCommit) return; } + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + Assert(!isCommit || SxactIsPrepared(MySerializableXact)); Assert(!isCommit || !SxactIsDoomed(MySerializableXact)); Assert(!SxactIsCommitted(MySerializableXact)); Assert(!SxactIsRolledBack(MySerializableXact)); /* may not be serializable during COMMIT/ROLLBACK PREPARED */ - if (MySerializableXact->pid != 0) - Assert(IsolationIsSerializable()); + Assert(MySerializableXact->pid == 0 || IsolationIsSerializable()); /* We'd better not already be on the cleanup list. */ Assert(!SxactIsOnFinishedList(MySerializableXact)); topLevelIsDeclaredReadOnly = SxactIsReadOnly(MySerializableXact); - LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); - /* * We don't hold XidGenLock lock here, assuming that TransactionId is * atomic! * * If this value is changing, we don't care that much whether we get the * old or new value -- it is just used to determine how far - * GlobalSerizableXmin must advance before this transaction can be fully - * cleaned up. The worst that could happen is we wait for one more + * GlobalSerializableXmin must advance before this transaction can be + * fully cleaned up. The worst that could happen is we wait for one more * transaction to complete before freeing some RAM; correctness of visible * behavior is not affected. */ @@ -3336,7 +3400,7 @@ ReleasePredicateLocks(bool isCommit) } /* - * Release all outConflicts to committed transactions. If we're rolling + * Release all outConflicts to committed transactions. If we're rolling * back clear them all. Set SXACT_FLAG_CONFLICT_OUT if any point to * previously committed transactions. */ @@ -3612,7 +3676,7 @@ ClearOldPredicateLocks(void) PREDICATELOCKTARGET *target; PREDICATELOCKTARGETTAG targettag; uint32 targettaghash; - LWLockId partitionLock; + LWLock *partitionLock; tag = predlock->tag; target = tag.myTarget; @@ -3655,7 +3719,7 @@ ClearOldPredicateLocks(void) * matter -- but keep the transaction entry itself and any outConflicts. * * When the summarize flag is set, we've run short of room for sxact data - * and must summarize to the SLRU. Predicate locks are transferred to a + * and must summarize to the SLRU. Predicate locks are transferred to a * dummy "old" transaction, with duplicate locks on a single target * collapsing to a single lock with the "latest" commitSeqNo from among * the conflicting locks.. @@ -3691,7 +3755,7 @@ ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial, PREDICATELOCKTARGET *target; PREDICATELOCKTARGETTAG targettag; uint32 targettaghash; - LWLockId partitionLock; + LWLock *partitionLock; nextpredlock = (PREDICATELOCK *) SHMQueueNext(&(sxact->predicateLocks), @@ -3848,7 +3912,7 @@ XidIsConcurrent(TransactionId xid) /* * CheckForSerializableConflictOut * We are reading a tuple which has been modified. If it is visible to - * us but has been deleted, that indicates a rw-conflict out. If it's + * us but has been deleted, that indicates a rw-conflict out. If it's * not visible and was created by a concurrent (overlapping) * serializable transaction, that is also a rw-conflict out, * @@ -3893,7 +3957,7 @@ CheckForSerializableConflictOut(bool visible, Relation relation, * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else * is going on with it. */ - htsvResult = HeapTupleSatisfiesVacuum(tuple->t_data, TransactionXmin, buffer); + htsvResult = HeapTupleSatisfiesVacuum(tuple, TransactionXmin, buffer); switch (htsvResult) { case HEAPTUPLE_LIVE: @@ -3904,10 +3968,10 @@ CheckForSerializableConflictOut(bool visible, Relation relation, case HEAPTUPLE_RECENTLY_DEAD: if (!visible) return; - xid = HeapTupleHeaderGetXmax(tuple->t_data); + xid = HeapTupleHeaderGetUpdateXid(tuple->t_data); break; case HEAPTUPLE_DELETE_IN_PROGRESS: - xid = HeapTupleHeaderGetXmax(tuple->t_data); + xid = HeapTupleHeaderGetUpdateXid(tuple->t_data); break; case HEAPTUPLE_INSERT_IN_PROGRESS: xid = HeapTupleHeaderGetXmin(tuple->t_data); @@ -3935,7 +3999,7 @@ CheckForSerializableConflictOut(bool visible, Relation relation, Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); /* - * Find top level xid. Bail out if xid is too early to be a conflict, or + * Find top level xid. Bail out if xid is too early to be a conflict, or * if it's our own xid. */ if (TransactionIdEquals(xid, GetTopTransactionIdIfAny())) @@ -4000,7 +4064,7 @@ CheckForSerializableConflictOut(bool visible, Relation relation, /* * We have a conflict out to a transaction which has a conflict out to a - * summarized transaction. That summarized transaction must have + * summarized transaction. That summarized transaction must have * committed first, and we can't tell when it committed in relation to our * snapshot acquisition, so something needs to be canceled. */ @@ -4034,7 +4098,7 @@ CheckForSerializableConflictOut(bool visible, Relation relation, && (!SxactHasConflictOut(sxact) || MySerializableXact->SeqNo.lastCommitBeforeSnapshot < sxact->SeqNo.earliestOutConflictCommit)) { - /* Read-only transaction will appear to run first. No conflict. */ + /* Read-only transaction will appear to run first. No conflict. */ LWLockRelease(SerializableXactHashLock); return; } @@ -4069,7 +4133,7 @@ static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag) { uint32 targettaghash; - LWLockId partitionLock; + LWLock *partitionLock; PREDICATELOCKTARGET *target; PREDICATELOCK *predlock; PREDICATELOCK *mypredlock = NULL; @@ -4280,9 +4344,8 @@ CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, SET_PREDICATELOCKTARGETTAG_TUPLE(targettag, relation->rd_node.dbNode, relation->rd_id, - ItemPointerGetBlockNumber(&(tuple->t_data->t_ctid)), - ItemPointerGetOffsetNumber(&(tuple->t_data->t_ctid)), - HeapTupleHeaderGetXmin(tuple->t_data)); + ItemPointerGetBlockNumber(&(tuple->t_self)), + ItemPointerGetOffsetNumber(&(tuple->t_self))); CheckTargetForConflictsIn(&targettag); } @@ -4362,8 +4425,8 @@ CheckTableForSerializableConflictIn(Relation relation) LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE); for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++) - LWLockAcquire(FirstPredicateLockMgrLock + i, LW_SHARED); - LWLockAcquire(SerializableXactHashLock, LW_SHARED); + LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); /* Scan through target list */ hash_seq_init(&seqstat, PredicateLockTargetHash); @@ -4409,7 +4472,7 @@ CheckTableForSerializableConflictIn(Relation relation) /* Release locks in reverse order */ LWLockRelease(SerializableXactHashLock); for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--) - LWLockRelease(FirstPredicateLockMgrLock + i); + LWLockRelease(PredicateLockHashPartitionLockByIndex(i)); LWLockRelease(SerializablePredicateLockListLock); } @@ -4626,7 +4689,7 @@ OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader, * * If a dangerous structure is found, the pivot (the near conflict) is * marked for death, because rolling back another transaction might mean - * that we flail without ever making progress. This transaction is + * that we flail without ever making progress. This transaction is * committing writes, so letting it commit ensures progress. If we * canceled the far conflict, it might immediately fail again on retry. */ @@ -4739,7 +4802,7 @@ AtPrepare_PredicateLocks(void) if (MySerializableXact == InvalidSerializableXact) return; - /* Generate a xact record for our SERIALIZABLEXACT */ + /* Generate an xact record for our SERIALIZABLEXACT */ record.type = TWOPHASEPREDICATERECORD_XACT; xactRecord->xmin = MySerializableXact->xmin; xactRecord->flags = MySerializableXact->flags;