]> granicus.if.org Git - postgresql/blob - src/backend/storage/lmgr/predicate.c
Fix outdated comment
[postgresql] / src / backend / storage / lmgr / predicate.c
1 /*-------------------------------------------------------------------------
2  *
3  * predicate.c
4  *        POSTGRES predicate locking
5  *        to support full serializable transaction isolation
6  *
7  *
8  * The approach taken is to implement Serializable Snapshot Isolation (SSI)
9  * as initially described in this paper:
10  *
11  *      Michael J. Cahill, Uwe Röhm, and Alan D. Fekete. 2008.
12  *      Serializable isolation for snapshot databases.
13  *      In SIGMOD '08: Proceedings of the 2008 ACM SIGMOD
14  *      international conference on Management of data,
15  *      pages 729-738, New York, NY, USA. ACM.
16  *      http://doi.acm.org/10.1145/1376616.1376690
17  *
18  * and further elaborated in Cahill's doctoral thesis:
19  *
20  *      Michael James Cahill. 2009.
21  *      Serializable Isolation for Snapshot Databases.
22  *      Sydney Digital Theses.
23  *      University of Sydney, School of Information Technologies.
24  *      http://hdl.handle.net/2123/5353
25  *
26  *
27  * Predicate locks for Serializable Snapshot Isolation (SSI) are SIREAD
28  * locks, which are so different from normal locks that a distinct set of
29  * structures is required to handle them.  They are needed to detect
30  * rw-conflicts when the read happens before the write.  (When the write
31  * occurs first, the reading transaction can check for a conflict by
32  * examining the MVCC data.)
33  *
34  * (1)  Besides tuples actually read, they must cover ranges of tuples
35  *              which would have been read based on the predicate.  This will
36  *              require modelling the predicates through locks against database
37  *              objects such as pages, index ranges, or entire tables.
38  *
39  * (2)  They must be kept in RAM for quick access.  Because of this, it
40  *              isn't possible to always maintain tuple-level granularity -- when
41  *              the space allocated to store these approaches exhaustion, a
42  *              request for a lock may need to scan for situations where a single
43  *              transaction holds many fine-grained locks which can be coalesced
44  *              into a single coarser-grained lock.
45  *
46  * (3)  They never block anything; they are more like flags than locks
47  *              in that regard; although they refer to database objects and are
48  *              used to identify rw-conflicts with normal write locks.
49  *
50  * (4)  While they are associated with a transaction, they must survive
51  *              a successful COMMIT of that transaction, and remain until all
52  *              overlapping transactions complete.  This even means that they
53  *              must survive termination of the transaction's process.  If a
54  *              top level transaction is rolled back, however, it is immediately
55  *              flagged so that it can be ignored, and its SIREAD locks can be
56  *              released any time after that.
57  *
58  * (5)  The only transactions which create SIREAD locks or check for
59  *              conflicts with them are serializable transactions.
60  *
61  * (6)  When a write lock for a top level transaction is found to cover
62  *              an existing SIREAD lock for the same transaction, the SIREAD lock
63  *              can be deleted.
64  *
65  * (7)  A write from a serializable transaction must ensure that an xact
66  *              record exists for the transaction, with the same lifespan (until
67  *              all concurrent transaction complete or the transaction is rolled
68  *              back) so that rw-dependencies to that transaction can be
69  *              detected.
70  *
71  * We use an optimization for read-only transactions. Under certain
72  * circumstances, a read-only transaction's snapshot can be shown to
73  * never have conflicts with other transactions.  This is referred to
74  * as a "safe" snapshot (and one known not to be is "unsafe").
75  * However, it can't be determined whether a snapshot is safe until
76  * all concurrent read/write transactions complete.
77  *
78  * Once a read-only transaction is known to have a safe snapshot, it
79  * can release its predicate locks and exempt itself from further
80  * predicate lock tracking. READ ONLY DEFERRABLE transactions run only
81  * on safe snapshots, waiting as necessary for one to be available.
82  *
83  *
84  * Lightweight locks to manage access to the predicate locking shared
85  * memory objects must be taken in this order, and should be released in
86  * reverse order:
87  *
88  *      SerializableFinishedListLock
89  *              - Protects the list of transactions which have completed but which
90  *                      may yet matter because they overlap still-active transactions.
91  *
92  *      SerializablePredicateLockListLock
93  *              - Protects the linked list of locks held by a transaction.  Note
94  *                      that the locks themselves are also covered by the partition
95  *                      locks of their respective lock targets; this lock only affects
96  *                      the linked list connecting the locks related to a transaction.
97  *              - All transactions share this single lock (with no partitioning).
98  *              - There is never a need for a process other than the one running
99  *                      an active transaction to walk the list of locks held by that
100  *                      transaction.
101  *              - It is relatively infrequent that another process needs to
102  *                      modify the list for a transaction, but it does happen for such
103  *                      things as index page splits for pages with predicate locks and
104  *                      freeing of predicate locked pages by a vacuum process.  When
105  *                      removing a lock in such cases, the lock itself contains the
106  *                      pointers needed to remove it from the list.  When adding a
107  *                      lock in such cases, the lock can be added using the anchor in
108  *                      the transaction structure.  Neither requires walking the list.
109  *              - Cleaning up the list for a terminated transaction is sometimes
110  *                      not done on a retail basis, in which case no lock is required.
111  *              - Due to the above, a process accessing its active transaction's
112  *                      list always uses a shared lock, regardless of whether it is
113  *                      walking or maintaining the list.  This improves concurrency
114  *                      for the common access patterns.
115  *              - A process which needs to alter the list of a transaction other
116  *                      than its own active transaction must acquire an exclusive
117  *                      lock.
118  *
119  *      PredicateLockHashPartitionLock(hashcode)
120  *              - The same lock protects a target, all locks on that target, and
121  *                      the linked list of locks on the target.
122  *              - When more than one is needed, acquire in ascending address order.
123  *              - When all are needed (rare), acquire in ascending index order with
124  *                      PredicateLockHashPartitionLockByIndex(index).
125  *
126  *      SerializableXactHashLock
127  *              - Protects both PredXact and SerializableXidHash.
128  *
129  *
130  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
131  * Portions Copyright (c) 1994, Regents of the University of California
132  *
133  *
134  * IDENTIFICATION
135  *        src/backend/storage/lmgr/predicate.c
136  *
137  *-------------------------------------------------------------------------
138  */
139 /*
140  * INTERFACE ROUTINES
141  *
142  * housekeeping for setting up shared memory predicate lock structures
143  *              InitPredicateLocks(void)
144  *              PredicateLockShmemSize(void)
145  *
146  * predicate lock reporting
147  *              GetPredicateLockStatusData(void)
148  *              PageIsPredicateLocked(Relation relation, BlockNumber blkno)
149  *
150  * predicate lock maintenance
151  *              GetSerializableTransactionSnapshot(Snapshot snapshot)
152  *              SetSerializableTransactionSnapshot(Snapshot snapshot,
153  *                                                                                 VirtualTransactionId *sourcevxid)
154  *              RegisterPredicateLockingXid(void)
155  *              PredicateLockRelation(Relation relation, Snapshot snapshot)
156  *              PredicateLockPage(Relation relation, BlockNumber blkno,
157  *                                              Snapshot snapshot)
158  *              PredicateLockTuple(Relation relation, HeapTuple tuple,
159  *                                              Snapshot snapshot)
160  *              PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
161  *                                                         BlockNumber newblkno)
162  *              PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
163  *                                                               BlockNumber newblkno)
164  *              TransferPredicateLocksToHeapRelation(Relation relation)
165  *              ReleasePredicateLocks(bool isCommit)
166  *
167  * conflict detection (may also trigger rollback)
168  *              CheckForSerializableConflictOut(bool visible, Relation relation,
169  *                                                                              HeapTupleData *tup, Buffer buffer,
170  *                                                                              Snapshot snapshot)
171  *              CheckForSerializableConflictIn(Relation relation, HeapTupleData *tup,
172  *                                                                         Buffer buffer)
173  *              CheckTableForSerializableConflictIn(Relation relation)
174  *
175  * final rollback checking
176  *              PreCommit_CheckForSerializationFailure(void)
177  *
178  * two-phase commit support
179  *              AtPrepare_PredicateLocks(void);
180  *              PostPrepare_PredicateLocks(TransactionId xid);
181  *              PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit);
182  *              predicatelock_twophase_recover(TransactionId xid, uint16 info,
183  *                                                                         void *recdata, uint32 len);
184  */
185
186 #include "postgres.h"
187
188 #include "access/htup_details.h"
189 #include "access/slru.h"
190 #include "access/subtrans.h"
191 #include "access/transam.h"
192 #include "access/twophase.h"
193 #include "access/twophase_rmgr.h"
194 #include "access/xact.h"
195 #include "access/xlog.h"
196 #include "miscadmin.h"
197 #include "pgstat.h"
198 #include "storage/bufmgr.h"
199 #include "storage/predicate.h"
200 #include "storage/predicate_internals.h"
201 #include "storage/proc.h"
202 #include "storage/procarray.h"
203 #include "utils/rel.h"
204 #include "utils/snapmgr.h"
205 #include "utils/tqual.h"
206
207 /* Uncomment the next line to test the graceful degradation code. */
208 /* #define TEST_OLDSERXID */
209
210 /*
211  * Test the most selective fields first, for performance.
212  *
213  * a is covered by b if all of the following hold:
214  *      1) a.database = b.database
215  *      2) a.relation = b.relation
216  *      3) b.offset is invalid (b is page-granularity or higher)
217  *      4) either of the following:
218  *              4a) a.offset is valid (a is tuple-granularity) and a.page = b.page
219  *       or 4b) a.offset is invalid and b.page is invalid (a is
220  *                      page-granularity and b is relation-granularity
221  */
222 #define TargetTagIsCoveredBy(covered_target, covering_target)                   \
223         ((GET_PREDICATELOCKTARGETTAG_RELATION(covered_target) == /* (2) */      \
224           GET_PREDICATELOCKTARGETTAG_RELATION(covering_target))                         \
225          && (GET_PREDICATELOCKTARGETTAG_OFFSET(covering_target) ==                      \
226                  InvalidOffsetNumber)                                                            /* (3) */      \
227          && (((GET_PREDICATELOCKTARGETTAG_OFFSET(covered_target) !=                     \
228                    InvalidOffsetNumber)                                                          /* (4a) */ \
229                   && (GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) ==               \
230                           GET_PREDICATELOCKTARGETTAG_PAGE(covered_target)))                     \
231                  || ((GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) ==               \
232                           InvalidBlockNumber)                                                    /* (4b) */ \
233                          && (GET_PREDICATELOCKTARGETTAG_PAGE(covered_target)            \
234                                  != InvalidBlockNumber)))                                                               \
235          && (GET_PREDICATELOCKTARGETTAG_DB(covered_target) ==    /* (1) */      \
236                  GET_PREDICATELOCKTARGETTAG_DB(covering_target)))
237
238 /*
239  * The predicate locking target and lock shared hash tables are partitioned to
240  * reduce contention.  To determine which partition a given target belongs to,
241  * compute the tag's hash code with PredicateLockTargetTagHashCode(), then
242  * apply one of these macros.
243  * NB: NUM_PREDICATELOCK_PARTITIONS must be a power of 2!
244  */
245 #define PredicateLockHashPartition(hashcode) \
246         ((hashcode) % NUM_PREDICATELOCK_PARTITIONS)
247 #define PredicateLockHashPartitionLock(hashcode) \
248         (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + \
249                 PredicateLockHashPartition(hashcode)].lock)
250 #define PredicateLockHashPartitionLockByIndex(i) \
251         (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + (i)].lock)
252
253 #define NPREDICATELOCKTARGETENTS() \
254         mul_size(max_predicate_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
255
256 #define SxactIsOnFinishedList(sxact) (!SHMQueueIsDetached(&((sxact)->finishedLink)))
257
258 /*
259  * Note that a sxact is marked "prepared" once it has passed
260  * PreCommit_CheckForSerializationFailure, even if it isn't using
261  * 2PC. This is the point at which it can no longer be aborted.
262  *
263  * The PREPARED flag remains set after commit, so SxactIsCommitted
264  * implies SxactIsPrepared.
265  */
266 #define SxactIsCommitted(sxact) (((sxact)->flags & SXACT_FLAG_COMMITTED) != 0)
267 #define SxactIsPrepared(sxact) (((sxact)->flags & SXACT_FLAG_PREPARED) != 0)
268 #define SxactIsRolledBack(sxact) (((sxact)->flags & SXACT_FLAG_ROLLED_BACK) != 0)
269 #define SxactIsDoomed(sxact) (((sxact)->flags & SXACT_FLAG_DOOMED) != 0)
270 #define SxactIsReadOnly(sxact) (((sxact)->flags & SXACT_FLAG_READ_ONLY) != 0)
271 #define SxactHasSummaryConflictIn(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_IN) != 0)
272 #define SxactHasSummaryConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_OUT) != 0)
273 /*
274  * The following macro actually means that the specified transaction has a
275  * conflict out *to a transaction which committed ahead of it*.  It's hard
276  * to get that into a name of a reasonable length.
277  */
278 #define SxactHasConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_CONFLICT_OUT) != 0)
279 #define SxactIsDeferrableWaiting(sxact) (((sxact)->flags & SXACT_FLAG_DEFERRABLE_WAITING) != 0)
280 #define SxactIsROSafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_SAFE) != 0)
281 #define SxactIsROUnsafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_UNSAFE) != 0)
282
283 /*
284  * Compute the hash code associated with a PREDICATELOCKTARGETTAG.
285  *
286  * To avoid unnecessary recomputations of the hash code, we try to do this
287  * just once per function, and then pass it around as needed.  Aside from
288  * passing the hashcode to hash_search_with_hash_value(), we can extract
289  * the lock partition number from the hashcode.
290  */
291 #define PredicateLockTargetTagHashCode(predicatelocktargettag) \
292         get_hash_value(PredicateLockTargetHash, predicatelocktargettag)
293
294 /*
295  * Given a predicate lock tag, and the hash for its target,
296  * compute the lock hash.
297  *
298  * To make the hash code also depend on the transaction, we xor the sxid
299  * struct's address into the hash code, left-shifted so that the
300  * partition-number bits don't change.  Since this is only a hash, we
301  * don't care if we lose high-order bits of the address; use an
302  * intermediate variable to suppress cast-pointer-to-int warnings.
303  */
304 #define PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash) \
305         ((targethash) ^ ((uint32) PointerGetDatum((predicatelocktag)->myXact)) \
306          << LOG2_NUM_PREDICATELOCK_PARTITIONS)
307
308
309 /*
310  * The SLRU buffer area through which we access the old xids.
311  */
312 static SlruCtlData OldSerXidSlruCtlData;
313
314 #define OldSerXidSlruCtl                        (&OldSerXidSlruCtlData)
315
316 #define OLDSERXID_PAGESIZE                      BLCKSZ
317 #define OLDSERXID_ENTRYSIZE                     sizeof(SerCommitSeqNo)
318 #define OLDSERXID_ENTRIESPERPAGE        (OLDSERXID_PAGESIZE / OLDSERXID_ENTRYSIZE)
319
320 /*
321  * Set maximum pages based on the lesser of the number needed to track all
322  * transactions and the maximum that SLRU supports.
323  */
324 #define OLDSERXID_MAX_PAGE                      Min(SLRU_PAGES_PER_SEGMENT * 0x10000 - 1, \
325                                                                                 (MaxTransactionId) / OLDSERXID_ENTRIESPERPAGE)
326
327 #define OldSerXidNextPage(page) (((page) >= OLDSERXID_MAX_PAGE) ? 0 : (page) + 1)
328
329 #define OldSerXidValue(slotno, xid) (*((SerCommitSeqNo *) \
330         (OldSerXidSlruCtl->shared->page_buffer[slotno] + \
331         ((((uint32) (xid)) % OLDSERXID_ENTRIESPERPAGE) * OLDSERXID_ENTRYSIZE))))
332
333 #define OldSerXidPage(xid)      ((((uint32) (xid)) / OLDSERXID_ENTRIESPERPAGE) % (OLDSERXID_MAX_PAGE + 1))
334 #define OldSerXidSegment(page)  ((page) / SLRU_PAGES_PER_SEGMENT)
335
336 typedef struct OldSerXidControlData
337 {
338         int                     headPage;               /* newest initialized page */
339         TransactionId headXid;          /* newest valid Xid in the SLRU */
340         TransactionId tailXid;          /* oldest xmin we might be interested in */
341         bool            warningIssued;  /* have we issued SLRU wrap-around warning? */
342 }                       OldSerXidControlData;
343
344 typedef struct OldSerXidControlData *OldSerXidControl;
345
346 static OldSerXidControl oldSerXidControl;
347
348 /*
349  * When the oldest committed transaction on the "finished" list is moved to
350  * SLRU, its predicate locks will be moved to this "dummy" transaction,
351  * collapsing duplicate targets.  When a duplicate is found, the later
352  * commitSeqNo is used.
353  */
354 static SERIALIZABLEXACT *OldCommittedSxact;
355
356
357 /*
358  * These configuration variables are used to set the predicate lock table size
359  * and to control promotion of predicate locks to coarser granularity in an
360  * attempt to degrade performance (mostly as false positive serialization
361  * failure) gracefully in the face of memory pressurel
362  */
363 int                     max_predicate_locks_per_xact;   /* set by guc.c */
364 int                     max_predicate_locks_per_relation;       /* set by guc.c */
365 int                     max_predicate_locks_per_page;   /* set by guc.c */
366
367 /*
368  * This provides a list of objects in order to track transactions
369  * participating in predicate locking.  Entries in the list are fixed size,
370  * and reside in shared memory.  The memory address of an entry must remain
371  * fixed during its lifetime.  The list will be protected from concurrent
372  * update externally; no provision is made in this code to manage that.  The
373  * number of entries in the list, and the size allowed for each entry is
374  * fixed upon creation.
375  */
376 static PredXactList PredXact;
377
378 /*
379  * This provides a pool of RWConflict data elements to use in conflict lists
380  * between transactions.
381  */
382 static RWConflictPoolHeader RWConflictPool;
383
384 /*
385  * The predicate locking hash tables are in shared memory.
386  * Each backend keeps pointers to them.
387  */
388 static HTAB *SerializableXidHash;
389 static HTAB *PredicateLockTargetHash;
390 static HTAB *PredicateLockHash;
391 static SHM_QUEUE *FinishedSerializableTransactions;
392
393 /*
394  * Tag for a dummy entry in PredicateLockTargetHash. By temporarily removing
395  * this entry, you can ensure that there's enough scratch space available for
396  * inserting one entry in the hash table. This is an otherwise-invalid tag.
397  */
398 static const PREDICATELOCKTARGETTAG ScratchTargetTag = {0, 0, 0, 0};
399 static uint32 ScratchTargetTagHash;
400 static LWLock *ScratchPartitionLock;
401
402 /*
403  * The local hash table used to determine when to combine multiple fine-
404  * grained locks into a single courser-grained lock.
405  */
406 static HTAB *LocalPredicateLockHash = NULL;
407
408 /*
409  * Keep a pointer to the currently-running serializable transaction (if any)
410  * for quick reference. Also, remember if we have written anything that could
411  * cause a rw-conflict.
412  */
413 static SERIALIZABLEXACT *MySerializableXact = InvalidSerializableXact;
414 static bool MyXactDidWrite = false;
415
416 /* local functions */
417
418 static SERIALIZABLEXACT *CreatePredXact(void);
419 static void ReleasePredXact(SERIALIZABLEXACT *sxact);
420 static SERIALIZABLEXACT *FirstPredXact(void);
421 static SERIALIZABLEXACT *NextPredXact(SERIALIZABLEXACT *sxact);
422
423 static bool RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer);
424 static void SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
425 static void SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact, SERIALIZABLEXACT *activeXact);
426 static void ReleaseRWConflict(RWConflict conflict);
427 static void FlagSxactUnsafe(SERIALIZABLEXACT *sxact);
428
429 static bool OldSerXidPagePrecedesLogically(int p, int q);
430 static void OldSerXidInit(void);
431 static void OldSerXidAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo);
432 static SerCommitSeqNo OldSerXidGetMinConflictCommitSeqNo(TransactionId xid);
433 static void OldSerXidSetActiveSerXmin(TransactionId xid);
434
435 static uint32 predicatelock_hash(const void *key, Size keysize);
436 static void SummarizeOldestCommittedSxact(void);
437 static Snapshot GetSafeSnapshot(Snapshot snapshot);
438 static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot,
439                                                                           VirtualTransactionId *sourcevxid,
440                                                                           int sourcepid);
441 static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag);
442 static bool GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
443                                                   PREDICATELOCKTARGETTAG *parent);
444 static bool CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag);
445 static void RemoveScratchTarget(bool lockheld);
446 static void RestoreScratchTarget(bool lockheld);
447 static void RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target,
448                                                    uint32 targettaghash);
449 static void DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag);
450 static int      MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag);
451 static bool CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag);
452 static void DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag);
453 static void CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
454                                         uint32 targettaghash,
455                                         SERIALIZABLEXACT *sxact);
456 static void DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash);
457 static bool TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag,
458                                                                   PREDICATELOCKTARGETTAG newtargettag,
459                                                                   bool removeOld);
460 static void PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag);
461 static void DropAllPredicateLocksFromTable(Relation relation,
462                                                            bool transfer);
463 static void SetNewSxactGlobalXmin(void);
464 static void ClearOldPredicateLocks(void);
465 static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
466                                                    bool summarize);
467 static bool XidIsConcurrent(TransactionId xid);
468 static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag);
469 static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
470 static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
471                                                                                 SERIALIZABLEXACT *writer);
472
473
474 /*------------------------------------------------------------------------*/
475
476 /*
477  * Does this relation participate in predicate locking? Temporary and system
478  * relations are exempt, as are materialized views.
479  */
480 static inline bool
481 PredicateLockingNeededForRelation(Relation relation)
482 {
483         return !(relation->rd_id < FirstBootstrapObjectId ||
484                          RelationUsesLocalBuffers(relation) ||
485                          relation->rd_rel->relkind == RELKIND_MATVIEW);
486 }
487
488 /*
489  * When a public interface method is called for a read, this is the test to
490  * see if we should do a quick return.
491  *
492  * Note: this function has side-effects! If this transaction has been flagged
493  * as RO-safe since the last call, we release all predicate locks and reset
494  * MySerializableXact. That makes subsequent calls to return quickly.
495  *
496  * This is marked as 'inline' to make to eliminate the function call overhead
497  * in the common case that serialization is not needed.
498  */
499 static inline bool
500 SerializationNeededForRead(Relation relation, Snapshot snapshot)
501 {
502         /* Nothing to do if this is not a serializable transaction */
503         if (MySerializableXact == InvalidSerializableXact)
504                 return false;
505
506         /*
507          * Don't acquire locks or conflict when scanning with a special snapshot.
508          * This excludes things like CLUSTER and REINDEX. They use the wholesale
509          * functions TransferPredicateLocksToHeapRelation() and
510          * CheckTableForSerializableConflictIn() to participate in serialization,
511          * but the scans involved don't need serialization.
512          */
513         if (!IsMVCCSnapshot(snapshot))
514                 return false;
515
516         /*
517          * Check if we have just become "RO-safe". If we have, immediately release
518          * all locks as they're not needed anymore. This also resets
519          * MySerializableXact, so that subsequent calls to this function can exit
520          * quickly.
521          *
522          * A transaction is flagged as RO_SAFE if all concurrent R/W transactions
523          * commit without having conflicts out to an earlier snapshot, thus
524          * ensuring that no conflicts are possible for this transaction.
525          */
526         if (SxactIsROSafe(MySerializableXact))
527         {
528                 ReleasePredicateLocks(false);
529                 return false;
530         }
531
532         /* Check if the relation doesn't participate in predicate locking */
533         if (!PredicateLockingNeededForRelation(relation))
534                 return false;
535
536         return true;                            /* no excuse to skip predicate locking */
537 }
538
539 /*
540  * Like SerializationNeededForRead(), but called on writes.
541  * The logic is the same, but there is no snapshot and we can't be RO-safe.
542  */
543 static inline bool
544 SerializationNeededForWrite(Relation relation)
545 {
546         /* Nothing to do if this is not a serializable transaction */
547         if (MySerializableXact == InvalidSerializableXact)
548                 return false;
549
550         /* Check if the relation doesn't participate in predicate locking */
551         if (!PredicateLockingNeededForRelation(relation))
552                 return false;
553
554         return true;                            /* no excuse to skip predicate locking */
555 }
556
557
558 /*------------------------------------------------------------------------*/
559
560 /*
561  * These functions are a simple implementation of a list for this specific
562  * type of struct.  If there is ever a generalized shared memory list, we
563  * should probably switch to that.
564  */
565 static SERIALIZABLEXACT *
566 CreatePredXact(void)
567 {
568         PredXactListElement ptle;
569
570         ptle = (PredXactListElement)
571                 SHMQueueNext(&PredXact->availableList,
572                                          &PredXact->availableList,
573                                          offsetof(PredXactListElementData, link));
574         if (!ptle)
575                 return NULL;
576
577         SHMQueueDelete(&ptle->link);
578         SHMQueueInsertBefore(&PredXact->activeList, &ptle->link);
579         return &ptle->sxact;
580 }
581
582 static void
583 ReleasePredXact(SERIALIZABLEXACT *sxact)
584 {
585         PredXactListElement ptle;
586
587         Assert(ShmemAddrIsValid(sxact));
588
589         ptle = (PredXactListElement)
590                 (((char *) sxact)
591                  - offsetof(PredXactListElementData, sxact)
592                  + offsetof(PredXactListElementData, link));
593         SHMQueueDelete(&ptle->link);
594         SHMQueueInsertBefore(&PredXact->availableList, &ptle->link);
595 }
596
597 static SERIALIZABLEXACT *
598 FirstPredXact(void)
599 {
600         PredXactListElement ptle;
601
602         ptle = (PredXactListElement)
603                 SHMQueueNext(&PredXact->activeList,
604                                          &PredXact->activeList,
605                                          offsetof(PredXactListElementData, link));
606         if (!ptle)
607                 return NULL;
608
609         return &ptle->sxact;
610 }
611
612 static SERIALIZABLEXACT *
613 NextPredXact(SERIALIZABLEXACT *sxact)
614 {
615         PredXactListElement ptle;
616
617         Assert(ShmemAddrIsValid(sxact));
618
619         ptle = (PredXactListElement)
620                 (((char *) sxact)
621                  - offsetof(PredXactListElementData, sxact)
622                  + offsetof(PredXactListElementData, link));
623         ptle = (PredXactListElement)
624                 SHMQueueNext(&PredXact->activeList,
625                                          &ptle->link,
626                                          offsetof(PredXactListElementData, link));
627         if (!ptle)
628                 return NULL;
629
630         return &ptle->sxact;
631 }
632
633 /*------------------------------------------------------------------------*/
634
635 /*
636  * These functions manage primitive access to the RWConflict pool and lists.
637  */
638 static bool
639 RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer)
640 {
641         RWConflict      conflict;
642
643         Assert(reader != writer);
644
645         /* Check the ends of the purported conflict first. */
646         if (SxactIsDoomed(reader)
647                 || SxactIsDoomed(writer)
648                 || SHMQueueEmpty(&reader->outConflicts)
649                 || SHMQueueEmpty(&writer->inConflicts))
650                 return false;
651
652         /* A conflict is possible; walk the list to find out. */
653         conflict = (RWConflict)
654                 SHMQueueNext(&reader->outConflicts,
655                                          &reader->outConflicts,
656                                          offsetof(RWConflictData, outLink));
657         while (conflict)
658         {
659                 if (conflict->sxactIn == writer)
660                         return true;
661                 conflict = (RWConflict)
662                         SHMQueueNext(&reader->outConflicts,
663                                                  &conflict->outLink,
664                                                  offsetof(RWConflictData, outLink));
665         }
666
667         /* No conflict found. */
668         return false;
669 }
670
671 static void
672 SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
673 {
674         RWConflict      conflict;
675
676         Assert(reader != writer);
677         Assert(!RWConflictExists(reader, writer));
678
679         conflict = (RWConflict)
680                 SHMQueueNext(&RWConflictPool->availableList,
681                                          &RWConflictPool->availableList,
682                                          offsetof(RWConflictData, outLink));
683         if (!conflict)
684                 ereport(ERROR,
685                                 (errcode(ERRCODE_OUT_OF_MEMORY),
686                                  errmsg("not enough elements in RWConflictPool to record a read/write conflict"),
687                                  errhint("You might need to run fewer transactions at a time or increase max_connections.")));
688
689         SHMQueueDelete(&conflict->outLink);
690
691         conflict->sxactOut = reader;
692         conflict->sxactIn = writer;
693         SHMQueueInsertBefore(&reader->outConflicts, &conflict->outLink);
694         SHMQueueInsertBefore(&writer->inConflicts, &conflict->inLink);
695 }
696
697 static void
698 SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact,
699                                                   SERIALIZABLEXACT *activeXact)
700 {
701         RWConflict      conflict;
702
703         Assert(roXact != activeXact);
704         Assert(SxactIsReadOnly(roXact));
705         Assert(!SxactIsReadOnly(activeXact));
706
707         conflict = (RWConflict)
708                 SHMQueueNext(&RWConflictPool->availableList,
709                                          &RWConflictPool->availableList,
710                                          offsetof(RWConflictData, outLink));
711         if (!conflict)
712                 ereport(ERROR,
713                                 (errcode(ERRCODE_OUT_OF_MEMORY),
714                                  errmsg("not enough elements in RWConflictPool to record a potential read/write conflict"),
715                                  errhint("You might need to run fewer transactions at a time or increase max_connections.")));
716
717         SHMQueueDelete(&conflict->outLink);
718
719         conflict->sxactOut = activeXact;
720         conflict->sxactIn = roXact;
721         SHMQueueInsertBefore(&activeXact->possibleUnsafeConflicts,
722                                                  &conflict->outLink);
723         SHMQueueInsertBefore(&roXact->possibleUnsafeConflicts,
724                                                  &conflict->inLink);
725 }
726
727 static void
728 ReleaseRWConflict(RWConflict conflict)
729 {
730         SHMQueueDelete(&conflict->inLink);
731         SHMQueueDelete(&conflict->outLink);
732         SHMQueueInsertBefore(&RWConflictPool->availableList, &conflict->outLink);
733 }
734
735 static void
736 FlagSxactUnsafe(SERIALIZABLEXACT *sxact)
737 {
738         RWConflict      conflict,
739                                 nextConflict;
740
741         Assert(SxactIsReadOnly(sxact));
742         Assert(!SxactIsROSafe(sxact));
743
744         sxact->flags |= SXACT_FLAG_RO_UNSAFE;
745
746         /*
747          * We know this isn't a safe snapshot, so we can stop looking for other
748          * potential conflicts.
749          */
750         conflict = (RWConflict)
751                 SHMQueueNext(&sxact->possibleUnsafeConflicts,
752                                          &sxact->possibleUnsafeConflicts,
753                                          offsetof(RWConflictData, inLink));
754         while (conflict)
755         {
756                 nextConflict = (RWConflict)
757                         SHMQueueNext(&sxact->possibleUnsafeConflicts,
758                                                  &conflict->inLink,
759                                                  offsetof(RWConflictData, inLink));
760
761                 Assert(!SxactIsReadOnly(conflict->sxactOut));
762                 Assert(sxact == conflict->sxactIn);
763
764                 ReleaseRWConflict(conflict);
765
766                 conflict = nextConflict;
767         }
768 }
769
770 /*------------------------------------------------------------------------*/
771
772 /*
773  * We will work on the page range of 0..OLDSERXID_MAX_PAGE.
774  * Compares using wraparound logic, as is required by slru.c.
775  */
776 static bool
777 OldSerXidPagePrecedesLogically(int p, int q)
778 {
779         int                     diff;
780
781         /*
782          * We have to compare modulo (OLDSERXID_MAX_PAGE+1)/2.  Both inputs should
783          * be in the range 0..OLDSERXID_MAX_PAGE.
784          */
785         Assert(p >= 0 && p <= OLDSERXID_MAX_PAGE);
786         Assert(q >= 0 && q <= OLDSERXID_MAX_PAGE);
787
788         diff = p - q;
789         if (diff >= ((OLDSERXID_MAX_PAGE + 1) / 2))
790                 diff -= OLDSERXID_MAX_PAGE + 1;
791         else if (diff < -((int) (OLDSERXID_MAX_PAGE + 1) / 2))
792                 diff += OLDSERXID_MAX_PAGE + 1;
793         return diff < 0;
794 }
795
796 /*
797  * Initialize for the tracking of old serializable committed xids.
798  */
799 static void
800 OldSerXidInit(void)
801 {
802         bool            found;
803
804         /*
805          * Set up SLRU management of the pg_serial data.
806          */
807         OldSerXidSlruCtl->PagePrecedes = OldSerXidPagePrecedesLogically;
808         SimpleLruInit(OldSerXidSlruCtl, "oldserxid",
809                                   NUM_OLDSERXID_BUFFERS, 0, OldSerXidLock, "pg_serial",
810                                   LWTRANCHE_OLDSERXID_BUFFERS);
811         /* Override default assumption that writes should be fsync'd */
812         OldSerXidSlruCtl->do_fsync = false;
813
814         /*
815          * Create or attach to the OldSerXidControl structure.
816          */
817         oldSerXidControl = (OldSerXidControl)
818                 ShmemInitStruct("OldSerXidControlData", sizeof(OldSerXidControlData), &found);
819
820         Assert(found == IsUnderPostmaster);
821         if (!found)
822         {
823                 /*
824                  * Set control information to reflect empty SLRU.
825                  */
826                 oldSerXidControl->headPage = -1;
827                 oldSerXidControl->headXid = InvalidTransactionId;
828                 oldSerXidControl->tailXid = InvalidTransactionId;
829                 oldSerXidControl->warningIssued = false;
830         }
831 }
832
833 /*
834  * Record a committed read write serializable xid and the minimum
835  * commitSeqNo of any transactions to which this xid had a rw-conflict out.
836  * An invalid seqNo means that there were no conflicts out from xid.
837  */
838 static void
839 OldSerXidAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo)
840 {
841         TransactionId tailXid;
842         int                     targetPage;
843         int                     slotno;
844         int                     firstZeroPage;
845         bool            isNewPage;
846
847         Assert(TransactionIdIsValid(xid));
848
849         targetPage = OldSerXidPage(xid);
850
851         LWLockAcquire(OldSerXidLock, LW_EXCLUSIVE);
852
853         /*
854          * If no serializable transactions are active, there shouldn't be anything
855          * to push out to the SLRU.  Hitting this assert would mean there's
856          * something wrong with the earlier cleanup logic.
857          */
858         tailXid = oldSerXidControl->tailXid;
859         Assert(TransactionIdIsValid(tailXid));
860
861         /*
862          * If the SLRU is currently unused, zero out the whole active region from
863          * tailXid to headXid before taking it into use. Otherwise zero out only
864          * any new pages that enter the tailXid-headXid range as we advance
865          * headXid.
866          */
867         if (oldSerXidControl->headPage < 0)
868         {
869                 firstZeroPage = OldSerXidPage(tailXid);
870                 isNewPage = true;
871         }
872         else
873         {
874                 firstZeroPage = OldSerXidNextPage(oldSerXidControl->headPage);
875                 isNewPage = OldSerXidPagePrecedesLogically(oldSerXidControl->headPage,
876                                                                                                    targetPage);
877         }
878
879         if (!TransactionIdIsValid(oldSerXidControl->headXid)
880                 || TransactionIdFollows(xid, oldSerXidControl->headXid))
881                 oldSerXidControl->headXid = xid;
882         if (isNewPage)
883                 oldSerXidControl->headPage = targetPage;
884
885         /*
886          * Give a warning if we're about to run out of SLRU pages.
887          *
888          * slru.c has a maximum of 64k segments, with 32 (SLRU_PAGES_PER_SEGMENT)
889          * pages each. We need to store a 64-bit integer for each Xid, and with
890          * default 8k block size, 65536*32 pages is only enough to cover 2^30
891          * XIDs. If we're about to hit that limit and wrap around, warn the user.
892          *
893          * To avoid spamming the user, we only give one warning when we've used 1
894          * billion XIDs, and stay silent until the situation is fixed and the
895          * number of XIDs used falls below 800 million again.
896          *
897          * XXX: We have no safeguard to actually *prevent* the wrap-around,
898          * though. All you get is a warning.
899          */
900         if (oldSerXidControl->warningIssued)
901         {
902                 TransactionId lowWatermark;
903
904                 lowWatermark = tailXid + 800000000;
905                 if (lowWatermark < FirstNormalTransactionId)
906                         lowWatermark = FirstNormalTransactionId;
907                 if (TransactionIdPrecedes(xid, lowWatermark))
908                         oldSerXidControl->warningIssued = false;
909         }
910         else
911         {
912                 TransactionId highWatermark;
913
914                 highWatermark = tailXid + 1000000000;
915                 if (highWatermark < FirstNormalTransactionId)
916                         highWatermark = FirstNormalTransactionId;
917                 if (TransactionIdFollows(xid, highWatermark))
918                 {
919                         oldSerXidControl->warningIssued = true;
920                         ereport(WARNING,
921                                         (errmsg("memory for serializable conflict tracking is nearly exhausted"),
922                                          errhint("There might be an idle transaction or a forgotten prepared transaction causing this.")));
923                 }
924         }
925
926         if (isNewPage)
927         {
928                 /* Initialize intervening pages. */
929                 while (firstZeroPage != targetPage)
930                 {
931                         (void) SimpleLruZeroPage(OldSerXidSlruCtl, firstZeroPage);
932                         firstZeroPage = OldSerXidNextPage(firstZeroPage);
933                 }
934                 slotno = SimpleLruZeroPage(OldSerXidSlruCtl, targetPage);
935         }
936         else
937                 slotno = SimpleLruReadPage(OldSerXidSlruCtl, targetPage, true, xid);
938
939         OldSerXidValue(slotno, xid) = minConflictCommitSeqNo;
940         OldSerXidSlruCtl->shared->page_dirty[slotno] = true;
941
942         LWLockRelease(OldSerXidLock);
943 }
944
945 /*
946  * Get the minimum commitSeqNo for any conflict out for the given xid.  For
947  * a transaction which exists but has no conflict out, InvalidSerCommitSeqNo
948  * will be returned.
949  */
950 static SerCommitSeqNo
951 OldSerXidGetMinConflictCommitSeqNo(TransactionId xid)
952 {
953         TransactionId headXid;
954         TransactionId tailXid;
955         SerCommitSeqNo val;
956         int                     slotno;
957
958         Assert(TransactionIdIsValid(xid));
959
960         LWLockAcquire(OldSerXidLock, LW_SHARED);
961         headXid = oldSerXidControl->headXid;
962         tailXid = oldSerXidControl->tailXid;
963         LWLockRelease(OldSerXidLock);
964
965         if (!TransactionIdIsValid(headXid))
966                 return 0;
967
968         Assert(TransactionIdIsValid(tailXid));
969
970         if (TransactionIdPrecedes(xid, tailXid)
971                 || TransactionIdFollows(xid, headXid))
972                 return 0;
973
974         /*
975          * The following function must be called without holding OldSerXidLock,
976          * but will return with that lock held, which must then be released.
977          */
978         slotno = SimpleLruReadPage_ReadOnly(OldSerXidSlruCtl,
979                                                                                 OldSerXidPage(xid), xid);
980         val = OldSerXidValue(slotno, xid);
981         LWLockRelease(OldSerXidLock);
982         return val;
983 }
984
985 /*
986  * Call this whenever there is a new xmin for active serializable
987  * transactions.  We don't need to keep information on transactions which
988  * precede that.  InvalidTransactionId means none active, so everything in
989  * the SLRU can be discarded.
990  */
991 static void
992 OldSerXidSetActiveSerXmin(TransactionId xid)
993 {
994         LWLockAcquire(OldSerXidLock, LW_EXCLUSIVE);
995
996         /*
997          * When no sxacts are active, nothing overlaps, set the xid values to
998          * invalid to show that there are no valid entries.  Don't clear headPage,
999          * though.  A new xmin might still land on that page, and we don't want to
1000          * repeatedly zero out the same page.
1001          */
1002         if (!TransactionIdIsValid(xid))
1003         {
1004                 oldSerXidControl->tailXid = InvalidTransactionId;
1005                 oldSerXidControl->headXid = InvalidTransactionId;
1006                 LWLockRelease(OldSerXidLock);
1007                 return;
1008         }
1009
1010         /*
1011          * When we're recovering prepared transactions, the global xmin might move
1012          * backwards depending on the order they're recovered. Normally that's not
1013          * OK, but during recovery no serializable transactions will commit, so
1014          * the SLRU is empty and we can get away with it.
1015          */
1016         if (RecoveryInProgress())
1017         {
1018                 Assert(oldSerXidControl->headPage < 0);
1019                 if (!TransactionIdIsValid(oldSerXidControl->tailXid)
1020                         || TransactionIdPrecedes(xid, oldSerXidControl->tailXid))
1021                 {
1022                         oldSerXidControl->tailXid = xid;
1023                 }
1024                 LWLockRelease(OldSerXidLock);
1025                 return;
1026         }
1027
1028         Assert(!TransactionIdIsValid(oldSerXidControl->tailXid)
1029                    || TransactionIdFollows(xid, oldSerXidControl->tailXid));
1030
1031         oldSerXidControl->tailXid = xid;
1032
1033         LWLockRelease(OldSerXidLock);
1034 }
1035
1036 /*
1037  * Perform a checkpoint --- either during shutdown, or on-the-fly
1038  *
1039  * We don't have any data that needs to survive a restart, but this is a
1040  * convenient place to truncate the SLRU.
1041  */
1042 void
1043 CheckPointPredicate(void)
1044 {
1045         int                     tailPage;
1046
1047         LWLockAcquire(OldSerXidLock, LW_EXCLUSIVE);
1048
1049         /* Exit quickly if the SLRU is currently not in use. */
1050         if (oldSerXidControl->headPage < 0)
1051         {
1052                 LWLockRelease(OldSerXidLock);
1053                 return;
1054         }
1055
1056         if (TransactionIdIsValid(oldSerXidControl->tailXid))
1057         {
1058                 /* We can truncate the SLRU up to the page containing tailXid */
1059                 tailPage = OldSerXidPage(oldSerXidControl->tailXid);
1060         }
1061         else
1062         {
1063                 /*
1064                  * The SLRU is no longer needed. Truncate to head before we set head
1065                  * invalid.
1066                  *
1067                  * XXX: It's possible that the SLRU is not needed again until XID
1068                  * wrap-around has happened, so that the segment containing headPage
1069                  * that we leave behind will appear to be new again. In that case it
1070                  * won't be removed until XID horizon advances enough to make it
1071                  * current again.
1072                  */
1073                 tailPage = oldSerXidControl->headPage;
1074                 oldSerXidControl->headPage = -1;
1075         }
1076
1077         LWLockRelease(OldSerXidLock);
1078
1079         /* Truncate away pages that are no longer required */
1080         SimpleLruTruncate(OldSerXidSlruCtl, tailPage);
1081
1082         /*
1083          * Flush dirty SLRU pages to disk
1084          *
1085          * This is not actually necessary from a correctness point of view. We do
1086          * it merely as a debugging aid.
1087          *
1088          * We're doing this after the truncation to avoid writing pages right
1089          * before deleting the file in which they sit, which would be completely
1090          * pointless.
1091          */
1092         SimpleLruFlush(OldSerXidSlruCtl, true);
1093 }
1094
1095 /*------------------------------------------------------------------------*/
1096
1097 /*
1098  * InitPredicateLocks -- Initialize the predicate locking data structures.
1099  *
1100  * This is called from CreateSharedMemoryAndSemaphores(), which see for
1101  * more comments.  In the normal postmaster case, the shared hash tables
1102  * are created here.  Backends inherit the pointers
1103  * to the shared tables via fork().  In the EXEC_BACKEND case, each
1104  * backend re-executes this code to obtain pointers to the already existing
1105  * shared hash tables.
1106  */
1107 void
1108 InitPredicateLocks(void)
1109 {
1110         HASHCTL         info;
1111         long            max_table_size;
1112         Size            requestSize;
1113         bool            found;
1114
1115 #ifndef EXEC_BACKEND
1116         Assert(!IsUnderPostmaster);
1117 #endif
1118
1119         /*
1120          * Compute size of predicate lock target hashtable. Note these
1121          * calculations must agree with PredicateLockShmemSize!
1122          */
1123         max_table_size = NPREDICATELOCKTARGETENTS();
1124
1125         /*
1126          * Allocate hash table for PREDICATELOCKTARGET structs.  This stores
1127          * per-predicate-lock-target information.
1128          */
1129         MemSet(&info, 0, sizeof(info));
1130         info.keysize = sizeof(PREDICATELOCKTARGETTAG);
1131         info.entrysize = sizeof(PREDICATELOCKTARGET);
1132         info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
1133
1134         PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash",
1135                                                                                         max_table_size,
1136                                                                                         max_table_size,
1137                                                                                         &info,
1138                                                                                         HASH_ELEM | HASH_BLOBS |
1139                                                                                         HASH_PARTITION | HASH_FIXED_SIZE);
1140
1141         /*
1142          * Reserve a dummy entry in the hash table; we use it to make sure there's
1143          * always one entry available when we need to split or combine a page,
1144          * because running out of space there could mean aborting a
1145          * non-serializable transaction.
1146          */
1147         if (!IsUnderPostmaster)
1148         {
1149                 (void) hash_search(PredicateLockTargetHash, &ScratchTargetTag,
1150                                                    HASH_ENTER, &found);
1151                 Assert(!found);
1152         }
1153
1154         /* Pre-calculate the hash and partition lock of the scratch entry */
1155         ScratchTargetTagHash = PredicateLockTargetTagHashCode(&ScratchTargetTag);
1156         ScratchPartitionLock = PredicateLockHashPartitionLock(ScratchTargetTagHash);
1157
1158         /*
1159          * Allocate hash table for PREDICATELOCK structs.  This stores per
1160          * xact-lock-of-a-target information.
1161          */
1162         MemSet(&info, 0, sizeof(info));
1163         info.keysize = sizeof(PREDICATELOCKTAG);
1164         info.entrysize = sizeof(PREDICATELOCK);
1165         info.hash = predicatelock_hash;
1166         info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
1167
1168         /* Assume an average of 2 xacts per target */
1169         max_table_size *= 2;
1170
1171         PredicateLockHash = ShmemInitHash("PREDICATELOCK hash",
1172                                                                           max_table_size,
1173                                                                           max_table_size,
1174                                                                           &info,
1175                                                                           HASH_ELEM | HASH_FUNCTION |
1176                                                                           HASH_PARTITION | HASH_FIXED_SIZE);
1177
1178         /*
1179          * Compute size for serializable transaction hashtable. Note these
1180          * calculations must agree with PredicateLockShmemSize!
1181          */
1182         max_table_size = (MaxBackends + max_prepared_xacts);
1183
1184         /*
1185          * Allocate a list to hold information on transactions participating in
1186          * predicate locking.
1187          *
1188          * Assume an average of 10 predicate locking transactions per backend.
1189          * This allows aggressive cleanup while detail is present before data must
1190          * be summarized for storage in SLRU and the "dummy" transaction.
1191          */
1192         max_table_size *= 10;
1193
1194         PredXact = ShmemInitStruct("PredXactList",
1195                                                            PredXactListDataSize,
1196                                                            &found);
1197         Assert(found == IsUnderPostmaster);
1198         if (!found)
1199         {
1200                 int                     i;
1201
1202                 SHMQueueInit(&PredXact->availableList);
1203                 SHMQueueInit(&PredXact->activeList);
1204                 PredXact->SxactGlobalXmin = InvalidTransactionId;
1205                 PredXact->SxactGlobalXminCount = 0;
1206                 PredXact->WritableSxactCount = 0;
1207                 PredXact->LastSxactCommitSeqNo = FirstNormalSerCommitSeqNo - 1;
1208                 PredXact->CanPartialClearThrough = 0;
1209                 PredXact->HavePartialClearedThrough = 0;
1210                 requestSize = mul_size((Size) max_table_size,
1211                                                            PredXactListElementDataSize);
1212                 PredXact->element = ShmemAlloc(requestSize);
1213                 /* Add all elements to available list, clean. */
1214                 memset(PredXact->element, 0, requestSize);
1215                 for (i = 0; i < max_table_size; i++)
1216                 {
1217                         SHMQueueInsertBefore(&(PredXact->availableList),
1218                                                                  &(PredXact->element[i].link));
1219                 }
1220                 PredXact->OldCommittedSxact = CreatePredXact();
1221                 SetInvalidVirtualTransactionId(PredXact->OldCommittedSxact->vxid);
1222                 PredXact->OldCommittedSxact->prepareSeqNo = 0;
1223                 PredXact->OldCommittedSxact->commitSeqNo = 0;
1224                 PredXact->OldCommittedSxact->SeqNo.lastCommitBeforeSnapshot = 0;
1225                 SHMQueueInit(&PredXact->OldCommittedSxact->outConflicts);
1226                 SHMQueueInit(&PredXact->OldCommittedSxact->inConflicts);
1227                 SHMQueueInit(&PredXact->OldCommittedSxact->predicateLocks);
1228                 SHMQueueInit(&PredXact->OldCommittedSxact->finishedLink);
1229                 SHMQueueInit(&PredXact->OldCommittedSxact->possibleUnsafeConflicts);
1230                 PredXact->OldCommittedSxact->topXid = InvalidTransactionId;
1231                 PredXact->OldCommittedSxact->finishedBefore = InvalidTransactionId;
1232                 PredXact->OldCommittedSxact->xmin = InvalidTransactionId;
1233                 PredXact->OldCommittedSxact->flags = SXACT_FLAG_COMMITTED;
1234                 PredXact->OldCommittedSxact->pid = 0;
1235         }
1236         /* This never changes, so let's keep a local copy. */
1237         OldCommittedSxact = PredXact->OldCommittedSxact;
1238
1239         /*
1240          * Allocate hash table for SERIALIZABLEXID structs.  This stores per-xid
1241          * information for serializable transactions which have accessed data.
1242          */
1243         MemSet(&info, 0, sizeof(info));
1244         info.keysize = sizeof(SERIALIZABLEXIDTAG);
1245         info.entrysize = sizeof(SERIALIZABLEXID);
1246
1247         SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash",
1248                                                                                 max_table_size,
1249                                                                                 max_table_size,
1250                                                                                 &info,
1251                                                                                 HASH_ELEM | HASH_BLOBS |
1252                                                                                 HASH_FIXED_SIZE);
1253
1254         /*
1255          * Allocate space for tracking rw-conflicts in lists attached to the
1256          * transactions.
1257          *
1258          * Assume an average of 5 conflicts per transaction.  Calculations suggest
1259          * that this will prevent resource exhaustion in even the most pessimal
1260          * loads up to max_connections = 200 with all 200 connections pounding the
1261          * database with serializable transactions.  Beyond that, there may be
1262          * occasional transactions canceled when trying to flag conflicts. That's
1263          * probably OK.
1264          */
1265         max_table_size *= 5;
1266
1267         RWConflictPool = ShmemInitStruct("RWConflictPool",
1268                                                                          RWConflictPoolHeaderDataSize,
1269                                                                          &found);
1270         Assert(found == IsUnderPostmaster);
1271         if (!found)
1272         {
1273                 int                     i;
1274
1275                 SHMQueueInit(&RWConflictPool->availableList);
1276                 requestSize = mul_size((Size) max_table_size,
1277                                                            RWConflictDataSize);
1278                 RWConflictPool->element = ShmemAlloc(requestSize);
1279                 /* Add all elements to available list, clean. */
1280                 memset(RWConflictPool->element, 0, requestSize);
1281                 for (i = 0; i < max_table_size; i++)
1282                 {
1283                         SHMQueueInsertBefore(&(RWConflictPool->availableList),
1284                                                                  &(RWConflictPool->element[i].outLink));
1285                 }
1286         }
1287
1288         /*
1289          * Create or attach to the header for the list of finished serializable
1290          * transactions.
1291          */
1292         FinishedSerializableTransactions = (SHM_QUEUE *)
1293                 ShmemInitStruct("FinishedSerializableTransactions",
1294                                                 sizeof(SHM_QUEUE),
1295                                                 &found);
1296         Assert(found == IsUnderPostmaster);
1297         if (!found)
1298                 SHMQueueInit(FinishedSerializableTransactions);
1299
1300         /*
1301          * Initialize the SLRU storage for old committed serializable
1302          * transactions.
1303          */
1304         OldSerXidInit();
1305 }
1306
1307 /*
1308  * Estimate shared-memory space used for predicate lock table
1309  */
1310 Size
1311 PredicateLockShmemSize(void)
1312 {
1313         Size            size = 0;
1314         long            max_table_size;
1315
1316         /* predicate lock target hash table */
1317         max_table_size = NPREDICATELOCKTARGETENTS();
1318         size = add_size(size, hash_estimate_size(max_table_size,
1319                                                                                          sizeof(PREDICATELOCKTARGET)));
1320
1321         /* predicate lock hash table */
1322         max_table_size *= 2;
1323         size = add_size(size, hash_estimate_size(max_table_size,
1324                                                                                          sizeof(PREDICATELOCK)));
1325
1326         /*
1327          * Since NPREDICATELOCKTARGETENTS is only an estimate, add 10% safety
1328          * margin.
1329          */
1330         size = add_size(size, size / 10);
1331
1332         /* transaction list */
1333         max_table_size = MaxBackends + max_prepared_xacts;
1334         max_table_size *= 10;
1335         size = add_size(size, PredXactListDataSize);
1336         size = add_size(size, mul_size((Size) max_table_size,
1337                                                                    PredXactListElementDataSize));
1338
1339         /* transaction xid table */
1340         size = add_size(size, hash_estimate_size(max_table_size,
1341                                                                                          sizeof(SERIALIZABLEXID)));
1342
1343         /* rw-conflict pool */
1344         max_table_size *= 5;
1345         size = add_size(size, RWConflictPoolHeaderDataSize);
1346         size = add_size(size, mul_size((Size) max_table_size,
1347                                                                    RWConflictDataSize));
1348
1349         /* Head for list of finished serializable transactions. */
1350         size = add_size(size, sizeof(SHM_QUEUE));
1351
1352         /* Shared memory structures for SLRU tracking of old committed xids. */
1353         size = add_size(size, sizeof(OldSerXidControlData));
1354         size = add_size(size, SimpleLruShmemSize(NUM_OLDSERXID_BUFFERS, 0));
1355
1356         return size;
1357 }
1358
1359
1360 /*
1361  * Compute the hash code associated with a PREDICATELOCKTAG.
1362  *
1363  * Because we want to use just one set of partition locks for both the
1364  * PREDICATELOCKTARGET and PREDICATELOCK hash tables, we have to make sure
1365  * that PREDICATELOCKs fall into the same partition number as their
1366  * associated PREDICATELOCKTARGETs.  dynahash.c expects the partition number
1367  * to be the low-order bits of the hash code, and therefore a
1368  * PREDICATELOCKTAG's hash code must have the same low-order bits as the
1369  * associated PREDICATELOCKTARGETTAG's hash code.  We achieve this with this
1370  * specialized hash function.
1371  */
1372 static uint32
1373 predicatelock_hash(const void *key, Size keysize)
1374 {
1375         const PREDICATELOCKTAG *predicatelocktag = (const PREDICATELOCKTAG *) key;
1376         uint32          targethash;
1377
1378         Assert(keysize == sizeof(PREDICATELOCKTAG));
1379
1380         /* Look into the associated target object, and compute its hash code */
1381         targethash = PredicateLockTargetTagHashCode(&predicatelocktag->myTarget->tag);
1382
1383         return PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash);
1384 }
1385
1386
1387 /*
1388  * GetPredicateLockStatusData
1389  *              Return a table containing the internal state of the predicate
1390  *              lock manager for use in pg_lock_status.
1391  *
1392  * Like GetLockStatusData, this function tries to hold the partition LWLocks
1393  * for as short a time as possible by returning two arrays that simply
1394  * contain the PREDICATELOCKTARGETTAG and SERIALIZABLEXACT for each lock
1395  * table entry. Multiple copies of the same PREDICATELOCKTARGETTAG and
1396  * SERIALIZABLEXACT will likely appear.
1397  */
1398 PredicateLockData *
1399 GetPredicateLockStatusData(void)
1400 {
1401         PredicateLockData *data;
1402         int                     i;
1403         int                     els,
1404                                 el;
1405         HASH_SEQ_STATUS seqstat;
1406         PREDICATELOCK *predlock;
1407
1408         data = (PredicateLockData *) palloc(sizeof(PredicateLockData));
1409
1410         /*
1411          * To ensure consistency, take simultaneous locks on all partition locks
1412          * in ascending order, then SerializableXactHashLock.
1413          */
1414         for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
1415                 LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED);
1416         LWLockAcquire(SerializableXactHashLock, LW_SHARED);
1417
1418         /* Get number of locks and allocate appropriately-sized arrays. */
1419         els = hash_get_num_entries(PredicateLockHash);
1420         data->nelements = els;
1421         data->locktags = (PREDICATELOCKTARGETTAG *)
1422                 palloc(sizeof(PREDICATELOCKTARGETTAG) * els);
1423         data->xacts = (SERIALIZABLEXACT *)
1424                 palloc(sizeof(SERIALIZABLEXACT) * els);
1425
1426
1427         /* Scan through PredicateLockHash and copy contents */
1428         hash_seq_init(&seqstat, PredicateLockHash);
1429
1430         el = 0;
1431
1432         while ((predlock = (PREDICATELOCK *) hash_seq_search(&seqstat)))
1433         {
1434                 data->locktags[el] = predlock->tag.myTarget->tag;
1435                 data->xacts[el] = *predlock->tag.myXact;
1436                 el++;
1437         }
1438
1439         Assert(el == els);
1440
1441         /* Release locks in reverse order */
1442         LWLockRelease(SerializableXactHashLock);
1443         for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
1444                 LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
1445
1446         return data;
1447 }
1448
1449 /*
1450  * Free up shared memory structures by pushing the oldest sxact (the one at
1451  * the front of the SummarizeOldestCommittedSxact queue) into summary form.
1452  * Each call will free exactly one SERIALIZABLEXACT structure and may also
1453  * free one or more of these structures: SERIALIZABLEXID, PREDICATELOCK,
1454  * PREDICATELOCKTARGET, RWConflictData.
1455  */
1456 static void
1457 SummarizeOldestCommittedSxact(void)
1458 {
1459         SERIALIZABLEXACT *sxact;
1460
1461         LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
1462
1463         /*
1464          * This function is only called if there are no sxact slots available.
1465          * Some of them must belong to old, already-finished transactions, so
1466          * there should be something in FinishedSerializableTransactions list that
1467          * we can summarize. However, there's a race condition: while we were not
1468          * holding any locks, a transaction might have ended and cleaned up all
1469          * the finished sxact entries already, freeing up their sxact slots. In
1470          * that case, we have nothing to do here. The caller will find one of the
1471          * slots released by the other backend when it retries.
1472          */
1473         if (SHMQueueEmpty(FinishedSerializableTransactions))
1474         {
1475                 LWLockRelease(SerializableFinishedListLock);
1476                 return;
1477         }
1478
1479         /*
1480          * Grab the first sxact off the finished list -- this will be the earliest
1481          * commit.  Remove it from the list.
1482          */
1483         sxact = (SERIALIZABLEXACT *)
1484                 SHMQueueNext(FinishedSerializableTransactions,
1485                                          FinishedSerializableTransactions,
1486                                          offsetof(SERIALIZABLEXACT, finishedLink));
1487         SHMQueueDelete(&(sxact->finishedLink));
1488
1489         /* Add to SLRU summary information. */
1490         if (TransactionIdIsValid(sxact->topXid) && !SxactIsReadOnly(sxact))
1491                 OldSerXidAdd(sxact->topXid, SxactHasConflictOut(sxact)
1492                                          ? sxact->SeqNo.earliestOutConflictCommit : InvalidSerCommitSeqNo);
1493
1494         /* Summarize and release the detail. */
1495         ReleaseOneSerializableXact(sxact, false, true);
1496
1497         LWLockRelease(SerializableFinishedListLock);
1498 }
1499
1500 /*
1501  * GetSafeSnapshot
1502  *              Obtain and register a snapshot for a READ ONLY DEFERRABLE
1503  *              transaction. Ensures that the snapshot is "safe", i.e. a
1504  *              read-only transaction running on it can execute serializably
1505  *              without further checks. This requires waiting for concurrent
1506  *              transactions to complete, and retrying with a new snapshot if
1507  *              one of them could possibly create a conflict.
1508  *
1509  *              As with GetSerializableTransactionSnapshot (which this is a subroutine
1510  *              for), the passed-in Snapshot pointer should reference a static data
1511  *              area that can safely be passed to GetSnapshotData.
1512  */
1513 static Snapshot
1514 GetSafeSnapshot(Snapshot origSnapshot)
1515 {
1516         Snapshot        snapshot;
1517
1518         Assert(XactReadOnly && XactDeferrable);
1519
1520         while (true)
1521         {
1522                 /*
1523                  * GetSerializableTransactionSnapshotInt is going to call
1524                  * GetSnapshotData, so we need to provide it the static snapshot area
1525                  * our caller passed to us.  The pointer returned is actually the same
1526                  * one passed to it, but we avoid assuming that here.
1527                  */
1528                 snapshot = GetSerializableTransactionSnapshotInt(origSnapshot,
1529                                                                                                                  NULL, InvalidPid);
1530
1531                 if (MySerializableXact == InvalidSerializableXact)
1532                         return snapshot;        /* no concurrent r/w xacts; it's safe */
1533
1534                 LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
1535
1536                 /*
1537                  * Wait for concurrent transactions to finish. Stop early if one of
1538                  * them marked us as conflicted.
1539                  */
1540                 MySerializableXact->flags |= SXACT_FLAG_DEFERRABLE_WAITING;
1541                 while (!(SHMQueueEmpty(&MySerializableXact->possibleUnsafeConflicts) ||
1542                                  SxactIsROUnsafe(MySerializableXact)))
1543                 {
1544                         LWLockRelease(SerializableXactHashLock);
1545                         ProcWaitForSignal(WAIT_EVENT_SAFE_SNAPSHOT);
1546                         LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
1547                 }
1548                 MySerializableXact->flags &= ~SXACT_FLAG_DEFERRABLE_WAITING;
1549
1550                 if (!SxactIsROUnsafe(MySerializableXact))
1551                 {
1552                         LWLockRelease(SerializableXactHashLock);
1553                         break;                          /* success */
1554                 }
1555
1556                 LWLockRelease(SerializableXactHashLock);
1557
1558                 /* else, need to retry... */
1559                 ereport(DEBUG2,
1560                                 (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
1561                                  errmsg("deferrable snapshot was unsafe; trying a new one")));
1562                 ReleasePredicateLocks(false);
1563         }
1564
1565         /*
1566          * Now we have a safe snapshot, so we don't need to do any further checks.
1567          */
1568         Assert(SxactIsROSafe(MySerializableXact));
1569         ReleasePredicateLocks(false);
1570
1571         return snapshot;
1572 }
1573
1574 /*
1575  * GetSafeSnapshotBlockingPids
1576  *              If the specified process is currently blocked in GetSafeSnapshot,
1577  *              write the process IDs of all processes that it is blocked by
1578  *              into the caller-supplied buffer output[].  The list is truncated at
1579  *              output_size, and the number of PIDs written into the buffer is
1580  *              returned.  Returns zero if the given PID is not currently blocked
1581  *              in GetSafeSnapshot.
1582  */
1583 int
1584 GetSafeSnapshotBlockingPids(int blocked_pid, int *output, int output_size)
1585 {
1586         int                     num_written = 0;
1587         SERIALIZABLEXACT *sxact;
1588
1589         LWLockAcquire(SerializableXactHashLock, LW_SHARED);
1590
1591         /* Find blocked_pid's SERIALIZABLEXACT by linear search. */
1592         for (sxact = FirstPredXact(); sxact != NULL; sxact = NextPredXact(sxact))
1593         {
1594                 if (sxact->pid == blocked_pid)
1595                         break;
1596         }
1597
1598         /* Did we find it, and is it currently waiting in GetSafeSnapshot? */
1599         if (sxact != NULL && SxactIsDeferrableWaiting(sxact))
1600         {
1601                 RWConflict      possibleUnsafeConflict;
1602
1603                 /* Traverse the list of possible unsafe conflicts collecting PIDs. */
1604                 possibleUnsafeConflict = (RWConflict)
1605                         SHMQueueNext(&sxact->possibleUnsafeConflicts,
1606                                                  &sxact->possibleUnsafeConflicts,
1607                                                  offsetof(RWConflictData, inLink));
1608
1609                 while (possibleUnsafeConflict != NULL && num_written < output_size)
1610                 {
1611                         output[num_written++] = possibleUnsafeConflict->sxactOut->pid;
1612                         possibleUnsafeConflict = (RWConflict)
1613                                 SHMQueueNext(&sxact->possibleUnsafeConflicts,
1614                                                          &possibleUnsafeConflict->inLink,
1615                                                          offsetof(RWConflictData, inLink));
1616                 }
1617         }
1618
1619         LWLockRelease(SerializableXactHashLock);
1620
1621         return num_written;
1622 }
1623
1624 /*
1625  * Acquire a snapshot that can be used for the current transaction.
1626  *
1627  * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact.
1628  * It should be current for this process and be contained in PredXact.
1629  *
1630  * The passed-in Snapshot pointer should reference a static data area that
1631  * can safely be passed to GetSnapshotData.  The return value is actually
1632  * always this same pointer; no new snapshot data structure is allocated
1633  * within this function.
1634  */
1635 Snapshot
1636 GetSerializableTransactionSnapshot(Snapshot snapshot)
1637 {
1638         Assert(IsolationIsSerializable());
1639
1640         /*
1641          * Can't use serializable mode while recovery is still active, as it is,
1642          * for example, on a hot standby.  We could get here despite the check in
1643          * check_XactIsoLevel() if default_transaction_isolation is set to
1644          * serializable, so phrase the hint accordingly.
1645          */
1646         if (RecoveryInProgress())
1647                 ereport(ERROR,
1648                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1649                                  errmsg("cannot use serializable mode in a hot standby"),
1650                                  errdetail("\"default_transaction_isolation\" is set to \"serializable\"."),
1651                                  errhint("You can use \"SET default_transaction_isolation = 'repeatable read'\" to change the default.")));
1652
1653         /*
1654          * A special optimization is available for SERIALIZABLE READ ONLY
1655          * DEFERRABLE transactions -- we can wait for a suitable snapshot and
1656          * thereby avoid all SSI overhead once it's running.
1657          */
1658         if (XactReadOnly && XactDeferrable)
1659                 return GetSafeSnapshot(snapshot);
1660
1661         return GetSerializableTransactionSnapshotInt(snapshot,
1662                                                                                                  NULL, InvalidPid);
1663 }
1664
1665 /*
1666  * Import a snapshot to be used for the current transaction.
1667  *
1668  * This is nearly the same as GetSerializableTransactionSnapshot, except that
1669  * we don't take a new snapshot, but rather use the data we're handed.
1670  *
1671  * The caller must have verified that the snapshot came from a serializable
1672  * transaction; and if we're read-write, the source transaction must not be
1673  * read-only.
1674  */
1675 void
1676 SetSerializableTransactionSnapshot(Snapshot snapshot,
1677                                                                    VirtualTransactionId *sourcevxid,
1678                                                                    int sourcepid)
1679 {
1680         Assert(IsolationIsSerializable());
1681
1682         /*
1683          * We do not allow SERIALIZABLE READ ONLY DEFERRABLE transactions to
1684          * import snapshots, since there's no way to wait for a safe snapshot when
1685          * we're using the snap we're told to.  (XXX instead of throwing an error,
1686          * we could just ignore the XactDeferrable flag?)
1687          */
1688         if (XactReadOnly && XactDeferrable)
1689                 ereport(ERROR,
1690                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1691                                  errmsg("a snapshot-importing transaction must not be READ ONLY DEFERRABLE")));
1692
1693         (void) GetSerializableTransactionSnapshotInt(snapshot, sourcevxid,
1694                                                                                                  sourcepid);
1695 }
1696
1697 /*
1698  * Guts of GetSerializableTransactionSnapshot
1699  *
1700  * If sourcexid is valid, this is actually an import operation and we should
1701  * skip calling GetSnapshotData, because the snapshot contents are already
1702  * loaded up.  HOWEVER: to avoid race conditions, we must check that the
1703  * source xact is still running after we acquire SerializableXactHashLock.
1704  * We do that by calling ProcArrayInstallImportedXmin.
1705  */
1706 static Snapshot
1707 GetSerializableTransactionSnapshotInt(Snapshot snapshot,
1708                                                                           VirtualTransactionId *sourcevxid,
1709                                                                           int sourcepid)
1710 {
1711         PGPROC     *proc;
1712         VirtualTransactionId vxid;
1713         SERIALIZABLEXACT *sxact,
1714                            *othersxact;
1715         HASHCTL         hash_ctl;
1716
1717         /* We only do this for serializable transactions.  Once. */
1718         Assert(MySerializableXact == InvalidSerializableXact);
1719
1720         Assert(!RecoveryInProgress());
1721
1722         /*
1723          * Since all parts of a serializable transaction must use the same
1724          * snapshot, it is too late to establish one after a parallel operation
1725          * has begun.
1726          */
1727         if (IsInParallelMode())
1728                 elog(ERROR, "cannot establish serializable snapshot during a parallel operation");
1729
1730         proc = MyProc;
1731         Assert(proc != NULL);
1732         GET_VXID_FROM_PGPROC(vxid, *proc);
1733
1734         /*
1735          * First we get the sxact structure, which may involve looping and access
1736          * to the "finished" list to free a structure for use.
1737          *
1738          * We must hold SerializableXactHashLock when taking/checking the snapshot
1739          * to avoid race conditions, for much the same reasons that
1740          * GetSnapshotData takes the ProcArrayLock.  Since we might have to
1741          * release SerializableXactHashLock to call SummarizeOldestCommittedSxact,
1742          * this means we have to create the sxact first, which is a bit annoying
1743          * (in particular, an elog(ERROR) in procarray.c would cause us to leak
1744          * the sxact).  Consider refactoring to avoid this.
1745          */
1746 #ifdef TEST_OLDSERXID
1747         SummarizeOldestCommittedSxact();
1748 #endif
1749         LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
1750         do
1751         {
1752                 sxact = CreatePredXact();
1753                 /* If null, push out committed sxact to SLRU summary & retry. */
1754                 if (!sxact)
1755                 {
1756                         LWLockRelease(SerializableXactHashLock);
1757                         SummarizeOldestCommittedSxact();
1758                         LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
1759                 }
1760         } while (!sxact);
1761
1762         /* Get the snapshot, or check that it's safe to use */
1763         if (!sourcevxid)
1764                 snapshot = GetSnapshotData(snapshot);
1765         else if (!ProcArrayInstallImportedXmin(snapshot->xmin, sourcevxid))
1766         {
1767                 ReleasePredXact(sxact);
1768                 LWLockRelease(SerializableXactHashLock);
1769                 ereport(ERROR,
1770                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1771                                  errmsg("could not import the requested snapshot"),
1772                                  errdetail("The source process with pid %d is not running anymore.",
1773                                                    sourcepid)));
1774         }
1775
1776         /*
1777          * If there are no serializable transactions which are not read-only, we
1778          * can "opt out" of predicate locking and conflict checking for a
1779          * read-only transaction.
1780          *
1781          * The reason this is safe is that a read-only transaction can only become
1782          * part of a dangerous structure if it overlaps a writable transaction
1783          * which in turn overlaps a writable transaction which committed before
1784          * the read-only transaction started.  A new writable transaction can
1785          * overlap this one, but it can't meet the other condition of overlapping
1786          * a transaction which committed before this one started.
1787          */
1788         if (XactReadOnly && PredXact->WritableSxactCount == 0)
1789         {
1790                 ReleasePredXact(sxact);
1791                 LWLockRelease(SerializableXactHashLock);
1792                 return snapshot;
1793         }
1794
1795         /* Maintain serializable global xmin info. */
1796         if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
1797         {
1798                 Assert(PredXact->SxactGlobalXminCount == 0);
1799                 PredXact->SxactGlobalXmin = snapshot->xmin;
1800                 PredXact->SxactGlobalXminCount = 1;
1801                 OldSerXidSetActiveSerXmin(snapshot->xmin);
1802         }
1803         else if (TransactionIdEquals(snapshot->xmin, PredXact->SxactGlobalXmin))
1804         {
1805                 Assert(PredXact->SxactGlobalXminCount > 0);
1806                 PredXact->SxactGlobalXminCount++;
1807         }
1808         else
1809         {
1810                 Assert(TransactionIdFollows(snapshot->xmin, PredXact->SxactGlobalXmin));
1811         }
1812
1813         /* Initialize the structure. */
1814         sxact->vxid = vxid;
1815         sxact->SeqNo.lastCommitBeforeSnapshot = PredXact->LastSxactCommitSeqNo;
1816         sxact->prepareSeqNo = InvalidSerCommitSeqNo;
1817         sxact->commitSeqNo = InvalidSerCommitSeqNo;
1818         SHMQueueInit(&(sxact->outConflicts));
1819         SHMQueueInit(&(sxact->inConflicts));
1820         SHMQueueInit(&(sxact->possibleUnsafeConflicts));
1821         sxact->topXid = GetTopTransactionIdIfAny();
1822         sxact->finishedBefore = InvalidTransactionId;
1823         sxact->xmin = snapshot->xmin;
1824         sxact->pid = MyProcPid;
1825         SHMQueueInit(&(sxact->predicateLocks));
1826         SHMQueueElemInit(&(sxact->finishedLink));
1827         sxact->flags = 0;
1828         if (XactReadOnly)
1829         {
1830                 sxact->flags |= SXACT_FLAG_READ_ONLY;
1831
1832                 /*
1833                  * Register all concurrent r/w transactions as possible conflicts; if
1834                  * all of them commit without any outgoing conflicts to earlier
1835                  * transactions then this snapshot can be deemed safe (and we can run
1836                  * without tracking predicate locks).
1837                  */
1838                 for (othersxact = FirstPredXact();
1839                          othersxact != NULL;
1840                          othersxact = NextPredXact(othersxact))
1841                 {
1842                         if (!SxactIsCommitted(othersxact)
1843                                 && !SxactIsDoomed(othersxact)
1844                                 && !SxactIsReadOnly(othersxact))
1845                         {
1846                                 SetPossibleUnsafeConflict(sxact, othersxact);
1847                         }
1848                 }
1849         }
1850         else
1851         {
1852                 ++(PredXact->WritableSxactCount);
1853                 Assert(PredXact->WritableSxactCount <=
1854                            (MaxBackends + max_prepared_xacts));
1855         }
1856
1857         MySerializableXact = sxact;
1858         MyXactDidWrite = false;         /* haven't written anything yet */
1859
1860         LWLockRelease(SerializableXactHashLock);
1861
1862         /* Initialize the backend-local hash table of parent locks */
1863         Assert(LocalPredicateLockHash == NULL);
1864         MemSet(&hash_ctl, 0, sizeof(hash_ctl));
1865         hash_ctl.keysize = sizeof(PREDICATELOCKTARGETTAG);
1866         hash_ctl.entrysize = sizeof(LOCALPREDICATELOCK);
1867         LocalPredicateLockHash = hash_create("Local predicate lock",
1868                                                                                  max_predicate_locks_per_xact,
1869                                                                                  &hash_ctl,
1870                                                                                  HASH_ELEM | HASH_BLOBS);
1871
1872         return snapshot;
1873 }
1874
1875 /*
1876  * Register the top level XID in SerializableXidHash.
1877  * Also store it for easy reference in MySerializableXact.
1878  */
1879 void
1880 RegisterPredicateLockingXid(TransactionId xid)
1881 {
1882         SERIALIZABLEXIDTAG sxidtag;
1883         SERIALIZABLEXID *sxid;
1884         bool            found;
1885
1886         /*
1887          * If we're not tracking predicate lock data for this transaction, we
1888          * should ignore the request and return quickly.
1889          */
1890         if (MySerializableXact == InvalidSerializableXact)
1891                 return;
1892
1893         /* We should have a valid XID and be at the top level. */
1894         Assert(TransactionIdIsValid(xid));
1895
1896         LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
1897
1898         /* This should only be done once per transaction. */
1899         Assert(MySerializableXact->topXid == InvalidTransactionId);
1900
1901         MySerializableXact->topXid = xid;
1902
1903         sxidtag.xid = xid;
1904         sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,
1905                                                                                    &sxidtag,
1906                                                                                    HASH_ENTER, &found);
1907         Assert(!found);
1908
1909         /* Initialize the structure. */
1910         sxid->myXact = MySerializableXact;
1911         LWLockRelease(SerializableXactHashLock);
1912 }
1913
1914
1915 /*
1916  * Check whether there are any predicate locks held by any transaction
1917  * for the page at the given block number.
1918  *
1919  * Note that the transaction may be completed but not yet subject to
1920  * cleanup due to overlapping serializable transactions.  This must
1921  * return valid information regardless of transaction isolation level.
1922  *
1923  * Also note that this doesn't check for a conflicting relation lock,
1924  * just a lock specifically on the given page.
1925  *
1926  * One use is to support proper behavior during GiST index vacuum.
1927  */
1928 bool
1929 PageIsPredicateLocked(Relation relation, BlockNumber blkno)
1930 {
1931         PREDICATELOCKTARGETTAG targettag;
1932         uint32          targettaghash;
1933         LWLock     *partitionLock;
1934         PREDICATELOCKTARGET *target;
1935
1936         SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
1937                                                                         relation->rd_node.dbNode,
1938                                                                         relation->rd_id,
1939                                                                         blkno);
1940
1941         targettaghash = PredicateLockTargetTagHashCode(&targettag);
1942         partitionLock = PredicateLockHashPartitionLock(targettaghash);
1943         LWLockAcquire(partitionLock, LW_SHARED);
1944         target = (PREDICATELOCKTARGET *)
1945                 hash_search_with_hash_value(PredicateLockTargetHash,
1946                                                                         &targettag, targettaghash,
1947                                                                         HASH_FIND, NULL);
1948         LWLockRelease(partitionLock);
1949
1950         return (target != NULL);
1951 }
1952
1953
1954 /*
1955  * Check whether a particular lock is held by this transaction.
1956  *
1957  * Important note: this function may return false even if the lock is
1958  * being held, because it uses the local lock table which is not
1959  * updated if another transaction modifies our lock list (e.g. to
1960  * split an index page). It can also return true when a coarser
1961  * granularity lock that covers this target is being held. Be careful
1962  * to only use this function in circumstances where such errors are
1963  * acceptable!
1964  */
1965 static bool
1966 PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag)
1967 {
1968         LOCALPREDICATELOCK *lock;
1969
1970         /* check local hash table */
1971         lock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
1972                                                                                           targettag,
1973                                                                                           HASH_FIND, NULL);
1974
1975         if (!lock)
1976                 return false;
1977
1978         /*
1979          * Found entry in the table, but still need to check whether it's actually
1980          * held -- it could just be a parent of some held lock.
1981          */
1982         return lock->held;
1983 }
1984
1985 /*
1986  * Return the parent lock tag in the lock hierarchy: the next coarser
1987  * lock that covers the provided tag.
1988  *
1989  * Returns true and sets *parent to the parent tag if one exists,
1990  * returns false if none exists.
1991  */
1992 static bool
1993 GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
1994                                                   PREDICATELOCKTARGETTAG *parent)
1995 {
1996         switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
1997         {
1998                 case PREDLOCKTAG_RELATION:
1999                         /* relation locks have no parent lock */
2000                         return false;
2001
2002                 case PREDLOCKTAG_PAGE:
2003                         /* parent lock is relation lock */
2004                         SET_PREDICATELOCKTARGETTAG_RELATION(*parent,
2005                                                                                                 GET_PREDICATELOCKTARGETTAG_DB(*tag),
2006                                                                                                 GET_PREDICATELOCKTARGETTAG_RELATION(*tag));
2007
2008                         return true;
2009
2010                 case PREDLOCKTAG_TUPLE:
2011                         /* parent lock is page lock */
2012                         SET_PREDICATELOCKTARGETTAG_PAGE(*parent,
2013                                                                                         GET_PREDICATELOCKTARGETTAG_DB(*tag),
2014                                                                                         GET_PREDICATELOCKTARGETTAG_RELATION(*tag),
2015                                                                                         GET_PREDICATELOCKTARGETTAG_PAGE(*tag));
2016                         return true;
2017         }
2018
2019         /* not reachable */
2020         Assert(false);
2021         return false;
2022 }
2023
2024 /*
2025  * Check whether the lock we are considering is already covered by a
2026  * coarser lock for our transaction.
2027  *
2028  * Like PredicateLockExists, this function might return a false
2029  * negative, but it will never return a false positive.
2030  */
2031 static bool
2032 CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag)
2033 {
2034         PREDICATELOCKTARGETTAG targettag,
2035                                 parenttag;
2036
2037         targettag = *newtargettag;
2038
2039         /* check parents iteratively until no more */
2040         while (GetParentPredicateLockTag(&targettag, &parenttag))
2041         {
2042                 targettag = parenttag;
2043                 if (PredicateLockExists(&targettag))
2044                         return true;
2045         }
2046
2047         /* no more parents to check; lock is not covered */
2048         return false;
2049 }
2050
2051 /*
2052  * Remove the dummy entry from the predicate lock target hash, to free up some
2053  * scratch space. The caller must be holding SerializablePredicateLockListLock,
2054  * and must restore the entry with RestoreScratchTarget() before releasing the
2055  * lock.
2056  *
2057  * If lockheld is true, the caller is already holding the partition lock
2058  * of the partition containing the scratch entry.
2059  */
2060 static void
2061 RemoveScratchTarget(bool lockheld)
2062 {
2063         bool            found;
2064
2065         Assert(LWLockHeldByMe(SerializablePredicateLockListLock));
2066
2067         if (!lockheld)
2068                 LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE);
2069         hash_search_with_hash_value(PredicateLockTargetHash,
2070                                                                 &ScratchTargetTag,
2071                                                                 ScratchTargetTagHash,
2072                                                                 HASH_REMOVE, &found);
2073         Assert(found);
2074         if (!lockheld)
2075                 LWLockRelease(ScratchPartitionLock);
2076 }
2077
2078 /*
2079  * Re-insert the dummy entry in predicate lock target hash.
2080  */
2081 static void
2082 RestoreScratchTarget(bool lockheld)
2083 {
2084         bool            found;
2085
2086         Assert(LWLockHeldByMe(SerializablePredicateLockListLock));
2087
2088         if (!lockheld)
2089                 LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE);
2090         hash_search_with_hash_value(PredicateLockTargetHash,
2091                                                                 &ScratchTargetTag,
2092                                                                 ScratchTargetTagHash,
2093                                                                 HASH_ENTER, &found);
2094         Assert(!found);
2095         if (!lockheld)
2096                 LWLockRelease(ScratchPartitionLock);
2097 }
2098
2099 /*
2100  * Check whether the list of related predicate locks is empty for a
2101  * predicate lock target, and remove the target if it is.
2102  */
2103 static void
2104 RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash)
2105 {
2106         PREDICATELOCKTARGET *rmtarget PG_USED_FOR_ASSERTS_ONLY;
2107
2108         Assert(LWLockHeldByMe(SerializablePredicateLockListLock));
2109
2110         /* Can't remove it until no locks at this target. */
2111         if (!SHMQueueEmpty(&target->predicateLocks))
2112                 return;
2113
2114         /* Actually remove the target. */
2115         rmtarget = hash_search_with_hash_value(PredicateLockTargetHash,
2116                                                                                    &target->tag,
2117                                                                                    targettaghash,
2118                                                                                    HASH_REMOVE, NULL);
2119         Assert(rmtarget == target);
2120 }
2121
2122 /*
2123  * Delete child target locks owned by this process.
2124  * This implementation is assuming that the usage of each target tag field
2125  * is uniform.  No need to make this hard if we don't have to.
2126  *
2127  * We aren't acquiring lightweight locks for the predicate lock or lock
2128  * target structures associated with this transaction unless we're going
2129  * to modify them, because no other process is permitted to modify our
2130  * locks.
2131  */
2132 static void
2133 DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag)
2134 {
2135         SERIALIZABLEXACT *sxact;
2136         PREDICATELOCK *predlock;
2137
2138         LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
2139         sxact = MySerializableXact;
2140         predlock = (PREDICATELOCK *)
2141                 SHMQueueNext(&(sxact->predicateLocks),
2142                                          &(sxact->predicateLocks),
2143                                          offsetof(PREDICATELOCK, xactLink));
2144         while (predlock)
2145         {
2146                 SHM_QUEUE  *predlocksxactlink;
2147                 PREDICATELOCK *nextpredlock;
2148                 PREDICATELOCKTAG oldlocktag;
2149                 PREDICATELOCKTARGET *oldtarget;
2150                 PREDICATELOCKTARGETTAG oldtargettag;
2151
2152                 predlocksxactlink = &(predlock->xactLink);
2153                 nextpredlock = (PREDICATELOCK *)
2154                         SHMQueueNext(&(sxact->predicateLocks),
2155                                                  predlocksxactlink,
2156                                                  offsetof(PREDICATELOCK, xactLink));
2157
2158                 oldlocktag = predlock->tag;
2159                 Assert(oldlocktag.myXact == sxact);
2160                 oldtarget = oldlocktag.myTarget;
2161                 oldtargettag = oldtarget->tag;
2162
2163                 if (TargetTagIsCoveredBy(oldtargettag, *newtargettag))
2164                 {
2165                         uint32          oldtargettaghash;
2166                         LWLock     *partitionLock;
2167                         PREDICATELOCK *rmpredlock PG_USED_FOR_ASSERTS_ONLY;
2168
2169                         oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
2170                         partitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
2171
2172                         LWLockAcquire(partitionLock, LW_EXCLUSIVE);
2173
2174                         SHMQueueDelete(predlocksxactlink);
2175                         SHMQueueDelete(&(predlock->targetLink));
2176                         rmpredlock = hash_search_with_hash_value
2177                                 (PredicateLockHash,
2178                                  &oldlocktag,
2179                                  PredicateLockHashCodeFromTargetHashCode(&oldlocktag,
2180                                                                                                                  oldtargettaghash),
2181                                  HASH_REMOVE, NULL);
2182                         Assert(rmpredlock == predlock);
2183
2184                         RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash);
2185
2186                         LWLockRelease(partitionLock);
2187
2188                         DecrementParentLocks(&oldtargettag);
2189                 }
2190
2191                 predlock = nextpredlock;
2192         }
2193         LWLockRelease(SerializablePredicateLockListLock);
2194 }
2195
2196 /*
2197  * Returns the promotion limit for a given predicate lock target.  This is the
2198  * max number of descendant locks allowed before promoting to the specified
2199  * tag. Note that the limit includes non-direct descendants (e.g., both tuples
2200  * and pages for a relation lock).
2201  *
2202  * Currently the default limit is 2 for a page lock, and half of the value of
2203  * max_pred_locks_per_transaction - 1 for a relation lock, to match behavior
2204  * of earlier releases when upgrading.
2205  *
2206  * TODO SSI: We should probably add additional GUCs to allow a maximum ratio
2207  * of page and tuple locks based on the pages in a relation, and the maximum
2208  * ratio of tuple locks to tuples in a page.  This would provide more
2209  * generally "balanced" allocation of locks to where they are most useful,
2210  * while still allowing the absolute numbers to prevent one relation from
2211  * tying up all predicate lock resources.
2212  */
2213 static int
2214 MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag)
2215 {
2216         switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
2217         {
2218                 case PREDLOCKTAG_RELATION:
2219                         return max_predicate_locks_per_relation < 0
2220                                 ? (max_predicate_locks_per_xact
2221                                    / (-max_predicate_locks_per_relation)) - 1
2222                                 : max_predicate_locks_per_relation;
2223
2224                 case PREDLOCKTAG_PAGE:
2225                         return max_predicate_locks_per_page;
2226
2227                 case PREDLOCKTAG_TUPLE:
2228
2229                         /*
2230                          * not reachable: nothing is finer-granularity than a tuple, so we
2231                          * should never try to promote to it.
2232                          */
2233                         Assert(false);
2234                         return 0;
2235         }
2236
2237         /* not reachable */
2238         Assert(false);
2239         return 0;
2240 }
2241
2242 /*
2243  * For all ancestors of a newly-acquired predicate lock, increment
2244  * their child count in the parent hash table. If any of them have
2245  * more descendants than their promotion threshold, acquire the
2246  * coarsest such lock.
2247  *
2248  * Returns true if a parent lock was acquired and false otherwise.
2249  */
2250 static bool
2251 CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag)
2252 {
2253         PREDICATELOCKTARGETTAG targettag,
2254                                 nexttag,
2255                                 promotiontag;
2256         LOCALPREDICATELOCK *parentlock;
2257         bool            found,
2258                                 promote;
2259
2260         promote = false;
2261
2262         targettag = *reqtag;
2263
2264         /* check parents iteratively */
2265         while (GetParentPredicateLockTag(&targettag, &nexttag))
2266         {
2267                 targettag = nexttag;
2268                 parentlock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
2269                                                                                                                 &targettag,
2270                                                                                                                 HASH_ENTER,
2271                                                                                                                 &found);
2272                 if (!found)
2273                 {
2274                         parentlock->held = false;
2275                         parentlock->childLocks = 1;
2276                 }
2277                 else
2278                         parentlock->childLocks++;
2279
2280                 if (parentlock->childLocks >
2281                         MaxPredicateChildLocks(&targettag))
2282                 {
2283                         /*
2284                          * We should promote to this parent lock. Continue to check its
2285                          * ancestors, however, both to get their child counts right and to
2286                          * check whether we should just go ahead and promote to one of
2287                          * them.
2288                          */
2289                         promotiontag = targettag;
2290                         promote = true;
2291                 }
2292         }
2293
2294         if (promote)
2295         {
2296                 /* acquire coarsest ancestor eligible for promotion */
2297                 PredicateLockAcquire(&promotiontag);
2298                 return true;
2299         }
2300         else
2301                 return false;
2302 }
2303
2304 /*
2305  * When releasing a lock, decrement the child count on all ancestor
2306  * locks.
2307  *
2308  * This is called only when releasing a lock via
2309  * DeleteChildTargetLocks (i.e. when a lock becomes redundant because
2310  * we've acquired its parent, possibly due to promotion) or when a new
2311  * MVCC write lock makes the predicate lock unnecessary. There's no
2312  * point in calling it when locks are released at transaction end, as
2313  * this information is no longer needed.
2314  */
2315 static void
2316 DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag)
2317 {
2318         PREDICATELOCKTARGETTAG parenttag,
2319                                 nexttag;
2320
2321         parenttag = *targettag;
2322
2323         while (GetParentPredicateLockTag(&parenttag, &nexttag))
2324         {
2325                 uint32          targettaghash;
2326                 LOCALPREDICATELOCK *parentlock,
2327                                    *rmlock PG_USED_FOR_ASSERTS_ONLY;
2328
2329                 parenttag = nexttag;
2330                 targettaghash = PredicateLockTargetTagHashCode(&parenttag);
2331                 parentlock = (LOCALPREDICATELOCK *)
2332                         hash_search_with_hash_value(LocalPredicateLockHash,
2333                                                                                 &parenttag, targettaghash,
2334                                                                                 HASH_FIND, NULL);
2335
2336                 /*
2337                  * There's a small chance the parent lock doesn't exist in the lock
2338                  * table. This can happen if we prematurely removed it because an
2339                  * index split caused the child refcount to be off.
2340                  */
2341                 if (parentlock == NULL)
2342                         continue;
2343
2344                 parentlock->childLocks--;
2345
2346                 /*
2347                  * Under similar circumstances the parent lock's refcount might be
2348                  * zero. This only happens if we're holding that lock (otherwise we
2349                  * would have removed the entry).
2350                  */
2351                 if (parentlock->childLocks < 0)
2352                 {
2353                         Assert(parentlock->held);
2354                         parentlock->childLocks = 0;
2355                 }
2356
2357                 if ((parentlock->childLocks == 0) && (!parentlock->held))
2358                 {
2359                         rmlock = (LOCALPREDICATELOCK *)
2360                                 hash_search_with_hash_value(LocalPredicateLockHash,
2361                                                                                         &parenttag, targettaghash,
2362                                                                                         HASH_REMOVE, NULL);
2363                         Assert(rmlock == parentlock);
2364                 }
2365         }
2366 }
2367
2368 /*
2369  * Indicate that a predicate lock on the given target is held by the
2370  * specified transaction. Has no effect if the lock is already held.
2371  *
2372  * This updates the lock table and the sxact's lock list, and creates
2373  * the lock target if necessary, but does *not* do anything related to
2374  * granularity promotion or the local lock table. See
2375  * PredicateLockAcquire for that.
2376  */
2377 static void
2378 CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
2379                                         uint32 targettaghash,
2380                                         SERIALIZABLEXACT *sxact)
2381 {
2382         PREDICATELOCKTARGET *target;
2383         PREDICATELOCKTAG locktag;
2384         PREDICATELOCK *lock;
2385         LWLock     *partitionLock;
2386         bool            found;
2387
2388         partitionLock = PredicateLockHashPartitionLock(targettaghash);
2389
2390         LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
2391         LWLockAcquire(partitionLock, LW_EXCLUSIVE);
2392
2393         /* Make sure that the target is represented. */
2394         target = (PREDICATELOCKTARGET *)
2395                 hash_search_with_hash_value(PredicateLockTargetHash,
2396                                                                         targettag, targettaghash,
2397                                                                         HASH_ENTER_NULL, &found);
2398         if (!target)
2399                 ereport(ERROR,
2400                                 (errcode(ERRCODE_OUT_OF_MEMORY),
2401                                  errmsg("out of shared memory"),
2402                                  errhint("You might need to increase max_pred_locks_per_transaction.")));
2403         if (!found)
2404                 SHMQueueInit(&(target->predicateLocks));
2405
2406         /* We've got the sxact and target, make sure they're joined. */
2407         locktag.myTarget = target;
2408         locktag.myXact = sxact;
2409         lock = (PREDICATELOCK *)
2410                 hash_search_with_hash_value(PredicateLockHash, &locktag,
2411                                                                         PredicateLockHashCodeFromTargetHashCode(&locktag, targettaghash),
2412                                                                         HASH_ENTER_NULL, &found);
2413         if (!lock)
2414                 ereport(ERROR,
2415                                 (errcode(ERRCODE_OUT_OF_MEMORY),
2416                                  errmsg("out of shared memory"),
2417                                  errhint("You might need to increase max_pred_locks_per_transaction.")));
2418
2419         if (!found)
2420         {
2421                 SHMQueueInsertBefore(&(target->predicateLocks), &(lock->targetLink));
2422                 SHMQueueInsertBefore(&(sxact->predicateLocks),
2423                                                          &(lock->xactLink));
2424                 lock->commitSeqNo = InvalidSerCommitSeqNo;
2425         }
2426
2427         LWLockRelease(partitionLock);
2428         LWLockRelease(SerializablePredicateLockListLock);
2429 }
2430
2431 /*
2432  * Acquire a predicate lock on the specified target for the current
2433  * connection if not already held. This updates the local lock table
2434  * and uses it to implement granularity promotion. It will consolidate
2435  * multiple locks into a coarser lock if warranted, and will release
2436  * any finer-grained locks covered by the new one.
2437  */
2438 static void
2439 PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag)
2440 {
2441         uint32          targettaghash;
2442         bool            found;
2443         LOCALPREDICATELOCK *locallock;
2444
2445         /* Do we have the lock already, or a covering lock? */
2446         if (PredicateLockExists(targettag))
2447                 return;
2448
2449         if (CoarserLockCovers(targettag))
2450                 return;
2451
2452         /* the same hash and LW lock apply to the lock target and the local lock. */
2453         targettaghash = PredicateLockTargetTagHashCode(targettag);
2454
2455         /* Acquire lock in local table */
2456         locallock = (LOCALPREDICATELOCK *)
2457                 hash_search_with_hash_value(LocalPredicateLockHash,
2458                                                                         targettag, targettaghash,
2459                                                                         HASH_ENTER, &found);
2460         locallock->held = true;
2461         if (!found)
2462                 locallock->childLocks = 0;
2463
2464         /* Actually create the lock */
2465         CreatePredicateLock(targettag, targettaghash, MySerializableXact);
2466
2467         /*
2468          * Lock has been acquired. Check whether it should be promoted to a
2469          * coarser granularity, or whether there are finer-granularity locks to
2470          * clean up.
2471          */
2472         if (CheckAndPromotePredicateLockRequest(targettag))
2473         {
2474                 /*
2475                  * Lock request was promoted to a coarser-granularity lock, and that
2476                  * lock was acquired. It will delete this lock and any of its
2477                  * children, so we're done.
2478                  */
2479         }
2480         else
2481         {
2482                 /* Clean up any finer-granularity locks */
2483                 if (GET_PREDICATELOCKTARGETTAG_TYPE(*targettag) != PREDLOCKTAG_TUPLE)
2484                         DeleteChildTargetLocks(targettag);
2485         }
2486 }
2487
2488
2489 /*
2490  *              PredicateLockRelation
2491  *
2492  * Gets a predicate lock at the relation level.
2493  * Skip if not in full serializable transaction isolation level.
2494  * Skip if this is a temporary table.
2495  * Clear any finer-grained predicate locks this session has on the relation.
2496  */
2497 void
2498 PredicateLockRelation(Relation relation, Snapshot snapshot)
2499 {
2500         PREDICATELOCKTARGETTAG tag;
2501
2502         if (!SerializationNeededForRead(relation, snapshot))
2503                 return;
2504
2505         SET_PREDICATELOCKTARGETTAG_RELATION(tag,
2506                                                                                 relation->rd_node.dbNode,
2507                                                                                 relation->rd_id);
2508         PredicateLockAcquire(&tag);
2509 }
2510
2511 /*
2512  *              PredicateLockPage
2513  *
2514  * Gets a predicate lock at the page level.
2515  * Skip if not in full serializable transaction isolation level.
2516  * Skip if this is a temporary table.
2517  * Skip if a coarser predicate lock already covers this page.
2518  * Clear any finer-grained predicate locks this session has on the relation.
2519  */
2520 void
2521 PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
2522 {
2523         PREDICATELOCKTARGETTAG tag;
2524
2525         if (!SerializationNeededForRead(relation, snapshot))
2526                 return;
2527
2528         SET_PREDICATELOCKTARGETTAG_PAGE(tag,
2529                                                                         relation->rd_node.dbNode,
2530                                                                         relation->rd_id,
2531                                                                         blkno);
2532         PredicateLockAcquire(&tag);
2533 }
2534
2535 /*
2536  *              PredicateLockTuple
2537  *
2538  * Gets a predicate lock at the tuple level.
2539  * Skip if not in full serializable transaction isolation level.
2540  * Skip if this is a temporary table.
2541  */
2542 void
2543 PredicateLockTuple(Relation relation, HeapTuple tuple, Snapshot snapshot)
2544 {
2545         PREDICATELOCKTARGETTAG tag;
2546         ItemPointer tid;
2547         TransactionId targetxmin;
2548
2549         if (!SerializationNeededForRead(relation, snapshot))
2550                 return;
2551
2552         /*
2553          * If it's a heap tuple, return if this xact wrote it.
2554          */
2555         if (relation->rd_index == NULL)
2556         {
2557                 TransactionId myxid;
2558
2559                 targetxmin = HeapTupleHeaderGetXmin(tuple->t_data);
2560
2561                 myxid = GetTopTransactionIdIfAny();
2562                 if (TransactionIdIsValid(myxid))
2563                 {
2564                         if (TransactionIdFollowsOrEquals(targetxmin, TransactionXmin))
2565                         {
2566                                 TransactionId xid = SubTransGetTopmostTransaction(targetxmin);
2567
2568                                 if (TransactionIdEquals(xid, myxid))
2569                                 {
2570                                         /* We wrote it; we already have a write lock. */
2571                                         return;
2572                                 }
2573                         }
2574                 }
2575         }
2576
2577         /*
2578          * Do quick-but-not-definitive test for a relation lock first.  This will
2579          * never cause a return when the relation is *not* locked, but will
2580          * occasionally let the check continue when there really *is* a relation
2581          * level lock.
2582          */
2583         SET_PREDICATELOCKTARGETTAG_RELATION(tag,
2584                                                                                 relation->rd_node.dbNode,
2585                                                                                 relation->rd_id);
2586         if (PredicateLockExists(&tag))
2587                 return;
2588
2589         tid = &(tuple->t_self);
2590         SET_PREDICATELOCKTARGETTAG_TUPLE(tag,
2591                                                                          relation->rd_node.dbNode,
2592                                                                          relation->rd_id,
2593                                                                          ItemPointerGetBlockNumber(tid),
2594                                                                          ItemPointerGetOffsetNumber(tid));
2595         PredicateLockAcquire(&tag);
2596 }
2597
2598
2599 /*
2600  *              DeleteLockTarget
2601  *
2602  * Remove a predicate lock target along with any locks held for it.
2603  *
2604  * Caller must hold SerializablePredicateLockListLock and the
2605  * appropriate hash partition lock for the target.
2606  */
2607 static void
2608 DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash)
2609 {
2610         PREDICATELOCK *predlock;
2611         SHM_QUEUE  *predlocktargetlink;
2612         PREDICATELOCK *nextpredlock;
2613         bool            found;
2614
2615         Assert(LWLockHeldByMe(SerializablePredicateLockListLock));
2616         Assert(LWLockHeldByMe(PredicateLockHashPartitionLock(targettaghash)));
2617
2618         predlock = (PREDICATELOCK *)
2619                 SHMQueueNext(&(target->predicateLocks),
2620                                          &(target->predicateLocks),
2621                                          offsetof(PREDICATELOCK, targetLink));
2622         LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
2623         while (predlock)
2624         {
2625                 predlocktargetlink = &(predlock->targetLink);
2626                 nextpredlock = (PREDICATELOCK *)
2627                         SHMQueueNext(&(target->predicateLocks),
2628                                                  predlocktargetlink,
2629                                                  offsetof(PREDICATELOCK, targetLink));
2630
2631                 SHMQueueDelete(&(predlock->xactLink));
2632                 SHMQueueDelete(&(predlock->targetLink));
2633
2634                 hash_search_with_hash_value
2635                         (PredicateLockHash,
2636                          &predlock->tag,
2637                          PredicateLockHashCodeFromTargetHashCode(&predlock->tag,
2638                                                                                                          targettaghash),
2639                          HASH_REMOVE, &found);
2640                 Assert(found);
2641
2642                 predlock = nextpredlock;
2643         }
2644         LWLockRelease(SerializableXactHashLock);
2645
2646         /* Remove the target itself, if possible. */
2647         RemoveTargetIfNoLongerUsed(target, targettaghash);
2648 }
2649
2650
2651 /*
2652  *              TransferPredicateLocksToNewTarget
2653  *
2654  * Move or copy all the predicate locks for a lock target, for use by
2655  * index page splits/combines and other things that create or replace
2656  * lock targets. If 'removeOld' is true, the old locks and the target
2657  * will be removed.
2658  *
2659  * Returns true on success, or false if we ran out of shared memory to
2660  * allocate the new target or locks. Guaranteed to always succeed if
2661  * removeOld is set (by using the scratch entry in PredicateLockTargetHash
2662  * for scratch space).
2663  *
2664  * Warning: the "removeOld" option should be used only with care,
2665  * because this function does not (indeed, can not) update other
2666  * backends' LocalPredicateLockHash. If we are only adding new
2667  * entries, this is not a problem: the local lock table is used only
2668  * as a hint, so missing entries for locks that are held are
2669  * OK. Having entries for locks that are no longer held, as can happen
2670  * when using "removeOld", is not in general OK. We can only use it
2671  * safely when replacing a lock with a coarser-granularity lock that
2672  * covers it, or if we are absolutely certain that no one will need to
2673  * refer to that lock in the future.
2674  *
2675  * Caller must hold SerializablePredicateLockListLock.
2676  */
2677 static bool
2678 TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag,
2679                                                                   PREDICATELOCKTARGETTAG newtargettag,
2680                                                                   bool removeOld)
2681 {
2682         uint32          oldtargettaghash;
2683         LWLock     *oldpartitionLock;
2684         PREDICATELOCKTARGET *oldtarget;
2685         uint32          newtargettaghash;
2686         LWLock     *newpartitionLock;
2687         bool            found;
2688         bool            outOfShmem = false;
2689
2690         Assert(LWLockHeldByMe(SerializablePredicateLockListLock));
2691
2692         oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
2693         newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
2694         oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
2695         newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
2696
2697         if (removeOld)
2698         {
2699                 /*
2700                  * Remove the dummy entry to give us scratch space, so we know we'll
2701                  * be able to create the new lock target.
2702                  */
2703                 RemoveScratchTarget(false);
2704         }
2705
2706         /*
2707          * We must get the partition locks in ascending sequence to avoid
2708          * deadlocks. If old and new partitions are the same, we must request the
2709          * lock only once.
2710          */
2711         if (oldpartitionLock < newpartitionLock)
2712         {
2713                 LWLockAcquire(oldpartitionLock,
2714                                           (removeOld ? LW_EXCLUSIVE : LW_SHARED));
2715                 LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
2716         }
2717         else if (oldpartitionLock > newpartitionLock)
2718         {
2719                 LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
2720                 LWLockAcquire(oldpartitionLock,
2721                                           (removeOld ? LW_EXCLUSIVE : LW_SHARED));
2722         }
2723         else
2724                 LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
2725
2726         /*
2727          * Look for the old target.  If not found, that's OK; no predicate locks
2728          * are affected, so we can just clean up and return. If it does exist,
2729          * walk its list of predicate locks and move or copy them to the new
2730          * target.
2731          */
2732         oldtarget = hash_search_with_hash_value(PredicateLockTargetHash,
2733                                                                                         &oldtargettag,
2734                                                                                         oldtargettaghash,
2735                                                                                         HASH_FIND, NULL);
2736
2737         if (oldtarget)
2738         {
2739                 PREDICATELOCKTARGET *newtarget;
2740                 PREDICATELOCK *oldpredlock;
2741                 PREDICATELOCKTAG newpredlocktag;
2742
2743                 newtarget = hash_search_with_hash_value(PredicateLockTargetHash,
2744                                                                                                 &newtargettag,
2745                                                                                                 newtargettaghash,
2746                                                                                                 HASH_ENTER_NULL, &found);
2747
2748                 if (!newtarget)
2749                 {
2750                         /* Failed to allocate due to insufficient shmem */
2751                         outOfShmem = true;
2752                         goto exit;
2753                 }
2754
2755                 /* If we created a new entry, initialize it */
2756                 if (!found)
2757                         SHMQueueInit(&(newtarget->predicateLocks));
2758
2759                 newpredlocktag.myTarget = newtarget;
2760
2761                 /*
2762                  * Loop through all the locks on the old target, replacing them with
2763                  * locks on the new target.
2764                  */
2765                 oldpredlock = (PREDICATELOCK *)
2766                         SHMQueueNext(&(oldtarget->predicateLocks),
2767                                                  &(oldtarget->predicateLocks),
2768                                                  offsetof(PREDICATELOCK, targetLink));
2769                 LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
2770                 while (oldpredlock)
2771                 {
2772                         SHM_QUEUE  *predlocktargetlink;
2773                         PREDICATELOCK *nextpredlock;
2774                         PREDICATELOCK *newpredlock;
2775                         SerCommitSeqNo oldCommitSeqNo = oldpredlock->commitSeqNo;
2776
2777                         predlocktargetlink = &(oldpredlock->targetLink);
2778                         nextpredlock = (PREDICATELOCK *)
2779                                 SHMQueueNext(&(oldtarget->predicateLocks),
2780                                                          predlocktargetlink,
2781                                                          offsetof(PREDICATELOCK, targetLink));
2782                         newpredlocktag.myXact = oldpredlock->tag.myXact;
2783
2784                         if (removeOld)
2785                         {
2786                                 SHMQueueDelete(&(oldpredlock->xactLink));
2787                                 SHMQueueDelete(&(oldpredlock->targetLink));
2788
2789                                 hash_search_with_hash_value
2790                                         (PredicateLockHash,
2791                                          &oldpredlock->tag,
2792                                          PredicateLockHashCodeFromTargetHashCode(&oldpredlock->tag,
2793                                                                                                                          oldtargettaghash),
2794                                          HASH_REMOVE, &found);
2795                                 Assert(found);
2796                         }
2797
2798                         newpredlock = (PREDICATELOCK *)
2799                                 hash_search_with_hash_value(PredicateLockHash,
2800                                                                                         &newpredlocktag,
2801                                                                                         PredicateLockHashCodeFromTargetHashCode(&newpredlocktag,
2802                                                                                                                                                                         newtargettaghash),
2803                                                                                         HASH_ENTER_NULL,
2804                                                                                         &found);
2805                         if (!newpredlock)
2806                         {
2807                                 /* Out of shared memory. Undo what we've done so far. */
2808                                 LWLockRelease(SerializableXactHashLock);
2809                                 DeleteLockTarget(newtarget, newtargettaghash);
2810                                 outOfShmem = true;
2811                                 goto exit;
2812                         }
2813                         if (!found)
2814                         {
2815                                 SHMQueueInsertBefore(&(newtarget->predicateLocks),
2816                                                                          &(newpredlock->targetLink));
2817                                 SHMQueueInsertBefore(&(newpredlocktag.myXact->predicateLocks),
2818                                                                          &(newpredlock->xactLink));
2819                                 newpredlock->commitSeqNo = oldCommitSeqNo;
2820                         }
2821                         else
2822                         {
2823                                 if (newpredlock->commitSeqNo < oldCommitSeqNo)
2824                                         newpredlock->commitSeqNo = oldCommitSeqNo;
2825                         }
2826
2827                         Assert(newpredlock->commitSeqNo != 0);
2828                         Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo)
2829                                    || (newpredlock->tag.myXact == OldCommittedSxact));
2830
2831                         oldpredlock = nextpredlock;
2832                 }
2833                 LWLockRelease(SerializableXactHashLock);
2834
2835                 if (removeOld)
2836                 {
2837                         Assert(SHMQueueEmpty(&oldtarget->predicateLocks));
2838                         RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash);
2839                 }
2840         }
2841
2842
2843 exit:
2844         /* Release partition locks in reverse order of acquisition. */
2845         if (oldpartitionLock < newpartitionLock)
2846         {
2847                 LWLockRelease(newpartitionLock);
2848                 LWLockRelease(oldpartitionLock);
2849         }
2850         else if (oldpartitionLock > newpartitionLock)
2851         {
2852                 LWLockRelease(oldpartitionLock);
2853                 LWLockRelease(newpartitionLock);
2854         }
2855         else
2856                 LWLockRelease(newpartitionLock);
2857
2858         if (removeOld)
2859         {
2860                 /* We shouldn't run out of memory if we're moving locks */
2861                 Assert(!outOfShmem);
2862
2863                 /* Put the scratch entry back */
2864                 RestoreScratchTarget(false);
2865         }
2866
2867         return !outOfShmem;
2868 }
2869
2870 /*
2871  * Drop all predicate locks of any granularity from the specified relation,
2872  * which can be a heap relation or an index relation.  If 'transfer' is true,
2873  * acquire a relation lock on the heap for any transactions with any lock(s)
2874  * on the specified relation.
2875  *
2876  * This requires grabbing a lot of LW locks and scanning the entire lock
2877  * target table for matches.  That makes this more expensive than most
2878  * predicate lock management functions, but it will only be called for DDL
2879  * type commands that are expensive anyway, and there are fast returns when
2880  * no serializable transactions are active or the relation is temporary.
2881  *
2882  * We don't use the TransferPredicateLocksToNewTarget function because it
2883  * acquires its own locks on the partitions of the two targets involved,
2884  * and we'll already be holding all partition locks.
2885  *
2886  * We can't throw an error from here, because the call could be from a
2887  * transaction which is not serializable.
2888  *
2889  * NOTE: This is currently only called with transfer set to true, but that may
2890  * change.  If we decide to clean up the locks from a table on commit of a
2891  * transaction which executed DROP TABLE, the false condition will be useful.
2892  */
2893 static void
2894 DropAllPredicateLocksFromTable(Relation relation, bool transfer)
2895 {
2896         HASH_SEQ_STATUS seqstat;
2897         PREDICATELOCKTARGET *oldtarget;
2898         PREDICATELOCKTARGET *heaptarget;
2899         Oid                     dbId;
2900         Oid                     relId;
2901         Oid                     heapId;
2902         int                     i;
2903         bool            isIndex;
2904         bool            found;
2905         uint32          heaptargettaghash;
2906
2907         /*
2908          * Bail out quickly if there are no serializable transactions running.
2909          * It's safe to check this without taking locks because the caller is
2910          * holding an ACCESS EXCLUSIVE lock on the relation.  No new locks which
2911          * would matter here can be acquired while that is held.
2912          */
2913         if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
2914                 return;
2915
2916         if (!PredicateLockingNeededForRelation(relation))
2917                 return;
2918
2919         dbId = relation->rd_node.dbNode;
2920         relId = relation->rd_id;
2921         if (relation->rd_index == NULL)
2922         {
2923                 isIndex = false;
2924                 heapId = relId;
2925         }
2926         else
2927         {
2928                 isIndex = true;
2929                 heapId = relation->rd_index->indrelid;
2930         }
2931         Assert(heapId != InvalidOid);
2932         Assert(transfer || !isIndex);   /* index OID only makes sense with
2933                                                                          * transfer */
2934
2935         /* Retrieve first time needed, then keep. */
2936         heaptargettaghash = 0;
2937         heaptarget = NULL;
2938
2939         /* Acquire locks on all lock partitions */
2940         LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
2941         for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
2942                 LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_EXCLUSIVE);
2943         LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
2944
2945         /*
2946          * Remove the dummy entry to give us scratch space, so we know we'll be
2947          * able to create the new lock target.
2948          */
2949         if (transfer)
2950                 RemoveScratchTarget(true);
2951
2952         /* Scan through target map */
2953         hash_seq_init(&seqstat, PredicateLockTargetHash);
2954
2955         while ((oldtarget = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
2956         {
2957                 PREDICATELOCK *oldpredlock;
2958
2959                 /*
2960                  * Check whether this is a target which needs attention.
2961                  */
2962                 if (GET_PREDICATELOCKTARGETTAG_RELATION(oldtarget->tag) != relId)
2963                         continue;                       /* wrong relation id */
2964                 if (GET_PREDICATELOCKTARGETTAG_DB(oldtarget->tag) != dbId)
2965                         continue;                       /* wrong database id */
2966                 if (transfer && !isIndex
2967                         && GET_PREDICATELOCKTARGETTAG_TYPE(oldtarget->tag) == PREDLOCKTAG_RELATION)
2968                         continue;                       /* already the right lock */
2969
2970                 /*
2971                  * If we made it here, we have work to do.  We make sure the heap
2972                  * relation lock exists, then we walk the list of predicate locks for
2973                  * the old target we found, moving all locks to the heap relation lock
2974                  * -- unless they already hold that.
2975                  */
2976
2977                 /*
2978                  * First make sure we have the heap relation target.  We only need to
2979                  * do this once.
2980                  */
2981                 if (transfer && heaptarget == NULL)
2982                 {
2983                         PREDICATELOCKTARGETTAG heaptargettag;
2984
2985                         SET_PREDICATELOCKTARGETTAG_RELATION(heaptargettag, dbId, heapId);
2986                         heaptargettaghash = PredicateLockTargetTagHashCode(&heaptargettag);
2987                         heaptarget = hash_search_with_hash_value(PredicateLockTargetHash,
2988                                                                                                          &heaptargettag,
2989                                                                                                          heaptargettaghash,
2990                                                                                                          HASH_ENTER, &found);
2991                         if (!found)
2992                                 SHMQueueInit(&heaptarget->predicateLocks);
2993                 }
2994
2995                 /*
2996                  * Loop through all the locks on the old target, replacing them with
2997                  * locks on the new target.
2998                  */
2999                 oldpredlock = (PREDICATELOCK *)
3000                         SHMQueueNext(&(oldtarget->predicateLocks),
3001                                                  &(oldtarget->predicateLocks),
3002                                                  offsetof(PREDICATELOCK, targetLink));
3003                 while (oldpredlock)
3004                 {
3005                         PREDICATELOCK *nextpredlock;
3006                         PREDICATELOCK *newpredlock;
3007                         SerCommitSeqNo oldCommitSeqNo;
3008                         SERIALIZABLEXACT *oldXact;
3009
3010                         nextpredlock = (PREDICATELOCK *)
3011                                 SHMQueueNext(&(oldtarget->predicateLocks),
3012                                                          &(oldpredlock->targetLink),
3013                                                          offsetof(PREDICATELOCK, targetLink));
3014
3015                         /*
3016                          * Remove the old lock first. This avoids the chance of running
3017                          * out of lock structure entries for the hash table.
3018                          */
3019                         oldCommitSeqNo = oldpredlock->commitSeqNo;
3020                         oldXact = oldpredlock->tag.myXact;
3021
3022                         SHMQueueDelete(&(oldpredlock->xactLink));
3023
3024                         /*
3025                          * No need for retail delete from oldtarget list, we're removing
3026                          * the whole target anyway.
3027                          */
3028                         hash_search(PredicateLockHash,
3029                                                 &oldpredlock->tag,
3030                                                 HASH_REMOVE, &found);
3031                         Assert(found);
3032
3033                         if (transfer)
3034                         {
3035                                 PREDICATELOCKTAG newpredlocktag;
3036
3037                                 newpredlocktag.myTarget = heaptarget;
3038                                 newpredlocktag.myXact = oldXact;
3039                                 newpredlock = (PREDICATELOCK *)
3040                                         hash_search_with_hash_value(PredicateLockHash,
3041                                                                                                 &newpredlocktag,
3042                                                                                                 PredicateLockHashCodeFromTargetHashCode(&newpredlocktag,
3043                                                                                                                                                                                 heaptargettaghash),
3044                                                                                                 HASH_ENTER,
3045                                                                                                 &found);
3046                                 if (!found)
3047                                 {
3048                                         SHMQueueInsertBefore(&(heaptarget->predicateLocks),
3049                                                                                  &(newpredlock->targetLink));
3050                                         SHMQueueInsertBefore(&(newpredlocktag.myXact->predicateLocks),
3051                                                                                  &(newpredlock->xactLink));
3052                                         newpredlock->commitSeqNo = oldCommitSeqNo;
3053                                 }
3054                                 else
3055                                 {
3056                                         if (newpredlock->commitSeqNo < oldCommitSeqNo)
3057                                                 newpredlock->commitSeqNo = oldCommitSeqNo;
3058                                 }
3059
3060                                 Assert(newpredlock->commitSeqNo != 0);
3061                                 Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo)
3062                                            || (newpredlock->tag.myXact == OldCommittedSxact));
3063                         }
3064
3065                         oldpredlock = nextpredlock;
3066                 }
3067
3068                 hash_search(PredicateLockTargetHash, &oldtarget->tag, HASH_REMOVE,
3069                                         &found);
3070                 Assert(found);
3071         }
3072
3073         /* Put the scratch entry back */
3074         if (transfer)
3075                 RestoreScratchTarget(true);
3076
3077         /* Release locks in reverse order */
3078         LWLockRelease(SerializableXactHashLock);
3079         for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
3080                 LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
3081         LWLockRelease(SerializablePredicateLockListLock);
3082 }
3083
3084 /*
3085  * TransferPredicateLocksToHeapRelation
3086  *              For all transactions, transfer all predicate locks for the given
3087  *              relation to a single relation lock on the heap.
3088  */
3089 void
3090 TransferPredicateLocksToHeapRelation(Relation relation)
3091 {
3092         DropAllPredicateLocksFromTable(relation, true);
3093 }
3094
3095
3096 /*
3097  *              PredicateLockPageSplit
3098  *
3099  * Copies any predicate locks for the old page to the new page.
3100  * Skip if this is a temporary table or toast table.
3101  *
3102  * NOTE: A page split (or overflow) affects all serializable transactions,
3103  * even if it occurs in the context of another transaction isolation level.
3104  *
3105  * NOTE: This currently leaves the local copy of the locks without
3106  * information on the new lock which is in shared memory.  This could cause
3107  * problems if enough page splits occur on locked pages without the processes
3108  * which hold the locks getting in and noticing.
3109  */
3110 void
3111 PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
3112                                            BlockNumber newblkno)
3113 {
3114         PREDICATELOCKTARGETTAG oldtargettag;
3115         PREDICATELOCKTARGETTAG newtargettag;
3116         bool            success;
3117
3118         /*
3119          * Bail out quickly if there are no serializable transactions running.
3120          *
3121          * It's safe to do this check without taking any additional locks. Even if
3122          * a serializable transaction starts concurrently, we know it can't take
3123          * any SIREAD locks on the page being split because the caller is holding
3124          * the associated buffer page lock. Memory reordering isn't an issue; the
3125          * memory barrier in the LWLock acquisition guarantees that this read
3126          * occurs while the buffer page lock is held.
3127          */
3128         if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
3129                 return;
3130
3131         if (!PredicateLockingNeededForRelation(relation))
3132                 return;
3133
3134         Assert(oldblkno != newblkno);
3135         Assert(BlockNumberIsValid(oldblkno));
3136         Assert(BlockNumberIsValid(newblkno));
3137
3138         SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
3139                                                                         relation->rd_node.dbNode,
3140                                                                         relation->rd_id,
3141                                                                         oldblkno);
3142         SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
3143                                                                         relation->rd_node.dbNode,
3144                                                                         relation->rd_id,
3145                                                                         newblkno);
3146
3147         LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
3148
3149         /*
3150          * Try copying the locks over to the new page's tag, creating it if
3151          * necessary.
3152          */
3153         success = TransferPredicateLocksToNewTarget(oldtargettag,
3154                                                                                                 newtargettag,
3155                                                                                                 false);
3156
3157         if (!success)
3158         {
3159                 /*
3160                  * No more predicate lock entries are available. Failure isn't an
3161                  * option here, so promote the page lock to a relation lock.
3162                  */
3163
3164                 /* Get the parent relation lock's lock tag */
3165                 success = GetParentPredicateLockTag(&oldtargettag,
3166                                                                                         &newtargettag);
3167                 Assert(success);
3168
3169                 /*
3170                  * Move the locks to the parent. This shouldn't fail.
3171                  *
3172                  * Note that here we are removing locks held by other backends,
3173                  * leading to a possible inconsistency in their local lock hash table.
3174                  * This is OK because we're replacing it with a lock that covers the
3175                  * old one.
3176                  */
3177                 success = TransferPredicateLocksToNewTarget(oldtargettag,
3178                                                                                                         newtargettag,
3179                                                                                                         true);
3180                 Assert(success);
3181         }
3182
3183         LWLockRelease(SerializablePredicateLockListLock);
3184 }
3185
3186 /*
3187  *              PredicateLockPageCombine
3188  *
3189  * Combines predicate locks for two existing pages.
3190  * Skip if this is a temporary table or toast table.
3191  *
3192  * NOTE: A page combine affects all serializable transactions, even if it
3193  * occurs in the context of another transaction isolation level.
3194  */
3195 void
3196 PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
3197                                                  BlockNumber newblkno)
3198 {
3199         /*
3200          * Page combines differ from page splits in that we ought to be able to
3201          * remove the locks on the old page after transferring them to the new
3202          * page, instead of duplicating them. However, because we can't edit other
3203          * backends' local lock tables, removing the old lock would leave them
3204          * with an entry in their LocalPredicateLockHash for a lock they're not
3205          * holding, which isn't acceptable. So we wind up having to do the same
3206          * work as a page split, acquiring a lock on the new page and keeping the
3207          * old page locked too. That can lead to some false positives, but should
3208          * be rare in practice.
3209          */
3210         PredicateLockPageSplit(relation, oldblkno, newblkno);
3211 }
3212
3213 /*
3214  * Walk the list of in-progress serializable transactions and find the new
3215  * xmin.
3216  */
3217 static void
3218 SetNewSxactGlobalXmin(void)
3219 {
3220         SERIALIZABLEXACT *sxact;
3221
3222         Assert(LWLockHeldByMe(SerializableXactHashLock));
3223
3224         PredXact->SxactGlobalXmin = InvalidTransactionId;
3225         PredXact->SxactGlobalXminCount = 0;
3226
3227         for (sxact = FirstPredXact(); sxact != NULL; sxact = NextPredXact(sxact))
3228         {
3229                 if (!SxactIsRolledBack(sxact)
3230                         && !SxactIsCommitted(sxact)
3231                         && sxact != OldCommittedSxact)
3232                 {
3233                         Assert(sxact->xmin != InvalidTransactionId);
3234                         if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)
3235                                 || TransactionIdPrecedes(sxact->xmin,
3236                                                                                  PredXact->SxactGlobalXmin))
3237                         {
3238                                 PredXact->SxactGlobalXmin = sxact->xmin;
3239                                 PredXact->SxactGlobalXminCount = 1;
3240                         }
3241                         else if (TransactionIdEquals(sxact->xmin,
3242                                                                                  PredXact->SxactGlobalXmin))
3243                                 PredXact->SxactGlobalXminCount++;
3244                 }
3245         }
3246
3247         OldSerXidSetActiveSerXmin(PredXact->SxactGlobalXmin);
3248 }
3249
3250 /*
3251  *              ReleasePredicateLocks
3252  *
3253  * Releases predicate locks based on completion of the current transaction,
3254  * whether committed or rolled back.  It can also be called for a read only
3255  * transaction when it becomes impossible for the transaction to become
3256  * part of a dangerous structure.
3257  *
3258  * We do nothing unless this is a serializable transaction.
3259  *
3260  * This method must ensure that shared memory hash tables are cleaned
3261  * up in some relatively timely fashion.
3262  *
3263  * If this transaction is committing and is holding any predicate locks,
3264  * it must be added to a list of completed serializable transactions still
3265  * holding locks.
3266  */
3267 void
3268 ReleasePredicateLocks(bool isCommit)
3269 {
3270         bool            needToClear;
3271         RWConflict      conflict,
3272                                 nextConflict,
3273                                 possibleUnsafeConflict;
3274         SERIALIZABLEXACT *roXact;
3275
3276         /*
3277          * We can't trust XactReadOnly here, because a transaction which started
3278          * as READ WRITE can show as READ ONLY later, e.g., within
3279          * subtransactions.  We want to flag a transaction as READ ONLY if it
3280          * commits without writing so that de facto READ ONLY transactions get the
3281          * benefit of some RO optimizations, so we will use this local variable to
3282          * get some cleanup logic right which is based on whether the transaction
3283          * was declared READ ONLY at the top level.
3284          */
3285         bool            topLevelIsDeclaredReadOnly;
3286
3287         if (MySerializableXact == InvalidSerializableXact)
3288         {
3289                 Assert(LocalPredicateLockHash == NULL);
3290                 return;
3291         }
3292
3293         LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
3294
3295         Assert(!isCommit || SxactIsPrepared(MySerializableXact));
3296         Assert(!isCommit || !SxactIsDoomed(MySerializableXact));
3297         Assert(!SxactIsCommitted(MySerializableXact));
3298         Assert(!SxactIsRolledBack(MySerializableXact));
3299
3300         /* may not be serializable during COMMIT/ROLLBACK PREPARED */
3301         Assert(MySerializableXact->pid == 0 || IsolationIsSerializable());
3302
3303         /* We'd better not already be on the cleanup list. */
3304         Assert(!SxactIsOnFinishedList(MySerializableXact));
3305
3306         topLevelIsDeclaredReadOnly = SxactIsReadOnly(MySerializableXact);
3307
3308         /*
3309          * We don't hold XidGenLock lock here, assuming that TransactionId is
3310          * atomic!
3311          *
3312          * If this value is changing, we don't care that much whether we get the
3313          * old or new value -- it is just used to determine how far
3314          * GlobalSerializableXmin must advance before this transaction can be
3315          * fully cleaned up.  The worst that could happen is we wait for one more
3316          * transaction to complete before freeing some RAM; correctness of visible
3317          * behavior is not affected.
3318          */
3319         MySerializableXact->finishedBefore = ShmemVariableCache->nextXid;
3320
3321         /*
3322          * If it's not a commit it's a rollback, and we can clear our locks
3323          * immediately.
3324          */
3325         if (isCommit)
3326         {
3327                 MySerializableXact->flags |= SXACT_FLAG_COMMITTED;
3328                 MySerializableXact->commitSeqNo = ++(PredXact->LastSxactCommitSeqNo);
3329                 /* Recognize implicit read-only transaction (commit without write). */
3330                 if (!MyXactDidWrite)
3331                         MySerializableXact->flags |= SXACT_FLAG_READ_ONLY;
3332         }
3333         else
3334         {
3335                 /*
3336                  * The DOOMED flag indicates that we intend to roll back this
3337                  * transaction and so it should not cause serialization failures for
3338                  * other transactions that conflict with it. Note that this flag might
3339                  * already be set, if another backend marked this transaction for
3340                  * abort.
3341                  *
3342                  * The ROLLED_BACK flag further indicates that ReleasePredicateLocks
3343                  * has been called, and so the SerializableXact is eligible for
3344                  * cleanup. This means it should not be considered when calculating
3345                  * SxactGlobalXmin.
3346                  */
3347                 MySerializableXact->flags |= SXACT_FLAG_DOOMED;
3348                 MySerializableXact->flags |= SXACT_FLAG_ROLLED_BACK;
3349
3350                 /*
3351                  * If the transaction was previously prepared, but is now failing due
3352                  * to a ROLLBACK PREPARED or (hopefully very rare) error after the
3353                  * prepare, clear the prepared flag.  This simplifies conflict
3354                  * checking.
3355                  */
3356                 MySerializableXact->flags &= ~SXACT_FLAG_PREPARED;
3357         }
3358
3359         if (!topLevelIsDeclaredReadOnly)
3360         {
3361                 Assert(PredXact->WritableSxactCount > 0);
3362                 if (--(PredXact->WritableSxactCount) == 0)
3363                 {
3364                         /*
3365                          * Release predicate locks and rw-conflicts in for all committed
3366                          * transactions.  There are no longer any transactions which might
3367                          * conflict with the locks and no chance for new transactions to
3368                          * overlap.  Similarly, existing conflicts in can't cause pivots,
3369                          * and any conflicts in which could have completed a dangerous
3370                          * structure would already have caused a rollback, so any
3371                          * remaining ones must be benign.
3372                          */
3373                         PredXact->CanPartialClearThrough = PredXact->LastSxactCommitSeqNo;
3374                 }
3375         }
3376         else
3377         {
3378                 /*
3379                  * Read-only transactions: clear the list of transactions that might
3380                  * make us unsafe. Note that we use 'inLink' for the iteration as
3381                  * opposed to 'outLink' for the r/w xacts.
3382                  */
3383                 possibleUnsafeConflict = (RWConflict)
3384                         SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts,
3385                                                  &MySerializableXact->possibleUnsafeConflicts,
3386                                                  offsetof(RWConflictData, inLink));
3387                 while (possibleUnsafeConflict)
3388                 {
3389                         nextConflict = (RWConflict)
3390                                 SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts,
3391                                                          &possibleUnsafeConflict->inLink,
3392                                                          offsetof(RWConflictData, inLink));
3393
3394                         Assert(!SxactIsReadOnly(possibleUnsafeConflict->sxactOut));
3395                         Assert(MySerializableXact == possibleUnsafeConflict->sxactIn);
3396
3397                         ReleaseRWConflict(possibleUnsafeConflict);
3398
3399                         possibleUnsafeConflict = nextConflict;
3400                 }
3401         }
3402
3403         /* Check for conflict out to old committed transactions. */
3404         if (isCommit
3405                 && !SxactIsReadOnly(MySerializableXact)
3406                 && SxactHasSummaryConflictOut(MySerializableXact))
3407         {
3408                 /*
3409                  * we don't know which old committed transaction we conflicted with,
3410                  * so be conservative and use FirstNormalSerCommitSeqNo here
3411                  */
3412                 MySerializableXact->SeqNo.earliestOutConflictCommit =
3413                         FirstNormalSerCommitSeqNo;
3414                 MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT;
3415         }
3416
3417         /*
3418          * Release all outConflicts to committed transactions.  If we're rolling
3419          * back clear them all.  Set SXACT_FLAG_CONFLICT_OUT if any point to
3420          * previously committed transactions.
3421          */
3422         conflict = (RWConflict)
3423                 SHMQueueNext(&MySerializableXact->outConflicts,
3424                                          &MySerializableXact->outConflicts,
3425                                          offsetof(RWConflictData, outLink));
3426         while (conflict)
3427         {
3428                 nextConflict = (RWConflict)
3429                         SHMQueueNext(&MySerializableXact->outConflicts,
3430                                                  &conflict->outLink,
3431                                                  offsetof(RWConflictData, outLink));
3432
3433                 if (isCommit
3434                         && !SxactIsReadOnly(MySerializableXact)
3435                         && SxactIsCommitted(conflict->sxactIn))
3436                 {
3437                         if ((MySerializableXact->flags & SXACT_FLAG_CONFLICT_OUT) == 0
3438                                 || conflict->sxactIn->prepareSeqNo < MySerializableXact->SeqNo.earliestOutConflictCommit)
3439                                 MySerializableXact->SeqNo.earliestOutConflictCommit = conflict->sxactIn->prepareSeqNo;
3440                         MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT;
3441                 }
3442
3443                 if (!isCommit
3444                         || SxactIsCommitted(conflict->sxactIn)
3445                         || (conflict->sxactIn->SeqNo.lastCommitBeforeSnapshot >= PredXact->LastSxactCommitSeqNo))
3446                         ReleaseRWConflict(conflict);
3447
3448                 conflict = nextConflict;
3449         }
3450
3451         /*
3452          * Release all inConflicts from committed and read-only transactions. If
3453          * we're rolling back, clear them all.
3454          */
3455         conflict = (RWConflict)
3456                 SHMQueueNext(&MySerializableXact->inConflicts,
3457                                          &MySerializableXact->inConflicts,
3458                                          offsetof(RWConflictData, inLink));
3459         while (conflict)
3460         {
3461                 nextConflict = (RWConflict)
3462                         SHMQueueNext(&MySerializableXact->inConflicts,
3463                                                  &conflict->inLink,
3464                                                  offsetof(RWConflictData, inLink));
3465
3466                 if (!isCommit
3467                         || SxactIsCommitted(conflict->sxactOut)
3468                         || SxactIsReadOnly(conflict->sxactOut))
3469                         ReleaseRWConflict(conflict);
3470
3471                 conflict = nextConflict;
3472         }
3473
3474         if (!topLevelIsDeclaredReadOnly)
3475         {
3476                 /*
3477                  * Remove ourselves from the list of possible conflicts for concurrent
3478                  * READ ONLY transactions, flagging them as unsafe if we have a
3479                  * conflict out. If any are waiting DEFERRABLE transactions, wake them
3480                  * up if they are known safe or known unsafe.
3481                  */
3482                 possibleUnsafeConflict = (RWConflict)
3483                         SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts,
3484                                                  &MySerializableXact->possibleUnsafeConflicts,
3485                                                  offsetof(RWConflictData, outLink));
3486                 while (possibleUnsafeConflict)
3487                 {
3488                         nextConflict = (RWConflict)
3489                                 SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts,
3490                                                          &possibleUnsafeConflict->outLink,
3491                                                          offsetof(RWConflictData, outLink));
3492
3493                         roXact = possibleUnsafeConflict->sxactIn;
3494                         Assert(MySerializableXact == possibleUnsafeConflict->sxactOut);
3495                         Assert(SxactIsReadOnly(roXact));
3496
3497                         /* Mark conflicted if necessary. */
3498                         if (isCommit
3499                                 && MyXactDidWrite
3500                                 && SxactHasConflictOut(MySerializableXact)
3501                                 && (MySerializableXact->SeqNo.earliestOutConflictCommit
3502                                         <= roXact->SeqNo.lastCommitBeforeSnapshot))
3503                         {
3504                                 /*
3505                                  * This releases possibleUnsafeConflict (as well as all other
3506                                  * possible conflicts for roXact)
3507                                  */
3508                                 FlagSxactUnsafe(roXact);
3509                         }
3510                         else
3511                         {
3512                                 ReleaseRWConflict(possibleUnsafeConflict);
3513
3514                                 /*
3515                                  * If we were the last possible conflict, flag it safe. The
3516                                  * transaction can now safely release its predicate locks (but
3517                                  * that transaction's backend has to do that itself).
3518                                  */
3519                                 if (SHMQueueEmpty(&roXact->possibleUnsafeConflicts))
3520                                         roXact->flags |= SXACT_FLAG_RO_SAFE;
3521                         }
3522
3523                         /*
3524                          * Wake up the process for a waiting DEFERRABLE transaction if we
3525                          * now know it's either safe or conflicted.
3526                          */
3527                         if (SxactIsDeferrableWaiting(roXact) &&
3528                                 (SxactIsROUnsafe(roXact) || SxactIsROSafe(roXact)))
3529                                 ProcSendSignal(roXact->pid);
3530
3531                         possibleUnsafeConflict = nextConflict;
3532                 }
3533         }
3534
3535         /*
3536          * Check whether it's time to clean up old transactions. This can only be
3537          * done when the last serializable transaction with the oldest xmin among
3538          * serializable transactions completes.  We then find the "new oldest"
3539          * xmin and purge any transactions which finished before this transaction
3540          * was launched.
3541          */
3542         needToClear = false;
3543         if (TransactionIdEquals(MySerializableXact->xmin, PredXact->SxactGlobalXmin))
3544         {
3545                 Assert(PredXact->SxactGlobalXminCount > 0);
3546                 if (--(PredXact->SxactGlobalXminCount) == 0)
3547                 {
3548                         SetNewSxactGlobalXmin();
3549                         needToClear = true;
3550                 }
3551         }
3552
3553         LWLockRelease(SerializableXactHashLock);
3554
3555         LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
3556
3557         /* Add this to the list of transactions to check for later cleanup. */
3558         if (isCommit)
3559                 SHMQueueInsertBefore(FinishedSerializableTransactions,
3560                                                          &MySerializableXact->finishedLink);
3561
3562         if (!isCommit)
3563                 ReleaseOneSerializableXact(MySerializableXact, false, false);
3564
3565         LWLockRelease(SerializableFinishedListLock);
3566
3567         if (needToClear)
3568                 ClearOldPredicateLocks();
3569
3570         MySerializableXact = InvalidSerializableXact;
3571         MyXactDidWrite = false;
3572
3573         /* Delete per-transaction lock table */
3574         if (LocalPredicateLockHash != NULL)
3575         {
3576                 hash_destroy(LocalPredicateLockHash);
3577                 LocalPredicateLockHash = NULL;
3578         }
3579 }
3580
3581 /*
3582  * Clear old predicate locks, belonging to committed transactions that are no
3583  * longer interesting to any in-progress transaction.
3584  */
3585 static void
3586 ClearOldPredicateLocks(void)
3587 {
3588         SERIALIZABLEXACT *finishedSxact;
3589         PREDICATELOCK *predlock;
3590
3591         /*
3592          * Loop through finished transactions. They are in commit order, so we can
3593          * stop as soon as we find one that's still interesting.
3594          */
3595         LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
3596         finishedSxact = (SERIALIZABLEXACT *)
3597                 SHMQueueNext(FinishedSerializableTransactions,
3598                                          FinishedSerializableTransactions,
3599                                          offsetof(SERIALIZABLEXACT, finishedLink));
3600         LWLockAcquire(SerializableXactHashLock, LW_SHARED);
3601         while (finishedSxact)
3602         {
3603                 SERIALIZABLEXACT *nextSxact;
3604
3605                 nextSxact = (SERIALIZABLEXACT *)
3606                         SHMQueueNext(FinishedSerializableTransactions,
3607                                                  &(finishedSxact->finishedLink),
3608                                                  offsetof(SERIALIZABLEXACT, finishedLink));
3609                 if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)
3610                         || TransactionIdPrecedesOrEquals(finishedSxact->finishedBefore,
3611                                                                                          PredXact->SxactGlobalXmin))
3612                 {
3613                         /*
3614                          * This transaction committed before any in-progress transaction
3615                          * took its snapshot. It's no longer interesting.
3616                          */
3617                         LWLockRelease(SerializableXactHashLock);
3618                         SHMQueueDelete(&(finishedSxact->finishedLink));
3619                         ReleaseOneSerializableXact(finishedSxact, false, false);
3620                         LWLockAcquire(SerializableXactHashLock, LW_SHARED);
3621                 }
3622                 else if (finishedSxact->commitSeqNo > PredXact->HavePartialClearedThrough
3623                                  && finishedSxact->commitSeqNo <= PredXact->CanPartialClearThrough)
3624                 {
3625                         /*
3626                          * Any active transactions that took their snapshot before this
3627                          * transaction committed are read-only, so we can clear part of
3628                          * its state.
3629                          */
3630                         LWLockRelease(SerializableXactHashLock);
3631
3632                         if (SxactIsReadOnly(finishedSxact))
3633                         {
3634                                 /* A read-only transaction can be removed entirely */
3635                                 SHMQueueDelete(&(finishedSxact->finishedLink));
3636                                 ReleaseOneSerializableXact(finishedSxact, false, false);
3637                         }
3638                         else
3639                         {
3640                                 /*
3641                                  * A read-write transaction can only be partially cleared. We
3642                                  * need to keep the SERIALIZABLEXACT but can release the
3643                                  * SIREAD locks and conflicts in.
3644                                  */
3645                                 ReleaseOneSerializableXact(finishedSxact, true, false);
3646                         }
3647
3648                         PredXact->HavePartialClearedThrough = finishedSxact->commitSeqNo;
3649                         LWLockAcquire(SerializableXactHashLock, LW_SHARED);
3650                 }
3651                 else
3652                 {
3653                         /* Still interesting. */
3654                         break;
3655                 }
3656                 finishedSxact = nextSxact;
3657         }
3658         LWLockRelease(SerializableXactHashLock);
3659
3660         /*
3661          * Loop through predicate locks on dummy transaction for summarized data.
3662          */
3663         LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
3664         predlock = (PREDICATELOCK *)
3665                 SHMQueueNext(&OldCommittedSxact->predicateLocks,
3666                                          &OldCommittedSxact->predicateLocks,
3667                                          offsetof(PREDICATELOCK, xactLink));
3668         while (predlock)
3669         {
3670                 PREDICATELOCK *nextpredlock;
3671                 bool            canDoPartialCleanup;
3672
3673                 nextpredlock = (PREDICATELOCK *)
3674                         SHMQueueNext(&OldCommittedSxact->predicateLocks,
3675                                                  &predlock->xactLink,
3676                                                  offsetof(PREDICATELOCK, xactLink));
3677
3678                 LWLockAcquire(SerializableXactHashLock, LW_SHARED);
3679                 Assert(predlock->commitSeqNo != 0);
3680                 Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo);
3681                 canDoPartialCleanup = (predlock->commitSeqNo <= PredXact->CanPartialClearThrough);
3682                 LWLockRelease(SerializableXactHashLock);
3683
3684                 /*
3685                  * If this lock originally belonged to an old enough transaction, we
3686                  * can release it.
3687                  */
3688                 if (canDoPartialCleanup)
3689                 {
3690                         PREDICATELOCKTAG tag;
3691                         PREDICATELOCKTARGET *target;
3692                         PREDICATELOCKTARGETTAG targettag;
3693                         uint32          targettaghash;
3694                         LWLock     *partitionLock;
3695
3696                         tag = predlock->tag;
3697                         target = tag.myTarget;
3698                         targettag = target->tag;
3699                         targettaghash = PredicateLockTargetTagHashCode(&targettag);
3700                         partitionLock = PredicateLockHashPartitionLock(targettaghash);
3701
3702                         LWLockAcquire(partitionLock, LW_EXCLUSIVE);
3703
3704                         SHMQueueDelete(&(predlock->targetLink));
3705                         SHMQueueDelete(&(predlock->xactLink));
3706
3707                         hash_search_with_hash_value(PredicateLockHash, &tag,
3708                                                                                 PredicateLockHashCodeFromTargetHashCode(&tag,
3709                                                                                                                                                                 targettaghash),
3710                                                                                 HASH_REMOVE, NULL);
3711                         RemoveTargetIfNoLongerUsed(target, targettaghash);
3712
3713                         LWLockRelease(partitionLock);
3714                 }
3715
3716                 predlock = nextpredlock;
3717         }
3718
3719         LWLockRelease(SerializablePredicateLockListLock);
3720         LWLockRelease(SerializableFinishedListLock);
3721 }
3722
3723 /*
3724  * This is the normal way to delete anything from any of the predicate
3725  * locking hash tables.  Given a transaction which we know can be deleted:
3726  * delete all predicate locks held by that transaction and any predicate
3727  * lock targets which are now unreferenced by a lock; delete all conflicts
3728  * for the transaction; delete all xid values for the transaction; then
3729  * delete the transaction.
3730  *
3731  * When the partial flag is set, we can release all predicate locks and
3732  * in-conflict information -- we've established that there are no longer
3733  * any overlapping read write transactions for which this transaction could
3734  * matter -- but keep the transaction entry itself and any outConflicts.
3735  *
3736  * When the summarize flag is set, we've run short of room for sxact data
3737  * and must summarize to the SLRU.  Predicate locks are transferred to a
3738  * dummy "old" transaction, with duplicate locks on a single target
3739  * collapsing to a single lock with the "latest" commitSeqNo from among
3740  * the conflicting locks..
3741  */
3742 static void
3743 ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
3744                                                    bool summarize)
3745 {
3746         PREDICATELOCK *predlock;
3747         SERIALIZABLEXIDTAG sxidtag;
3748         RWConflict      conflict,
3749                                 nextConflict;
3750
3751         Assert(sxact != NULL);
3752         Assert(SxactIsRolledBack(sxact) || SxactIsCommitted(sxact));
3753         Assert(partial || !SxactIsOnFinishedList(sxact));
3754         Assert(LWLockHeldByMe(SerializableFinishedListLock));
3755
3756         /*
3757          * First release all the predicate locks held by this xact (or transfer
3758          * them to OldCommittedSxact if summarize is true)
3759          */
3760         LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
3761         predlock = (PREDICATELOCK *)
3762                 SHMQueueNext(&(sxact->predicateLocks),
3763                                          &(sxact->predicateLocks),
3764                                          offsetof(PREDICATELOCK, xactLink));
3765         while (predlock)
3766         {
3767                 PREDICATELOCK *nextpredlock;
3768                 PREDICATELOCKTAG tag;
3769                 SHM_QUEUE  *targetLink;
3770                 PREDICATELOCKTARGET *target;
3771                 PREDICATELOCKTARGETTAG targettag;
3772                 uint32          targettaghash;
3773                 LWLock     *partitionLock;
3774
3775                 nextpredlock = (PREDICATELOCK *)
3776                         SHMQueueNext(&(sxact->predicateLocks),
3777                                                  &(predlock->xactLink),
3778                                                  offsetof(PREDICATELOCK, xactLink));
3779
3780                 tag = predlock->tag;
3781                 targetLink = &(predlock->targetLink);
3782                 target = tag.myTarget;
3783                 targettag = target->tag;
3784                 targettaghash = PredicateLockTargetTagHashCode(&targettag);
3785                 partitionLock = PredicateLockHashPartitionLock(targettaghash);
3786
3787                 LWLockAcquire(partitionLock, LW_EXCLUSIVE);
3788
3789                 SHMQueueDelete(targetLink);
3790
3791                 hash_search_with_hash_value(PredicateLockHash, &tag,
3792                                                                         PredicateLockHashCodeFromTargetHashCode(&tag,
3793                                                                                                                                                         targettaghash),
3794                                                                         HASH_REMOVE, NULL);
3795                 if (summarize)
3796                 {
3797                         bool            found;
3798
3799                         /* Fold into dummy transaction list. */
3800                         tag.myXact = OldCommittedSxact;
3801                         predlock = hash_search_with_hash_value(PredicateLockHash, &tag,
3802                                                                                                    PredicateLockHashCodeFromTargetHashCode(&tag,
3803                                                                                                                                                                                    targettaghash),
3804                                                                                                    HASH_ENTER_NULL, &found);
3805                         if (!predlock)
3806                                 ereport(ERROR,
3807                                                 (errcode(ERRCODE_OUT_OF_MEMORY),
3808                                                  errmsg("out of shared memory"),
3809                                                  errhint("You might need to increase max_pred_locks_per_transaction.")));
3810                         if (found)
3811                         {
3812                                 Assert(predlock->commitSeqNo != 0);
3813                                 Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo);
3814                                 if (predlock->commitSeqNo < sxact->commitSeqNo)
3815                                         predlock->commitSeqNo = sxact->commitSeqNo;
3816                         }
3817                         else
3818                         {
3819                                 SHMQueueInsertBefore(&(target->predicateLocks),
3820                                                                          &(predlock->targetLink));
3821                                 SHMQueueInsertBefore(&(OldCommittedSxact->predicateLocks),
3822                                                                          &(predlock->xactLink));
3823                                 predlock->commitSeqNo = sxact->commitSeqNo;
3824                         }
3825                 }
3826                 else
3827                         RemoveTargetIfNoLongerUsed(target, targettaghash);
3828
3829                 LWLockRelease(partitionLock);
3830
3831                 predlock = nextpredlock;
3832         }
3833
3834         /*
3835          * Rather than retail removal, just re-init the head after we've run
3836          * through the list.
3837          */
3838         SHMQueueInit(&sxact->predicateLocks);
3839
3840         LWLockRelease(SerializablePredicateLockListLock);
3841
3842         sxidtag.xid = sxact->topXid;
3843         LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
3844
3845         /* Release all outConflicts (unless 'partial' is true) */
3846         if (!partial)
3847         {
3848                 conflict = (RWConflict)
3849                         SHMQueueNext(&sxact->outConflicts,
3850                                                  &sxact->outConflicts,
3851                                                  offsetof(RWConflictData, outLink));
3852                 while (conflict)
3853                 {
3854                         nextConflict = (RWConflict)
3855                                 SHMQueueNext(&sxact->outConflicts,
3856                                                          &conflict->outLink,
3857                                                          offsetof(RWConflictData, outLink));
3858                         if (summarize)
3859                                 conflict->sxactIn->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
3860                         ReleaseRWConflict(conflict);
3861                         conflict = nextConflict;
3862                 }
3863         }
3864
3865         /* Release all inConflicts. */
3866         conflict = (RWConflict)
3867                 SHMQueueNext(&sxact->inConflicts,
3868                                          &sxact->inConflicts,
3869                                          offsetof(RWConflictData, inLink));
3870         while (conflict)
3871         {
3872                 nextConflict = (RWConflict)
3873                         SHMQueueNext(&sxact->inConflicts,
3874                                                  &conflict->inLink,
3875                                                  offsetof(RWConflictData, inLink));
3876                 if (summarize)
3877                         conflict->sxactOut->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
3878                 ReleaseRWConflict(conflict);
3879                 conflict = nextConflict;
3880         }
3881
3882         /* Finally, get rid of the xid and the record of the transaction itself. */
3883         if (!partial)
3884         {
3885                 if (sxidtag.xid != InvalidTransactionId)
3886                         hash_search(SerializableXidHash, &sxidtag, HASH_REMOVE, NULL);
3887                 ReleasePredXact(sxact);
3888         }
3889
3890         LWLockRelease(SerializableXactHashLock);
3891 }
3892
3893 /*
3894  * Tests whether the given top level transaction is concurrent with
3895  * (overlaps) our current transaction.
3896  *
3897  * We need to identify the top level transaction for SSI, anyway, so pass
3898  * that to this function to save the overhead of checking the snapshot's
3899  * subxip array.
3900  */
3901 static bool
3902 XidIsConcurrent(TransactionId xid)
3903 {
3904         Snapshot        snap;
3905         uint32          i;
3906
3907         Assert(TransactionIdIsValid(xid));
3908         Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
3909
3910         snap = GetTransactionSnapshot();
3911
3912         if (TransactionIdPrecedes(xid, snap->xmin))
3913                 return false;
3914
3915         if (TransactionIdFollowsOrEquals(xid, snap->xmax))
3916                 return true;
3917
3918         for (i = 0; i < snap->xcnt; i++)
3919         {
3920                 if (xid == snap->xip[i])
3921                         return true;
3922         }
3923
3924         return false;
3925 }
3926
3927 /*
3928  * CheckForSerializableConflictOut
3929  *              We are reading a tuple which has been modified.  If it is visible to
3930  *              us but has been deleted, that indicates a rw-conflict out.  If it's
3931  *              not visible and was created by a concurrent (overlapping)
3932  *              serializable transaction, that is also a rw-conflict out,
3933  *
3934  * We will determine the top level xid of the writing transaction with which
3935  * we may be in conflict, and check for overlap with our own transaction.
3936  * If the transactions overlap (i.e., they cannot see each other's writes),
3937  * then we have a conflict out.
3938  *
3939  * This function should be called just about anywhere in heapam.c where a
3940  * tuple has been read. The caller must hold at least a shared lock on the
3941  * buffer, because this function might set hint bits on the tuple. There is
3942  * currently no known reason to call this function from an index AM.
3943  */
3944 void
3945 CheckForSerializableConflictOut(bool visible, Relation relation,
3946                                                                 HeapTuple tuple, Buffer buffer,
3947                                                                 Snapshot snapshot)
3948 {
3949         TransactionId xid;
3950         SERIALIZABLEXIDTAG sxidtag;
3951         SERIALIZABLEXID *sxid;
3952         SERIALIZABLEXACT *sxact;
3953         HTSV_Result htsvResult;
3954
3955         if (!SerializationNeededForRead(relation, snapshot))
3956                 return;
3957
3958         /* Check if someone else has already decided that we need to die */
3959         if (SxactIsDoomed(MySerializableXact))
3960         {
3961                 ereport(ERROR,
3962                                 (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
3963                                  errmsg("could not serialize access due to read/write dependencies among transactions"),
3964                                  errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."),
3965                                  errhint("The transaction might succeed if retried.")));
3966         }
3967
3968         /*
3969          * Check to see whether the tuple has been written to by a concurrent
3970          * transaction, either to create it not visible to us, or to delete it
3971          * while it is visible to us.  The "visible" bool indicates whether the
3972          * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
3973          * is going on with it.
3974          */
3975         htsvResult = HeapTupleSatisfiesVacuum(tuple, TransactionXmin, buffer);
3976         switch (htsvResult)
3977         {
3978                 case HEAPTUPLE_LIVE:
3979                         if (visible)
3980                                 return;
3981                         xid = HeapTupleHeaderGetXmin(tuple->t_data);
3982                         break;
3983                 case HEAPTUPLE_RECENTLY_DEAD:
3984                         if (!visible)
3985                                 return;
3986                         xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
3987                         break;
3988                 case HEAPTUPLE_DELETE_IN_PROGRESS:
3989                         xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
3990                         break;
3991                 case HEAPTUPLE_INSERT_IN_PROGRESS:
3992                         xid = HeapTupleHeaderGetXmin(tuple->t_data);
3993                         break;
3994                 case HEAPTUPLE_DEAD:
3995                         return;
3996                 default:
3997
3998                         /*
3999                          * The only way to get to this default clause is if a new value is
4000                          * added to the enum type without adding it to this switch
4001                          * statement.  That's a bug, so elog.
4002                          */
4003                         elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
4004
4005                         /*
4006                          * In spite of having all enum values covered and calling elog on
4007                          * this default, some compilers think this is a code path which
4008                          * allows xid to be used below without initialization. Silence
4009                          * that warning.
4010                          */
4011                         xid = InvalidTransactionId;
4012         }
4013         Assert(TransactionIdIsValid(xid));
4014         Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
4015
4016         /*
4017          * Find top level xid.  Bail out if xid is too early to be a conflict, or
4018          * if it's our own xid.
4019          */
4020         if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
4021                 return;
4022         xid = SubTransGetTopmostTransaction(xid);
4023         if (TransactionIdPrecedes(xid, TransactionXmin))
4024                 return;
4025         if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
4026                 return;
4027
4028         /*
4029          * Find sxact or summarized info for the top level xid.
4030          */
4031         sxidtag.xid = xid;
4032         LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4033         sxid = (SERIALIZABLEXID *)
4034                 hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
4035         if (!sxid)
4036         {
4037                 /*
4038                  * Transaction not found in "normal" SSI structures.  Check whether it
4039                  * got pushed out to SLRU storage for "old committed" transactions.
4040                  */
4041                 SerCommitSeqNo conflictCommitSeqNo;
4042
4043                 conflictCommitSeqNo = OldSerXidGetMinConflictCommitSeqNo(xid);
4044                 if (conflictCommitSeqNo != 0)
4045                 {
4046                         if (conflictCommitSeqNo != InvalidSerCommitSeqNo
4047                                 && (!SxactIsReadOnly(MySerializableXact)
4048                                         || conflictCommitSeqNo
4049                                         <= MySerializableXact->SeqNo.lastCommitBeforeSnapshot))
4050                                 ereport(ERROR,
4051                                                 (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4052                                                  errmsg("could not serialize access due to read/write dependencies among transactions"),
4053                                                  errdetail_internal("Reason code: Canceled on conflict out to old pivot %u.", xid),
4054                                                  errhint("The transaction might succeed if retried.")));
4055
4056                         if (SxactHasSummaryConflictIn(MySerializableXact)
4057                                 || !SHMQueueEmpty(&MySerializableXact->inConflicts))
4058                                 ereport(ERROR,
4059                                                 (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4060                                                  errmsg("could not serialize access due to read/write dependencies among transactions"),
4061                                                  errdetail_internal("Reason code: Canceled on identification as a pivot, with conflict out to old committed transaction %u.", xid),
4062                                                  errhint("The transaction might succeed if retried.")));
4063
4064                         MySerializableXact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
4065                 }
4066
4067                 /* It's not serializable or otherwise not important. */
4068                 LWLockRelease(SerializableXactHashLock);
4069                 return;
4070         }
4071         sxact = sxid->myXact;
4072         Assert(TransactionIdEquals(sxact->topXid, xid));
4073         if (sxact == MySerializableXact || SxactIsDoomed(sxact))
4074         {
4075                 /* Can't conflict with ourself or a transaction that will roll back. */
4076                 LWLockRelease(SerializableXactHashLock);
4077                 return;
4078         }
4079
4080         /*
4081          * We have a conflict out to a transaction which has a conflict out to a
4082          * summarized transaction.  That summarized transaction must have
4083          * committed first, and we can't tell when it committed in relation to our
4084          * snapshot acquisition, so something needs to be canceled.
4085          */
4086         if (SxactHasSummaryConflictOut(sxact))
4087         {
4088                 if (!SxactIsPrepared(sxact))
4089                 {
4090                         sxact->flags |= SXACT_FLAG_DOOMED;
4091                         LWLockRelease(SerializableXactHashLock);
4092                         return;
4093                 }
4094                 else
4095                 {
4096                         LWLockRelease(SerializableXactHashLock);
4097                         ereport(ERROR,
4098                                         (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4099                                          errmsg("could not serialize access due to read/write dependencies among transactions"),
4100                                          errdetail_internal("Reason code: Canceled on conflict out to old pivot."),
4101                                          errhint("The transaction might succeed if retried.")));
4102                 }
4103         }
4104
4105         /*
4106          * If this is a read-only transaction and the writing transaction has
4107          * committed, and it doesn't have a rw-conflict to a transaction which
4108          * committed before it, no conflict.
4109          */
4110         if (SxactIsReadOnly(MySerializableXact)
4111                 && SxactIsCommitted(sxact)
4112                 && !SxactHasSummaryConflictOut(sxact)
4113                 && (!SxactHasConflictOut(sxact)
4114                         || MySerializableXact->SeqNo.lastCommitBeforeSnapshot < sxact->SeqNo.earliestOutConflictCommit))
4115         {
4116                 /* Read-only transaction will appear to run first.  No conflict. */
4117                 LWLockRelease(SerializableXactHashLock);
4118                 return;
4119         }
4120
4121         if (!XidIsConcurrent(xid))
4122         {
4123                 /* This write was already in our snapshot; no conflict. */
4124                 LWLockRelease(SerializableXactHashLock);
4125                 return;
4126         }
4127
4128         if (RWConflictExists(MySerializableXact, sxact))
4129         {
4130                 /* We don't want duplicate conflict records in the list. */
4131                 LWLockRelease(SerializableXactHashLock);
4132                 return;
4133         }
4134
4135         /*
4136          * Flag the conflict.  But first, if this conflict creates a dangerous
4137          * structure, ereport an error.
4138          */
4139         FlagRWConflict(MySerializableXact, sxact);
4140         LWLockRelease(SerializableXactHashLock);
4141 }
4142
4143 /*
4144  * Check a particular target for rw-dependency conflict in. A subroutine of
4145  * CheckForSerializableConflictIn().
4146  */
4147 static void
4148 CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
4149 {
4150         uint32          targettaghash;
4151         LWLock     *partitionLock;
4152         PREDICATELOCKTARGET *target;
4153         PREDICATELOCK *predlock;
4154         PREDICATELOCK *mypredlock = NULL;
4155         PREDICATELOCKTAG mypredlocktag;
4156
4157         Assert(MySerializableXact != InvalidSerializableXact);
4158
4159         /*
4160          * The same hash and LW lock apply to the lock target and the lock itself.
4161          */
4162         targettaghash = PredicateLockTargetTagHashCode(targettag);
4163         partitionLock = PredicateLockHashPartitionLock(targettaghash);
4164         LWLockAcquire(partitionLock, LW_SHARED);
4165         target = (PREDICATELOCKTARGET *)
4166                 hash_search_with_hash_value(PredicateLockTargetHash,
4167                                                                         targettag, targettaghash,
4168                                                                         HASH_FIND, NULL);
4169         if (!target)
4170         {
4171                 /* Nothing has this target locked; we're done here. */
4172                 LWLockRelease(partitionLock);
4173                 return;
4174         }
4175
4176         /*
4177          * Each lock for an overlapping transaction represents a conflict: a
4178          * rw-dependency in to this transaction.
4179          */
4180         predlock = (PREDICATELOCK *)
4181                 SHMQueueNext(&(target->predicateLocks),
4182                                          &(target->predicateLocks),
4183                                          offsetof(PREDICATELOCK, targetLink));
4184         LWLockAcquire(SerializableXactHashLock, LW_SHARED);
4185         while (predlock)
4186         {
4187                 SHM_QUEUE  *predlocktargetlink;
4188                 PREDICATELOCK *nextpredlock;
4189                 SERIALIZABLEXACT *sxact;
4190
4191                 predlocktargetlink = &(predlock->targetLink);
4192                 nextpredlock = (PREDICATELOCK *)
4193                         SHMQueueNext(&(target->predicateLocks),
4194                                                  predlocktargetlink,
4195                                                  offsetof(PREDICATELOCK, targetLink));
4196
4197                 sxact = predlock->tag.myXact;
4198                 if (sxact == MySerializableXact)
4199                 {
4200                         /*
4201                          * If we're getting a write lock on a tuple, we don't need a
4202                          * predicate (SIREAD) lock on the same tuple. We can safely remove
4203                          * our SIREAD lock, but we'll defer doing so until after the loop
4204                          * because that requires upgrading to an exclusive partition lock.
4205                          *
4206                          * We can't use this optimization within a subtransaction because
4207                          * the subtransaction could roll back, and we would be left
4208                          * without any lock at the top level.
4209                          */
4210                         if (!IsSubTransaction()
4211                                 && GET_PREDICATELOCKTARGETTAG_OFFSET(*targettag))
4212                         {
4213                                 mypredlock = predlock;
4214                                 mypredlocktag = predlock->tag;
4215                         }
4216                 }
4217                 else if (!SxactIsDoomed(sxact)
4218                                  && (!SxactIsCommitted(sxact)
4219                                          || TransactionIdPrecedes(GetTransactionSnapshot()->xmin,
4220                                                                                           sxact->finishedBefore))
4221                                  && !RWConflictExists(sxact, MySerializableXact))
4222                 {
4223                         LWLockRelease(SerializableXactHashLock);
4224                         LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4225
4226                         /*
4227                          * Re-check after getting exclusive lock because the other
4228                          * transaction may have flagged a conflict.
4229                          */
4230                         if (!SxactIsDoomed(sxact)
4231                                 && (!SxactIsCommitted(sxact)
4232                                         || TransactionIdPrecedes(GetTransactionSnapshot()->xmin,
4233                                                                                          sxact->finishedBefore))
4234                                 && !RWConflictExists(sxact, MySerializableXact))
4235                         {
4236                                 FlagRWConflict(sxact, MySerializableXact);
4237                         }
4238
4239                         LWLockRelease(SerializableXactHashLock);
4240                         LWLockAcquire(SerializableXactHashLock, LW_SHARED);
4241                 }
4242
4243                 predlock = nextpredlock;
4244         }
4245         LWLockRelease(SerializableXactHashLock);
4246         LWLockRelease(partitionLock);
4247
4248         /*
4249          * If we found one of our own SIREAD locks to remove, remove it now.
4250          *
4251          * At this point our transaction already has an ExclusiveRowLock on the
4252          * relation, so we are OK to drop the predicate lock on the tuple, if
4253          * found, without fearing that another write against the tuple will occur
4254          * before the MVCC information makes it to the buffer.
4255          */
4256         if (mypredlock != NULL)
4257         {
4258                 uint32          predlockhashcode;
4259                 PREDICATELOCK *rmpredlock;
4260
4261                 LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
4262                 LWLockAcquire(partitionLock, LW_EXCLUSIVE);
4263                 LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4264
4265                 /*
4266                  * Remove the predicate lock from shared memory, if it wasn't removed
4267                  * while the locks were released.  One way that could happen is from
4268                  * autovacuum cleaning up an index.
4269                  */
4270                 predlockhashcode = PredicateLockHashCodeFromTargetHashCode
4271                         (&mypredlocktag, targettaghash);
4272                 rmpredlock = (PREDICATELOCK *)
4273                         hash_search_with_hash_value(PredicateLockHash,
4274                                                                                 &mypredlocktag,
4275                                                                                 predlockhashcode,
4276                                                                                 HASH_FIND, NULL);
4277                 if (rmpredlock != NULL)
4278                 {
4279                         Assert(rmpredlock == mypredlock);
4280
4281                         SHMQueueDelete(&(mypredlock->targetLink));
4282                         SHMQueueDelete(&(mypredlock->xactLink));
4283
4284                         rmpredlock = (PREDICATELOCK *)
4285                                 hash_search_with_hash_value(PredicateLockHash,
4286                                                                                         &mypredlocktag,
4287                                                                                         predlockhashcode,
4288                                                                                         HASH_REMOVE, NULL);
4289                         Assert(rmpredlock == mypredlock);
4290
4291                         RemoveTargetIfNoLongerUsed(target, targettaghash);
4292                 }
4293
4294                 LWLockRelease(SerializableXactHashLock);
4295                 LWLockRelease(partitionLock);
4296                 LWLockRelease(SerializablePredicateLockListLock);
4297
4298                 if (rmpredlock != NULL)
4299                 {
4300                         /*
4301                          * Remove entry in local lock table if it exists. It's OK if it
4302                          * doesn't exist; that means the lock was transferred to a new
4303                          * target by a different backend.
4304                          */
4305                         hash_search_with_hash_value(LocalPredicateLockHash,
4306                                                                                 targettag, targettaghash,
4307                                                                                 HASH_REMOVE, NULL);
4308
4309                         DecrementParentLocks(targettag);
4310                 }
4311         }
4312 }
4313
4314 /*
4315  * CheckForSerializableConflictIn
4316  *              We are writing the given tuple.  If that indicates a rw-conflict
4317  *              in from another serializable transaction, take appropriate action.
4318  *
4319  * Skip checking for any granularity for which a parameter is missing.
4320  *
4321  * A tuple update or delete is in conflict if we have a predicate lock
4322  * against the relation or page in which the tuple exists, or against the
4323  * tuple itself.
4324  */
4325 void
4326 CheckForSerializableConflictIn(Relation relation, HeapTuple tuple,
4327                                                            Buffer buffer)
4328 {
4329         PREDICATELOCKTARGETTAG targettag;
4330
4331         if (!SerializationNeededForWrite(relation))
4332                 return;
4333
4334         /* Check if someone else has already decided that we need to die */
4335         if (SxactIsDoomed(MySerializableXact))
4336                 ereport(ERROR,
4337                                 (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4338                                  errmsg("could not serialize access due to read/write dependencies among transactions"),
4339                                  errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict in checking."),
4340                                  errhint("The transaction might succeed if retried.")));
4341
4342         /*
4343          * We're doing a write which might cause rw-conflicts now or later.
4344          * Memorize that fact.
4345          */
4346         MyXactDidWrite = true;
4347
4348         /*
4349          * It is important that we check for locks from the finest granularity to
4350          * the coarsest granularity, so that granularity promotion doesn't cause
4351          * us to miss a lock.  The new (coarser) lock will be acquired before the
4352          * old (finer) locks are released.
4353          *
4354          * It is not possible to take and hold a lock across the checks for all
4355          * granularities because each target could be in a separate partition.
4356          */
4357         if (tuple != NULL)
4358         {
4359                 SET_PREDICATELOCKTARGETTAG_TUPLE(targettag,
4360                                                                                  relation->rd_node.dbNode,
4361                                                                                  relation->rd_id,
4362                                                                                  ItemPointerGetBlockNumber(&(tuple->t_self)),
4363                                                                                  ItemPointerGetOffsetNumber(&(tuple->t_self)));
4364                 CheckTargetForConflictsIn(&targettag);
4365         }
4366
4367         if (BufferIsValid(buffer))
4368         {
4369                 SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
4370                                                                                 relation->rd_node.dbNode,
4371                                                                                 relation->rd_id,
4372                                                                                 BufferGetBlockNumber(buffer));
4373                 CheckTargetForConflictsIn(&targettag);
4374         }
4375
4376         SET_PREDICATELOCKTARGETTAG_RELATION(targettag,
4377                                                                                 relation->rd_node.dbNode,
4378                                                                                 relation->rd_id);
4379         CheckTargetForConflictsIn(&targettag);
4380 }
4381
4382 /*
4383  * CheckTableForSerializableConflictIn
4384  *              The entire table is going through a DDL-style logical mass delete
4385  *              like TRUNCATE or DROP TABLE.  If that causes a rw-conflict in from
4386  *              another serializable transaction, take appropriate action.
4387  *
4388  * While these operations do not operate entirely within the bounds of
4389  * snapshot isolation, they can occur inside a serializable transaction, and
4390  * will logically occur after any reads which saw rows which were destroyed
4391  * by these operations, so we do what we can to serialize properly under
4392  * SSI.
4393  *
4394  * The relation passed in must be a heap relation. Any predicate lock of any
4395  * granularity on the heap will cause a rw-conflict in to this transaction.
4396  * Predicate locks on indexes do not matter because they only exist to guard
4397  * against conflicting inserts into the index, and this is a mass *delete*.
4398  * When a table is truncated or dropped, the index will also be truncated
4399  * or dropped, and we'll deal with locks on the index when that happens.
4400  *
4401  * Dropping or truncating a table also needs to drop any existing predicate
4402  * locks on heap tuples or pages, because they're about to go away. This
4403  * should be done before altering the predicate locks because the transaction
4404  * could be rolled back because of a conflict, in which case the lock changes
4405  * are not needed. (At the moment, we don't actually bother to drop the
4406  * existing locks on a dropped or truncated table at the moment. That might
4407  * lead to some false positives, but it doesn't seem worth the trouble.)
4408  */
4409 void
4410 CheckTableForSerializableConflictIn(Relation relation)
4411 {
4412         HASH_SEQ_STATUS seqstat;
4413         PREDICATELOCKTARGET *target;
4414         Oid                     dbId;
4415         Oid                     heapId;
4416         int                     i;
4417
4418         /*
4419          * Bail out quickly if there are no serializable transactions running.
4420          * It's safe to check this without taking locks because the caller is
4421          * holding an ACCESS EXCLUSIVE lock on the relation.  No new locks which
4422          * would matter here can be acquired while that is held.
4423          */
4424         if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
4425                 return;
4426
4427         if (!SerializationNeededForWrite(relation))
4428                 return;
4429
4430         /*
4431          * We're doing a write which might cause rw-conflicts now or later.
4432          * Memorize that fact.
4433          */
4434         MyXactDidWrite = true;
4435
4436         Assert(relation->rd_index == NULL); /* not an index relation */
4437
4438         dbId = relation->rd_node.dbNode;
4439         heapId = relation->rd_id;
4440
4441         LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
4442         for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
4443                 LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED);
4444         LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4445
4446         /* Scan through target list */
4447         hash_seq_init(&seqstat, PredicateLockTargetHash);
4448
4449         while ((target = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
4450         {
4451                 PREDICATELOCK *predlock;
4452
4453                 /*
4454                  * Check whether this is a target which needs attention.
4455                  */
4456                 if (GET_PREDICATELOCKTARGETTAG_RELATION(target->tag) != heapId)
4457                         continue;                       /* wrong relation id */
4458                 if (GET_PREDICATELOCKTARGETTAG_DB(target->tag) != dbId)
4459                         continue;                       /* wrong database id */
4460
4461                 /*
4462                  * Loop through locks for this target and flag conflicts.
4463                  */
4464                 predlock = (PREDICATELOCK *)
4465                         SHMQueueNext(&(target->predicateLocks),
4466                                                  &(target->predicateLocks),
4467                                                  offsetof(PREDICATELOCK, targetLink));
4468                 while (predlock)
4469                 {
4470                         PREDICATELOCK *nextpredlock;
4471
4472                         nextpredlock = (PREDICATELOCK *)
4473                                 SHMQueueNext(&(target->predicateLocks),
4474                                                          &(predlock->targetLink),
4475                                                          offsetof(PREDICATELOCK, targetLink));
4476
4477                         if (predlock->tag.myXact != MySerializableXact
4478                                 && !RWConflictExists(predlock->tag.myXact, MySerializableXact))
4479                         {
4480                                 FlagRWConflict(predlock->tag.myXact, MySerializableXact);
4481                         }
4482
4483                         predlock = nextpredlock;
4484                 }
4485         }
4486
4487         /* Release locks in reverse order */
4488         LWLockRelease(SerializableXactHashLock);
4489         for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
4490                 LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
4491         LWLockRelease(SerializablePredicateLockListLock);
4492 }
4493
4494
4495 /*
4496  * Flag a rw-dependency between two serializable transactions.
4497  *
4498  * The caller is responsible for ensuring that we have a LW lock on
4499  * the transaction hash table.
4500  */
4501 static void
4502 FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
4503 {
4504         Assert(reader != writer);
4505
4506         /* First, see if this conflict causes failure. */
4507         OnConflict_CheckForSerializationFailure(reader, writer);
4508
4509         /* Actually do the conflict flagging. */
4510         if (reader == OldCommittedSxact)
4511                 writer->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
4512         else if (writer == OldCommittedSxact)
4513                 reader->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
4514         else
4515                 SetRWConflict(reader, writer);
4516 }
4517
4518 /*----------------------------------------------------------------------------
4519  * We are about to add a RW-edge to the dependency graph - check that we don't
4520  * introduce a dangerous structure by doing so, and abort one of the
4521  * transactions if so.
4522  *
4523  * A serialization failure can only occur if there is a dangerous structure
4524  * in the dependency graph:
4525  *
4526  *              Tin ------> Tpivot ------> Tout
4527  *                        rw                     rw
4528  *
4529  * Furthermore, Tout must commit first.
4530  *
4531  * One more optimization is that if Tin is declared READ ONLY (or commits
4532  * without writing), we can only have a problem if Tout committed before Tin
4533  * acquired its snapshot.
4534  *----------------------------------------------------------------------------
4535  */
4536 static void
4537 OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
4538                                                                                 SERIALIZABLEXACT *writer)
4539 {
4540         bool            failure;
4541         RWConflict      conflict;
4542
4543         Assert(LWLockHeldByMe(SerializableXactHashLock));
4544
4545         failure = false;
4546
4547         /*------------------------------------------------------------------------
4548          * Check for already-committed writer with rw-conflict out flagged
4549          * (conflict-flag on W means that T2 committed before W):
4550          *
4551          *              R ------> W ------> T2
4552          *                      rw                rw
4553          *
4554          * That is a dangerous structure, so we must abort. (Since the writer
4555          * has already committed, we must be the reader)
4556          *------------------------------------------------------------------------
4557          */
4558         if (SxactIsCommitted(writer)
4559                 && (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer)))
4560                 failure = true;
4561
4562         /*------------------------------------------------------------------------
4563          * Check whether the writer has become a pivot with an out-conflict
4564          * committed transaction (T2), and T2 committed first:
4565          *
4566          *              R ------> W ------> T2
4567          *                      rw                rw
4568          *
4569          * Because T2 must've committed first, there is no anomaly if:
4570          * - the reader committed before T2
4571          * - the writer committed before T2
4572          * - the reader is a READ ONLY transaction and the reader was concurrent
4573          *       with T2 (= reader acquired its snapshot before T2 committed)
4574          *
4575          * We also handle the case that T2 is prepared but not yet committed
4576          * here. In that case T2 has already checked for conflicts, so if it
4577          * commits first, making the above conflict real, it's too late for it
4578          * to abort.
4579          *------------------------------------------------------------------------
4580          */
4581         if (!failure)
4582         {
4583                 if (SxactHasSummaryConflictOut(writer))
4584                 {
4585                         failure = true;
4586                         conflict = NULL;
4587                 }
4588                 else
4589                         conflict = (RWConflict)
4590                                 SHMQueueNext(&writer->outConflicts,
4591                                                          &writer->outConflicts,
4592                                                          offsetof(RWConflictData, outLink));
4593                 while (conflict)
4594                 {
4595                         SERIALIZABLEXACT *t2 = conflict->sxactIn;
4596
4597                         if (SxactIsPrepared(t2)
4598                                 && (!SxactIsCommitted(reader)
4599                                         || t2->prepareSeqNo <= reader->commitSeqNo)
4600                                 && (!SxactIsCommitted(writer)
4601                                         || t2->prepareSeqNo <= writer->commitSeqNo)
4602                                 && (!SxactIsReadOnly(reader)
4603                                         || t2->prepareSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot))
4604                         {
4605                                 failure = true;
4606                                 break;
4607                         }
4608                         conflict = (RWConflict)
4609                                 SHMQueueNext(&writer->outConflicts,
4610                                                          &conflict->outLink,
4611                                                          offsetof(RWConflictData, outLink));
4612                 }
4613         }
4614
4615         /*------------------------------------------------------------------------
4616          * Check whether the reader has become a pivot with a writer
4617          * that's committed (or prepared):
4618          *
4619          *              T0 ------> R ------> W
4620          *                       rw                rw
4621          *
4622          * Because W must've committed first for an anomaly to occur, there is no
4623          * anomaly if:
4624          * - T0 committed before the writer
4625          * - T0 is READ ONLY, and overlaps the writer
4626          *------------------------------------------------------------------------
4627          */
4628         if (!failure && SxactIsPrepared(writer) && !SxactIsReadOnly(reader))
4629         {
4630                 if (SxactHasSummaryConflictIn(reader))
4631                 {
4632                         failure = true;
4633                         conflict = NULL;
4634                 }
4635                 else
4636                         conflict = (RWConflict)
4637                                 SHMQueueNext(&reader->inConflicts,
4638                                                          &reader->inConflicts,
4639                                                          offsetof(RWConflictData, inLink));
4640                 while (conflict)
4641                 {
4642                         SERIALIZABLEXACT *t0 = conflict->sxactOut;
4643
4644                         if (!SxactIsDoomed(t0)
4645                                 && (!SxactIsCommitted(t0)
4646                                         || t0->commitSeqNo >= writer->prepareSeqNo)
4647                                 && (!SxactIsReadOnly(t0)
4648                                         || t0->SeqNo.lastCommitBeforeSnapshot >= writer->prepareSeqNo))
4649                         {
4650                                 failure = true;
4651                                 break;
4652                         }
4653                         conflict = (RWConflict)
4654                                 SHMQueueNext(&reader->inConflicts,
4655                                                          &conflict->inLink,
4656                                                          offsetof(RWConflictData, inLink));
4657                 }
4658         }
4659
4660         if (failure)
4661         {
4662                 /*
4663                  * We have to kill a transaction to avoid a possible anomaly from
4664                  * occurring. If the writer is us, we can just ereport() to cause a
4665                  * transaction abort. Otherwise we flag the writer for termination,
4666                  * causing it to abort when it tries to commit. However, if the writer
4667                  * is a prepared transaction, already prepared, we can't abort it
4668                  * anymore, so we have to kill the reader instead.
4669                  */
4670                 if (MySerializableXact == writer)
4671                 {
4672                         LWLockRelease(SerializableXactHashLock);
4673                         ereport(ERROR,
4674                                         (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4675                                          errmsg("could not serialize access due to read/write dependencies among transactions"),
4676                                          errdetail_internal("Reason code: Canceled on identification as a pivot, during write."),
4677                                          errhint("The transaction might succeed if retried.")));
4678                 }
4679                 else if (SxactIsPrepared(writer))
4680                 {
4681                         LWLockRelease(SerializableXactHashLock);
4682
4683                         /* if we're not the writer, we have to be the reader */
4684                         Assert(MySerializableXact == reader);
4685                         ereport(ERROR,
4686                                         (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4687                                          errmsg("could not serialize access due to read/write dependencies among transactions"),
4688                                          errdetail_internal("Reason code: Canceled on conflict out to pivot %u, during read.", writer->topXid),
4689                                          errhint("The transaction might succeed if retried.")));
4690                 }
4691                 writer->flags |= SXACT_FLAG_DOOMED;
4692         }
4693 }
4694
4695 /*
4696  * PreCommit_CheckForSerializableConflicts
4697  *              Check for dangerous structures in a serializable transaction
4698  *              at commit.
4699  *
4700  * We're checking for a dangerous structure as each conflict is recorded.
4701  * The only way we could have a problem at commit is if this is the "out"
4702  * side of a pivot, and neither the "in" side nor the pivot has yet
4703  * committed.
4704  *
4705  * If a dangerous structure is found, the pivot (the near conflict) is
4706  * marked for death, because rolling back another transaction might mean
4707  * that we flail without ever making progress.  This transaction is
4708  * committing writes, so letting it commit ensures progress.  If we
4709  * canceled the far conflict, it might immediately fail again on retry.
4710  */
4711 void
4712 PreCommit_CheckForSerializationFailure(void)
4713 {
4714         RWConflict      nearConflict;
4715
4716         if (MySerializableXact == InvalidSerializableXact)
4717                 return;
4718
4719         Assert(IsolationIsSerializable());
4720
4721         LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4722
4723         /* Check if someone else has already decided that we need to die */
4724         if (SxactIsDoomed(MySerializableXact))
4725         {
4726                 LWLockRelease(SerializableXactHashLock);
4727                 ereport(ERROR,
4728                                 (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4729                                  errmsg("could not serialize access due to read/write dependencies among transactions"),
4730                                  errdetail_internal("Reason code: Canceled on identification as a pivot, during commit attempt."),
4731                                  errhint("The transaction might succeed if retried.")));
4732         }
4733
4734         nearConflict = (RWConflict)
4735                 SHMQueueNext(&MySerializableXact->inConflicts,
4736                                          &MySerializableXact->inConflicts,
4737                                          offsetof(RWConflictData, inLink));
4738         while (nearConflict)
4739         {
4740                 if (!SxactIsCommitted(nearConflict->sxactOut)
4741                         && !SxactIsDoomed(nearConflict->sxactOut))
4742                 {
4743                         RWConflict      farConflict;
4744
4745                         farConflict = (RWConflict)
4746                                 SHMQueueNext(&nearConflict->sxactOut->inConflicts,
4747                                                          &nearConflict->sxactOut->inConflicts,
4748                                                          offsetof(RWConflictData, inLink));
4749                         while (farConflict)
4750                         {
4751                                 if (farConflict->sxactOut == MySerializableXact
4752                                         || (!SxactIsCommitted(farConflict->sxactOut)
4753                                                 && !SxactIsReadOnly(farConflict->sxactOut)
4754                                                 && !SxactIsDoomed(farConflict->sxactOut)))
4755                                 {
4756                                         /*
4757                                          * Normally, we kill the pivot transaction to make sure we
4758                                          * make progress if the failing transaction is retried.
4759                                          * However, we can't kill it if it's already prepared, so
4760                                          * in that case we commit suicide instead.
4761                                          */
4762                                         if (SxactIsPrepared(nearConflict->sxactOut))
4763                                         {
4764                                                 LWLockRelease(SerializableXactHashLock);
4765                                                 ereport(ERROR,
4766                                                                 (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4767                                                                  errmsg("could not serialize access due to read/write dependencies among transactions"),
4768                                                                  errdetail_internal("Reason code: Canceled on commit attempt with conflict in from prepared pivot."),
4769                                                                  errhint("The transaction might succeed if retried.")));
4770                                         }
4771                                         nearConflict->sxactOut->flags |= SXACT_FLAG_DOOMED;
4772                                         break;
4773                                 }
4774                                 farConflict = (RWConflict)
4775                                         SHMQueueNext(&nearConflict->sxactOut->inConflicts,
4776                                                                  &farConflict->inLink,
4777                                                                  offsetof(RWConflictData, inLink));
4778                         }
4779                 }
4780
4781                 nearConflict = (RWConflict)
4782                         SHMQueueNext(&MySerializableXact->inConflicts,
4783                                                  &nearConflict->inLink,
4784                                                  offsetof(RWConflictData, inLink));
4785         }
4786
4787         MySerializableXact->prepareSeqNo = ++(PredXact->LastSxactCommitSeqNo);
4788         MySerializableXact->flags |= SXACT_FLAG_PREPARED;
4789
4790         LWLockRelease(SerializableXactHashLock);
4791 }
4792
4793 /*------------------------------------------------------------------------*/
4794
4795 /*
4796  * Two-phase commit support
4797  */
4798
4799 /*
4800  * AtPrepare_Locks
4801  *              Do the preparatory work for a PREPARE: make 2PC state file
4802  *              records for all predicate locks currently held.
4803  */
4804 void
4805 AtPrepare_PredicateLocks(void)
4806 {
4807         PREDICATELOCK *predlock;
4808         SERIALIZABLEXACT *sxact;
4809         TwoPhasePredicateRecord record;
4810         TwoPhasePredicateXactRecord *xactRecord;
4811         TwoPhasePredicateLockRecord *lockRecord;
4812
4813         sxact = MySerializableXact;
4814         xactRecord = &(record.data.xactRecord);
4815         lockRecord = &(record.data.lockRecord);
4816
4817         if (MySerializableXact == InvalidSerializableXact)
4818                 return;
4819
4820         /* Generate an xact record for our SERIALIZABLEXACT */
4821         record.type = TWOPHASEPREDICATERECORD_XACT;
4822         xactRecord->xmin = MySerializableXact->xmin;
4823         xactRecord->flags = MySerializableXact->flags;
4824
4825         /*
4826          * Note that we don't include the list of conflicts in our out in the
4827          * statefile, because new conflicts can be added even after the
4828          * transaction prepares. We'll just make a conservative assumption during
4829          * recovery instead.
4830          */
4831
4832         RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0,
4833                                                    &record, sizeof(record));
4834
4835         /*
4836          * Generate a lock record for each lock.
4837          *
4838          * To do this, we need to walk the predicate lock list in our sxact rather
4839          * than using the local predicate lock table because the latter is not
4840          * guaranteed to be accurate.
4841          */
4842         LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
4843
4844         predlock = (PREDICATELOCK *)
4845                 SHMQueueNext(&(sxact->predicateLocks),
4846                                          &(sxact->predicateLocks),
4847                                          offsetof(PREDICATELOCK, xactLink));
4848
4849         while (predlock != NULL)
4850         {
4851                 record.type = TWOPHASEPREDICATERECORD_LOCK;
4852                 lockRecord->target = predlock->tag.myTarget->tag;
4853
4854                 RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0,
4855                                                            &record, sizeof(record));
4856
4857                 predlock = (PREDICATELOCK *)
4858                         SHMQueueNext(&(sxact->predicateLocks),
4859                                                  &(predlock->xactLink),
4860                                                  offsetof(PREDICATELOCK, xactLink));
4861         }
4862
4863         LWLockRelease(SerializablePredicateLockListLock);
4864 }
4865
4866 /*
4867  * PostPrepare_Locks
4868  *              Clean up after successful PREPARE. Unlike the non-predicate
4869  *              lock manager, we do not need to transfer locks to a dummy
4870  *              PGPROC because our SERIALIZABLEXACT will stay around
4871  *              anyway. We only need to clean up our local state.
4872  */
4873 void
4874 PostPrepare_PredicateLocks(TransactionId xid)
4875 {
4876         if (MySerializableXact == InvalidSerializableXact)
4877                 return;
4878
4879         Assert(SxactIsPrepared(MySerializableXact));
4880
4881         MySerializableXact->pid = 0;
4882
4883         hash_destroy(LocalPredicateLockHash);
4884         LocalPredicateLockHash = NULL;
4885
4886         MySerializableXact = InvalidSerializableXact;
4887         MyXactDidWrite = false;
4888 }
4889
4890 /*
4891  * PredicateLockTwoPhaseFinish
4892  *              Release a prepared transaction's predicate locks once it
4893  *              commits or aborts.
4894  */
4895 void
4896 PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit)
4897 {
4898         SERIALIZABLEXID *sxid;
4899         SERIALIZABLEXIDTAG sxidtag;
4900
4901         sxidtag.xid = xid;
4902
4903         LWLockAcquire(SerializableXactHashLock, LW_SHARED);
4904         sxid = (SERIALIZABLEXID *)
4905                 hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
4906         LWLockRelease(SerializableXactHashLock);
4907
4908         /* xid will not be found if it wasn't a serializable transaction */
4909         if (sxid == NULL)
4910                 return;
4911
4912         /* Release its locks */
4913         MySerializableXact = sxid->myXact;
4914         MyXactDidWrite = true;          /* conservatively assume that we wrote
4915                                                                  * something */
4916         ReleasePredicateLocks(isCommit);
4917 }
4918
4919 /*
4920  * Re-acquire a predicate lock belonging to a transaction that was prepared.
4921  */
4922 void
4923 predicatelock_twophase_recover(TransactionId xid, uint16 info,
4924                                                            void *recdata, uint32 len)
4925 {
4926         TwoPhasePredicateRecord *record;
4927
4928         Assert(len == sizeof(TwoPhasePredicateRecord));
4929
4930         record = (TwoPhasePredicateRecord *) recdata;
4931
4932         Assert((record->type == TWOPHASEPREDICATERECORD_XACT) ||
4933                    (record->type == TWOPHASEPREDICATERECORD_LOCK));
4934
4935         if (record->type == TWOPHASEPREDICATERECORD_XACT)
4936         {
4937                 /* Per-transaction record. Set up a SERIALIZABLEXACT. */
4938                 TwoPhasePredicateXactRecord *xactRecord;
4939                 SERIALIZABLEXACT *sxact;
4940                 SERIALIZABLEXID *sxid;
4941                 SERIALIZABLEXIDTAG sxidtag;
4942                 bool            found;
4943
4944                 xactRecord = (TwoPhasePredicateXactRecord *) &record->data.xactRecord;
4945
4946                 LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4947                 sxact = CreatePredXact();
4948                 if (!sxact)
4949                         ereport(ERROR,
4950                                         (errcode(ERRCODE_OUT_OF_MEMORY),
4951                                          errmsg("out of shared memory")));
4952
4953                 /* vxid for a prepared xact is InvalidBackendId/xid; no pid */
4954                 sxact->vxid.backendId = InvalidBackendId;
4955                 sxact->vxid.localTransactionId = (LocalTransactionId) xid;
4956                 sxact->pid = 0;
4957
4958                 /* a prepared xact hasn't committed yet */
4959                 sxact->prepareSeqNo = RecoverySerCommitSeqNo;
4960                 sxact->commitSeqNo = InvalidSerCommitSeqNo;
4961                 sxact->finishedBefore = InvalidTransactionId;
4962
4963                 sxact->SeqNo.lastCommitBeforeSnapshot = RecoverySerCommitSeqNo;
4964
4965                 /*
4966                  * Don't need to track this; no transactions running at the time the
4967                  * recovered xact started are still active, except possibly other
4968                  * prepared xacts and we don't care whether those are RO_SAFE or not.
4969                  */
4970                 SHMQueueInit(&(sxact->possibleUnsafeConflicts));
4971
4972                 SHMQueueInit(&(sxact->predicateLocks));
4973                 SHMQueueElemInit(&(sxact->finishedLink));
4974
4975                 sxact->topXid = xid;
4976                 sxact->xmin = xactRecord->xmin;
4977                 sxact->flags = xactRecord->flags;
4978                 Assert(SxactIsPrepared(sxact));
4979                 if (!SxactIsReadOnly(sxact))
4980                 {
4981                         ++(PredXact->WritableSxactCount);
4982                         Assert(PredXact->WritableSxactCount <=
4983                                    (MaxBackends + max_prepared_xacts));
4984                 }
4985
4986                 /*
4987                  * We don't know whether the transaction had any conflicts or not, so
4988                  * we'll conservatively assume that it had both a conflict in and a
4989                  * conflict out, and represent that with the summary conflict flags.
4990                  */
4991                 SHMQueueInit(&(sxact->outConflicts));
4992                 SHMQueueInit(&(sxact->inConflicts));
4993                 sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
4994                 sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
4995
4996                 /* Register the transaction's xid */
4997                 sxidtag.xid = xid;
4998                 sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,
4999                                                                                            &sxidtag,
5000                                                                                            HASH_ENTER, &found);
5001                 Assert(sxid != NULL);
5002                 Assert(!found);
5003                 sxid->myXact = (SERIALIZABLEXACT *) sxact;
5004
5005                 /*
5006                  * Update global xmin. Note that this is a special case compared to
5007                  * registering a normal transaction, because the global xmin might go
5008                  * backwards. That's OK, because until recovery is over we're not
5009                  * going to complete any transactions or create any non-prepared
5010                  * transactions, so there's no danger of throwing away.
5011                  */
5012                 if ((!TransactionIdIsValid(PredXact->SxactGlobalXmin)) ||
5013                         (TransactionIdFollows(PredXact->SxactGlobalXmin, sxact->xmin)))
5014                 {
5015                         PredXact->SxactGlobalXmin = sxact->xmin;
5016                         PredXact->SxactGlobalXminCount = 1;
5017                         OldSerXidSetActiveSerXmin(sxact->xmin);
5018                 }
5019                 else if (TransactionIdEquals(sxact->xmin, PredXact->SxactGlobalXmin))
5020                 {
5021                         Assert(PredXact->SxactGlobalXminCount > 0);
5022                         PredXact->SxactGlobalXminCount++;
5023                 }
5024
5025                 LWLockRelease(SerializableXactHashLock);
5026         }
5027         else if (record->type == TWOPHASEPREDICATERECORD_LOCK)
5028         {
5029                 /* Lock record. Recreate the PREDICATELOCK */
5030                 TwoPhasePredicateLockRecord *lockRecord;
5031                 SERIALIZABLEXID *sxid;
5032                 SERIALIZABLEXACT *sxact;
5033                 SERIALIZABLEXIDTAG sxidtag;
5034                 uint32          targettaghash;
5035
5036                 lockRecord = (TwoPhasePredicateLockRecord *) &record->data.lockRecord;
5037                 targettaghash = PredicateLockTargetTagHashCode(&lockRecord->target);
5038
5039                 LWLockAcquire(SerializableXactHashLock, LW_SHARED);
5040                 sxidtag.xid = xid;
5041                 sxid = (SERIALIZABLEXID *)
5042                         hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
5043                 LWLockRelease(SerializableXactHashLock);
5044
5045                 Assert(sxid != NULL);
5046                 sxact = sxid->myXact;
5047                 Assert(sxact != InvalidSerializableXact);
5048
5049                 CreatePredicateLock(&lockRecord->target, targettaghash, sxact);
5050         }
5051 }