1 /*-------------------------------------------------------------------------
3 * predicate_internals.h
4 * POSTGRES internal predicate locking definitions.
7 * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
10 * src/include/storage/predicate_internals.h
12 *-------------------------------------------------------------------------
14 #ifndef PREDICATE_INTERNALS_H
15 #define PREDICATE_INTERNALS_H
17 #include "storage/lock.h"
22 typedef uint64 SerCommitSeqNo;
25 * Reserved commit sequence numbers:
26 * - 0 is reserved to indicate a non-existent SLRU entry; it cannot be
27 * used as a SerCommitSeqNo, even an invalid one
28 * - InvalidSerCommitSeqNo is used to indicate a transaction that
29 * hasn't committed yet, so use a number greater than all valid
30 * ones to make comparison do the expected thing
31 * - RecoverySerCommitSeqNo is used to refer to transactions that
32 * happened before a crash/recovery, since we restart the sequence
33 * at that point. It's earlier than all normal sequence numbers,
34 * and is only used by recovered prepared transactions
36 #define InvalidSerCommitSeqNo ((SerCommitSeqNo) UINT64CONST(0xFFFFFFFFFFFFFFFF))
37 #define RecoverySerCommitSeqNo ((SerCommitSeqNo) 1)
38 #define FirstNormalSerCommitSeqNo ((SerCommitSeqNo) 2)
41 * The SERIALIZABLEXACT struct contains information needed for each
42 * serializable database transaction to support SSI techniques.
44 * A home-grown list is maintained in shared memory to manage these.
45 * An entry is used when the serializable transaction acquires a snapshot.
46 * Unless the transaction is rolled back, this entry must generally remain
47 * until all concurrent transactions have completed. (There are special
48 * optimizations for READ ONLY transactions which often allow them to be
49 * cleaned up earlier.) A transaction which is rolled back is cleaned up
50 * as soon as possible.
52 * Eligibility for cleanup of committed transactions is generally determined
53 * by comparing the transaction's finishedBefore field to
54 * SerializableGlobalXmin.
56 typedef struct SERIALIZABLEXACT
58 VirtualTransactionId vxid; /* The executing process always has one of
62 * We use two numbers to track the order that transactions commit. Before
63 * commit, a transaction is marked as prepared, and prepareSeqNo is set.
64 * Shortly after commit, it's marked as committed, and commitSeqNo is set.
65 * This doesn't give a strict commit order, but these two values together
66 * are good enough for us, as we can always err on the safe side and
67 * assume that there's a conflict, if we can't be sure of the exact
68 * ordering of two commits.
70 * Note that a transaction is marked as prepared for a short period during
71 * commit processing, even if two-phase commit is not used. But with
72 * two-phase commit, a transaction can stay in prepared state for some
75 SerCommitSeqNo prepareSeqNo;
76 SerCommitSeqNo commitSeqNo;
78 /* these values are not both interesting at the same time */
81 SerCommitSeqNo earliestOutConflictCommit; /* when committed with
83 SerCommitSeqNo lastCommitBeforeSnapshot; /* when not committed or
86 SHM_QUEUE outConflicts; /* list of write transactions whose data we
88 SHM_QUEUE inConflicts; /* list of read transactions which couldn't
90 SHM_QUEUE predicateLocks; /* list of associated PREDICATELOCK objects */
91 SHM_QUEUE finishedLink; /* list link in
92 * FinishedSerializableTransactions */
95 * for r/o transactions: list of concurrent r/w transactions that we could
96 * potentially have conflicts with, and vice versa for r/w transactions
98 SHM_QUEUE possibleUnsafeConflicts;
100 TransactionId topXid; /* top level xid for the transaction, if one
101 * exists; else invalid */
102 TransactionId finishedBefore; /* invalid means still running; else
103 * the struct expires when no
104 * serializable xids are before this. */
105 TransactionId xmin; /* the transaction's snapshot xmin */
106 uint32 flags; /* OR'd combination of values defined below */
107 int pid; /* pid of associated process */
110 #define SXACT_FLAG_COMMITTED 0x00000001 /* already committed */
111 #define SXACT_FLAG_PREPARED 0x00000002 /* about to commit */
112 #define SXACT_FLAG_ROLLED_BACK 0x00000004 /* already rolled back */
113 #define SXACT_FLAG_DOOMED 0x00000008 /* will roll back */
115 * The following flag actually means that the flagged transaction has a
116 * conflict out *to a transaction which committed ahead of it*. It's hard
117 * to get that into a name of a reasonable length.
119 #define SXACT_FLAG_CONFLICT_OUT 0x00000010
120 #define SXACT_FLAG_READ_ONLY 0x00000020
121 #define SXACT_FLAG_DEFERRABLE_WAITING 0x00000040
122 #define SXACT_FLAG_RO_SAFE 0x00000080
123 #define SXACT_FLAG_RO_UNSAFE 0x00000100
124 #define SXACT_FLAG_SUMMARY_CONFLICT_IN 0x00000200
125 #define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400
128 * The following types are used to provide an ad hoc list for holding
129 * SERIALIZABLEXACT objects. An HTAB is overkill, since there is no need to
130 * access these by key -- there are direct pointers to these objects where
131 * needed. If a shared memory list is created, these types can probably be
132 * eliminated in favor of using the general solution.
134 typedef struct PredXactListElementData
137 SERIALIZABLEXACT sxact;
138 } PredXactListElementData;
140 typedef struct PredXactListElementData *PredXactListElement;
142 #define PredXactListElementDataSize \
143 ((Size)MAXALIGN(sizeof(PredXactListElementData)))
145 typedef struct PredXactListData
147 SHM_QUEUE availableList;
148 SHM_QUEUE activeList;
151 * These global variables are maintained when registering and cleaning up
152 * serializable transactions. They must be global across all backends,
153 * but are not needed outside the predicate.c source file. Protected by
154 * SerializableXactHashLock.
156 TransactionId SxactGlobalXmin; /* global xmin for active serializable
158 int SxactGlobalXminCount; /* how many active serializable
159 * transactions have this xmin */
160 int WritableSxactCount; /* how many non-read-only serializable
161 * transactions are active */
162 SerCommitSeqNo LastSxactCommitSeqNo; /* a strictly monotonically
163 * increasing number for
164 * commits of serializable
166 /* Protected by SerializableXactHashLock. */
167 SerCommitSeqNo CanPartialClearThrough; /* can clear predicate locks
168 * and inConflicts for
169 * committed transactions
170 * through this seq no */
171 /* Protected by SerializableFinishedListLock. */
172 SerCommitSeqNo HavePartialClearedThrough; /* have cleared through this
174 SERIALIZABLEXACT *OldCommittedSxact; /* shared copy of dummy sxact */
176 PredXactListElement element;
179 typedef struct PredXactListData *PredXactList;
181 #define PredXactListDataSize \
182 ((Size)MAXALIGN(sizeof(PredXactListData)))
186 * The following types are used to provide lists of rw-conflicts between
187 * pairs of transactions. Since exactly the same information is needed,
188 * they are also used to record possible unsafe transaction relationships
189 * for purposes of identifying safe snapshots for read-only transactions.
191 * When a RWConflictData is not in use to record either type of relationship
192 * between a pair of transactions, it is kept on an "available" list. The
193 * outLink field is used for maintaining that list.
195 typedef struct RWConflictData
197 SHM_QUEUE outLink; /* link for list of conflicts out from a sxact */
198 SHM_QUEUE inLink; /* link for list of conflicts in to a sxact */
199 SERIALIZABLEXACT *sxactOut;
200 SERIALIZABLEXACT *sxactIn;
203 typedef struct RWConflictData *RWConflict;
205 #define RWConflictDataSize \
206 ((Size)MAXALIGN(sizeof(RWConflictData)))
208 typedef struct RWConflictPoolHeaderData
210 SHM_QUEUE availableList;
212 } RWConflictPoolHeaderData;
214 typedef struct RWConflictPoolHeaderData *RWConflictPoolHeader;
216 #define RWConflictPoolHeaderDataSize \
217 ((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData)))
221 * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable
222 * transaction or any of its subtransactions.
224 typedef struct SERIALIZABLEXIDTAG
227 } SERIALIZABLEXIDTAG;
230 * The SERIALIZABLEXID struct provides a link from a TransactionId for a
231 * serializable transaction to the related SERIALIZABLEXACT record, even if
232 * the transaction has completed and its connection has been closed.
234 * These are created as new top level transaction IDs are first assigned to
235 * transactions which are participating in predicate locking. This may
236 * never happen for a particular transaction if it doesn't write anything.
237 * They are removed with their related serializable transaction objects.
239 * The SubTransGetTopmostTransaction method is used where necessary to get
240 * from an XID which might be from a subtransaction to the top level XID.
242 typedef struct SERIALIZABLEXID
245 SERIALIZABLEXIDTAG tag;
248 SERIALIZABLEXACT *myXact; /* pointer to the top level transaction data */
253 * The PREDICATELOCKTARGETTAG struct identifies a database object which can
254 * be the target of predicate locks.
256 * Note that the hash function being used doesn't properly respect tag
257 * length -- it will go to a four byte boundary past the end of the tag.
258 * If you change this struct, make sure any slack space is initialized,
259 * so that any random bytes in the middle or at the end are not included
262 * TODO SSI: If we always use the same fields for the same type of value, we
263 * should rename these. Holding off until it's clear there are no exceptions.
264 * Since indexes are relations with blocks and tuples, it's looking likely that
265 * the rename will be possible. If not, we may need to divide the last field
266 * and use part of it for a target type, so that we know how to interpret the
269 typedef struct PREDICATELOCKTARGETTAG
271 uint32 locktag_field1; /* a 32-bit ID field */
272 uint32 locktag_field2; /* a 32-bit ID field */
273 uint32 locktag_field3; /* a 32-bit ID field */
274 uint32 locktag_field4; /* a 32-bit ID field */
275 uint32 locktag_field5; /* a 32-bit ID field */
276 } PREDICATELOCKTARGETTAG;
279 * The PREDICATELOCKTARGET struct represents a database object on which there
280 * are predicate locks.
282 * A hash list of these objects is maintained in shared memory. An entry is
283 * added when a predicate lock is requested on an object which doesn't
284 * already have one. An entry is removed when the last lock is removed from
287 * Because a particular target might become obsolete, due to update to a new
288 * version, before the reading transaction is obsolete, we need some way to
289 * prevent errors from reuse of a tuple ID. Rather than attempting to clean
290 * up the targets as the related tuples are pruned or vacuumed, we check the
291 * xmin on access. This should be far less costly.
293 typedef struct PREDICATELOCKTARGET
296 PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
299 SHM_QUEUE predicateLocks; /* list of PREDICATELOCK objects assoc. with
300 * predicate lock target */
301 } PREDICATELOCKTARGET;
305 * The PREDICATELOCKTAG struct identifies an individual predicate lock.
307 * It is the combination of predicate lock target (which is a lockable
308 * object) and a serializable transaction which has acquired a lock on that
311 typedef struct PREDICATELOCKTAG
313 PREDICATELOCKTARGET *myTarget;
314 SERIALIZABLEXACT *myXact;
318 * The PREDICATELOCK struct represents an individual lock.
320 * An entry can be created here when the related database object is read, or
321 * by promotion of multiple finer-grained targets. All entries related to a
322 * serializable transaction are removed when that serializable transaction is
323 * cleaned up. Entries can also be removed when they are combined into a
324 * single coarser-grained lock entry.
326 typedef struct PREDICATELOCK
329 PREDICATELOCKTAG tag; /* unique identifier of lock */
332 SHM_QUEUE targetLink; /* list link in PREDICATELOCKTARGET's list of
334 SHM_QUEUE xactLink; /* list link in SERIALIZABLEXACT's list of
336 SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */
341 * The LOCALPREDICATELOCK struct represents a local copy of data which is
342 * also present in the PREDICATELOCK table, organized for fast access without
343 * needing to acquire a LWLock. It is strictly for optimization.
345 * Each serializable transaction creates its own local hash table to hold a
346 * collection of these. This information is used to determine when a number
347 * of fine-grained locks should be promoted to a single coarser-grained lock.
348 * The information is maintained more-or-less in parallel to the
349 * PREDICATELOCK data, but because this data is not protected by locks and is
350 * only used in an optimization heuristic, it is allowed to drift in a few
351 * corner cases where maintaining exact data would be expensive.
353 * The hash table is created when the serializable transaction acquires its
354 * snapshot, and its memory is released upon completion of the transaction.
356 typedef struct LOCALPREDICATELOCK
359 PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
362 bool held; /* is lock held, or just its children? */
363 int childLocks; /* number of child locks currently held */
364 } LOCALPREDICATELOCK;
368 * The types of predicate locks which can be acquired.
370 typedef enum PredicateLockTargetType
372 PREDLOCKTAG_RELATION,
375 /* TODO SSI: Other types may be needed for index locking */
376 } PredicateLockTargetType;
380 * This structure is used to quickly capture a copy of all predicate
381 * locks. This is currently used only by the pg_lock_status function,
382 * which in turn is used by the pg_locks view.
384 typedef struct PredicateLockData
387 PREDICATELOCKTARGETTAG *locktags;
388 SERIALIZABLEXACT *xacts;
393 * These macros define how we map logical IDs of lockable objects into the
394 * physical fields of PREDICATELOCKTARGETTAG. Use these to set up values,
395 * rather than accessing the fields directly. Note multiple eval of target!
397 #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \
398 ((locktag).locktag_field1 = (dboid), \
399 (locktag).locktag_field2 = (reloid), \
400 (locktag).locktag_field3 = InvalidBlockNumber, \
401 (locktag).locktag_field4 = InvalidOffsetNumber, \
402 (locktag).locktag_field5 = InvalidTransactionId)
404 #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
405 ((locktag).locktag_field1 = (dboid), \
406 (locktag).locktag_field2 = (reloid), \
407 (locktag).locktag_field3 = (blocknum), \
408 (locktag).locktag_field4 = InvalidOffsetNumber, \
409 (locktag).locktag_field5 = InvalidTransactionId)
411 #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum,xmin) \
412 ((locktag).locktag_field1 = (dboid), \
413 (locktag).locktag_field2 = (reloid), \
414 (locktag).locktag_field3 = (blocknum), \
415 (locktag).locktag_field4 = (offnum), \
416 (locktag).locktag_field5 = (xmin))
418 #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
419 ((Oid) (locktag).locktag_field1)
420 #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
421 ((Oid) (locktag).locktag_field2)
422 #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
423 ((BlockNumber) (locktag).locktag_field3)
424 #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
425 ((OffsetNumber) (locktag).locktag_field4)
426 #define GET_PREDICATELOCKTARGETTAG_XMIN(locktag) \
427 ((TransactionId) (locktag).locktag_field5)
428 #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag) \
429 (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
430 (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE : \
431 PREDLOCKTAG_RELATION))
434 * Two-phase commit statefile records. There are two types: for each
435 * transaction, we generate one per-transaction record and a variable
436 * number of per-predicate-lock records.
438 typedef enum TwoPhasePredicateRecordType
440 TWOPHASEPREDICATERECORD_XACT,
441 TWOPHASEPREDICATERECORD_LOCK
442 } TwoPhasePredicateRecordType;
445 * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not
446 * much is needed because most of it not meaningful for a recovered
447 * prepared transaction.
449 * In particular, we do not record the in and out conflict lists for a
450 * prepared transaction because the associated SERIALIZABLEXACTs will
451 * not be available after recovery. Instead, we simply record the
452 * existence of each type of conflict by setting the transaction's
453 * summary conflict in/out flag.
455 typedef struct TwoPhasePredicateXactRecord
459 } TwoPhasePredicateXactRecord;
462 typedef struct TwoPhasePredicateLockRecord
464 PREDICATELOCKTARGETTAG target;
465 } TwoPhasePredicateLockRecord;
467 typedef struct TwoPhasePredicateRecord
469 TwoPhasePredicateRecordType type;
472 TwoPhasePredicateXactRecord xactRecord;
473 TwoPhasePredicateLockRecord lockRecord;
475 } TwoPhasePredicateRecord;
478 * Define a macro to use for an "empty" SERIALIZABLEXACT reference.
480 #define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL)
484 * Function definitions for functions needing awareness of predicate
487 extern PredicateLockData *GetPredicateLockStatusData(void);
490 #endif /* PREDICATE_INTERNALS_H */