1 /*-------------------------------------------------------------------------
4 * POSTGRES process array code.
7 * This module maintains an unsorted array of the PGPROC structures for all
8 * active backends. Although there are several uses for this, the principal
9 * one is as a means of determining the set of currently running transactions.
11 * Because of various subtle race conditions it is critical that a backend
12 * hold the correct locks while setting or clearing its MyProc->xid field.
13 * See notes in src/backend/access/transam/README.
15 * The process array now also includes PGPROC structures representing
16 * prepared transactions. The xid and subxids fields of these are valid,
17 * as are the myProcLocks lists. They can be distinguished from regular
18 * backend PGPROCs at need by checking for pid == 0.
20 * During hot standby, we also keep a list of XIDs representing transactions
21 * that are known to be running in the master (or more precisely, were running
22 * as of the current point in the WAL stream). This list is kept in the
23 * KnownAssignedXids array, and is updated by watching the sequence of
24 * arriving XIDs. This is necessary because if we leave those XIDs out of
25 * snapshots taken for standby queries, then they will appear to be already
26 * complete, leading to MVCC failures. Note that in hot standby, the PGPROC
27 * array represents standby processes, which by definition are not running
28 * transactions that have XIDs.
30 * It is perhaps possible for a backend on the master to terminate without
31 * writing an abort record for its transaction. While that shouldn't really
32 * happen, it would tie up KnownAssignedXids indefinitely, so we protect
33 * ourselves by pruning the array when a valid list of running XIDs arrives.
35 * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
36 * Portions Copyright (c) 1994, Regents of the University of California
40 * src/backend/storage/ipc/procarray.c
42 *-------------------------------------------------------------------------
48 #include "access/clog.h"
49 #include "access/subtrans.h"
50 #include "access/transam.h"
51 #include "access/xact.h"
52 #include "access/twophase.h"
53 #include "miscadmin.h"
54 #include "storage/procarray.h"
55 #include "storage/spin.h"
56 #include "utils/builtins.h"
57 #include "utils/snapmgr.h"
60 /* Our shared memory area */
61 typedef struct ProcArrayStruct
63 int numProcs; /* number of valid procs entries */
64 int maxProcs; /* allocated size of procs array */
67 * Known assigned XIDs handling
69 int maxKnownAssignedXids; /* allocated size of array */
70 int numKnownAssignedXids; /* currrent # of valid entries */
71 int tailKnownAssignedXids; /* index of oldest valid element */
72 int headKnownAssignedXids; /* index of newest element, + 1 */
73 slock_t known_assigned_xids_lck; /* protects head/tail pointers */
76 * Highest subxid that has been removed from KnownAssignedXids array to
77 * prevent overflow; or InvalidTransactionId if none. We track this for
78 * similar reasons to tracking overflowing cached subxids in PGPROC
79 * entries. Must hold exclusive ProcArrayLock to change this, and shared
82 TransactionId lastOverflowedXid;
85 * We declare procs[] as 1 entry because C wants a fixed-size array, but
86 * actually it is maxProcs entries long.
88 PGPROC *procs[1]; /* VARIABLE LENGTH ARRAY */
91 static ProcArrayStruct *procArray;
94 * Bookkeeping for tracking emulated transactions in recovery
96 static TransactionId *KnownAssignedXids;
97 static bool *KnownAssignedXidsValid;
98 static TransactionId latestObservedXid = InvalidTransactionId;
101 * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is
102 * the highest xid that might still be running that we don't have in
105 static TransactionId standbySnapshotPendingXmin;
107 #ifdef XIDCACHE_DEBUG
109 /* counters for XidCache measurement */
110 static long xc_by_recent_xmin = 0;
111 static long xc_by_known_xact = 0;
112 static long xc_by_my_xact = 0;
113 static long xc_by_latest_xid = 0;
114 static long xc_by_main_xid = 0;
115 static long xc_by_child_xid = 0;
116 static long xc_by_known_assigned = 0;
117 static long xc_no_overflow = 0;
118 static long xc_slow_answer = 0;
120 #define xc_by_recent_xmin_inc() (xc_by_recent_xmin++)
121 #define xc_by_known_xact_inc() (xc_by_known_xact++)
122 #define xc_by_my_xact_inc() (xc_by_my_xact++)
123 #define xc_by_latest_xid_inc() (xc_by_latest_xid++)
124 #define xc_by_main_xid_inc() (xc_by_main_xid++)
125 #define xc_by_child_xid_inc() (xc_by_child_xid++)
126 #define xc_by_known_assigned_inc() (xc_by_known_assigned++)
127 #define xc_no_overflow_inc() (xc_no_overflow++)
128 #define xc_slow_answer_inc() (xc_slow_answer++)
130 static void DisplayXidCache(void);
131 #else /* !XIDCACHE_DEBUG */
133 #define xc_by_recent_xmin_inc() ((void) 0)
134 #define xc_by_known_xact_inc() ((void) 0)
135 #define xc_by_my_xact_inc() ((void) 0)
136 #define xc_by_latest_xid_inc() ((void) 0)
137 #define xc_by_main_xid_inc() ((void) 0)
138 #define xc_by_child_xid_inc() ((void) 0)
139 #define xc_by_known_assigned_inc() ((void) 0)
140 #define xc_no_overflow_inc() ((void) 0)
141 #define xc_slow_answer_inc() ((void) 0)
142 #endif /* XIDCACHE_DEBUG */
144 /* Primitives for KnownAssignedXids array handling for standby */
145 static void KnownAssignedXidsCompress(bool force);
146 static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
147 bool exclusive_lock);
148 static bool KnownAssignedXidsSearch(TransactionId xid, bool remove);
149 static bool KnownAssignedXidExists(TransactionId xid);
150 static void KnownAssignedXidsRemove(TransactionId xid);
151 static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
152 TransactionId *subxids);
153 static void KnownAssignedXidsRemovePreceding(TransactionId xid);
154 static int KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax);
155 static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray,
158 static TransactionId KnownAssignedXidsGetOldestXmin(void);
159 static void KnownAssignedXidsDisplay(int trace_level);
162 * Report shared-memory space needed by CreateSharedProcArray.
165 ProcArrayShmemSize(void)
169 /* Size of the ProcArray structure itself */
170 #define PROCARRAY_MAXPROCS (MaxBackends + max_prepared_xacts)
172 size = offsetof(ProcArrayStruct, procs);
173 size = add_size(size, mul_size(sizeof(PGPROC *), PROCARRAY_MAXPROCS));
176 * During Hot Standby processing we have a data structure called
177 * KnownAssignedXids, created in shared memory. Local data structures are
178 * also created in various backends during GetSnapshotData(),
179 * TransactionIdIsInProgress() and GetRunningTransactionData(). All of the
180 * main structures created in those functions must be identically sized,
181 * since we may at times copy the whole of the data structures around. We
182 * refer to this size as TOTAL_MAX_CACHED_SUBXIDS.
184 * Ideally we'd only create this structure if we were actually doing hot
185 * standby in the current run, but we don't know that yet at the time
186 * shared memory is being set up.
188 #define TOTAL_MAX_CACHED_SUBXIDS \
189 ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
191 if (EnableHotStandby)
193 size = add_size(size,
194 mul_size(sizeof(TransactionId),
195 TOTAL_MAX_CACHED_SUBXIDS));
196 size = add_size(size,
197 mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS));
204 * Initialize the shared PGPROC array during postmaster startup.
207 CreateSharedProcArray(void)
211 /* Create or attach to the ProcArray shared structure */
212 procArray = (ProcArrayStruct *)
213 ShmemInitStruct("Proc Array",
214 add_size(offsetof(ProcArrayStruct, procs),
215 mul_size(sizeof(PGPROC *),
216 PROCARRAY_MAXPROCS)),
222 * We're the first - initialize.
224 procArray->numProcs = 0;
225 procArray->maxProcs = PROCARRAY_MAXPROCS;
226 procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS;
227 procArray->numKnownAssignedXids = 0;
228 procArray->tailKnownAssignedXids = 0;
229 procArray->headKnownAssignedXids = 0;
230 SpinLockInit(&procArray->known_assigned_xids_lck);
231 procArray->lastOverflowedXid = InvalidTransactionId;
234 /* Create or attach to the KnownAssignedXids arrays too, if needed */
235 if (EnableHotStandby)
237 KnownAssignedXids = (TransactionId *)
238 ShmemInitStruct("KnownAssignedXids",
239 mul_size(sizeof(TransactionId),
240 TOTAL_MAX_CACHED_SUBXIDS),
242 KnownAssignedXidsValid = (bool *)
243 ShmemInitStruct("KnownAssignedXidsValid",
244 mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS),
250 * Add the specified PGPROC to the shared array.
253 ProcArrayAdd(PGPROC *proc)
255 ProcArrayStruct *arrayP = procArray;
257 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
259 if (arrayP->numProcs >= arrayP->maxProcs)
262 * Ooops, no room. (This really shouldn't happen, since there is a
263 * fixed supply of PGPROC structs too, and so we should have failed
266 LWLockRelease(ProcArrayLock);
268 (errcode(ERRCODE_TOO_MANY_CONNECTIONS),
269 errmsg("sorry, too many clients already")));
272 arrayP->procs[arrayP->numProcs] = proc;
275 LWLockRelease(ProcArrayLock);
279 * Remove the specified PGPROC from the shared array.
281 * When latestXid is a valid XID, we are removing a live 2PC gxact from the
282 * array, and thus causing it to appear as "not running" anymore. In this
283 * case we must advance latestCompletedXid. (This is essentially the same
284 * as ProcArrayEndTransaction followed by removal of the PGPROC, but we take
285 * the ProcArrayLock only once, and don't damage the content of the PGPROC;
286 * twophase.c depends on the latter.)
289 ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
291 ProcArrayStruct *arrayP = procArray;
294 #ifdef XIDCACHE_DEBUG
295 /* dump stats at backend shutdown, but not prepared-xact end */
300 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
302 if (TransactionIdIsValid(latestXid))
304 Assert(TransactionIdIsValid(proc->xid));
306 /* Advance global latestCompletedXid while holding the lock */
307 if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
309 ShmemVariableCache->latestCompletedXid = latestXid;
313 /* Shouldn't be trying to remove a live transaction here */
314 Assert(!TransactionIdIsValid(proc->xid));
317 for (index = 0; index < arrayP->numProcs; index++)
319 if (arrayP->procs[index] == proc)
321 arrayP->procs[index] = arrayP->procs[arrayP->numProcs - 1];
322 arrayP->procs[arrayP->numProcs - 1] = NULL; /* for debugging */
324 LWLockRelease(ProcArrayLock);
330 LWLockRelease(ProcArrayLock);
332 elog(LOG, "failed to find proc %p in ProcArray", proc);
337 * ProcArrayEndTransaction -- mark a transaction as no longer running
339 * This is used interchangeably for commit and abort cases. The transaction
340 * commit/abort must already be reported to WAL and pg_clog.
342 * proc is currently always MyProc, but we pass it explicitly for flexibility.
343 * latestXid is the latest Xid among the transaction's main XID and
344 * subtransactions, or InvalidTransactionId if it has no XID. (We must ask
345 * the caller to pass latestXid, instead of computing it from the PGPROC's
346 * contents, because the subxid information in the PGPROC might be
350 ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
352 if (TransactionIdIsValid(latestXid))
355 * We must lock ProcArrayLock while clearing proc->xid, so that we do
356 * not exit the set of "running" transactions while someone else is
357 * taking a snapshot. See discussion in
358 * src/backend/access/transam/README.
360 Assert(TransactionIdIsValid(proc->xid));
362 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
364 proc->xid = InvalidTransactionId;
365 proc->lxid = InvalidLocalTransactionId;
366 proc->xmin = InvalidTransactionId;
367 /* must be cleared with xid/xmin: */
368 proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
369 proc->inCommit = false; /* be sure this is cleared in abort */
370 proc->recoveryConflictPending = false;
372 /* Clear the subtransaction-XID cache too while holding the lock */
373 proc->subxids.nxids = 0;
374 proc->subxids.overflowed = false;
376 /* Also advance global latestCompletedXid while holding the lock */
377 if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
379 ShmemVariableCache->latestCompletedXid = latestXid;
381 LWLockRelease(ProcArrayLock);
386 * If we have no XID, we don't need to lock, since we won't affect
387 * anyone else's calculation of a snapshot. We might change their
388 * estimate of global xmin, but that's OK.
390 Assert(!TransactionIdIsValid(proc->xid));
392 proc->lxid = InvalidLocalTransactionId;
393 proc->xmin = InvalidTransactionId;
394 /* must be cleared with xid/xmin: */
395 proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
396 proc->inCommit = false; /* be sure this is cleared in abort */
397 proc->recoveryConflictPending = false;
399 Assert(proc->subxids.nxids == 0);
400 Assert(proc->subxids.overflowed == false);
406 * ProcArrayClearTransaction -- clear the transaction fields
408 * This is used after successfully preparing a 2-phase transaction. We are
409 * not actually reporting the transaction's XID as no longer running --- it
410 * will still appear as running because the 2PC's gxact is in the ProcArray
411 * too. We just have to clear out our own PGPROC.
414 ProcArrayClearTransaction(PGPROC *proc)
417 * We can skip locking ProcArrayLock here, because this action does not
418 * actually change anyone's view of the set of running XIDs: our entry is
419 * duplicate with the gxact that has already been inserted into the
422 proc->xid = InvalidTransactionId;
423 proc->lxid = InvalidLocalTransactionId;
424 proc->xmin = InvalidTransactionId;
425 proc->recoveryConflictPending = false;
427 /* redundant, but just in case */
428 proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
429 proc->inCommit = false;
431 /* Clear the subtransaction-XID cache too */
432 proc->subxids.nxids = 0;
433 proc->subxids.overflowed = false;
437 * ProcArrayApplyRecoveryInfo -- apply recovery info about xids
439 * Takes us through 3 states: Initialized, Pending and Ready.
440 * Normal case is to go all the way to Ready straight away, though there
441 * are atypical cases where we need to take it in steps.
443 * Use the data about running transactions on master to create the initial
444 * state of KnownAssignedXids. We also use these records to regularly prune
445 * KnownAssignedXids because we know it is possible that some transactions
446 * with FATAL errors fail to write abort records, which could cause eventual
449 * See comments for LogStandbySnapshot().
452 ProcArrayApplyRecoveryInfo(RunningTransactions running)
456 TransactionId nextXid;
459 Assert(standbyState >= STANDBY_INITIALIZED);
460 Assert(TransactionIdIsValid(running->nextXid));
461 Assert(TransactionIdIsValid(running->oldestRunningXid));
462 Assert(TransactionIdIsNormal(running->latestCompletedXid));
465 * Remove stale transactions, if any.
467 ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid);
468 StandbyReleaseOldLocks(running->oldestRunningXid);
471 * If our snapshot is already valid, nothing else to do...
473 if (standbyState == STANDBY_SNAPSHOT_READY)
477 * If our initial RunningTransactionsData had an overflowed snapshot then
478 * we knew we were missing some subxids from our snapshot. We can use this
479 * data as an initial snapshot, but we cannot yet mark it valid. We know
480 * that the missing subxids are equal to or earlier than nextXid. After we
481 * initialise we continue to apply changes during recovery, so once the
482 * oldestRunningXid is later than the nextXid from the initial snapshot we
483 * know that we no longer have missing information and can mark the
486 if (standbyState == STANDBY_SNAPSHOT_PENDING)
488 if (TransactionIdPrecedes(standbySnapshotPendingXmin,
489 running->oldestRunningXid))
491 standbyState = STANDBY_SNAPSHOT_READY;
492 elog(trace_recovery(DEBUG2),
493 "running xact data now proven complete");
494 elog(trace_recovery(DEBUG2),
495 "recovery snapshots are now enabled");
498 elog(trace_recovery(DEBUG2),
499 "recovery snapshot waiting for %u oldest active xid on standby is %u",
500 standbySnapshotPendingXmin,
501 running->oldestRunningXid);
505 Assert(standbyState == STANDBY_INITIALIZED);
508 * OK, we need to initialise from the RunningTransactionsData record
512 * Release any locks belonging to old transactions that are not running
513 * according to the running-xacts record.
515 StandbyReleaseOldLocks(running->nextXid);
518 * Nobody else is running yet, but take locks anyhow
520 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
523 * KnownAssignedXids is sorted so we cannot just add the xids, we have to
526 * Some of the new xids are top-level xids and some are subtransactions.
527 * We don't call SubtransSetParent because it doesn't matter yet. If we
528 * aren't overflowed then all xids will fit in snapshot and so we don't
529 * need subtrans. If we later overflow, an xid assignment record will add
530 * xids to subtrans. If RunningXacts is overflowed then we don't have
531 * enough information to correctly update subtrans anyway.
533 Assert(procArray->numKnownAssignedXids == 0);
536 * Allocate a temporary array to avoid modifying the array passed as
539 xids = palloc(sizeof(TransactionId) * running->xcnt);
542 * Add to the temp array any xids which have not already completed.
545 for (i = 0; i < running->xcnt; i++)
547 TransactionId xid = running->xids[i];
550 * The running-xacts snapshot can contain xids that were still visible
551 * in the procarray when the snapshot was taken, but were already
552 * WAL-logged as completed. They're not running anymore, so ignore
555 if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
564 * Sort the array so that we can add them safely into
567 qsort(xids, nxids, sizeof(TransactionId), xidComparator);
570 * Add the sorted snapshot into KnownAssignedXids
572 for (i = 0; i < nxids; i++)
573 KnownAssignedXidsAdd(xids[i], xids[i], true);
575 KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
581 * Now we've got the running xids we need to set the global values that
582 * are used to track snapshots as they evolve further.
584 * - latestCompletedXid which will be the xmax for snapshots -
585 * lastOverflowedXid which shows whether snapshots overflow - nextXid
587 * If the snapshot overflowed, then we still initialise with what we know,
588 * but the recovery snapshot isn't fully valid yet because we know there
589 * are some subxids missing. We don't know the specific subxids that are
590 * missing, so conservatively assume the last one is latestObservedXid.
592 latestObservedXid = running->nextXid;
593 TransactionIdRetreat(latestObservedXid);
595 if (running->subxid_overflow)
597 standbyState = STANDBY_SNAPSHOT_PENDING;
599 standbySnapshotPendingXmin = latestObservedXid;
600 procArray->lastOverflowedXid = latestObservedXid;
604 standbyState = STANDBY_SNAPSHOT_READY;
606 standbySnapshotPendingXmin = InvalidTransactionId;
607 procArray->lastOverflowedXid = InvalidTransactionId;
611 * If a transaction wrote a commit record in the gap between taking and
612 * logging the snapshot then latestCompletedXid may already be higher than
613 * the value from the snapshot, so check before we use the incoming value.
615 if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
616 running->latestCompletedXid))
617 ShmemVariableCache->latestCompletedXid = running->latestCompletedXid;
619 /* nextXid must be beyond any observed xid */
620 nextXid = latestObservedXid;
621 TransactionIdAdvance(nextXid);
622 if (TransactionIdFollows(nextXid, ShmemVariableCache->nextXid))
623 ShmemVariableCache->nextXid = nextXid;
625 Assert(TransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
626 Assert(TransactionIdIsValid(ShmemVariableCache->nextXid));
628 LWLockRelease(ProcArrayLock);
630 elog(trace_recovery(DEBUG2), "running transaction data initialized");
631 KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
632 if (standbyState == STANDBY_SNAPSHOT_READY)
633 elog(trace_recovery(DEBUG2), "recovery snapshots are now enabled");
636 (errmsg("consistent state delayed because recovery snapshot incomplete")));
640 * ProcArrayApplyXidAssignment
641 * Process an XLOG_XACT_ASSIGNMENT WAL record
644 ProcArrayApplyXidAssignment(TransactionId topxid,
645 int nsubxids, TransactionId *subxids)
647 TransactionId max_xid;
650 Assert(standbyState >= STANDBY_INITIALIZED);
652 max_xid = TransactionIdLatest(topxid, nsubxids, subxids);
655 * Mark all the subtransactions as observed.
657 * NOTE: This will fail if the subxid contains too many previously
658 * unobserved xids to fit into known-assigned-xids. That shouldn't happen
659 * as the code stands, because xid-assignment records should never contain
660 * more than PGPROC_MAX_CACHED_SUBXIDS entries.
662 RecordKnownAssignedTransactionIds(max_xid);
665 * Notice that we update pg_subtrans with the top-level xid, rather than
666 * the parent xid. This is a difference between normal processing and
667 * recovery, yet is still correct in all cases. The reason is that
668 * subtransaction commit is not marked in clog until commit processing, so
669 * all aborted subtransactions have already been clearly marked in clog.
670 * As a result we are able to refer directly to the top-level
671 * transaction's state rather than skipping through all the intermediate
672 * states in the subtransaction tree. This should be the first time we
673 * have attempted to SubTransSetParent().
675 for (i = 0; i < nsubxids; i++)
676 SubTransSetParent(subxids[i], topxid, false);
679 * Uses same locking as transaction commit
681 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
684 * Remove subxids from known-assigned-xacts.
686 KnownAssignedXidsRemoveTree(InvalidTransactionId, nsubxids, subxids);
689 * Advance lastOverflowedXid to be at least the last of these subxids.
691 if (TransactionIdPrecedes(procArray->lastOverflowedXid, max_xid))
692 procArray->lastOverflowedXid = max_xid;
694 LWLockRelease(ProcArrayLock);
698 * TransactionIdIsInProgress -- is given transaction running in some backend
700 * Aside from some shortcuts such as checking RecentXmin and our own Xid,
701 * there are four possibilities for finding a running transaction:
703 * 1. The given Xid is a main transaction Id. We will find this out cheaply
704 * by looking at the PGPROC struct for each backend.
706 * 2. The given Xid is one of the cached subxact Xids in the PGPROC array.
707 * We can find this out cheaply too.
709 * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see
710 * if the Xid is running on the master.
712 * 4. Search the SubTrans tree to find the Xid's topmost parent, and then
713 * see if that is running according to PGPROC or KnownAssignedXids. This is
714 * the slowest way, but sadly it has to be done always if the others failed,
715 * unless we see that the cached subxact sets are complete (none have
718 * ProcArrayLock has to be held while we do 1, 2, 3. If we save the top Xids
719 * while doing 1 and 3, we can release the ProcArrayLock while we do 4.
720 * This buys back some concurrency (and we can't retrieve the main Xids from
721 * PGPROC again anyway; see GetNewTransactionId).
724 TransactionIdIsInProgress(TransactionId xid)
726 static TransactionId *xids = NULL;
728 ProcArrayStruct *arrayP = procArray;
729 TransactionId topxid;
734 * Don't bother checking a transaction older than RecentXmin; it could not
735 * possibly still be running. (Note: in particular, this guarantees that
736 * we reject InvalidTransactionId, FrozenTransactionId, etc as not
739 if (TransactionIdPrecedes(xid, RecentXmin))
741 xc_by_recent_xmin_inc();
746 * We may have just checked the status of this transaction, so if it is
747 * already known to be completed, we can fall out without any access to
750 if (TransactionIdIsKnownCompleted(xid))
752 xc_by_known_xact_inc();
757 * Also, we can handle our own transaction (and subtransactions) without
758 * any access to shared memory.
760 if (TransactionIdIsCurrentTransactionId(xid))
767 * If first time through, get workspace to remember main XIDs in. We
768 * malloc it permanently to avoid repeated palloc/pfree overhead.
773 * In hot standby mode, reserve enough space to hold all xids in the
774 * known-assigned list. If we later finish recovery, we no longer need
775 * the bigger array, but we don't bother to shrink it.
777 int maxxids = RecoveryInProgress() ? TOTAL_MAX_CACHED_SUBXIDS : arrayP->maxProcs;
779 xids = (TransactionId *) malloc(maxxids * sizeof(TransactionId));
782 (errcode(ERRCODE_OUT_OF_MEMORY),
783 errmsg("out of memory")));
786 LWLockAcquire(ProcArrayLock, LW_SHARED);
789 * Now that we have the lock, we can check latestCompletedXid; if the
790 * target Xid is after that, it's surely still running.
792 if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, xid))
794 LWLockRelease(ProcArrayLock);
795 xc_by_latest_xid_inc();
799 /* No shortcuts, gotta grovel through the array */
800 for (i = 0; i < arrayP->numProcs; i++)
802 volatile PGPROC *proc = arrayP->procs[i];
805 /* Ignore my own proc --- dealt with it above */
809 /* Fetch xid just once - see GetNewTransactionId */
812 if (!TransactionIdIsValid(pxid))
816 * Step 1: check the main Xid
818 if (TransactionIdEquals(pxid, xid))
820 LWLockRelease(ProcArrayLock);
821 xc_by_main_xid_inc();
826 * We can ignore main Xids that are younger than the target Xid, since
827 * the target could not possibly be their child.
829 if (TransactionIdPrecedes(xid, pxid))
833 * Step 2: check the cached child-Xids arrays
835 for (j = proc->subxids.nxids - 1; j >= 0; j--)
837 /* Fetch xid just once - see GetNewTransactionId */
838 TransactionId cxid = proc->subxids.xids[j];
840 if (TransactionIdEquals(cxid, xid))
842 LWLockRelease(ProcArrayLock);
843 xc_by_child_xid_inc();
849 * Save the main Xid for step 4. We only need to remember main Xids
850 * that have uncached children. (Note: there is no race condition
851 * here because the overflowed flag cannot be cleared, only set, while
852 * we hold ProcArrayLock. So we can't miss an Xid that we need to
855 if (proc->subxids.overflowed)
856 xids[nxids++] = pxid;
860 * Step 3: in hot standby mode, check the known-assigned-xids list. XIDs
861 * in the list must be treated as running.
863 if (RecoveryInProgress())
865 /* none of the PGPROC entries should have XIDs in hot standby mode */
868 if (KnownAssignedXidExists(xid))
870 LWLockRelease(ProcArrayLock);
871 xc_by_known_assigned_inc();
876 * If the KnownAssignedXids overflowed, we have to check pg_subtrans
877 * too. Fetch all xids from KnownAssignedXids that are lower than
878 * xid, since if xid is a subtransaction its parent will always have a
879 * lower value. Note we will collect both main and subXIDs here, but
880 * there's no help for it.
882 if (TransactionIdPrecedesOrEquals(xid, procArray->lastOverflowedXid))
883 nxids = KnownAssignedXidsGet(xids, xid);
886 LWLockRelease(ProcArrayLock);
889 * If none of the relevant caches overflowed, we know the Xid is not
890 * running without even looking at pg_subtrans.
894 xc_no_overflow_inc();
899 * Step 4: have to check pg_subtrans.
901 * At this point, we know it's either a subtransaction of one of the Xids
902 * in xids[], or it's not running. If it's an already-failed
903 * subtransaction, we want to say "not running" even though its parent may
904 * still be running. So first, check pg_clog to see if it's been aborted.
906 xc_slow_answer_inc();
908 if (TransactionIdDidAbort(xid))
912 * It isn't aborted, so check whether the transaction tree it belongs to
913 * is still running (or, more precisely, whether it was running when we
914 * held ProcArrayLock).
916 topxid = SubTransGetTopmostTransaction(xid);
917 Assert(TransactionIdIsValid(topxid));
918 if (!TransactionIdEquals(topxid, xid))
920 for (i = 0; i < nxids; i++)
922 if (TransactionIdEquals(xids[i], topxid))
931 * TransactionIdIsActive -- is xid the top-level XID of an active backend?
933 * This differs from TransactionIdIsInProgress in that it ignores prepared
934 * transactions, as well as transactions running on the master if we're in
935 * hot standby. Also, we ignore subtransactions since that's not needed
939 TransactionIdIsActive(TransactionId xid)
942 ProcArrayStruct *arrayP = procArray;
946 * Don't bother checking a transaction older than RecentXmin; it could not
947 * possibly still be running.
949 if (TransactionIdPrecedes(xid, RecentXmin))
952 LWLockAcquire(ProcArrayLock, LW_SHARED);
954 for (i = 0; i < arrayP->numProcs; i++)
956 volatile PGPROC *proc = arrayP->procs[i];
958 /* Fetch xid just once - see GetNewTransactionId */
959 TransactionId pxid = proc->xid;
961 if (!TransactionIdIsValid(pxid))
965 continue; /* ignore prepared transactions */
967 if (TransactionIdEquals(pxid, xid))
974 LWLockRelease(ProcArrayLock);
981 * GetOldestXmin -- returns oldest transaction that was running
982 * when any current transaction was started.
984 * If allDbs is TRUE then all backends are considered; if allDbs is FALSE
985 * then only backends running in my own database are considered.
987 * If ignoreVacuum is TRUE then backends with the PROC_IN_VACUUM flag set are
990 * This is used by VACUUM to decide which deleted tuples must be preserved
991 * in a table. allDbs = TRUE is needed for shared relations, but allDbs =
992 * FALSE is sufficient for non-shared relations, since only backends in my
993 * own database could ever see the tuples in them. Also, we can ignore
994 * concurrently running lazy VACUUMs because (a) they must be working on other
995 * tables, and (b) they don't need to do snapshot-based lookups.
997 * This is also used to determine where to truncate pg_subtrans. allDbs
998 * must be TRUE for that case, and ignoreVacuum FALSE.
1000 * Note: it's possible for the calculated value to move backwards on repeated
1001 * calls. The calculated value is conservative, so that anything older is
1002 * definitely not considered as running by anyone anymore, but the exact
1003 * value calculated depends on a number of things. For example, if allDbs is
1004 * TRUE and there are no transactions running in the current database,
1005 * GetOldestXmin() returns latestCompletedXid. If a transaction begins after
1006 * that, its xmin will include in-progress transactions in other databases
1007 * that started earlier, so another call will return an lower value. The
1008 * return value is also adjusted with vacuum_defer_cleanup_age, so increasing
1009 * that setting on the fly is an easy way to have GetOldestXmin() move
1012 * Note: we include all currently running xids in the set of considered xids.
1013 * This ensures that if a just-started xact has not yet set its snapshot,
1014 * when it does set the snapshot it cannot set xmin less than what we compute.
1015 * See notes in src/backend/access/transam/README.
1018 GetOldestXmin(bool allDbs, bool ignoreVacuum)
1020 ProcArrayStruct *arrayP = procArray;
1021 TransactionId result;
1024 /* Cannot look for individual databases during recovery */
1025 Assert(allDbs || !RecoveryInProgress());
1027 LWLockAcquire(ProcArrayLock, LW_SHARED);
1030 * We initialize the MIN() calculation with latestCompletedXid + 1. This
1031 * is a lower bound for the XIDs that might appear in the ProcArray later,
1032 * and so protects us against overestimating the result due to future
1035 result = ShmemVariableCache->latestCompletedXid;
1036 Assert(TransactionIdIsNormal(result));
1037 TransactionIdAdvance(result);
1039 for (index = 0; index < arrayP->numProcs; index++)
1041 volatile PGPROC *proc = arrayP->procs[index];
1043 if (ignoreVacuum && (proc->vacuumFlags & PROC_IN_VACUUM))
1047 proc->databaseId == MyDatabaseId ||
1048 proc->databaseId == 0) /* include WalSender */
1050 /* Fetch xid just once - see GetNewTransactionId */
1051 TransactionId xid = proc->xid;
1053 /* First consider the transaction's own Xid, if any */
1054 if (TransactionIdIsNormal(xid) &&
1055 TransactionIdPrecedes(xid, result))
1059 * Also consider the transaction's Xmin, if set.
1061 * We must check both Xid and Xmin because a transaction might
1062 * have an Xmin but not (yet) an Xid; conversely, if it has an
1063 * Xid, that could determine some not-yet-set Xmin.
1065 xid = proc->xmin; /* Fetch just once */
1066 if (TransactionIdIsNormal(xid) &&
1067 TransactionIdPrecedes(xid, result))
1072 if (RecoveryInProgress())
1075 * Check to see whether KnownAssignedXids contains an xid value older
1076 * than the main procarray.
1078 TransactionId kaxmin = KnownAssignedXidsGetOldestXmin();
1080 LWLockRelease(ProcArrayLock);
1082 if (TransactionIdIsNormal(kaxmin) &&
1083 TransactionIdPrecedes(kaxmin, result))
1089 * No other information needed, so release the lock immediately.
1091 LWLockRelease(ProcArrayLock);
1094 * Compute the cutoff XID, being careful not to generate a "permanent"
1095 * XID. We need do this only on the primary, never on standby.
1097 * vacuum_defer_cleanup_age provides some additional "slop" for the
1098 * benefit of hot standby queries on slave servers. This is quick and
1099 * dirty, and perhaps not all that useful unless the master has a
1100 * predictable transaction rate, but it's what we've got. Note that
1101 * we are assuming vacuum_defer_cleanup_age isn't large enough to
1102 * cause wraparound --- so guc.c should limit it to no more than the
1103 * xidStopLimit threshold in varsup.c.
1105 result -= vacuum_defer_cleanup_age;
1106 if (!TransactionIdIsNormal(result))
1107 result = FirstNormalTransactionId;
1114 * GetSnapshotData -- returns information about running transactions.
1116 * The returned snapshot includes xmin (lowest still-running xact ID),
1117 * xmax (highest completed xact ID + 1), and a list of running xact IDs
1118 * in the range xmin <= xid < xmax. It is used as follows:
1119 * All xact IDs < xmin are considered finished.
1120 * All xact IDs >= xmax are considered still running.
1121 * For an xact ID xmin <= xid < xmax, consult list to see whether
1122 * it is considered running or not.
1123 * This ensures that the set of transactions seen as "running" by the
1124 * current xact will not change after it takes the snapshot.
1126 * All running top-level XIDs are included in the snapshot, except for lazy
1127 * VACUUM processes. We also try to include running subtransaction XIDs,
1128 * but since PGPROC has only a limited cache area for subxact XIDs, full
1129 * information may not be available. If we find any overflowed subxid arrays,
1130 * we have to mark the snapshot's subxid data as overflowed, and extra work
1131 * *may* need to be done to determine what's running (see XidInMVCCSnapshot()
1134 * We also update the following backend-global variables:
1135 * TransactionXmin: the oldest xmin of any snapshot in use in the
1136 * current transaction (this is the same as MyProc->xmin).
1137 * RecentXmin: the xmin computed for the most recent snapshot. XIDs
1138 * older than this are known not running any more.
1139 * RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
1140 * running transactions, except those running LAZY VACUUM). This is
1141 * the same computation done by GetOldestXmin(true, true).
1143 * Note: this function should probably not be called with an argument that's
1144 * not statically allocated (see xip allocation below).
1147 GetSnapshotData(Snapshot snapshot)
1149 ProcArrayStruct *arrayP = procArray;
1152 TransactionId globalxmin;
1156 bool suboverflowed = false;
1158 Assert(snapshot != NULL);
1161 * Allocating space for maxProcs xids is usually overkill; numProcs would
1162 * be sufficient. But it seems better to do the malloc while not holding
1163 * the lock, so we can't look at numProcs. Likewise, we allocate much
1164 * more subxip storage than is probably needed.
1166 * This does open a possibility for avoiding repeated malloc/free: since
1167 * maxProcs does not change at runtime, we can simply reuse the previous
1168 * xip arrays if any. (This relies on the fact that all callers pass
1169 * static SnapshotData structs.)
1171 if (snapshot->xip == NULL)
1174 * First call for this snapshot. Snapshot is same size whether or not
1175 * we are in recovery, see later comments.
1177 snapshot->xip = (TransactionId *)
1178 malloc(arrayP->maxProcs * sizeof(TransactionId));
1179 if (snapshot->xip == NULL)
1181 (errcode(ERRCODE_OUT_OF_MEMORY),
1182 errmsg("out of memory")));
1183 Assert(snapshot->subxip == NULL);
1184 snapshot->subxip = (TransactionId *)
1185 malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
1186 if (snapshot->subxip == NULL)
1188 (errcode(ERRCODE_OUT_OF_MEMORY),
1189 errmsg("out of memory")));
1193 * It is sufficient to get shared lock on ProcArrayLock, even if we are
1194 * going to set MyProc->xmin.
1196 LWLockAcquire(ProcArrayLock, LW_SHARED);
1198 /* xmax is always latestCompletedXid + 1 */
1199 xmax = ShmemVariableCache->latestCompletedXid;
1200 Assert(TransactionIdIsNormal(xmax));
1201 TransactionIdAdvance(xmax);
1203 /* initialize xmin calculation with xmax */
1204 globalxmin = xmin = xmax;
1207 * If we're in recovery then snapshot data comes from a different place,
1208 * so decide which route we take before grab the lock. It is possible for
1209 * recovery to end before we finish taking snapshot, and for newly
1210 * assigned transaction ids to be added to the procarray. Xmax cannot
1211 * change while we hold ProcArrayLock, so those newly added transaction
1212 * ids would be filtered away, so we need not be concerned about them.
1214 snapshot->takenDuringRecovery = RecoveryInProgress();
1216 if (!snapshot->takenDuringRecovery)
1219 * Spin over procArray checking xid, xmin, and subxids. The goal is
1220 * to gather all active xids, find the lowest xmin, and try to record
1221 * subxids. During recovery no xids will be assigned, so all normal
1222 * backends can be ignored, nor are there any VACUUMs running. All
1223 * prepared transaction xids are held in KnownAssignedXids, so these
1224 * will be seen without needing to loop through procs here.
1226 for (index = 0; index < arrayP->numProcs; index++)
1228 volatile PGPROC *proc = arrayP->procs[index];
1231 /* Ignore procs running LAZY VACUUM */
1232 if (proc->vacuumFlags & PROC_IN_VACUUM)
1235 /* Update globalxmin to be the smallest valid xmin */
1236 xid = proc->xmin; /* fetch just once */
1237 if (TransactionIdIsNormal(xid) &&
1238 TransactionIdPrecedes(xid, globalxmin))
1241 /* Fetch xid just once - see GetNewTransactionId */
1245 * If the transaction has been assigned an xid < xmax we add it to
1246 * the snapshot, and update xmin if necessary. There's no need to
1247 * store XIDs >= xmax, since we'll treat them as running anyway.
1248 * We don't bother to examine their subxids either.
1250 * We don't include our own XID (if any) in the snapshot, but we
1251 * must include it into xmin.
1253 if (TransactionIdIsNormal(xid))
1255 if (TransactionIdFollowsOrEquals(xid, xmax))
1258 snapshot->xip[count++] = xid;
1259 if (TransactionIdPrecedes(xid, xmin))
1264 * Save subtransaction XIDs if possible (if we've already
1265 * overflowed, there's no point). Note that the subxact XIDs must
1266 * be later than their parent, so no need to check them against
1267 * xmin. We could filter against xmax, but it seems better not to
1268 * do that much work while holding the ProcArrayLock.
1270 * The other backend can add more subxids concurrently, but cannot
1271 * remove any. Hence it's important to fetch nxids just once.
1272 * Should be safe to use memcpy, though. (We needn't worry about
1273 * missing any xids added concurrently, because they must postdate
1276 * Again, our own XIDs are not included in the snapshot.
1278 if (!suboverflowed && proc != MyProc)
1280 if (proc->subxids.overflowed)
1281 suboverflowed = true;
1284 int nxids = proc->subxids.nxids;
1288 memcpy(snapshot->subxip + subcount,
1289 (void *) proc->subxids.xids,
1290 nxids * sizeof(TransactionId));
1300 * We're in hot standby, so get XIDs from KnownAssignedXids.
1302 * We store all xids directly into subxip[]. Here's why:
1304 * In recovery we don't know which xids are top-level and which are
1305 * subxacts, a design choice that greatly simplifies xid processing.
1307 * It seems like we would want to try to put xids into xip[] only, but
1308 * that is fairly small. We would either need to make that bigger or
1309 * to increase the rate at which we WAL-log xid assignment; neither is
1310 * an appealing choice.
1312 * We could try to store xids into xip[] first and then into subxip[]
1313 * if there are too many xids. That only works if the snapshot doesn't
1314 * overflow because we do not search subxip[] in that case. A simpler
1315 * way is to just store all xids in the subxact array because this is
1316 * by far the bigger array. We just leave the xip array empty.
1318 * Either way we need to change the way XidInMVCCSnapshot() works
1319 * depending upon when the snapshot was taken, or change normal
1320 * snapshot processing so it matches.
1322 subcount = KnownAssignedXidsGetAndSetXmin(snapshot->subxip, &xmin,
1325 if (TransactionIdPrecedesOrEquals(xmin, procArray->lastOverflowedXid))
1326 suboverflowed = true;
1329 if (!TransactionIdIsValid(MyProc->xmin))
1330 MyProc->xmin = TransactionXmin = xmin;
1332 LWLockRelease(ProcArrayLock);
1335 * Update globalxmin to include actual process xids. This is a slightly
1336 * different way of computing it than GetOldestXmin uses, but should give
1339 if (TransactionIdPrecedes(xmin, globalxmin))
1342 /* Update global variables too */
1343 RecentGlobalXmin = globalxmin - vacuum_defer_cleanup_age;
1344 if (!TransactionIdIsNormal(RecentGlobalXmin))
1345 RecentGlobalXmin = FirstNormalTransactionId;
1348 snapshot->xmin = xmin;
1349 snapshot->xmax = xmax;
1350 snapshot->xcnt = count;
1351 snapshot->subxcnt = subcount;
1352 snapshot->suboverflowed = suboverflowed;
1354 snapshot->curcid = GetCurrentCommandId(false);
1357 * This is a new snapshot, so set both refcounts are zero, and mark it as
1358 * not copied in persistent memory.
1360 snapshot->active_count = 0;
1361 snapshot->regd_count = 0;
1362 snapshot->copied = false;
1368 * GetRunningTransactionData -- returns information about running transactions.
1370 * Similar to GetSnapshotData but returns more information. We include
1371 * all PGPROCs with an assigned TransactionId, even VACUUM processes.
1373 * We acquire XidGenLock, but the caller is responsible for releasing it.
1374 * This ensures that no new XIDs enter the proc array until the caller has
1375 * WAL-logged this snapshot, and releases the lock.
1377 * The returned data structure is statically allocated; caller should not
1378 * modify it, and must not assume it is valid past the next call.
1380 * This is never executed during recovery so there is no need to look at
1381 * KnownAssignedXids.
1383 * We don't worry about updating other counters, we want to keep this as
1384 * simple as possible and leave GetSnapshotData() as the primary code for
1388 GetRunningTransactionData(void)
1390 /* result workspace */
1391 static RunningTransactionsData CurrentRunningXactsData;
1393 ProcArrayStruct *arrayP = procArray;
1394 RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData;
1395 TransactionId latestCompletedXid;
1396 TransactionId oldestRunningXid;
1397 TransactionId *xids;
1403 Assert(!RecoveryInProgress());
1406 * Allocating space for maxProcs xids is usually overkill; numProcs would
1407 * be sufficient. But it seems better to do the malloc while not holding
1408 * the lock, so we can't look at numProcs. Likewise, we allocate much
1409 * more subxip storage than is probably needed.
1411 * Should only be allocated in bgwriter, since only ever executed during
1414 if (CurrentRunningXacts->xids == NULL)
1419 CurrentRunningXacts->xids = (TransactionId *)
1420 malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
1421 if (CurrentRunningXacts->xids == NULL)
1423 (errcode(ERRCODE_OUT_OF_MEMORY),
1424 errmsg("out of memory")));
1427 xids = CurrentRunningXacts->xids;
1429 count = subcount = 0;
1430 suboverflowed = false;
1433 * Ensure that no xids enter or leave the procarray while we obtain
1436 LWLockAcquire(ProcArrayLock, LW_SHARED);
1437 LWLockAcquire(XidGenLock, LW_SHARED);
1439 latestCompletedXid = ShmemVariableCache->latestCompletedXid;
1441 oldestRunningXid = ShmemVariableCache->nextXid;
1444 * Spin over procArray collecting all xids and subxids.
1446 for (index = 0; index < arrayP->numProcs; index++)
1448 volatile PGPROC *proc = arrayP->procs[index];
1452 /* Fetch xid just once - see GetNewTransactionId */
1456 * We don't need to store transactions that don't have a TransactionId
1457 * yet because they will not show as running on a standby server.
1459 if (!TransactionIdIsValid(xid))
1462 xids[count++] = xid;
1464 if (TransactionIdPrecedes(xid, oldestRunningXid))
1465 oldestRunningXid = xid;
1468 * Save subtransaction XIDs. Other backends can't add or remove
1469 * entries while we're holding XidGenLock.
1471 nxids = proc->subxids.nxids;
1474 memcpy(&xids[count], (void *) proc->subxids.xids,
1475 nxids * sizeof(TransactionId));
1479 if (proc->subxids.overflowed)
1480 suboverflowed = true;
1483 * Top-level XID of a transaction is always less than any of its
1484 * subxids, so we don't need to check if any of the subxids are
1485 * smaller than oldestRunningXid
1490 CurrentRunningXacts->xcnt = count;
1491 CurrentRunningXacts->subxid_overflow = suboverflowed;
1492 CurrentRunningXacts->nextXid = ShmemVariableCache->nextXid;
1493 CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
1494 CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
1496 /* We don't release XidGenLock here, the caller is responsible for that */
1497 LWLockRelease(ProcArrayLock);
1499 Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
1500 Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid));
1501 Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid));
1503 return CurrentRunningXacts;
1507 * GetTransactionsInCommit -- Get the XIDs of transactions that are committing
1509 * Constructs an array of XIDs of transactions that are currently in commit
1510 * critical sections, as shown by having inCommit set in their PGPROC entries.
1512 * *xids_p is set to a palloc'd array that should be freed by the caller.
1513 * The return value is the number of valid entries.
1515 * Note that because backends set or clear inCommit without holding any lock,
1516 * the result is somewhat indeterminate, but we don't really care. Even in
1517 * a multiprocessor with delayed writes to shared memory, it should be certain
1518 * that setting of inCommit will propagate to shared memory when the backend
1519 * takes the WALInsertLock, so we cannot fail to see an xact as inCommit if
1520 * it's already inserted its commit record. Whether it takes a little while
1521 * for clearing of inCommit to propagate is unimportant for correctness.
1524 GetTransactionsInCommit(TransactionId **xids_p)
1526 ProcArrayStruct *arrayP = procArray;
1527 TransactionId *xids;
1531 xids = (TransactionId *) palloc(arrayP->maxProcs * sizeof(TransactionId));
1534 LWLockAcquire(ProcArrayLock, LW_SHARED);
1536 for (index = 0; index < arrayP->numProcs; index++)
1538 volatile PGPROC *proc = arrayP->procs[index];
1540 /* Fetch xid just once - see GetNewTransactionId */
1541 TransactionId pxid = proc->xid;
1543 if (proc->inCommit && TransactionIdIsValid(pxid))
1544 xids[nxids++] = pxid;
1547 LWLockRelease(ProcArrayLock);
1554 * HaveTransactionsInCommit -- Are any of the specified XIDs in commit?
1556 * This is used with the results of GetTransactionsInCommit to see if any
1557 * of the specified XIDs are still in their commit critical sections.
1559 * Note: this is O(N^2) in the number of xacts that are/were in commit, but
1560 * those numbers should be small enough for it not to be a problem.
1563 HaveTransactionsInCommit(TransactionId *xids, int nxids)
1565 bool result = false;
1566 ProcArrayStruct *arrayP = procArray;
1569 LWLockAcquire(ProcArrayLock, LW_SHARED);
1571 for (index = 0; index < arrayP->numProcs; index++)
1573 volatile PGPROC *proc = arrayP->procs[index];
1575 /* Fetch xid just once - see GetNewTransactionId */
1576 TransactionId pxid = proc->xid;
1578 if (proc->inCommit && TransactionIdIsValid(pxid))
1582 for (i = 0; i < nxids; i++)
1584 if (xids[i] == pxid)
1595 LWLockRelease(ProcArrayLock);
1601 * BackendPidGetProc -- get a backend's PGPROC given its PID
1603 * Returns NULL if not found. Note that it is up to the caller to be
1604 * sure that the question remains meaningful for long enough for the
1605 * answer to be used ...
1608 BackendPidGetProc(int pid)
1610 PGPROC *result = NULL;
1611 ProcArrayStruct *arrayP = procArray;
1614 if (pid == 0) /* never match dummy PGPROCs */
1617 LWLockAcquire(ProcArrayLock, LW_SHARED);
1619 for (index = 0; index < arrayP->numProcs; index++)
1621 PGPROC *proc = arrayP->procs[index];
1623 if (proc->pid == pid)
1630 LWLockRelease(ProcArrayLock);
1636 * BackendXidGetPid -- get a backend's pid given its XID
1638 * Returns 0 if not found or it's a prepared transaction. Note that
1639 * it is up to the caller to be sure that the question remains
1640 * meaningful for long enough for the answer to be used ...
1642 * Only main transaction Ids are considered. This function is mainly
1643 * useful for determining what backend owns a lock.
1645 * Beware that not every xact has an XID assigned. However, as long as you
1646 * only call this using an XID found on disk, you're safe.
1649 BackendXidGetPid(TransactionId xid)
1652 ProcArrayStruct *arrayP = procArray;
1655 if (xid == InvalidTransactionId) /* never match invalid xid */
1658 LWLockAcquire(ProcArrayLock, LW_SHARED);
1660 for (index = 0; index < arrayP->numProcs; index++)
1662 volatile PGPROC *proc = arrayP->procs[index];
1664 if (proc->xid == xid)
1671 LWLockRelease(ProcArrayLock);
1677 * IsBackendPid -- is a given pid a running backend
1680 IsBackendPid(int pid)
1682 return (BackendPidGetProc(pid) != NULL);
1687 * GetCurrentVirtualXIDs -- returns an array of currently active VXIDs.
1689 * The array is palloc'd. The number of valid entries is returned into *nvxids.
1691 * The arguments allow filtering the set of VXIDs returned. Our own process
1692 * is always skipped. In addition:
1693 * If limitXmin is not InvalidTransactionId, skip processes with
1695 * If excludeXmin0 is true, skip processes with xmin = 0.
1696 * If allDbs is false, skip processes attached to other databases.
1697 * If excludeVacuum isn't zero, skip processes for which
1698 * (vacuumFlags & excludeVacuum) is not zero.
1700 * Note: the purpose of the limitXmin and excludeXmin0 parameters is to
1701 * allow skipping backends whose oldest live snapshot is no older than
1702 * some snapshot we have. Since we examine the procarray with only shared
1703 * lock, there are race conditions: a backend could set its xmin just after
1704 * we look. Indeed, on multiprocessors with weak memory ordering, the
1705 * other backend could have set its xmin *before* we look. We know however
1706 * that such a backend must have held shared ProcArrayLock overlapping our
1707 * own hold of ProcArrayLock, else we would see its xmin update. Therefore,
1708 * any snapshot the other backend is taking concurrently with our scan cannot
1709 * consider any transactions as still running that we think are committed
1710 * (since backends must hold ProcArrayLock exclusive to commit).
1712 VirtualTransactionId *
1713 GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
1714 bool allDbs, int excludeVacuum,
1717 VirtualTransactionId *vxids;
1718 ProcArrayStruct *arrayP = procArray;
1722 /* allocate what's certainly enough result space */
1723 vxids = (VirtualTransactionId *)
1724 palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
1726 LWLockAcquire(ProcArrayLock, LW_SHARED);
1728 for (index = 0; index < arrayP->numProcs; index++)
1730 volatile PGPROC *proc = arrayP->procs[index];
1735 if (excludeVacuum & proc->vacuumFlags)
1738 if (allDbs || proc->databaseId == MyDatabaseId)
1740 /* Fetch xmin just once - might change on us */
1741 TransactionId pxmin = proc->xmin;
1743 if (excludeXmin0 && !TransactionIdIsValid(pxmin))
1747 * InvalidTransactionId precedes all other XIDs, so a proc that
1748 * hasn't set xmin yet will not be rejected by this test.
1750 if (!TransactionIdIsValid(limitXmin) ||
1751 TransactionIdPrecedesOrEquals(pxmin, limitXmin))
1753 VirtualTransactionId vxid;
1755 GET_VXID_FROM_PGPROC(vxid, *proc);
1756 if (VirtualTransactionIdIsValid(vxid))
1757 vxids[count++] = vxid;
1762 LWLockRelease(ProcArrayLock);
1769 * GetConflictingVirtualXIDs -- returns an array of currently active VXIDs.
1771 * Usage is limited to conflict resolution during recovery on standby servers.
1772 * limitXmin is supplied as either latestRemovedXid, or InvalidTransactionId
1773 * in cases where we cannot accurately determine a value for latestRemovedXid.
1775 * If limitXmin is InvalidTransactionId then we want to kill everybody,
1776 * so we're not worried if they have a snapshot or not, nor does it really
1777 * matter what type of lock we hold.
1779 * All callers that are checking xmins always now supply a valid and useful
1780 * value for limitXmin. The limitXmin is always lower than the lowest
1781 * numbered KnownAssignedXid that is not already a FATAL error. This is
1782 * because we only care about cleanup records that are cleaning up tuple
1783 * versions from committed transactions. In that case they will only occur
1784 * at the point where the record is less than the lowest running xid. That
1785 * allows us to say that if any backend takes a snapshot concurrently with
1786 * us then the conflict assessment made here would never include the snapshot
1787 * that is being derived. So we take LW_SHARED on the ProcArray and allow
1788 * concurrent snapshots when limitXmin is valid. We might think about adding
1789 * Assert(limitXmin < lowest(KnownAssignedXids))
1790 * but that would not be true in the case of FATAL errors lagging in array,
1791 * but we already know those are bogus anyway, so we skip that test.
1793 * If dbOid is valid we skip backends attached to other databases.
1795 * Be careful to *not* pfree the result from this function. We reuse
1796 * this array sufficiently often that we use malloc for the result.
1798 VirtualTransactionId *
1799 GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
1801 static VirtualTransactionId *vxids;
1802 ProcArrayStruct *arrayP = procArray;
1807 * If first time through, get workspace to remember main XIDs in. We
1808 * malloc it permanently to avoid repeated palloc/pfree overhead. Allow
1809 * result space, remembering room for a terminator.
1813 vxids = (VirtualTransactionId *)
1814 malloc(sizeof(VirtualTransactionId) * (arrayP->maxProcs + 1));
1817 (errcode(ERRCODE_OUT_OF_MEMORY),
1818 errmsg("out of memory")));
1821 LWLockAcquire(ProcArrayLock, LW_SHARED);
1823 for (index = 0; index < arrayP->numProcs; index++)
1825 volatile PGPROC *proc = arrayP->procs[index];
1827 /* Exclude prepared transactions */
1831 if (!OidIsValid(dbOid) ||
1832 proc->databaseId == dbOid)
1834 /* Fetch xmin just once - can't change on us, but good coding */
1835 TransactionId pxmin = proc->xmin;
1838 * We ignore an invalid pxmin because this means that backend has
1839 * no snapshot and cannot get another one while we hold exclusive
1842 if (!TransactionIdIsValid(limitXmin) ||
1843 (TransactionIdIsValid(pxmin) && !TransactionIdFollows(pxmin, limitXmin)))
1845 VirtualTransactionId vxid;
1847 GET_VXID_FROM_PGPROC(vxid, *proc);
1848 if (VirtualTransactionIdIsValid(vxid))
1849 vxids[count++] = vxid;
1854 LWLockRelease(ProcArrayLock);
1856 /* add the terminator */
1857 vxids[count].backendId = InvalidBackendId;
1858 vxids[count].localTransactionId = InvalidLocalTransactionId;
1864 * CancelVirtualTransaction - used in recovery conflict processing
1866 * Returns pid of the process signaled, or 0 if not found.
1869 CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode)
1871 ProcArrayStruct *arrayP = procArray;
1875 LWLockAcquire(ProcArrayLock, LW_SHARED);
1877 for (index = 0; index < arrayP->numProcs; index++)
1879 VirtualTransactionId procvxid;
1880 PGPROC *proc = arrayP->procs[index];
1882 GET_VXID_FROM_PGPROC(procvxid, *proc);
1884 if (procvxid.backendId == vxid.backendId &&
1885 procvxid.localTransactionId == vxid.localTransactionId)
1887 proc->recoveryConflictPending = true;
1892 * Kill the pid if it's still here. If not, that's what we
1893 * wanted so ignore any errors.
1895 (void) SendProcSignal(pid, sigmode, vxid.backendId);
1901 LWLockRelease(ProcArrayLock);
1907 * MinimumActiveBackends --- count backends (other than myself) that are
1908 * in active transactions. Return true if the count exceeds the
1909 * minimum threshold passed. This is used as a heuristic to decide if
1910 * a pre-XLOG-flush delay is worthwhile during commit.
1912 * Do not count backends that are blocked waiting for locks, since they are
1913 * not going to get to run until someone else commits.
1916 MinimumActiveBackends(int min)
1918 ProcArrayStruct *arrayP = procArray;
1922 /* Quick short-circuit if no minimum is specified */
1927 * Note: for speed, we don't acquire ProcArrayLock. This is a little bit
1928 * bogus, but since we are only testing fields for zero or nonzero, it
1929 * should be OK. The result is only used for heuristic purposes anyway...
1931 for (index = 0; index < arrayP->numProcs; index++)
1933 volatile PGPROC *proc = arrayP->procs[index];
1936 * Since we're not holding a lock, need to check that the pointer is
1937 * valid. Someone holding the lock could have incremented numProcs
1938 * already, but not yet inserted a valid pointer to the array.
1940 * If someone just decremented numProcs, 'proc' could also point to a
1941 * PGPROC entry that's no longer in the array. It still points to a
1942 * PGPROC struct, though, because freed PGPPROC entries just go to the
1943 * free list and are recycled. Its contents are nonsense in that case,
1944 * but that's acceptable for this function.
1950 continue; /* do not count myself */
1952 continue; /* do not count prepared xacts */
1953 if (proc->xid == InvalidTransactionId)
1954 continue; /* do not count if no XID assigned */
1955 if (proc->waitLock != NULL)
1956 continue; /* do not count if blocked on a lock */
1962 return count >= min;
1966 * CountDBBackends --- count backends that are using specified database
1969 CountDBBackends(Oid databaseid)
1971 ProcArrayStruct *arrayP = procArray;
1975 LWLockAcquire(ProcArrayLock, LW_SHARED);
1977 for (index = 0; index < arrayP->numProcs; index++)
1979 volatile PGPROC *proc = arrayP->procs[index];
1982 continue; /* do not count prepared xacts */
1983 if (!OidIsValid(databaseid) ||
1984 proc->databaseId == databaseid)
1988 LWLockRelease(ProcArrayLock);
1994 * CancelDBBackends --- cancel backends that are using specified database
1997 CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending)
1999 ProcArrayStruct *arrayP = procArray;
2003 /* tell all backends to die */
2004 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2006 for (index = 0; index < arrayP->numProcs; index++)
2008 volatile PGPROC *proc = arrayP->procs[index];
2010 if (databaseid == InvalidOid || proc->databaseId == databaseid)
2012 VirtualTransactionId procvxid;
2014 GET_VXID_FROM_PGPROC(procvxid, *proc);
2016 proc->recoveryConflictPending = conflictPending;
2021 * Kill the pid if it's still here. If not, that's what we
2022 * wanted so ignore any errors.
2024 (void) SendProcSignal(pid, sigmode, procvxid.backendId);
2029 LWLockRelease(ProcArrayLock);
2033 * CountUserBackends --- count backends that are used by specified user
2036 CountUserBackends(Oid roleid)
2038 ProcArrayStruct *arrayP = procArray;
2042 LWLockAcquire(ProcArrayLock, LW_SHARED);
2044 for (index = 0; index < arrayP->numProcs; index++)
2046 volatile PGPROC *proc = arrayP->procs[index];
2049 continue; /* do not count prepared xacts */
2050 if (proc->roleId == roleid)
2054 LWLockRelease(ProcArrayLock);
2060 * CountOtherDBBackends -- check for other backends running in the given DB
2062 * If there are other backends in the DB, we will wait a maximum of 5 seconds
2063 * for them to exit. Autovacuum backends are encouraged to exit early by
2064 * sending them SIGTERM, but normal user backends are just waited for.
2066 * The current backend is always ignored; it is caller's responsibility to
2067 * check whether the current backend uses the given DB, if it's important.
2069 * Returns TRUE if there are (still) other backends in the DB, FALSE if not.
2070 * Also, *nbackends and *nprepared are set to the number of other backends
2071 * and prepared transactions in the DB, respectively.
2073 * This function is used to interlock DROP DATABASE and related commands
2074 * against there being any active backends in the target DB --- dropping the
2075 * DB while active backends remain would be a Bad Thing. Note that we cannot
2076 * detect here the possibility of a newly-started backend that is trying to
2077 * connect to the doomed database, so additional interlocking is needed during
2078 * backend startup. The caller should normally hold an exclusive lock on the
2079 * target DB before calling this, which is one reason we mustn't wait
2083 CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
2085 ProcArrayStruct *arrayP = procArray;
2087 #define MAXAUTOVACPIDS 10 /* max autovacs to SIGTERM per iteration */
2088 int autovac_pids[MAXAUTOVACPIDS];
2091 /* 50 tries with 100ms sleep between tries makes 5 sec total wait */
2092 for (tries = 0; tries < 50; tries++)
2098 CHECK_FOR_INTERRUPTS();
2100 *nbackends = *nprepared = 0;
2102 LWLockAcquire(ProcArrayLock, LW_SHARED);
2104 for (index = 0; index < arrayP->numProcs; index++)
2106 volatile PGPROC *proc = arrayP->procs[index];
2108 if (proc->databaseId != databaseId)
2120 if ((proc->vacuumFlags & PROC_IS_AUTOVACUUM) &&
2121 nautovacs < MAXAUTOVACPIDS)
2122 autovac_pids[nautovacs++] = proc->pid;
2126 LWLockRelease(ProcArrayLock);
2129 return false; /* no conflicting backends, so done */
2132 * Send SIGTERM to any conflicting autovacuums before sleeping. We
2133 * postpone this step until after the loop because we don't want to
2134 * hold ProcArrayLock while issuing kill(). We have no idea what might
2135 * block kill() inside the kernel...
2137 for (index = 0; index < nautovacs; index++)
2138 (void) kill(autovac_pids[index], SIGTERM); /* ignore any error */
2140 /* sleep, then try again */
2141 pg_usleep(100 * 1000L); /* 100ms */
2144 return true; /* timed out, still conflicts */
2148 #define XidCacheRemove(i) \
2150 MyProc->subxids.xids[i] = MyProc->subxids.xids[MyProc->subxids.nxids - 1]; \
2151 MyProc->subxids.nxids--; \
2155 * XidCacheRemoveRunningXids
2157 * Remove a bunch of TransactionIds from the list of known-running
2158 * subtransactions for my backend. Both the specified xid and those in
2159 * the xids[] array (of length nxids) are removed from the subxids cache.
2160 * latestXid must be the latest XID among the group.
2163 XidCacheRemoveRunningXids(TransactionId xid,
2164 int nxids, const TransactionId *xids,
2165 TransactionId latestXid)
2170 Assert(TransactionIdIsValid(xid));
2173 * We must hold ProcArrayLock exclusively in order to remove transactions
2174 * from the PGPROC array. (See src/backend/access/transam/README.) It's
2175 * possible this could be relaxed since we know this routine is only used
2176 * to abort subtransactions, but pending closer analysis we'd best be
2179 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2182 * Under normal circumstances xid and xids[] will be in increasing order,
2183 * as will be the entries in subxids. Scan backwards to avoid O(N^2)
2184 * behavior when removing a lot of xids.
2186 for (i = nxids - 1; i >= 0; i--)
2188 TransactionId anxid = xids[i];
2190 for (j = MyProc->subxids.nxids - 1; j >= 0; j--)
2192 if (TransactionIdEquals(MyProc->subxids.xids[j], anxid))
2200 * Ordinarily we should have found it, unless the cache has
2201 * overflowed. However it's also possible for this routine to be
2202 * invoked multiple times for the same subtransaction, in case of an
2203 * error during AbortSubTransaction. So instead of Assert, emit a
2206 if (j < 0 && !MyProc->subxids.overflowed)
2207 elog(WARNING, "did not find subXID %u in MyProc", anxid);
2210 for (j = MyProc->subxids.nxids - 1; j >= 0; j--)
2212 if (TransactionIdEquals(MyProc->subxids.xids[j], xid))
2218 /* Ordinarily we should have found it, unless the cache has overflowed */
2219 if (j < 0 && !MyProc->subxids.overflowed)
2220 elog(WARNING, "did not find subXID %u in MyProc", xid);
2222 /* Also advance global latestCompletedXid while holding the lock */
2223 if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
2225 ShmemVariableCache->latestCompletedXid = latestXid;
2227 LWLockRelease(ProcArrayLock);
2230 #ifdef XIDCACHE_DEBUG
2233 * Print stats about effectiveness of XID cache
2236 DisplayXidCache(void)
2239 "XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, nooflo: %ld, slow: %ld\n",
2246 xc_by_known_assigned,
2250 #endif /* XIDCACHE_DEBUG */
2253 /* ----------------------------------------------
2254 * KnownAssignedTransactions sub-module
2255 * ----------------------------------------------
2259 * In Hot Standby mode, we maintain a list of transactions that are (or were)
2260 * running in the master at the current point in WAL. These XIDs must be
2261 * treated as running by standby transactions, even though they are not in
2262 * the standby server's PGPROC array.
2264 * We record all XIDs that we know have been assigned. That includes all the
2265 * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have
2266 * been assigned. We can deduce the existence of unobserved XIDs because we
2267 * know XIDs are assigned in sequence, with no gaps. The KnownAssignedXids
2268 * list expands as new XIDs are observed or inferred, and contracts when
2269 * transaction completion records arrive.
2271 * During hot standby we do not fret too much about the distinction between
2272 * top-level XIDs and subtransaction XIDs. We store both together in the
2273 * KnownAssignedXids list. In backends, this is copied into snapshots in
2274 * GetSnapshotData(), taking advantage of the fact that XidInMVCCSnapshot()
2275 * doesn't care about the distinction either. Subtransaction XIDs are
2276 * effectively treated as top-level XIDs and in the typical case pg_subtrans
2277 * links are *not* maintained (which does not affect visibility).
2279 * We have room in KnownAssignedXids and in snapshots to hold maxProcs *
2280 * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every master transaction must
2281 * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at
2282 * least every PGPROC_MAX_CACHED_SUBXIDS. When we receive one of these
2283 * records, we mark the subXIDs as children of the top XID in pg_subtrans,
2284 * and then remove them from KnownAssignedXids. This prevents overflow of
2285 * KnownAssignedXids and snapshots, at the cost that status checks for these
2286 * subXIDs will take a slower path through TransactionIdIsInProgress().
2287 * This means that KnownAssignedXids is not necessarily complete for subXIDs,
2288 * though it should be complete for top-level XIDs; this is the same situation
2289 * that holds with respect to the PGPROC entries in normal running.
2291 * When we throw away subXIDs from KnownAssignedXids, we need to keep track of
2292 * that, similarly to tracking overflow of a PGPROC's subxids array. We do
2293 * that by remembering the lastOverflowedXID, ie the last thrown-away subXID.
2294 * As long as that is within the range of interesting XIDs, we have to assume
2295 * that subXIDs are missing from snapshots. (Note that subXID overflow occurs
2296 * on primary when 65th subXID arrives, whereas on standby it occurs when 64th
2297 * subXID arrives - that is not an error.)
2299 * Should a backend on primary somehow disappear before it can write an abort
2300 * record, then we just leave those XIDs in KnownAssignedXids. They actually
2301 * aborted but we think they were running; the distinction is irrelevant
2302 * because either way any changes done by the transaction are not visible to
2303 * backends in the standby. We prune KnownAssignedXids when
2304 * XLOG_RUNNING_XACTS arrives, to forestall possible overflow of the
2305 * array due to such dead XIDs.
2309 * RecordKnownAssignedTransactionIds
2310 * Record the given XID in KnownAssignedXids, as well as any preceding
2313 * RecordKnownAssignedTransactionIds() should be run for *every* WAL record
2314 * associated with a transaction. Must be called for each record after we
2315 * have executed StartupCLOG() et al, since we must ExtendCLOG() etc..
2317 * Called during recovery in analogy with and in place of GetNewTransactionId()
2320 RecordKnownAssignedTransactionIds(TransactionId xid)
2322 Assert(standbyState >= STANDBY_INITIALIZED);
2323 Assert(TransactionIdIsValid(xid));
2325 elog(trace_recovery(DEBUG4), "record known xact %u latestObservedXid %u",
2326 xid, latestObservedXid);
2329 * If the KnownAssignedXids machinery isn't up yet, do nothing.
2331 if (standbyState <= STANDBY_INITIALIZED)
2334 Assert(TransactionIdIsValid(latestObservedXid));
2337 * When a newly observed xid arrives, it is frequently the case that it is
2338 * *not* the next xid in sequence. When this occurs, we must treat the
2339 * intervening xids as running also.
2341 if (TransactionIdFollows(xid, latestObservedXid))
2343 TransactionId next_expected_xid;
2346 * Extend clog and subtrans like we do in GetNewTransactionId() during
2347 * normal operation using individual extend steps. Typical case
2348 * requires almost no activity.
2350 next_expected_xid = latestObservedXid;
2351 TransactionIdAdvance(next_expected_xid);
2352 while (TransactionIdPrecedesOrEquals(next_expected_xid, xid))
2354 ExtendCLOG(next_expected_xid);
2355 ExtendSUBTRANS(next_expected_xid);
2357 TransactionIdAdvance(next_expected_xid);
2361 * Add the new xids onto the KnownAssignedXids array.
2363 next_expected_xid = latestObservedXid;
2364 TransactionIdAdvance(next_expected_xid);
2365 KnownAssignedXidsAdd(next_expected_xid, xid, false);
2368 * Now we can advance latestObservedXid
2370 latestObservedXid = xid;
2372 /* ShmemVariableCache->nextXid must be beyond any observed xid */
2373 next_expected_xid = latestObservedXid;
2374 TransactionIdAdvance(next_expected_xid);
2375 ShmemVariableCache->nextXid = next_expected_xid;
2380 * ExpireTreeKnownAssignedTransactionIds
2381 * Remove the given XIDs from KnownAssignedXids.
2383 * Called during recovery in analogy with and in place of ProcArrayEndTransaction()
2386 ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids,
2387 TransactionId *subxids, TransactionId max_xid)
2389 Assert(standbyState >= STANDBY_INITIALIZED);
2392 * Uses same locking as transaction commit
2394 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2396 KnownAssignedXidsRemoveTree(xid, nsubxids, subxids);
2398 /* As in ProcArrayEndTransaction, advance latestCompletedXid */
2399 if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
2401 ShmemVariableCache->latestCompletedXid = max_xid;
2403 LWLockRelease(ProcArrayLock);
2407 * ExpireAllKnownAssignedTransactionIds
2408 * Remove all entries in KnownAssignedXids
2411 ExpireAllKnownAssignedTransactionIds(void)
2413 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2414 KnownAssignedXidsRemovePreceding(InvalidTransactionId);
2415 LWLockRelease(ProcArrayLock);
2419 * ExpireOldKnownAssignedTransactionIds
2420 * Remove KnownAssignedXids entries preceding the given XID
2423 ExpireOldKnownAssignedTransactionIds(TransactionId xid)
2425 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2426 KnownAssignedXidsRemovePreceding(xid);
2427 LWLockRelease(ProcArrayLock);
2432 * Private module functions to manipulate KnownAssignedXids
2434 * There are 5 main uses of the KnownAssignedXids data structure:
2436 * * backends taking snapshots - all valid XIDs need to be copied out
2437 * * backends seeking to determine presence of a specific XID
2438 * * startup process adding new known-assigned XIDs
2439 * * startup process removing specific XIDs as transactions end
2440 * * startup process pruning array when special WAL records arrive
2442 * This data structure is known to be a hot spot during Hot Standby, so we
2443 * go to some lengths to make these operations as efficient and as concurrent
2446 * The XIDs are stored in an array in sorted order --- TransactionIdPrecedes
2447 * order, to be exact --- to allow binary search for specific XIDs. Note:
2448 * in general TransactionIdPrecedes would not provide a total order, but
2449 * we know that the entries present at any instant should not extend across
2450 * a large enough fraction of XID space to wrap around (the master would
2451 * shut down for fear of XID wrap long before that happens). So it's OK to
2452 * use TransactionIdPrecedes as a binary-search comparator.
2454 * It's cheap to maintain the sortedness during insertions, since new known
2455 * XIDs are always reported in XID order; we just append them at the right.
2457 * To keep individual deletions cheap, we need to allow gaps in the array.
2458 * This is implemented by marking array elements as valid or invalid using
2459 * the parallel boolean array KnownAssignedXidsValid[]. A deletion is done
2460 * by setting KnownAssignedXidsValid[i] to false, *without* clearing the
2461 * XID entry itself. This preserves the property that the XID entries are
2462 * sorted, so we can do binary searches easily. Periodically we compress
2463 * out the unused entries; that's much cheaper than having to compress the
2464 * array immediately on every deletion.
2466 * The actually valid items in KnownAssignedXids[] and KnownAssignedXidsValid[]
2467 * are those with indexes tail <= i < head; items outside this subscript range
2468 * have unspecified contents. When head reaches the end of the array, we
2469 * force compression of unused entries rather than wrapping around, since
2470 * allowing wraparound would greatly complicate the search logic. We maintain
2471 * an explicit tail pointer so that pruning of old XIDs can be done without
2472 * immediately moving the array contents. In most cases only a small fraction
2473 * of the array contains valid entries at any instant.
2475 * Although only the startup process can ever change the KnownAssignedXids
2476 * data structure, we still need interlocking so that standby backends will
2477 * not observe invalid intermediate states. The convention is that backends
2478 * must hold shared ProcArrayLock to examine the array. To remove XIDs from
2479 * the array, the startup process must hold ProcArrayLock exclusively, for
2480 * the usual transactional reasons (compare commit/abort of a transaction
2481 * during normal running). Compressing unused entries out of the array
2482 * likewise requires exclusive lock. To add XIDs to the array, we just insert
2483 * them into slots to the right of the head pointer and then advance the head
2484 * pointer. This wouldn't require any lock at all, except that on machines
2485 * with weak memory ordering we need to be careful that other processors
2486 * see the array element changes before they see the head pointer change.
2487 * We handle this by using a spinlock to protect reads and writes of the
2488 * head/tail pointers. (We could dispense with the spinlock if we were to
2489 * create suitable memory access barrier primitives and use those instead.)
2490 * The spinlock must be taken to read or write the head/tail pointers unless
2491 * the caller holds ProcArrayLock exclusively.
2493 * Algorithmic analysis:
2495 * If we have a maximum of M slots, with N XIDs currently spread across
2496 * S elements then we have N <= S <= M always.
2498 * * Adding a new XID is O(1) and needs little locking (unless compression
2500 * * Compressing the array is O(S) and requires exclusive lock
2501 * * Removing an XID is O(logS) and requires exclusive lock
2502 * * Taking a snapshot is O(S) and requires shared lock
2503 * * Checking for an XID is O(logS) and requires shared lock
2505 * In comparison, using a hash table for KnownAssignedXids would mean that
2506 * taking snapshots would be O(M). If we can maintain S << M then the
2507 * sorted array technique will deliver significantly faster snapshots.
2508 * If we try to keep S too small then we will spend too much time compressing,
2509 * so there is an optimal point for any workload mix. We use a heuristic to
2510 * decide when to compress the array, though trimming also helps reduce
2511 * frequency of compressing. The heuristic requires us to track the number of
2512 * currently valid XIDs in the array.
2517 * Compress KnownAssignedXids by shifting valid data down to the start of the
2518 * array, removing any gaps.
2520 * A compression step is forced if "force" is true, otherwise we do it
2521 * only if a heuristic indicates it's a good time to do it.
2523 * Caller must hold ProcArrayLock in exclusive mode.
2526 KnownAssignedXidsCompress(bool force)
2528 /* use volatile pointer to prevent code rearrangement */
2529 volatile ProcArrayStruct *pArray = procArray;
2535 /* no spinlock required since we hold ProcArrayLock exclusively */
2536 head = pArray->headKnownAssignedXids;
2537 tail = pArray->tailKnownAssignedXids;
2542 * If we can choose how much to compress, use a heuristic to avoid
2543 * compressing too often or not often enough.
2545 * Heuristic is if we have a large enough current spread and less than
2546 * 50% of the elements are currently in use, then compress. This
2547 * should ensure we compress fairly infrequently. We could compress
2548 * less often though the virtual array would spread out more and
2549 * snapshots would become more expensive.
2551 int nelements = head - tail;
2553 if (nelements < 4 * PROCARRAY_MAXPROCS ||
2554 nelements < 2 * pArray->numKnownAssignedXids)
2559 * We compress the array by reading the valid values from tail to head,
2560 * re-aligning data to 0th element.
2563 for (i = tail; i < head; i++)
2565 if (KnownAssignedXidsValid[i])
2567 KnownAssignedXids[compress_index] = KnownAssignedXids[i];
2568 KnownAssignedXidsValid[compress_index] = true;
2573 pArray->tailKnownAssignedXids = 0;
2574 pArray->headKnownAssignedXids = compress_index;
2578 * Add xids into KnownAssignedXids at the head of the array.
2580 * xids from from_xid to to_xid, inclusive, are added to the array.
2582 * If exclusive_lock is true then caller already holds ProcArrayLock in
2583 * exclusive mode, so we need no extra locking here. Else caller holds no
2584 * lock, so we need to be sure we maintain sufficient interlocks against
2585 * concurrent readers. (Only the startup process ever calls this, so no need
2586 * to worry about concurrent writers.)
2589 KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
2590 bool exclusive_lock)
2592 /* use volatile pointer to prevent code rearrangement */
2593 volatile ProcArrayStruct *pArray = procArray;
2594 TransactionId next_xid;
2600 Assert(TransactionIdPrecedesOrEquals(from_xid, to_xid));
2603 * Calculate how many array slots we'll need. Normally this is cheap; in
2604 * the unusual case where the XIDs cross the wrap point, we do it the hard
2607 if (to_xid >= from_xid)
2608 nxids = to_xid - from_xid + 1;
2612 next_xid = from_xid;
2613 while (TransactionIdPrecedes(next_xid, to_xid))
2616 TransactionIdAdvance(next_xid);
2621 * Since only the startup process modifies the head/tail pointers, we
2622 * don't need a lock to read them here.
2624 head = pArray->headKnownAssignedXids;
2625 tail = pArray->tailKnownAssignedXids;
2627 Assert(head >= 0 && head <= pArray->maxKnownAssignedXids);
2628 Assert(tail >= 0 && tail < pArray->maxKnownAssignedXids);
2631 * Verify that insertions occur in TransactionId sequence. Note that even
2632 * if the last existing element is marked invalid, it must still have a
2633 * correctly sequenced XID value.
2636 TransactionIdFollowsOrEquals(KnownAssignedXids[head - 1], from_xid))
2638 KnownAssignedXidsDisplay(LOG);
2639 elog(ERROR, "out-of-order XID insertion in KnownAssignedXids");
2643 * If our xids won't fit in the remaining space, compress out free space
2645 if (head + nxids > pArray->maxKnownAssignedXids)
2647 /* must hold lock to compress */
2648 if (!exclusive_lock)
2649 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2651 KnownAssignedXidsCompress(true);
2653 head = pArray->headKnownAssignedXids;
2654 /* note: we no longer care about the tail pointer */
2656 if (!exclusive_lock)
2657 LWLockRelease(ProcArrayLock);
2660 * If it still won't fit then we're out of memory
2662 if (head + nxids > pArray->maxKnownAssignedXids)
2663 elog(ERROR, "too many KnownAssignedXids");
2666 /* Now we can insert the xids into the space starting at head */
2667 next_xid = from_xid;
2668 for (i = 0; i < nxids; i++)
2670 KnownAssignedXids[head] = next_xid;
2671 KnownAssignedXidsValid[head] = true;
2672 TransactionIdAdvance(next_xid);
2676 /* Adjust count of number of valid entries */
2677 pArray->numKnownAssignedXids += nxids;
2680 * Now update the head pointer. We use a spinlock to protect this
2681 * pointer, not because the update is likely to be non-atomic, but to
2682 * ensure that other processors see the above array updates before they
2683 * see the head pointer change.
2685 * If we're holding ProcArrayLock exclusively, there's no need to take the
2689 pArray->headKnownAssignedXids = head;
2692 SpinLockAcquire(&pArray->known_assigned_xids_lck);
2693 pArray->headKnownAssignedXids = head;
2694 SpinLockRelease(&pArray->known_assigned_xids_lck);
2699 * KnownAssignedXidsSearch
2701 * Searches KnownAssignedXids for a specific xid and optionally removes it.
2702 * Returns true if it was found, false if not.
2704 * Caller must hold ProcArrayLock in shared or exclusive mode.
2705 * Exclusive lock must be held for remove = true.
2708 KnownAssignedXidsSearch(TransactionId xid, bool remove)
2710 /* use volatile pointer to prevent code rearrangement */
2711 volatile ProcArrayStruct *pArray = procArray;
2716 int result_index = -1;
2720 /* we hold ProcArrayLock exclusively, so no need for spinlock */
2721 tail = pArray->tailKnownAssignedXids;
2722 head = pArray->headKnownAssignedXids;
2726 /* take spinlock to ensure we see up-to-date array contents */
2727 SpinLockAcquire(&pArray->known_assigned_xids_lck);
2728 tail = pArray->tailKnownAssignedXids;
2729 head = pArray->headKnownAssignedXids;
2730 SpinLockRelease(&pArray->known_assigned_xids_lck);
2734 * Standard binary search. Note we can ignore the KnownAssignedXidsValid
2735 * array here, since even invalid entries will contain sorted XIDs.
2739 while (first <= last)
2742 TransactionId mid_xid;
2744 mid_index = (first + last) / 2;
2745 mid_xid = KnownAssignedXids[mid_index];
2749 result_index = mid_index;
2752 else if (TransactionIdPrecedes(xid, mid_xid))
2753 last = mid_index - 1;
2755 first = mid_index + 1;
2758 if (result_index < 0)
2759 return false; /* not in array */
2761 if (!KnownAssignedXidsValid[result_index])
2762 return false; /* in array, but invalid */
2766 KnownAssignedXidsValid[result_index] = false;
2768 pArray->numKnownAssignedXids--;
2769 Assert(pArray->numKnownAssignedXids >= 0);
2772 * If we're removing the tail element then advance tail pointer over
2773 * any invalid elements. This will speed future searches.
2775 if (result_index == tail)
2778 while (tail < head && !KnownAssignedXidsValid[tail])
2782 /* Array is empty, so we can reset both pointers */
2783 pArray->headKnownAssignedXids = 0;
2784 pArray->tailKnownAssignedXids = 0;
2788 pArray->tailKnownAssignedXids = tail;
2797 * Is the specified XID present in KnownAssignedXids[]?
2799 * Caller must hold ProcArrayLock in shared or exclusive mode.
2802 KnownAssignedXidExists(TransactionId xid)
2804 Assert(TransactionIdIsValid(xid));
2806 return KnownAssignedXidsSearch(xid, false);
2810 * Remove the specified XID from KnownAssignedXids[].
2812 * Caller must hold ProcArrayLock in exclusive mode.
2815 KnownAssignedXidsRemove(TransactionId xid)
2817 Assert(TransactionIdIsValid(xid));
2819 elog(trace_recovery(DEBUG4), "remove KnownAssignedXid %u", xid);
2822 * Note: we cannot consider it an error to remove an XID that's not
2823 * present. We intentionally remove subxact IDs while processing
2824 * XLOG_XACT_ASSIGNMENT, to avoid array overflow. Then those XIDs will be
2825 * removed again when the top-level xact commits or aborts.
2827 * It might be possible to track such XIDs to distinguish this case from
2828 * actual errors, but it would be complicated and probably not worth it.
2829 * So, just ignore the search result.
2831 (void) KnownAssignedXidsSearch(xid, true);
2835 * KnownAssignedXidsRemoveTree
2836 * Remove xid (if it's not InvalidTransactionId) and all the subxids.
2838 * Caller must hold ProcArrayLock in exclusive mode.
2841 KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
2842 TransactionId *subxids)
2846 if (TransactionIdIsValid(xid))
2847 KnownAssignedXidsRemove(xid);
2849 for (i = 0; i < nsubxids; i++)
2850 KnownAssignedXidsRemove(subxids[i]);
2852 /* Opportunistically compress the array */
2853 KnownAssignedXidsCompress(false);
2857 * Prune KnownAssignedXids up to, but *not* including xid. If xid is invalid
2858 * then clear the whole table.
2860 * Caller must hold ProcArrayLock in exclusive mode.
2863 KnownAssignedXidsRemovePreceding(TransactionId removeXid)
2865 /* use volatile pointer to prevent code rearrangement */
2866 volatile ProcArrayStruct *pArray = procArray;
2872 if (!TransactionIdIsValid(removeXid))
2874 elog(trace_recovery(DEBUG4), "removing all KnownAssignedXids");
2875 pArray->numKnownAssignedXids = 0;
2876 pArray->headKnownAssignedXids = pArray->tailKnownAssignedXids = 0;
2880 elog(trace_recovery(DEBUG4), "prune KnownAssignedXids to %u", removeXid);
2883 * Mark entries invalid starting at the tail. Since array is sorted, we
2884 * can stop as soon as we reach a entry >= removeXid.
2886 tail = pArray->tailKnownAssignedXids;
2887 head = pArray->headKnownAssignedXids;
2889 for (i = tail; i < head; i++)
2891 if (KnownAssignedXidsValid[i])
2893 TransactionId knownXid = KnownAssignedXids[i];
2895 if (TransactionIdFollowsOrEquals(knownXid, removeXid))
2898 if (!StandbyTransactionIdIsPrepared(knownXid))
2900 KnownAssignedXidsValid[i] = false;
2906 pArray->numKnownAssignedXids -= count;
2907 Assert(pArray->numKnownAssignedXids >= 0);
2910 * Advance the tail pointer if we've marked the tail item invalid.
2912 for (i = tail; i < head; i++)
2914 if (KnownAssignedXidsValid[i])
2919 /* Array is empty, so we can reset both pointers */
2920 pArray->headKnownAssignedXids = 0;
2921 pArray->tailKnownAssignedXids = 0;
2925 pArray->tailKnownAssignedXids = i;
2928 /* Opportunistically compress the array */
2929 KnownAssignedXidsCompress(false);
2933 * KnownAssignedXidsGet - Get an array of xids by scanning KnownAssignedXids.
2934 * We filter out anything >= xmax.
2936 * Returns the number of XIDs stored into xarray[]. Caller is responsible
2937 * that array is large enough.
2939 * Caller must hold ProcArrayLock in (at least) shared mode.
2942 KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax)
2944 TransactionId xtmp = InvalidTransactionId;
2946 return KnownAssignedXidsGetAndSetXmin(xarray, &xtmp, xmax);
2950 * KnownAssignedXidsGetAndSetXmin - as KnownAssignedXidsGet, plus
2951 * we reduce *xmin to the lowest xid value seen if not already lower.
2953 * Caller must hold ProcArrayLock in (at least) shared mode.
2956 KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
2959 /* use volatile pointer to prevent code rearrangement */
2960 volatile ProcArrayStruct *pArray = procArray;
2967 * Fetch head just once, since it may change while we loop. We can stop
2968 * once we reach the initially seen head, since we are certain that an xid
2969 * cannot enter and then leave the array while we hold ProcArrayLock. We
2970 * might miss newly-added xids, but they should be >= xmax so irrelevant
2973 * Must take spinlock to ensure we see up-to-date array contents.
2975 SpinLockAcquire(&pArray->known_assigned_xids_lck);
2976 tail = pArray->tailKnownAssignedXids;
2977 head = pArray->headKnownAssignedXids;
2978 SpinLockRelease(&pArray->known_assigned_xids_lck);
2980 for (i = tail; i < head; i++)
2982 /* Skip any gaps in the array */
2983 if (KnownAssignedXidsValid[i])
2985 TransactionId knownXid = KnownAssignedXids[i];
2988 * Update xmin if required. Only the first XID need be checked,
2989 * since the array is sorted.
2992 TransactionIdPrecedes(knownXid, *xmin))
2996 * Filter out anything >= xmax, again relying on sorted property
2999 if (TransactionIdIsValid(xmax) &&
3000 TransactionIdFollowsOrEquals(knownXid, xmax))
3003 /* Add knownXid into output array */
3004 xarray[count++] = knownXid;
3012 * Get oldest XID in the KnownAssignedXids array, or InvalidTransactionId
3015 static TransactionId
3016 KnownAssignedXidsGetOldestXmin(void)
3018 /* use volatile pointer to prevent code rearrangement */
3019 volatile ProcArrayStruct *pArray = procArray;
3025 * Fetch head just once, since it may change while we loop.
3027 SpinLockAcquire(&pArray->known_assigned_xids_lck);
3028 tail = pArray->tailKnownAssignedXids;
3029 head = pArray->headKnownAssignedXids;
3030 SpinLockRelease(&pArray->known_assigned_xids_lck);
3032 for (i = tail; i < head; i++)
3034 /* Skip any gaps in the array */
3035 if (KnownAssignedXidsValid[i])
3036 return KnownAssignedXids[i];
3039 return InvalidTransactionId;
3043 * Display KnownAssignedXids to provide debug trail
3045 * Currently this is only called within startup process, so we need no
3048 * Note this is pretty expensive, and much of the expense will be incurred
3049 * even if the elog message will get discarded. It's not currently called
3050 * in any performance-critical places, however, so no need to be tenser.
3053 KnownAssignedXidsDisplay(int trace_level)
3055 /* use volatile pointer to prevent code rearrangement */
3056 volatile ProcArrayStruct *pArray = procArray;
3063 tail = pArray->tailKnownAssignedXids;
3064 head = pArray->headKnownAssignedXids;
3066 initStringInfo(&buf);
3068 for (i = tail; i < head; i++)
3070 if (KnownAssignedXidsValid[i])
3073 appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]);
3077 elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s",
3079 pArray->numKnownAssignedXids,
3080 pArray->tailKnownAssignedXids,
3081 pArray->headKnownAssignedXids,