1 /*-------------------------------------------------------------------------
4 * POSTGRES process array code.
7 * This module maintains an unsorted array of the PGPROC structures for all
8 * active backends. Although there are several uses for this, the principal
9 * one is as a means of determining the set of currently running transactions.
11 * Because of various subtle race conditions it is critical that a backend
12 * hold the correct locks while setting or clearing its MyProc->xid field.
13 * See notes in src/backend/access/transam/README.
15 * The process array now also includes PGPROC structures representing
16 * prepared transactions. The xid and subxids fields of these are valid,
17 * as are the myProcLocks lists. They can be distinguished from regular
18 * backend PGPROCs at need by checking for pid == 0.
20 * During hot standby, we also keep a list of XIDs representing transactions
21 * that are known to be running in the master (or more precisely, were running
22 * as of the current point in the WAL stream). This list is kept in the
23 * KnownAssignedXids array, and is updated by watching the sequence of
24 * arriving XIDs. This is necessary because if we leave those XIDs out of
25 * snapshots taken for standby queries, then they will appear to be already
26 * complete, leading to MVCC failures. Note that in hot standby, the PGPROC
27 * array represents standby processes, which by definition are not running
28 * transactions that have XIDs.
30 * It is perhaps possible for a backend on the master to terminate without
31 * writing an abort record for its transaction. While that shouldn't really
32 * happen, it would tie up KnownAssignedXids indefinitely, so we protect
33 * ourselves by pruning the array when a valid list of running XIDs arrives.
35 * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
36 * Portions Copyright (c) 1994, Regents of the University of California
40 * src/backend/storage/ipc/procarray.c
42 *-------------------------------------------------------------------------
48 #include "access/clog.h"
49 #include "access/subtrans.h"
50 #include "access/transam.h"
51 #include "access/xact.h"
52 #include "access/twophase.h"
53 #include "miscadmin.h"
54 #include "storage/procarray.h"
55 #include "storage/spin.h"
56 #include "utils/builtins.h"
57 #include "utils/snapmgr.h"
60 /* Our shared memory area */
61 typedef struct ProcArrayStruct
63 int numProcs; /* number of valid procs entries */
64 int maxProcs; /* allocated size of procs array */
67 * Known assigned XIDs handling
69 int maxKnownAssignedXids; /* allocated size of array */
70 int numKnownAssignedXids; /* currrent # of valid entries */
71 int tailKnownAssignedXids; /* index of oldest valid element */
72 int headKnownAssignedXids; /* index of newest element, + 1 */
73 slock_t known_assigned_xids_lck; /* protects head/tail pointers */
76 * Highest subxid that has been removed from KnownAssignedXids array to
77 * prevent overflow; or InvalidTransactionId if none. We track this for
78 * similar reasons to tracking overflowing cached subxids in PGPROC
79 * entries. Must hold exclusive ProcArrayLock to change this, and shared
82 TransactionId lastOverflowedXid;
85 * We declare procs[] as 1 entry because C wants a fixed-size array, but
86 * actually it is maxProcs entries long.
88 PGPROC *procs[1]; /* VARIABLE LENGTH ARRAY */
91 static ProcArrayStruct *procArray;
94 * Bookkeeping for tracking emulated transactions in recovery
96 static TransactionId *KnownAssignedXids;
97 static bool *KnownAssignedXidsValid;
98 static TransactionId latestObservedXid = InvalidTransactionId;
101 * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is
102 * the highest xid that might still be running that we don't have in
105 static TransactionId standbySnapshotPendingXmin;
107 #ifdef XIDCACHE_DEBUG
109 /* counters for XidCache measurement */
110 static long xc_by_recent_xmin = 0;
111 static long xc_by_known_xact = 0;
112 static long xc_by_my_xact = 0;
113 static long xc_by_latest_xid = 0;
114 static long xc_by_main_xid = 0;
115 static long xc_by_child_xid = 0;
116 static long xc_by_known_assigned = 0;
117 static long xc_no_overflow = 0;
118 static long xc_slow_answer = 0;
120 #define xc_by_recent_xmin_inc() (xc_by_recent_xmin++)
121 #define xc_by_known_xact_inc() (xc_by_known_xact++)
122 #define xc_by_my_xact_inc() (xc_by_my_xact++)
123 #define xc_by_latest_xid_inc() (xc_by_latest_xid++)
124 #define xc_by_main_xid_inc() (xc_by_main_xid++)
125 #define xc_by_child_xid_inc() (xc_by_child_xid++)
126 #define xc_by_known_assigned_inc() (xc_by_known_assigned++)
127 #define xc_no_overflow_inc() (xc_no_overflow++)
128 #define xc_slow_answer_inc() (xc_slow_answer++)
130 static void DisplayXidCache(void);
131 #else /* !XIDCACHE_DEBUG */
133 #define xc_by_recent_xmin_inc() ((void) 0)
134 #define xc_by_known_xact_inc() ((void) 0)
135 #define xc_by_my_xact_inc() ((void) 0)
136 #define xc_by_latest_xid_inc() ((void) 0)
137 #define xc_by_main_xid_inc() ((void) 0)
138 #define xc_by_child_xid_inc() ((void) 0)
139 #define xc_by_known_assigned_inc() ((void) 0)
140 #define xc_no_overflow_inc() ((void) 0)
141 #define xc_slow_answer_inc() ((void) 0)
142 #endif /* XIDCACHE_DEBUG */
144 /* Primitives for KnownAssignedXids array handling for standby */
145 static void KnownAssignedXidsCompress(bool force);
146 static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
147 bool exclusive_lock);
148 static bool KnownAssignedXidsSearch(TransactionId xid, bool remove);
149 static bool KnownAssignedXidExists(TransactionId xid);
150 static void KnownAssignedXidsRemove(TransactionId xid);
151 static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
152 TransactionId *subxids);
153 static void KnownAssignedXidsRemovePreceding(TransactionId xid);
154 static int KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax);
155 static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray,
158 static TransactionId KnownAssignedXidsGetOldestXmin(void);
159 static void KnownAssignedXidsDisplay(int trace_level);
162 * Report shared-memory space needed by CreateSharedProcArray.
165 ProcArrayShmemSize(void)
169 /* Size of the ProcArray structure itself */
170 #define PROCARRAY_MAXPROCS (MaxBackends + max_prepared_xacts)
172 size = offsetof(ProcArrayStruct, procs);
173 size = add_size(size, mul_size(sizeof(PGPROC *), PROCARRAY_MAXPROCS));
176 * During Hot Standby processing we have a data structure called
177 * KnownAssignedXids, created in shared memory. Local data structures are
178 * also created in various backends during GetSnapshotData(),
179 * TransactionIdIsInProgress() and GetRunningTransactionData(). All of the
180 * main structures created in those functions must be identically sized,
181 * since we may at times copy the whole of the data structures around. We
182 * refer to this size as TOTAL_MAX_CACHED_SUBXIDS.
184 * Ideally we'd only create this structure if we were actually doing hot
185 * standby in the current run, but we don't know that yet at the time
186 * shared memory is being set up.
188 #define TOTAL_MAX_CACHED_SUBXIDS \
189 ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
191 if (EnableHotStandby)
193 size = add_size(size,
194 mul_size(sizeof(TransactionId),
195 TOTAL_MAX_CACHED_SUBXIDS));
196 size = add_size(size,
197 mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS));
204 * Initialize the shared PGPROC array during postmaster startup.
207 CreateSharedProcArray(void)
211 /* Create or attach to the ProcArray shared structure */
212 procArray = (ProcArrayStruct *)
213 ShmemInitStruct("Proc Array",
214 add_size(offsetof(ProcArrayStruct, procs),
215 mul_size(sizeof(PGPROC *),
216 PROCARRAY_MAXPROCS)),
222 * We're the first - initialize.
224 procArray->numProcs = 0;
225 procArray->maxProcs = PROCARRAY_MAXPROCS;
226 procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS;
227 procArray->numKnownAssignedXids = 0;
228 procArray->tailKnownAssignedXids = 0;
229 procArray->headKnownAssignedXids = 0;
230 SpinLockInit(&procArray->known_assigned_xids_lck);
231 procArray->lastOverflowedXid = InvalidTransactionId;
234 /* Create or attach to the KnownAssignedXids arrays too, if needed */
235 if (EnableHotStandby)
237 KnownAssignedXids = (TransactionId *)
238 ShmemInitStruct("KnownAssignedXids",
239 mul_size(sizeof(TransactionId),
240 TOTAL_MAX_CACHED_SUBXIDS),
242 KnownAssignedXidsValid = (bool *)
243 ShmemInitStruct("KnownAssignedXidsValid",
244 mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS),
250 * Add the specified PGPROC to the shared array.
253 ProcArrayAdd(PGPROC *proc)
255 ProcArrayStruct *arrayP = procArray;
257 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
259 if (arrayP->numProcs >= arrayP->maxProcs)
262 * Ooops, no room. (This really shouldn't happen, since there is a
263 * fixed supply of PGPROC structs too, and so we should have failed
266 LWLockRelease(ProcArrayLock);
268 (errcode(ERRCODE_TOO_MANY_CONNECTIONS),
269 errmsg("sorry, too many clients already")));
272 arrayP->procs[arrayP->numProcs] = proc;
275 LWLockRelease(ProcArrayLock);
279 * Remove the specified PGPROC from the shared array.
281 * When latestXid is a valid XID, we are removing a live 2PC gxact from the
282 * array, and thus causing it to appear as "not running" anymore. In this
283 * case we must advance latestCompletedXid. (This is essentially the same
284 * as ProcArrayEndTransaction followed by removal of the PGPROC, but we take
285 * the ProcArrayLock only once, and don't damage the content of the PGPROC;
286 * twophase.c depends on the latter.)
289 ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
291 ProcArrayStruct *arrayP = procArray;
294 #ifdef XIDCACHE_DEBUG
295 /* dump stats at backend shutdown, but not prepared-xact end */
300 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
302 if (TransactionIdIsValid(latestXid))
304 Assert(TransactionIdIsValid(proc->xid));
306 /* Advance global latestCompletedXid while holding the lock */
307 if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
309 ShmemVariableCache->latestCompletedXid = latestXid;
313 /* Shouldn't be trying to remove a live transaction here */
314 Assert(!TransactionIdIsValid(proc->xid));
317 for (index = 0; index < arrayP->numProcs; index++)
319 if (arrayP->procs[index] == proc)
321 arrayP->procs[index] = arrayP->procs[arrayP->numProcs - 1];
322 arrayP->procs[arrayP->numProcs - 1] = NULL; /* for debugging */
324 LWLockRelease(ProcArrayLock);
330 LWLockRelease(ProcArrayLock);
332 elog(LOG, "failed to find proc %p in ProcArray", proc);
337 * ProcArrayEndTransaction -- mark a transaction as no longer running
339 * This is used interchangeably for commit and abort cases. The transaction
340 * commit/abort must already be reported to WAL and pg_clog.
342 * proc is currently always MyProc, but we pass it explicitly for flexibility.
343 * latestXid is the latest Xid among the transaction's main XID and
344 * subtransactions, or InvalidTransactionId if it has no XID. (We must ask
345 * the caller to pass latestXid, instead of computing it from the PGPROC's
346 * contents, because the subxid information in the PGPROC might be
350 ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
352 if (TransactionIdIsValid(latestXid))
355 * We must lock ProcArrayLock while clearing proc->xid, so that we do
356 * not exit the set of "running" transactions while someone else is
357 * taking a snapshot. See discussion in
358 * src/backend/access/transam/README.
360 Assert(TransactionIdIsValid(proc->xid));
362 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
364 proc->xid = InvalidTransactionId;
365 proc->lxid = InvalidLocalTransactionId;
366 proc->xmin = InvalidTransactionId;
367 /* must be cleared with xid/xmin: */
368 proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
369 proc->inCommit = false; /* be sure this is cleared in abort */
370 proc->recoveryConflictPending = false;
372 /* Clear the subtransaction-XID cache too while holding the lock */
373 proc->subxids.nxids = 0;
374 proc->subxids.overflowed = false;
376 /* Also advance global latestCompletedXid while holding the lock */
377 if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
379 ShmemVariableCache->latestCompletedXid = latestXid;
381 LWLockRelease(ProcArrayLock);
386 * If we have no XID, we don't need to lock, since we won't affect
387 * anyone else's calculation of a snapshot. We might change their
388 * estimate of global xmin, but that's OK.
390 Assert(!TransactionIdIsValid(proc->xid));
392 proc->lxid = InvalidLocalTransactionId;
393 proc->xmin = InvalidTransactionId;
394 /* must be cleared with xid/xmin: */
395 proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
396 proc->inCommit = false; /* be sure this is cleared in abort */
397 proc->recoveryConflictPending = false;
399 Assert(proc->subxids.nxids == 0);
400 Assert(proc->subxids.overflowed == false);
406 * ProcArrayClearTransaction -- clear the transaction fields
408 * This is used after successfully preparing a 2-phase transaction. We are
409 * not actually reporting the transaction's XID as no longer running --- it
410 * will still appear as running because the 2PC's gxact is in the ProcArray
411 * too. We just have to clear out our own PGPROC.
414 ProcArrayClearTransaction(PGPROC *proc)
417 * We can skip locking ProcArrayLock here, because this action does not
418 * actually change anyone's view of the set of running XIDs: our entry is
419 * duplicate with the gxact that has already been inserted into the
422 proc->xid = InvalidTransactionId;
423 proc->lxid = InvalidLocalTransactionId;
424 proc->xmin = InvalidTransactionId;
425 proc->recoveryConflictPending = false;
427 /* redundant, but just in case */
428 proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
429 proc->inCommit = false;
431 /* Clear the subtransaction-XID cache too */
432 proc->subxids.nxids = 0;
433 proc->subxids.overflowed = false;
437 * ProcArrayApplyRecoveryInfo -- apply recovery info about xids
439 * Takes us through 3 states: Initialized, Pending and Ready.
440 * Normal case is to go all the way to Ready straight away, though there
441 * are atypical cases where we need to take it in steps.
443 * Use the data about running transactions on master to create the initial
444 * state of KnownAssignedXids. We also use these records to regularly prune
445 * KnownAssignedXids because we know it is possible that some transactions
446 * with FATAL errors fail to write abort records, which could cause eventual
449 * See comments for LogStandbySnapshot().
452 ProcArrayApplyRecoveryInfo(RunningTransactions running)
456 TransactionId nextXid;
459 Assert(standbyState >= STANDBY_INITIALIZED);
460 Assert(TransactionIdIsValid(running->nextXid));
461 Assert(TransactionIdIsValid(running->oldestRunningXid));
462 Assert(TransactionIdIsNormal(running->latestCompletedXid));
465 * Remove stale transactions, if any.
467 ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid);
468 StandbyReleaseOldLocks(running->oldestRunningXid);
471 * If our snapshot is already valid, nothing else to do...
473 if (standbyState == STANDBY_SNAPSHOT_READY)
477 * If our initial RunningTransactionsData had an overflowed snapshot then
478 * we knew we were missing some subxids from our snapshot. We can use this
479 * data as an initial snapshot, but we cannot yet mark it valid. We know
480 * that the missing subxids are equal to or earlier than nextXid. After we
481 * initialise we continue to apply changes during recovery, so once the
482 * oldestRunningXid is later than the nextXid from the initial snapshot we
483 * know that we no longer have missing information and can mark the
486 if (standbyState == STANDBY_SNAPSHOT_PENDING)
488 if (TransactionIdPrecedes(standbySnapshotPendingXmin,
489 running->oldestRunningXid))
491 standbyState = STANDBY_SNAPSHOT_READY;
492 elog(trace_recovery(DEBUG2),
493 "running xact data now proven complete");
494 elog(trace_recovery(DEBUG2),
495 "recovery snapshots are now enabled");
498 elog(trace_recovery(DEBUG2),
499 "recovery snapshot waiting for %u oldest active xid on standby is %u",
500 standbySnapshotPendingXmin,
501 running->oldestRunningXid);
505 Assert(standbyState == STANDBY_INITIALIZED);
508 * OK, we need to initialise from the RunningTransactionsData record
512 * Release any locks belonging to old transactions that are not running
513 * according to the running-xacts record.
515 StandbyReleaseOldLocks(running->nextXid);
518 * Nobody else is running yet, but take locks anyhow
520 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
523 * KnownAssignedXids is sorted so we cannot just add the xids, we have to
526 * Some of the new xids are top-level xids and some are subtransactions.
527 * We don't call SubtransSetParent because it doesn't matter yet. If we
528 * aren't overflowed then all xids will fit in snapshot and so we don't
529 * need subtrans. If we later overflow, an xid assignment record will add
530 * xids to subtrans. If RunningXacts is overflowed then we don't have
531 * enough information to correctly update subtrans anyway.
533 Assert(procArray->numKnownAssignedXids == 0);
536 * Allocate a temporary array to avoid modifying the array passed as
539 xids = palloc(sizeof(TransactionId) * running->xcnt);
542 * Add to the temp array any xids which have not already completed.
545 for (i = 0; i < running->xcnt; i++)
547 TransactionId xid = running->xids[i];
550 * The running-xacts snapshot can contain xids that were still visible
551 * in the procarray when the snapshot was taken, but were already
552 * WAL-logged as completed. They're not running anymore, so ignore
555 if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
564 * Sort the array so that we can add them safely into
567 qsort(xids, nxids, sizeof(TransactionId), xidComparator);
570 * Add the sorted snapshot into KnownAssignedXids
572 for (i = 0; i < nxids; i++)
573 KnownAssignedXidsAdd(xids[i], xids[i], true);
575 KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
581 * Now we've got the running xids we need to set the global values that
582 * are used to track snapshots as they evolve further.
584 * - latestCompletedXid which will be the xmax for snapshots -
585 * lastOverflowedXid which shows whether snapshots overflow - nextXid
587 * If the snapshot overflowed, then we still initialise with what we know,
588 * but the recovery snapshot isn't fully valid yet because we know there
589 * are some subxids missing. We don't know the specific subxids that are
590 * missing, so conservatively assume the last one is latestObservedXid.
592 latestObservedXid = running->nextXid;
593 TransactionIdRetreat(latestObservedXid);
595 if (running->subxid_overflow)
597 standbyState = STANDBY_SNAPSHOT_PENDING;
599 standbySnapshotPendingXmin = latestObservedXid;
600 procArray->lastOverflowedXid = latestObservedXid;
604 standbyState = STANDBY_SNAPSHOT_READY;
606 standbySnapshotPendingXmin = InvalidTransactionId;
607 procArray->lastOverflowedXid = InvalidTransactionId;
611 * If a transaction wrote a commit record in the gap between taking and
612 * logging the snapshot then latestCompletedXid may already be higher than
613 * the value from the snapshot, so check before we use the incoming value.
615 if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
616 running->latestCompletedXid))
617 ShmemVariableCache->latestCompletedXid = running->latestCompletedXid;
619 /* nextXid must be beyond any observed xid */
620 nextXid = latestObservedXid;
621 TransactionIdAdvance(nextXid);
622 if (TransactionIdFollows(nextXid, ShmemVariableCache->nextXid))
623 ShmemVariableCache->nextXid = nextXid;
625 Assert(TransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
626 Assert(TransactionIdIsValid(ShmemVariableCache->nextXid));
628 LWLockRelease(ProcArrayLock);
630 elog(trace_recovery(DEBUG2), "running transaction data initialized");
631 KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
632 if (standbyState == STANDBY_SNAPSHOT_READY)
633 elog(trace_recovery(DEBUG2), "recovery snapshots are now enabled");
636 (errmsg("consistent state delayed because recovery snapshot incomplete")));
640 * ProcArrayApplyXidAssignment
641 * Process an XLOG_XACT_ASSIGNMENT WAL record
644 ProcArrayApplyXidAssignment(TransactionId topxid,
645 int nsubxids, TransactionId *subxids)
647 TransactionId max_xid;
650 Assert(standbyState >= STANDBY_INITIALIZED);
652 max_xid = TransactionIdLatest(topxid, nsubxids, subxids);
655 * Mark all the subtransactions as observed.
657 * NOTE: This will fail if the subxid contains too many previously
658 * unobserved xids to fit into known-assigned-xids. That shouldn't happen
659 * as the code stands, because xid-assignment records should never contain
660 * more than PGPROC_MAX_CACHED_SUBXIDS entries.
662 RecordKnownAssignedTransactionIds(max_xid);
665 * Notice that we update pg_subtrans with the top-level xid, rather than
666 * the parent xid. This is a difference between normal processing and
667 * recovery, yet is still correct in all cases. The reason is that
668 * subtransaction commit is not marked in clog until commit processing, so
669 * all aborted subtransactions have already been clearly marked in clog.
670 * As a result we are able to refer directly to the top-level
671 * transaction's state rather than skipping through all the intermediate
672 * states in the subtransaction tree. This should be the first time we
673 * have attempted to SubTransSetParent().
675 for (i = 0; i < nsubxids; i++)
676 SubTransSetParent(subxids[i], topxid, false);
679 * Uses same locking as transaction commit
681 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
684 * Remove subxids from known-assigned-xacts.
686 KnownAssignedXidsRemoveTree(InvalidTransactionId, nsubxids, subxids);
689 * Advance lastOverflowedXid to be at least the last of these subxids.
691 if (TransactionIdPrecedes(procArray->lastOverflowedXid, max_xid))
692 procArray->lastOverflowedXid = max_xid;
694 LWLockRelease(ProcArrayLock);
698 * TransactionIdIsInProgress -- is given transaction running in some backend
700 * Aside from some shortcuts such as checking RecentXmin and our own Xid,
701 * there are four possibilities for finding a running transaction:
703 * 1. The given Xid is a main transaction Id. We will find this out cheaply
704 * by looking at the PGPROC struct for each backend.
706 * 2. The given Xid is one of the cached subxact Xids in the PGPROC array.
707 * We can find this out cheaply too.
709 * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see
710 * if the Xid is running on the master.
712 * 4. Search the SubTrans tree to find the Xid's topmost parent, and then
713 * see if that is running according to PGPROC or KnownAssignedXids. This is
714 * the slowest way, but sadly it has to be done always if the others failed,
715 * unless we see that the cached subxact sets are complete (none have
718 * ProcArrayLock has to be held while we do 1, 2, 3. If we save the top Xids
719 * while doing 1 and 3, we can release the ProcArrayLock while we do 4.
720 * This buys back some concurrency (and we can't retrieve the main Xids from
721 * PGPROC again anyway; see GetNewTransactionId).
724 TransactionIdIsInProgress(TransactionId xid)
726 static TransactionId *xids = NULL;
728 ProcArrayStruct *arrayP = procArray;
729 TransactionId topxid;
734 * Don't bother checking a transaction older than RecentXmin; it could not
735 * possibly still be running. (Note: in particular, this guarantees that
736 * we reject InvalidTransactionId, FrozenTransactionId, etc as not
739 if (TransactionIdPrecedes(xid, RecentXmin))
741 xc_by_recent_xmin_inc();
746 * We may have just checked the status of this transaction, so if it is
747 * already known to be completed, we can fall out without any access to
750 if (TransactionIdIsKnownCompleted(xid))
752 xc_by_known_xact_inc();
757 * Also, we can handle our own transaction (and subtransactions) without
758 * any access to shared memory.
760 if (TransactionIdIsCurrentTransactionId(xid))
767 * If first time through, get workspace to remember main XIDs in. We
768 * malloc it permanently to avoid repeated palloc/pfree overhead.
773 * In hot standby mode, reserve enough space to hold all xids in the
774 * known-assigned list. If we later finish recovery, we no longer need
775 * the bigger array, but we don't bother to shrink it.
777 int maxxids = RecoveryInProgress() ? TOTAL_MAX_CACHED_SUBXIDS : arrayP->maxProcs;
779 xids = (TransactionId *) malloc(maxxids * sizeof(TransactionId));
782 (errcode(ERRCODE_OUT_OF_MEMORY),
783 errmsg("out of memory")));
786 LWLockAcquire(ProcArrayLock, LW_SHARED);
789 * Now that we have the lock, we can check latestCompletedXid; if the
790 * target Xid is after that, it's surely still running.
792 if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, xid))
794 LWLockRelease(ProcArrayLock);
795 xc_by_latest_xid_inc();
799 /* No shortcuts, gotta grovel through the array */
800 for (i = 0; i < arrayP->numProcs; i++)
802 volatile PGPROC *proc = arrayP->procs[i];
805 /* Ignore my own proc --- dealt with it above */
809 /* Fetch xid just once - see GetNewTransactionId */
812 if (!TransactionIdIsValid(pxid))
816 * Step 1: check the main Xid
818 if (TransactionIdEquals(pxid, xid))
820 LWLockRelease(ProcArrayLock);
821 xc_by_main_xid_inc();
826 * We can ignore main Xids that are younger than the target Xid, since
827 * the target could not possibly be their child.
829 if (TransactionIdPrecedes(xid, pxid))
833 * Step 2: check the cached child-Xids arrays
835 for (j = proc->subxids.nxids - 1; j >= 0; j--)
837 /* Fetch xid just once - see GetNewTransactionId */
838 TransactionId cxid = proc->subxids.xids[j];
840 if (TransactionIdEquals(cxid, xid))
842 LWLockRelease(ProcArrayLock);
843 xc_by_child_xid_inc();
849 * Save the main Xid for step 4. We only need to remember main Xids
850 * that have uncached children. (Note: there is no race condition
851 * here because the overflowed flag cannot be cleared, only set, while
852 * we hold ProcArrayLock. So we can't miss an Xid that we need to
855 if (proc->subxids.overflowed)
856 xids[nxids++] = pxid;
860 * Step 3: in hot standby mode, check the known-assigned-xids list. XIDs
861 * in the list must be treated as running.
863 if (RecoveryInProgress())
865 /* none of the PGPROC entries should have XIDs in hot standby mode */
868 if (KnownAssignedXidExists(xid))
870 LWLockRelease(ProcArrayLock);
871 xc_by_known_assigned_inc();
876 * If the KnownAssignedXids overflowed, we have to check pg_subtrans
877 * too. Fetch all xids from KnownAssignedXids that are lower than
878 * xid, since if xid is a subtransaction its parent will always have a
879 * lower value. Note we will collect both main and subXIDs here, but
880 * there's no help for it.
882 if (TransactionIdPrecedesOrEquals(xid, procArray->lastOverflowedXid))
883 nxids = KnownAssignedXidsGet(xids, xid);
886 LWLockRelease(ProcArrayLock);
889 * If none of the relevant caches overflowed, we know the Xid is not
890 * running without even looking at pg_subtrans.
894 xc_no_overflow_inc();
899 * Step 4: have to check pg_subtrans.
901 * At this point, we know it's either a subtransaction of one of the Xids
902 * in xids[], or it's not running. If it's an already-failed
903 * subtransaction, we want to say "not running" even though its parent may
904 * still be running. So first, check pg_clog to see if it's been aborted.
906 xc_slow_answer_inc();
908 if (TransactionIdDidAbort(xid))
912 * It isn't aborted, so check whether the transaction tree it belongs to
913 * is still running (or, more precisely, whether it was running when we
914 * held ProcArrayLock).
916 topxid = SubTransGetTopmostTransaction(xid);
917 Assert(TransactionIdIsValid(topxid));
918 if (!TransactionIdEquals(topxid, xid))
920 for (i = 0; i < nxids; i++)
922 if (TransactionIdEquals(xids[i], topxid))
931 * TransactionIdIsActive -- is xid the top-level XID of an active backend?
933 * This differs from TransactionIdIsInProgress in that it ignores prepared
934 * transactions, as well as transactions running on the master if we're in
935 * hot standby. Also, we ignore subtransactions since that's not needed
939 TransactionIdIsActive(TransactionId xid)
942 ProcArrayStruct *arrayP = procArray;
946 * Don't bother checking a transaction older than RecentXmin; it could not
947 * possibly still be running.
949 if (TransactionIdPrecedes(xid, RecentXmin))
952 LWLockAcquire(ProcArrayLock, LW_SHARED);
954 for (i = 0; i < arrayP->numProcs; i++)
956 volatile PGPROC *proc = arrayP->procs[i];
958 /* Fetch xid just once - see GetNewTransactionId */
959 TransactionId pxid = proc->xid;
961 if (!TransactionIdIsValid(pxid))
965 continue; /* ignore prepared transactions */
967 if (TransactionIdEquals(pxid, xid))
974 LWLockRelease(ProcArrayLock);
981 * GetOldestXmin -- returns oldest transaction that was running
982 * when any current transaction was started.
984 * If allDbs is TRUE then all backends are considered; if allDbs is FALSE
985 * then only backends running in my own database are considered.
987 * If ignoreVacuum is TRUE then backends with the PROC_IN_VACUUM flag set are
990 * This is used by VACUUM to decide which deleted tuples must be preserved
991 * in a table. allDbs = TRUE is needed for shared relations, but allDbs =
992 * FALSE is sufficient for non-shared relations, since only backends in my
993 * own database could ever see the tuples in them. Also, we can ignore
994 * concurrently running lazy VACUUMs because (a) they must be working on other
995 * tables, and (b) they don't need to do snapshot-based lookups.
997 * This is also used to determine where to truncate pg_subtrans. allDbs
998 * must be TRUE for that case, and ignoreVacuum FALSE.
1000 * Note: we include all currently running xids in the set of considered xids.
1001 * This ensures that if a just-started xact has not yet set its snapshot,
1002 * when it does set the snapshot it cannot set xmin less than what we compute.
1003 * See notes in src/backend/access/transam/README.
1005 * Note: despite the above, it's possible for the calculated value to move
1006 * backwards on repeated calls. The calculated value is conservative, so that
1007 * anything older is definitely not considered as running by anyone anymore,
1008 * but the exact value calculated depends on a number of things. For example,
1009 * if allDbs is FALSE and there are no transactions running in the current
1010 * database, GetOldestXmin() returns latestCompletedXid. If a transaction
1011 * begins after that, its xmin will include in-progress transactions in other
1012 * databases that started earlier, so another call will return a lower value.
1013 * Nonetheless it is safe to vacuum a table in the current database with the
1014 * first result. There are also replication-related effects: a walsender
1015 * process can set its xmin based on transactions that are no longer running
1016 * in the master but are still being replayed on the standby, thus possibly
1017 * making the GetOldestXmin reading go backwards. In this case there is a
1018 * possibility that we lose data that the standby would like to have, but
1019 * there is little we can do about that --- data is only protected if the
1020 * walsender runs continuously while queries are executed on the standby.
1021 * (The Hot Standby code deals with such cases by failing standby queries
1022 * that needed to access already-removed data, so there's no integrity bug.)
1023 * The return value is also adjusted with vacuum_defer_cleanup_age, so
1024 * increasing that setting on the fly is another easy way to make
1025 * GetOldestXmin() move backwards, with no consequences for data integrity.
1028 GetOldestXmin(bool allDbs, bool ignoreVacuum)
1030 ProcArrayStruct *arrayP = procArray;
1031 TransactionId result;
1034 /* Cannot look for individual databases during recovery */
1035 Assert(allDbs || !RecoveryInProgress());
1037 LWLockAcquire(ProcArrayLock, LW_SHARED);
1040 * We initialize the MIN() calculation with latestCompletedXid + 1. This
1041 * is a lower bound for the XIDs that might appear in the ProcArray later,
1042 * and so protects us against overestimating the result due to future
1045 result = ShmemVariableCache->latestCompletedXid;
1046 Assert(TransactionIdIsNormal(result));
1047 TransactionIdAdvance(result);
1049 for (index = 0; index < arrayP->numProcs; index++)
1051 volatile PGPROC *proc = arrayP->procs[index];
1053 if (ignoreVacuum && (proc->vacuumFlags & PROC_IN_VACUUM))
1057 proc->databaseId == MyDatabaseId ||
1058 proc->databaseId == 0) /* always include WalSender */
1060 /* Fetch xid just once - see GetNewTransactionId */
1061 TransactionId xid = proc->xid;
1063 /* First consider the transaction's own Xid, if any */
1064 if (TransactionIdIsNormal(xid) &&
1065 TransactionIdPrecedes(xid, result))
1069 * Also consider the transaction's Xmin, if set.
1071 * We must check both Xid and Xmin because a transaction might
1072 * have an Xmin but not (yet) an Xid; conversely, if it has an
1073 * Xid, that could determine some not-yet-set Xmin.
1075 xid = proc->xmin; /* Fetch just once */
1076 if (TransactionIdIsNormal(xid) &&
1077 TransactionIdPrecedes(xid, result))
1082 if (RecoveryInProgress())
1085 * Check to see whether KnownAssignedXids contains an xid value older
1086 * than the main procarray.
1088 TransactionId kaxmin = KnownAssignedXidsGetOldestXmin();
1090 LWLockRelease(ProcArrayLock);
1092 if (TransactionIdIsNormal(kaxmin) &&
1093 TransactionIdPrecedes(kaxmin, result))
1099 * No other information needed, so release the lock immediately.
1101 LWLockRelease(ProcArrayLock);
1104 * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age,
1105 * being careful not to generate a "permanent" XID.
1107 * vacuum_defer_cleanup_age provides some additional "slop" for the
1108 * benefit of hot standby queries on slave servers. This is quick and
1109 * dirty, and perhaps not all that useful unless the master has a
1110 * predictable transaction rate, but it offers some protection when
1111 * there's no walsender connection. Note that we are assuming
1112 * vacuum_defer_cleanup_age isn't large enough to cause wraparound ---
1113 * so guc.c should limit it to no more than the xidStopLimit threshold
1114 * in varsup.c. Also note that we intentionally don't apply
1115 * vacuum_defer_cleanup_age on standby servers.
1117 result -= vacuum_defer_cleanup_age;
1118 if (!TransactionIdIsNormal(result))
1119 result = FirstNormalTransactionId;
1126 * GetSnapshotData -- returns information about running transactions.
1128 * The returned snapshot includes xmin (lowest still-running xact ID),
1129 * xmax (highest completed xact ID + 1), and a list of running xact IDs
1130 * in the range xmin <= xid < xmax. It is used as follows:
1131 * All xact IDs < xmin are considered finished.
1132 * All xact IDs >= xmax are considered still running.
1133 * For an xact ID xmin <= xid < xmax, consult list to see whether
1134 * it is considered running or not.
1135 * This ensures that the set of transactions seen as "running" by the
1136 * current xact will not change after it takes the snapshot.
1138 * All running top-level XIDs are included in the snapshot, except for lazy
1139 * VACUUM processes. We also try to include running subtransaction XIDs,
1140 * but since PGPROC has only a limited cache area for subxact XIDs, full
1141 * information may not be available. If we find any overflowed subxid arrays,
1142 * we have to mark the snapshot's subxid data as overflowed, and extra work
1143 * *may* need to be done to determine what's running (see XidInMVCCSnapshot()
1146 * We also update the following backend-global variables:
1147 * TransactionXmin: the oldest xmin of any snapshot in use in the
1148 * current transaction (this is the same as MyProc->xmin).
1149 * RecentXmin: the xmin computed for the most recent snapshot. XIDs
1150 * older than this are known not running any more.
1151 * RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
1152 * running transactions, except those running LAZY VACUUM). This is
1153 * the same computation done by GetOldestXmin(true, true).
1155 * Note: this function should probably not be called with an argument that's
1156 * not statically allocated (see xip allocation below).
1159 GetSnapshotData(Snapshot snapshot)
1161 ProcArrayStruct *arrayP = procArray;
1164 TransactionId globalxmin;
1168 bool suboverflowed = false;
1170 Assert(snapshot != NULL);
1173 * Allocating space for maxProcs xids is usually overkill; numProcs would
1174 * be sufficient. But it seems better to do the malloc while not holding
1175 * the lock, so we can't look at numProcs. Likewise, we allocate much
1176 * more subxip storage than is probably needed.
1178 * This does open a possibility for avoiding repeated malloc/free: since
1179 * maxProcs does not change at runtime, we can simply reuse the previous
1180 * xip arrays if any. (This relies on the fact that all callers pass
1181 * static SnapshotData structs.)
1183 if (snapshot->xip == NULL)
1186 * First call for this snapshot. Snapshot is same size whether or not
1187 * we are in recovery, see later comments.
1189 snapshot->xip = (TransactionId *)
1190 malloc(arrayP->maxProcs * sizeof(TransactionId));
1191 if (snapshot->xip == NULL)
1193 (errcode(ERRCODE_OUT_OF_MEMORY),
1194 errmsg("out of memory")));
1195 Assert(snapshot->subxip == NULL);
1196 snapshot->subxip = (TransactionId *)
1197 malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
1198 if (snapshot->subxip == NULL)
1200 (errcode(ERRCODE_OUT_OF_MEMORY),
1201 errmsg("out of memory")));
1205 * It is sufficient to get shared lock on ProcArrayLock, even if we are
1206 * going to set MyProc->xmin.
1208 LWLockAcquire(ProcArrayLock, LW_SHARED);
1210 /* xmax is always latestCompletedXid + 1 */
1211 xmax = ShmemVariableCache->latestCompletedXid;
1212 Assert(TransactionIdIsNormal(xmax));
1213 TransactionIdAdvance(xmax);
1215 /* initialize xmin calculation with xmax */
1216 globalxmin = xmin = xmax;
1219 * If we're in recovery then snapshot data comes from a different place,
1220 * so decide which route we take before grab the lock. It is possible for
1221 * recovery to end before we finish taking snapshot, and for newly
1222 * assigned transaction ids to be added to the procarray. Xmax cannot
1223 * change while we hold ProcArrayLock, so those newly added transaction
1224 * ids would be filtered away, so we need not be concerned about them.
1226 snapshot->takenDuringRecovery = RecoveryInProgress();
1228 if (!snapshot->takenDuringRecovery)
1231 * Spin over procArray checking xid, xmin, and subxids. The goal is
1232 * to gather all active xids, find the lowest xmin, and try to record
1233 * subxids. During recovery no xids will be assigned, so all normal
1234 * backends can be ignored, nor are there any VACUUMs running. All
1235 * prepared transaction xids are held in KnownAssignedXids, so these
1236 * will be seen without needing to loop through procs here.
1238 for (index = 0; index < arrayP->numProcs; index++)
1240 volatile PGPROC *proc = arrayP->procs[index];
1243 /* Ignore procs running LAZY VACUUM */
1244 if (proc->vacuumFlags & PROC_IN_VACUUM)
1247 /* Update globalxmin to be the smallest valid xmin */
1248 xid = proc->xmin; /* fetch just once */
1249 if (TransactionIdIsNormal(xid) &&
1250 TransactionIdPrecedes(xid, globalxmin))
1253 /* Fetch xid just once - see GetNewTransactionId */
1257 * If the transaction has been assigned an xid < xmax we add it to
1258 * the snapshot, and update xmin if necessary. There's no need to
1259 * store XIDs >= xmax, since we'll treat them as running anyway.
1260 * We don't bother to examine their subxids either.
1262 * We don't include our own XID (if any) in the snapshot, but we
1263 * must include it into xmin.
1265 if (TransactionIdIsNormal(xid))
1267 if (TransactionIdFollowsOrEquals(xid, xmax))
1270 snapshot->xip[count++] = xid;
1271 if (TransactionIdPrecedes(xid, xmin))
1276 * Save subtransaction XIDs if possible (if we've already
1277 * overflowed, there's no point). Note that the subxact XIDs must
1278 * be later than their parent, so no need to check them against
1279 * xmin. We could filter against xmax, but it seems better not to
1280 * do that much work while holding the ProcArrayLock.
1282 * The other backend can add more subxids concurrently, but cannot
1283 * remove any. Hence it's important to fetch nxids just once.
1284 * Should be safe to use memcpy, though. (We needn't worry about
1285 * missing any xids added concurrently, because they must postdate
1288 * Again, our own XIDs are not included in the snapshot.
1290 if (!suboverflowed && proc != MyProc)
1292 if (proc->subxids.overflowed)
1293 suboverflowed = true;
1296 int nxids = proc->subxids.nxids;
1300 memcpy(snapshot->subxip + subcount,
1301 (void *) proc->subxids.xids,
1302 nxids * sizeof(TransactionId));
1312 * We're in hot standby, so get XIDs from KnownAssignedXids.
1314 * We store all xids directly into subxip[]. Here's why:
1316 * In recovery we don't know which xids are top-level and which are
1317 * subxacts, a design choice that greatly simplifies xid processing.
1319 * It seems like we would want to try to put xids into xip[] only, but
1320 * that is fairly small. We would either need to make that bigger or
1321 * to increase the rate at which we WAL-log xid assignment; neither is
1322 * an appealing choice.
1324 * We could try to store xids into xip[] first and then into subxip[]
1325 * if there are too many xids. That only works if the snapshot doesn't
1326 * overflow because we do not search subxip[] in that case. A simpler
1327 * way is to just store all xids in the subxact array because this is
1328 * by far the bigger array. We just leave the xip array empty.
1330 * Either way we need to change the way XidInMVCCSnapshot() works
1331 * depending upon when the snapshot was taken, or change normal
1332 * snapshot processing so it matches.
1334 subcount = KnownAssignedXidsGetAndSetXmin(snapshot->subxip, &xmin,
1337 if (TransactionIdPrecedesOrEquals(xmin, procArray->lastOverflowedXid))
1338 suboverflowed = true;
1341 if (!TransactionIdIsValid(MyProc->xmin))
1342 MyProc->xmin = TransactionXmin = xmin;
1344 LWLockRelease(ProcArrayLock);
1347 * Update globalxmin to include actual process xids. This is a slightly
1348 * different way of computing it than GetOldestXmin uses, but should give
1351 if (TransactionIdPrecedes(xmin, globalxmin))
1354 /* Update global variables too */
1355 RecentGlobalXmin = globalxmin - vacuum_defer_cleanup_age;
1356 if (!TransactionIdIsNormal(RecentGlobalXmin))
1357 RecentGlobalXmin = FirstNormalTransactionId;
1360 snapshot->xmin = xmin;
1361 snapshot->xmax = xmax;
1362 snapshot->xcnt = count;
1363 snapshot->subxcnt = subcount;
1364 snapshot->suboverflowed = suboverflowed;
1366 snapshot->curcid = GetCurrentCommandId(false);
1369 * This is a new snapshot, so set both refcounts are zero, and mark it as
1370 * not copied in persistent memory.
1372 snapshot->active_count = 0;
1373 snapshot->regd_count = 0;
1374 snapshot->copied = false;
1380 * GetRunningTransactionData -- returns information about running transactions.
1382 * Similar to GetSnapshotData but returns more information. We include
1383 * all PGPROCs with an assigned TransactionId, even VACUUM processes.
1385 * We acquire XidGenLock, but the caller is responsible for releasing it.
1386 * This ensures that no new XIDs enter the proc array until the caller has
1387 * WAL-logged this snapshot, and releases the lock.
1389 * The returned data structure is statically allocated; caller should not
1390 * modify it, and must not assume it is valid past the next call.
1392 * This is never executed during recovery so there is no need to look at
1393 * KnownAssignedXids.
1395 * We don't worry about updating other counters, we want to keep this as
1396 * simple as possible and leave GetSnapshotData() as the primary code for
1400 GetRunningTransactionData(void)
1402 /* result workspace */
1403 static RunningTransactionsData CurrentRunningXactsData;
1405 ProcArrayStruct *arrayP = procArray;
1406 RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData;
1407 TransactionId latestCompletedXid;
1408 TransactionId oldestRunningXid;
1409 TransactionId *xids;
1415 Assert(!RecoveryInProgress());
1418 * Allocating space for maxProcs xids is usually overkill; numProcs would
1419 * be sufficient. But it seems better to do the malloc while not holding
1420 * the lock, so we can't look at numProcs. Likewise, we allocate much
1421 * more subxip storage than is probably needed.
1423 * Should only be allocated in bgwriter, since only ever executed during
1426 if (CurrentRunningXacts->xids == NULL)
1431 CurrentRunningXacts->xids = (TransactionId *)
1432 malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
1433 if (CurrentRunningXacts->xids == NULL)
1435 (errcode(ERRCODE_OUT_OF_MEMORY),
1436 errmsg("out of memory")));
1439 xids = CurrentRunningXacts->xids;
1441 count = subcount = 0;
1442 suboverflowed = false;
1445 * Ensure that no xids enter or leave the procarray while we obtain
1448 LWLockAcquire(ProcArrayLock, LW_SHARED);
1449 LWLockAcquire(XidGenLock, LW_SHARED);
1451 latestCompletedXid = ShmemVariableCache->latestCompletedXid;
1453 oldestRunningXid = ShmemVariableCache->nextXid;
1456 * Spin over procArray collecting all xids and subxids.
1458 for (index = 0; index < arrayP->numProcs; index++)
1460 volatile PGPROC *proc = arrayP->procs[index];
1464 /* Fetch xid just once - see GetNewTransactionId */
1468 * We don't need to store transactions that don't have a TransactionId
1469 * yet because they will not show as running on a standby server.
1471 if (!TransactionIdIsValid(xid))
1474 xids[count++] = xid;
1476 if (TransactionIdPrecedes(xid, oldestRunningXid))
1477 oldestRunningXid = xid;
1480 * Save subtransaction XIDs. Other backends can't add or remove
1481 * entries while we're holding XidGenLock.
1483 nxids = proc->subxids.nxids;
1486 memcpy(&xids[count], (void *) proc->subxids.xids,
1487 nxids * sizeof(TransactionId));
1491 if (proc->subxids.overflowed)
1492 suboverflowed = true;
1495 * Top-level XID of a transaction is always less than any of its
1496 * subxids, so we don't need to check if any of the subxids are
1497 * smaller than oldestRunningXid
1502 CurrentRunningXacts->xcnt = count;
1503 CurrentRunningXacts->subxid_overflow = suboverflowed;
1504 CurrentRunningXacts->nextXid = ShmemVariableCache->nextXid;
1505 CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
1506 CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
1508 /* We don't release XidGenLock here, the caller is responsible for that */
1509 LWLockRelease(ProcArrayLock);
1511 Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
1512 Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid));
1513 Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid));
1515 return CurrentRunningXacts;
1519 * GetTransactionsInCommit -- Get the XIDs of transactions that are committing
1521 * Constructs an array of XIDs of transactions that are currently in commit
1522 * critical sections, as shown by having inCommit set in their PGPROC entries.
1524 * *xids_p is set to a palloc'd array that should be freed by the caller.
1525 * The return value is the number of valid entries.
1527 * Note that because backends set or clear inCommit without holding any lock,
1528 * the result is somewhat indeterminate, but we don't really care. Even in
1529 * a multiprocessor with delayed writes to shared memory, it should be certain
1530 * that setting of inCommit will propagate to shared memory when the backend
1531 * takes the WALInsertLock, so we cannot fail to see an xact as inCommit if
1532 * it's already inserted its commit record. Whether it takes a little while
1533 * for clearing of inCommit to propagate is unimportant for correctness.
1536 GetTransactionsInCommit(TransactionId **xids_p)
1538 ProcArrayStruct *arrayP = procArray;
1539 TransactionId *xids;
1543 xids = (TransactionId *) palloc(arrayP->maxProcs * sizeof(TransactionId));
1546 LWLockAcquire(ProcArrayLock, LW_SHARED);
1548 for (index = 0; index < arrayP->numProcs; index++)
1550 volatile PGPROC *proc = arrayP->procs[index];
1552 /* Fetch xid just once - see GetNewTransactionId */
1553 TransactionId pxid = proc->xid;
1555 if (proc->inCommit && TransactionIdIsValid(pxid))
1556 xids[nxids++] = pxid;
1559 LWLockRelease(ProcArrayLock);
1566 * HaveTransactionsInCommit -- Are any of the specified XIDs in commit?
1568 * This is used with the results of GetTransactionsInCommit to see if any
1569 * of the specified XIDs are still in their commit critical sections.
1571 * Note: this is O(N^2) in the number of xacts that are/were in commit, but
1572 * those numbers should be small enough for it not to be a problem.
1575 HaveTransactionsInCommit(TransactionId *xids, int nxids)
1577 bool result = false;
1578 ProcArrayStruct *arrayP = procArray;
1581 LWLockAcquire(ProcArrayLock, LW_SHARED);
1583 for (index = 0; index < arrayP->numProcs; index++)
1585 volatile PGPROC *proc = arrayP->procs[index];
1587 /* Fetch xid just once - see GetNewTransactionId */
1588 TransactionId pxid = proc->xid;
1590 if (proc->inCommit && TransactionIdIsValid(pxid))
1594 for (i = 0; i < nxids; i++)
1596 if (xids[i] == pxid)
1607 LWLockRelease(ProcArrayLock);
1613 * BackendPidGetProc -- get a backend's PGPROC given its PID
1615 * Returns NULL if not found. Note that it is up to the caller to be
1616 * sure that the question remains meaningful for long enough for the
1617 * answer to be used ...
1620 BackendPidGetProc(int pid)
1622 PGPROC *result = NULL;
1623 ProcArrayStruct *arrayP = procArray;
1626 if (pid == 0) /* never match dummy PGPROCs */
1629 LWLockAcquire(ProcArrayLock, LW_SHARED);
1631 for (index = 0; index < arrayP->numProcs; index++)
1633 PGPROC *proc = arrayP->procs[index];
1635 if (proc->pid == pid)
1642 LWLockRelease(ProcArrayLock);
1648 * BackendXidGetPid -- get a backend's pid given its XID
1650 * Returns 0 if not found or it's a prepared transaction. Note that
1651 * it is up to the caller to be sure that the question remains
1652 * meaningful for long enough for the answer to be used ...
1654 * Only main transaction Ids are considered. This function is mainly
1655 * useful for determining what backend owns a lock.
1657 * Beware that not every xact has an XID assigned. However, as long as you
1658 * only call this using an XID found on disk, you're safe.
1661 BackendXidGetPid(TransactionId xid)
1664 ProcArrayStruct *arrayP = procArray;
1667 if (xid == InvalidTransactionId) /* never match invalid xid */
1670 LWLockAcquire(ProcArrayLock, LW_SHARED);
1672 for (index = 0; index < arrayP->numProcs; index++)
1674 volatile PGPROC *proc = arrayP->procs[index];
1676 if (proc->xid == xid)
1683 LWLockRelease(ProcArrayLock);
1689 * IsBackendPid -- is a given pid a running backend
1692 IsBackendPid(int pid)
1694 return (BackendPidGetProc(pid) != NULL);
1699 * GetCurrentVirtualXIDs -- returns an array of currently active VXIDs.
1701 * The array is palloc'd. The number of valid entries is returned into *nvxids.
1703 * The arguments allow filtering the set of VXIDs returned. Our own process
1704 * is always skipped. In addition:
1705 * If limitXmin is not InvalidTransactionId, skip processes with
1707 * If excludeXmin0 is true, skip processes with xmin = 0.
1708 * If allDbs is false, skip processes attached to other databases.
1709 * If excludeVacuum isn't zero, skip processes for which
1710 * (vacuumFlags & excludeVacuum) is not zero.
1712 * Note: the purpose of the limitXmin and excludeXmin0 parameters is to
1713 * allow skipping backends whose oldest live snapshot is no older than
1714 * some snapshot we have. Since we examine the procarray with only shared
1715 * lock, there are race conditions: a backend could set its xmin just after
1716 * we look. Indeed, on multiprocessors with weak memory ordering, the
1717 * other backend could have set its xmin *before* we look. We know however
1718 * that such a backend must have held shared ProcArrayLock overlapping our
1719 * own hold of ProcArrayLock, else we would see its xmin update. Therefore,
1720 * any snapshot the other backend is taking concurrently with our scan cannot
1721 * consider any transactions as still running that we think are committed
1722 * (since backends must hold ProcArrayLock exclusive to commit).
1724 VirtualTransactionId *
1725 GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
1726 bool allDbs, int excludeVacuum,
1729 VirtualTransactionId *vxids;
1730 ProcArrayStruct *arrayP = procArray;
1734 /* allocate what's certainly enough result space */
1735 vxids = (VirtualTransactionId *)
1736 palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
1738 LWLockAcquire(ProcArrayLock, LW_SHARED);
1740 for (index = 0; index < arrayP->numProcs; index++)
1742 volatile PGPROC *proc = arrayP->procs[index];
1747 if (excludeVacuum & proc->vacuumFlags)
1750 if (allDbs || proc->databaseId == MyDatabaseId)
1752 /* Fetch xmin just once - might change on us */
1753 TransactionId pxmin = proc->xmin;
1755 if (excludeXmin0 && !TransactionIdIsValid(pxmin))
1759 * InvalidTransactionId precedes all other XIDs, so a proc that
1760 * hasn't set xmin yet will not be rejected by this test.
1762 if (!TransactionIdIsValid(limitXmin) ||
1763 TransactionIdPrecedesOrEquals(pxmin, limitXmin))
1765 VirtualTransactionId vxid;
1767 GET_VXID_FROM_PGPROC(vxid, *proc);
1768 if (VirtualTransactionIdIsValid(vxid))
1769 vxids[count++] = vxid;
1774 LWLockRelease(ProcArrayLock);
1781 * GetConflictingVirtualXIDs -- returns an array of currently active VXIDs.
1783 * Usage is limited to conflict resolution during recovery on standby servers.
1784 * limitXmin is supplied as either latestRemovedXid, or InvalidTransactionId
1785 * in cases where we cannot accurately determine a value for latestRemovedXid.
1787 * If limitXmin is InvalidTransactionId then we want to kill everybody,
1788 * so we're not worried if they have a snapshot or not, nor does it really
1789 * matter what type of lock we hold.
1791 * All callers that are checking xmins always now supply a valid and useful
1792 * value for limitXmin. The limitXmin is always lower than the lowest
1793 * numbered KnownAssignedXid that is not already a FATAL error. This is
1794 * because we only care about cleanup records that are cleaning up tuple
1795 * versions from committed transactions. In that case they will only occur
1796 * at the point where the record is less than the lowest running xid. That
1797 * allows us to say that if any backend takes a snapshot concurrently with
1798 * us then the conflict assessment made here would never include the snapshot
1799 * that is being derived. So we take LW_SHARED on the ProcArray and allow
1800 * concurrent snapshots when limitXmin is valid. We might think about adding
1801 * Assert(limitXmin < lowest(KnownAssignedXids))
1802 * but that would not be true in the case of FATAL errors lagging in array,
1803 * but we already know those are bogus anyway, so we skip that test.
1805 * If dbOid is valid we skip backends attached to other databases.
1807 * Be careful to *not* pfree the result from this function. We reuse
1808 * this array sufficiently often that we use malloc for the result.
1810 VirtualTransactionId *
1811 GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
1813 static VirtualTransactionId *vxids;
1814 ProcArrayStruct *arrayP = procArray;
1819 * If first time through, get workspace to remember main XIDs in. We
1820 * malloc it permanently to avoid repeated palloc/pfree overhead. Allow
1821 * result space, remembering room for a terminator.
1825 vxids = (VirtualTransactionId *)
1826 malloc(sizeof(VirtualTransactionId) * (arrayP->maxProcs + 1));
1829 (errcode(ERRCODE_OUT_OF_MEMORY),
1830 errmsg("out of memory")));
1833 LWLockAcquire(ProcArrayLock, LW_SHARED);
1835 for (index = 0; index < arrayP->numProcs; index++)
1837 volatile PGPROC *proc = arrayP->procs[index];
1839 /* Exclude prepared transactions */
1843 if (!OidIsValid(dbOid) ||
1844 proc->databaseId == dbOid)
1846 /* Fetch xmin just once - can't change on us, but good coding */
1847 TransactionId pxmin = proc->xmin;
1850 * We ignore an invalid pxmin because this means that backend has
1851 * no snapshot and cannot get another one while we hold exclusive
1854 if (!TransactionIdIsValid(limitXmin) ||
1855 (TransactionIdIsValid(pxmin) && !TransactionIdFollows(pxmin, limitXmin)))
1857 VirtualTransactionId vxid;
1859 GET_VXID_FROM_PGPROC(vxid, *proc);
1860 if (VirtualTransactionIdIsValid(vxid))
1861 vxids[count++] = vxid;
1866 LWLockRelease(ProcArrayLock);
1868 /* add the terminator */
1869 vxids[count].backendId = InvalidBackendId;
1870 vxids[count].localTransactionId = InvalidLocalTransactionId;
1876 * CancelVirtualTransaction - used in recovery conflict processing
1878 * Returns pid of the process signaled, or 0 if not found.
1881 CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode)
1883 ProcArrayStruct *arrayP = procArray;
1887 LWLockAcquire(ProcArrayLock, LW_SHARED);
1889 for (index = 0; index < arrayP->numProcs; index++)
1891 VirtualTransactionId procvxid;
1892 PGPROC *proc = arrayP->procs[index];
1894 GET_VXID_FROM_PGPROC(procvxid, *proc);
1896 if (procvxid.backendId == vxid.backendId &&
1897 procvxid.localTransactionId == vxid.localTransactionId)
1899 proc->recoveryConflictPending = true;
1904 * Kill the pid if it's still here. If not, that's what we
1905 * wanted so ignore any errors.
1907 (void) SendProcSignal(pid, sigmode, vxid.backendId);
1913 LWLockRelease(ProcArrayLock);
1919 * MinimumActiveBackends --- count backends (other than myself) that are
1920 * in active transactions. Return true if the count exceeds the
1921 * minimum threshold passed. This is used as a heuristic to decide if
1922 * a pre-XLOG-flush delay is worthwhile during commit.
1924 * Do not count backends that are blocked waiting for locks, since they are
1925 * not going to get to run until someone else commits.
1928 MinimumActiveBackends(int min)
1930 ProcArrayStruct *arrayP = procArray;
1934 /* Quick short-circuit if no minimum is specified */
1939 * Note: for speed, we don't acquire ProcArrayLock. This is a little bit
1940 * bogus, but since we are only testing fields for zero or nonzero, it
1941 * should be OK. The result is only used for heuristic purposes anyway...
1943 for (index = 0; index < arrayP->numProcs; index++)
1945 volatile PGPROC *proc = arrayP->procs[index];
1948 * Since we're not holding a lock, need to check that the pointer is
1949 * valid. Someone holding the lock could have incremented numProcs
1950 * already, but not yet inserted a valid pointer to the array.
1952 * If someone just decremented numProcs, 'proc' could also point to a
1953 * PGPROC entry that's no longer in the array. It still points to a
1954 * PGPROC struct, though, because freed PGPPROC entries just go to the
1955 * free list and are recycled. Its contents are nonsense in that case,
1956 * but that's acceptable for this function.
1962 continue; /* do not count myself */
1964 continue; /* do not count prepared xacts */
1965 if (proc->xid == InvalidTransactionId)
1966 continue; /* do not count if no XID assigned */
1967 if (proc->waitLock != NULL)
1968 continue; /* do not count if blocked on a lock */
1974 return count >= min;
1978 * CountDBBackends --- count backends that are using specified database
1981 CountDBBackends(Oid databaseid)
1983 ProcArrayStruct *arrayP = procArray;
1987 LWLockAcquire(ProcArrayLock, LW_SHARED);
1989 for (index = 0; index < arrayP->numProcs; index++)
1991 volatile PGPROC *proc = arrayP->procs[index];
1994 continue; /* do not count prepared xacts */
1995 if (!OidIsValid(databaseid) ||
1996 proc->databaseId == databaseid)
2000 LWLockRelease(ProcArrayLock);
2006 * CancelDBBackends --- cancel backends that are using specified database
2009 CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending)
2011 ProcArrayStruct *arrayP = procArray;
2015 /* tell all backends to die */
2016 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2018 for (index = 0; index < arrayP->numProcs; index++)
2020 volatile PGPROC *proc = arrayP->procs[index];
2022 if (databaseid == InvalidOid || proc->databaseId == databaseid)
2024 VirtualTransactionId procvxid;
2026 GET_VXID_FROM_PGPROC(procvxid, *proc);
2028 proc->recoveryConflictPending = conflictPending;
2033 * Kill the pid if it's still here. If not, that's what we
2034 * wanted so ignore any errors.
2036 (void) SendProcSignal(pid, sigmode, procvxid.backendId);
2041 LWLockRelease(ProcArrayLock);
2045 * CountUserBackends --- count backends that are used by specified user
2048 CountUserBackends(Oid roleid)
2050 ProcArrayStruct *arrayP = procArray;
2054 LWLockAcquire(ProcArrayLock, LW_SHARED);
2056 for (index = 0; index < arrayP->numProcs; index++)
2058 volatile PGPROC *proc = arrayP->procs[index];
2061 continue; /* do not count prepared xacts */
2062 if (proc->roleId == roleid)
2066 LWLockRelease(ProcArrayLock);
2072 * CountOtherDBBackends -- check for other backends running in the given DB
2074 * If there are other backends in the DB, we will wait a maximum of 5 seconds
2075 * for them to exit. Autovacuum backends are encouraged to exit early by
2076 * sending them SIGTERM, but normal user backends are just waited for.
2078 * The current backend is always ignored; it is caller's responsibility to
2079 * check whether the current backend uses the given DB, if it's important.
2081 * Returns TRUE if there are (still) other backends in the DB, FALSE if not.
2082 * Also, *nbackends and *nprepared are set to the number of other backends
2083 * and prepared transactions in the DB, respectively.
2085 * This function is used to interlock DROP DATABASE and related commands
2086 * against there being any active backends in the target DB --- dropping the
2087 * DB while active backends remain would be a Bad Thing. Note that we cannot
2088 * detect here the possibility of a newly-started backend that is trying to
2089 * connect to the doomed database, so additional interlocking is needed during
2090 * backend startup. The caller should normally hold an exclusive lock on the
2091 * target DB before calling this, which is one reason we mustn't wait
2095 CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
2097 ProcArrayStruct *arrayP = procArray;
2099 #define MAXAUTOVACPIDS 10 /* max autovacs to SIGTERM per iteration */
2100 int autovac_pids[MAXAUTOVACPIDS];
2103 /* 50 tries with 100ms sleep between tries makes 5 sec total wait */
2104 for (tries = 0; tries < 50; tries++)
2110 CHECK_FOR_INTERRUPTS();
2112 *nbackends = *nprepared = 0;
2114 LWLockAcquire(ProcArrayLock, LW_SHARED);
2116 for (index = 0; index < arrayP->numProcs; index++)
2118 volatile PGPROC *proc = arrayP->procs[index];
2120 if (proc->databaseId != databaseId)
2132 if ((proc->vacuumFlags & PROC_IS_AUTOVACUUM) &&
2133 nautovacs < MAXAUTOVACPIDS)
2134 autovac_pids[nautovacs++] = proc->pid;
2138 LWLockRelease(ProcArrayLock);
2141 return false; /* no conflicting backends, so done */
2144 * Send SIGTERM to any conflicting autovacuums before sleeping. We
2145 * postpone this step until after the loop because we don't want to
2146 * hold ProcArrayLock while issuing kill(). We have no idea what might
2147 * block kill() inside the kernel...
2149 for (index = 0; index < nautovacs; index++)
2150 (void) kill(autovac_pids[index], SIGTERM); /* ignore any error */
2152 /* sleep, then try again */
2153 pg_usleep(100 * 1000L); /* 100ms */
2156 return true; /* timed out, still conflicts */
2160 #define XidCacheRemove(i) \
2162 MyProc->subxids.xids[i] = MyProc->subxids.xids[MyProc->subxids.nxids - 1]; \
2163 MyProc->subxids.nxids--; \
2167 * XidCacheRemoveRunningXids
2169 * Remove a bunch of TransactionIds from the list of known-running
2170 * subtransactions for my backend. Both the specified xid and those in
2171 * the xids[] array (of length nxids) are removed from the subxids cache.
2172 * latestXid must be the latest XID among the group.
2175 XidCacheRemoveRunningXids(TransactionId xid,
2176 int nxids, const TransactionId *xids,
2177 TransactionId latestXid)
2182 Assert(TransactionIdIsValid(xid));
2185 * We must hold ProcArrayLock exclusively in order to remove transactions
2186 * from the PGPROC array. (See src/backend/access/transam/README.) It's
2187 * possible this could be relaxed since we know this routine is only used
2188 * to abort subtransactions, but pending closer analysis we'd best be
2191 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2194 * Under normal circumstances xid and xids[] will be in increasing order,
2195 * as will be the entries in subxids. Scan backwards to avoid O(N^2)
2196 * behavior when removing a lot of xids.
2198 for (i = nxids - 1; i >= 0; i--)
2200 TransactionId anxid = xids[i];
2202 for (j = MyProc->subxids.nxids - 1; j >= 0; j--)
2204 if (TransactionIdEquals(MyProc->subxids.xids[j], anxid))
2212 * Ordinarily we should have found it, unless the cache has
2213 * overflowed. However it's also possible for this routine to be
2214 * invoked multiple times for the same subtransaction, in case of an
2215 * error during AbortSubTransaction. So instead of Assert, emit a
2218 if (j < 0 && !MyProc->subxids.overflowed)
2219 elog(WARNING, "did not find subXID %u in MyProc", anxid);
2222 for (j = MyProc->subxids.nxids - 1; j >= 0; j--)
2224 if (TransactionIdEquals(MyProc->subxids.xids[j], xid))
2230 /* Ordinarily we should have found it, unless the cache has overflowed */
2231 if (j < 0 && !MyProc->subxids.overflowed)
2232 elog(WARNING, "did not find subXID %u in MyProc", xid);
2234 /* Also advance global latestCompletedXid while holding the lock */
2235 if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
2237 ShmemVariableCache->latestCompletedXid = latestXid;
2239 LWLockRelease(ProcArrayLock);
2242 #ifdef XIDCACHE_DEBUG
2245 * Print stats about effectiveness of XID cache
2248 DisplayXidCache(void)
2251 "XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, nooflo: %ld, slow: %ld\n",
2258 xc_by_known_assigned,
2262 #endif /* XIDCACHE_DEBUG */
2265 /* ----------------------------------------------
2266 * KnownAssignedTransactions sub-module
2267 * ----------------------------------------------
2271 * In Hot Standby mode, we maintain a list of transactions that are (or were)
2272 * running in the master at the current point in WAL. These XIDs must be
2273 * treated as running by standby transactions, even though they are not in
2274 * the standby server's PGPROC array.
2276 * We record all XIDs that we know have been assigned. That includes all the
2277 * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have
2278 * been assigned. We can deduce the existence of unobserved XIDs because we
2279 * know XIDs are assigned in sequence, with no gaps. The KnownAssignedXids
2280 * list expands as new XIDs are observed or inferred, and contracts when
2281 * transaction completion records arrive.
2283 * During hot standby we do not fret too much about the distinction between
2284 * top-level XIDs and subtransaction XIDs. We store both together in the
2285 * KnownAssignedXids list. In backends, this is copied into snapshots in
2286 * GetSnapshotData(), taking advantage of the fact that XidInMVCCSnapshot()
2287 * doesn't care about the distinction either. Subtransaction XIDs are
2288 * effectively treated as top-level XIDs and in the typical case pg_subtrans
2289 * links are *not* maintained (which does not affect visibility).
2291 * We have room in KnownAssignedXids and in snapshots to hold maxProcs *
2292 * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every master transaction must
2293 * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at
2294 * least every PGPROC_MAX_CACHED_SUBXIDS. When we receive one of these
2295 * records, we mark the subXIDs as children of the top XID in pg_subtrans,
2296 * and then remove them from KnownAssignedXids. This prevents overflow of
2297 * KnownAssignedXids and snapshots, at the cost that status checks for these
2298 * subXIDs will take a slower path through TransactionIdIsInProgress().
2299 * This means that KnownAssignedXids is not necessarily complete for subXIDs,
2300 * though it should be complete for top-level XIDs; this is the same situation
2301 * that holds with respect to the PGPROC entries in normal running.
2303 * When we throw away subXIDs from KnownAssignedXids, we need to keep track of
2304 * that, similarly to tracking overflow of a PGPROC's subxids array. We do
2305 * that by remembering the lastOverflowedXID, ie the last thrown-away subXID.
2306 * As long as that is within the range of interesting XIDs, we have to assume
2307 * that subXIDs are missing from snapshots. (Note that subXID overflow occurs
2308 * on primary when 65th subXID arrives, whereas on standby it occurs when 64th
2309 * subXID arrives - that is not an error.)
2311 * Should a backend on primary somehow disappear before it can write an abort
2312 * record, then we just leave those XIDs in KnownAssignedXids. They actually
2313 * aborted but we think they were running; the distinction is irrelevant
2314 * because either way any changes done by the transaction are not visible to
2315 * backends in the standby. We prune KnownAssignedXids when
2316 * XLOG_RUNNING_XACTS arrives, to forestall possible overflow of the
2317 * array due to such dead XIDs.
2321 * RecordKnownAssignedTransactionIds
2322 * Record the given XID in KnownAssignedXids, as well as any preceding
2325 * RecordKnownAssignedTransactionIds() should be run for *every* WAL record
2326 * associated with a transaction. Must be called for each record after we
2327 * have executed StartupCLOG() et al, since we must ExtendCLOG() etc..
2329 * Called during recovery in analogy with and in place of GetNewTransactionId()
2332 RecordKnownAssignedTransactionIds(TransactionId xid)
2334 Assert(standbyState >= STANDBY_INITIALIZED);
2335 Assert(TransactionIdIsValid(xid));
2337 elog(trace_recovery(DEBUG4), "record known xact %u latestObservedXid %u",
2338 xid, latestObservedXid);
2341 * If the KnownAssignedXids machinery isn't up yet, do nothing.
2343 if (standbyState <= STANDBY_INITIALIZED)
2346 Assert(TransactionIdIsValid(latestObservedXid));
2349 * When a newly observed xid arrives, it is frequently the case that it is
2350 * *not* the next xid in sequence. When this occurs, we must treat the
2351 * intervening xids as running also.
2353 if (TransactionIdFollows(xid, latestObservedXid))
2355 TransactionId next_expected_xid;
2358 * Extend clog and subtrans like we do in GetNewTransactionId() during
2359 * normal operation using individual extend steps. Typical case
2360 * requires almost no activity.
2362 next_expected_xid = latestObservedXid;
2363 TransactionIdAdvance(next_expected_xid);
2364 while (TransactionIdPrecedesOrEquals(next_expected_xid, xid))
2366 ExtendCLOG(next_expected_xid);
2367 ExtendSUBTRANS(next_expected_xid);
2369 TransactionIdAdvance(next_expected_xid);
2373 * Add the new xids onto the KnownAssignedXids array.
2375 next_expected_xid = latestObservedXid;
2376 TransactionIdAdvance(next_expected_xid);
2377 KnownAssignedXidsAdd(next_expected_xid, xid, false);
2380 * Now we can advance latestObservedXid
2382 latestObservedXid = xid;
2384 /* ShmemVariableCache->nextXid must be beyond any observed xid */
2385 next_expected_xid = latestObservedXid;
2386 TransactionIdAdvance(next_expected_xid);
2387 ShmemVariableCache->nextXid = next_expected_xid;
2392 * ExpireTreeKnownAssignedTransactionIds
2393 * Remove the given XIDs from KnownAssignedXids.
2395 * Called during recovery in analogy with and in place of ProcArrayEndTransaction()
2398 ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids,
2399 TransactionId *subxids, TransactionId max_xid)
2401 Assert(standbyState >= STANDBY_INITIALIZED);
2404 * Uses same locking as transaction commit
2406 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2408 KnownAssignedXidsRemoveTree(xid, nsubxids, subxids);
2410 /* As in ProcArrayEndTransaction, advance latestCompletedXid */
2411 if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
2413 ShmemVariableCache->latestCompletedXid = max_xid;
2415 LWLockRelease(ProcArrayLock);
2419 * ExpireAllKnownAssignedTransactionIds
2420 * Remove all entries in KnownAssignedXids
2423 ExpireAllKnownAssignedTransactionIds(void)
2425 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2426 KnownAssignedXidsRemovePreceding(InvalidTransactionId);
2427 LWLockRelease(ProcArrayLock);
2431 * ExpireOldKnownAssignedTransactionIds
2432 * Remove KnownAssignedXids entries preceding the given XID
2435 ExpireOldKnownAssignedTransactionIds(TransactionId xid)
2437 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2438 KnownAssignedXidsRemovePreceding(xid);
2439 LWLockRelease(ProcArrayLock);
2444 * Private module functions to manipulate KnownAssignedXids
2446 * There are 5 main uses of the KnownAssignedXids data structure:
2448 * * backends taking snapshots - all valid XIDs need to be copied out
2449 * * backends seeking to determine presence of a specific XID
2450 * * startup process adding new known-assigned XIDs
2451 * * startup process removing specific XIDs as transactions end
2452 * * startup process pruning array when special WAL records arrive
2454 * This data structure is known to be a hot spot during Hot Standby, so we
2455 * go to some lengths to make these operations as efficient and as concurrent
2458 * The XIDs are stored in an array in sorted order --- TransactionIdPrecedes
2459 * order, to be exact --- to allow binary search for specific XIDs. Note:
2460 * in general TransactionIdPrecedes would not provide a total order, but
2461 * we know that the entries present at any instant should not extend across
2462 * a large enough fraction of XID space to wrap around (the master would
2463 * shut down for fear of XID wrap long before that happens). So it's OK to
2464 * use TransactionIdPrecedes as a binary-search comparator.
2466 * It's cheap to maintain the sortedness during insertions, since new known
2467 * XIDs are always reported in XID order; we just append them at the right.
2469 * To keep individual deletions cheap, we need to allow gaps in the array.
2470 * This is implemented by marking array elements as valid or invalid using
2471 * the parallel boolean array KnownAssignedXidsValid[]. A deletion is done
2472 * by setting KnownAssignedXidsValid[i] to false, *without* clearing the
2473 * XID entry itself. This preserves the property that the XID entries are
2474 * sorted, so we can do binary searches easily. Periodically we compress
2475 * out the unused entries; that's much cheaper than having to compress the
2476 * array immediately on every deletion.
2478 * The actually valid items in KnownAssignedXids[] and KnownAssignedXidsValid[]
2479 * are those with indexes tail <= i < head; items outside this subscript range
2480 * have unspecified contents. When head reaches the end of the array, we
2481 * force compression of unused entries rather than wrapping around, since
2482 * allowing wraparound would greatly complicate the search logic. We maintain
2483 * an explicit tail pointer so that pruning of old XIDs can be done without
2484 * immediately moving the array contents. In most cases only a small fraction
2485 * of the array contains valid entries at any instant.
2487 * Although only the startup process can ever change the KnownAssignedXids
2488 * data structure, we still need interlocking so that standby backends will
2489 * not observe invalid intermediate states. The convention is that backends
2490 * must hold shared ProcArrayLock to examine the array. To remove XIDs from
2491 * the array, the startup process must hold ProcArrayLock exclusively, for
2492 * the usual transactional reasons (compare commit/abort of a transaction
2493 * during normal running). Compressing unused entries out of the array
2494 * likewise requires exclusive lock. To add XIDs to the array, we just insert
2495 * them into slots to the right of the head pointer and then advance the head
2496 * pointer. This wouldn't require any lock at all, except that on machines
2497 * with weak memory ordering we need to be careful that other processors
2498 * see the array element changes before they see the head pointer change.
2499 * We handle this by using a spinlock to protect reads and writes of the
2500 * head/tail pointers. (We could dispense with the spinlock if we were to
2501 * create suitable memory access barrier primitives and use those instead.)
2502 * The spinlock must be taken to read or write the head/tail pointers unless
2503 * the caller holds ProcArrayLock exclusively.
2505 * Algorithmic analysis:
2507 * If we have a maximum of M slots, with N XIDs currently spread across
2508 * S elements then we have N <= S <= M always.
2510 * * Adding a new XID is O(1) and needs little locking (unless compression
2512 * * Compressing the array is O(S) and requires exclusive lock
2513 * * Removing an XID is O(logS) and requires exclusive lock
2514 * * Taking a snapshot is O(S) and requires shared lock
2515 * * Checking for an XID is O(logS) and requires shared lock
2517 * In comparison, using a hash table for KnownAssignedXids would mean that
2518 * taking snapshots would be O(M). If we can maintain S << M then the
2519 * sorted array technique will deliver significantly faster snapshots.
2520 * If we try to keep S too small then we will spend too much time compressing,
2521 * so there is an optimal point for any workload mix. We use a heuristic to
2522 * decide when to compress the array, though trimming also helps reduce
2523 * frequency of compressing. The heuristic requires us to track the number of
2524 * currently valid XIDs in the array.
2529 * Compress KnownAssignedXids by shifting valid data down to the start of the
2530 * array, removing any gaps.
2532 * A compression step is forced if "force" is true, otherwise we do it
2533 * only if a heuristic indicates it's a good time to do it.
2535 * Caller must hold ProcArrayLock in exclusive mode.
2538 KnownAssignedXidsCompress(bool force)
2540 /* use volatile pointer to prevent code rearrangement */
2541 volatile ProcArrayStruct *pArray = procArray;
2547 /* no spinlock required since we hold ProcArrayLock exclusively */
2548 head = pArray->headKnownAssignedXids;
2549 tail = pArray->tailKnownAssignedXids;
2554 * If we can choose how much to compress, use a heuristic to avoid
2555 * compressing too often or not often enough.
2557 * Heuristic is if we have a large enough current spread and less than
2558 * 50% of the elements are currently in use, then compress. This
2559 * should ensure we compress fairly infrequently. We could compress
2560 * less often though the virtual array would spread out more and
2561 * snapshots would become more expensive.
2563 int nelements = head - tail;
2565 if (nelements < 4 * PROCARRAY_MAXPROCS ||
2566 nelements < 2 * pArray->numKnownAssignedXids)
2571 * We compress the array by reading the valid values from tail to head,
2572 * re-aligning data to 0th element.
2575 for (i = tail; i < head; i++)
2577 if (KnownAssignedXidsValid[i])
2579 KnownAssignedXids[compress_index] = KnownAssignedXids[i];
2580 KnownAssignedXidsValid[compress_index] = true;
2585 pArray->tailKnownAssignedXids = 0;
2586 pArray->headKnownAssignedXids = compress_index;
2590 * Add xids into KnownAssignedXids at the head of the array.
2592 * xids from from_xid to to_xid, inclusive, are added to the array.
2594 * If exclusive_lock is true then caller already holds ProcArrayLock in
2595 * exclusive mode, so we need no extra locking here. Else caller holds no
2596 * lock, so we need to be sure we maintain sufficient interlocks against
2597 * concurrent readers. (Only the startup process ever calls this, so no need
2598 * to worry about concurrent writers.)
2601 KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
2602 bool exclusive_lock)
2604 /* use volatile pointer to prevent code rearrangement */
2605 volatile ProcArrayStruct *pArray = procArray;
2606 TransactionId next_xid;
2612 Assert(TransactionIdPrecedesOrEquals(from_xid, to_xid));
2615 * Calculate how many array slots we'll need. Normally this is cheap; in
2616 * the unusual case where the XIDs cross the wrap point, we do it the hard
2619 if (to_xid >= from_xid)
2620 nxids = to_xid - from_xid + 1;
2624 next_xid = from_xid;
2625 while (TransactionIdPrecedes(next_xid, to_xid))
2628 TransactionIdAdvance(next_xid);
2633 * Since only the startup process modifies the head/tail pointers, we
2634 * don't need a lock to read them here.
2636 head = pArray->headKnownAssignedXids;
2637 tail = pArray->tailKnownAssignedXids;
2639 Assert(head >= 0 && head <= pArray->maxKnownAssignedXids);
2640 Assert(tail >= 0 && tail < pArray->maxKnownAssignedXids);
2643 * Verify that insertions occur in TransactionId sequence. Note that even
2644 * if the last existing element is marked invalid, it must still have a
2645 * correctly sequenced XID value.
2648 TransactionIdFollowsOrEquals(KnownAssignedXids[head - 1], from_xid))
2650 KnownAssignedXidsDisplay(LOG);
2651 elog(ERROR, "out-of-order XID insertion in KnownAssignedXids");
2655 * If our xids won't fit in the remaining space, compress out free space
2657 if (head + nxids > pArray->maxKnownAssignedXids)
2659 /* must hold lock to compress */
2660 if (!exclusive_lock)
2661 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2663 KnownAssignedXidsCompress(true);
2665 head = pArray->headKnownAssignedXids;
2666 /* note: we no longer care about the tail pointer */
2668 if (!exclusive_lock)
2669 LWLockRelease(ProcArrayLock);
2672 * If it still won't fit then we're out of memory
2674 if (head + nxids > pArray->maxKnownAssignedXids)
2675 elog(ERROR, "too many KnownAssignedXids");
2678 /* Now we can insert the xids into the space starting at head */
2679 next_xid = from_xid;
2680 for (i = 0; i < nxids; i++)
2682 KnownAssignedXids[head] = next_xid;
2683 KnownAssignedXidsValid[head] = true;
2684 TransactionIdAdvance(next_xid);
2688 /* Adjust count of number of valid entries */
2689 pArray->numKnownAssignedXids += nxids;
2692 * Now update the head pointer. We use a spinlock to protect this
2693 * pointer, not because the update is likely to be non-atomic, but to
2694 * ensure that other processors see the above array updates before they
2695 * see the head pointer change.
2697 * If we're holding ProcArrayLock exclusively, there's no need to take the
2701 pArray->headKnownAssignedXids = head;
2704 SpinLockAcquire(&pArray->known_assigned_xids_lck);
2705 pArray->headKnownAssignedXids = head;
2706 SpinLockRelease(&pArray->known_assigned_xids_lck);
2711 * KnownAssignedXidsSearch
2713 * Searches KnownAssignedXids for a specific xid and optionally removes it.
2714 * Returns true if it was found, false if not.
2716 * Caller must hold ProcArrayLock in shared or exclusive mode.
2717 * Exclusive lock must be held for remove = true.
2720 KnownAssignedXidsSearch(TransactionId xid, bool remove)
2722 /* use volatile pointer to prevent code rearrangement */
2723 volatile ProcArrayStruct *pArray = procArray;
2728 int result_index = -1;
2732 /* we hold ProcArrayLock exclusively, so no need for spinlock */
2733 tail = pArray->tailKnownAssignedXids;
2734 head = pArray->headKnownAssignedXids;
2738 /* take spinlock to ensure we see up-to-date array contents */
2739 SpinLockAcquire(&pArray->known_assigned_xids_lck);
2740 tail = pArray->tailKnownAssignedXids;
2741 head = pArray->headKnownAssignedXids;
2742 SpinLockRelease(&pArray->known_assigned_xids_lck);
2746 * Standard binary search. Note we can ignore the KnownAssignedXidsValid
2747 * array here, since even invalid entries will contain sorted XIDs.
2751 while (first <= last)
2754 TransactionId mid_xid;
2756 mid_index = (first + last) / 2;
2757 mid_xid = KnownAssignedXids[mid_index];
2761 result_index = mid_index;
2764 else if (TransactionIdPrecedes(xid, mid_xid))
2765 last = mid_index - 1;
2767 first = mid_index + 1;
2770 if (result_index < 0)
2771 return false; /* not in array */
2773 if (!KnownAssignedXidsValid[result_index])
2774 return false; /* in array, but invalid */
2778 KnownAssignedXidsValid[result_index] = false;
2780 pArray->numKnownAssignedXids--;
2781 Assert(pArray->numKnownAssignedXids >= 0);
2784 * If we're removing the tail element then advance tail pointer over
2785 * any invalid elements. This will speed future searches.
2787 if (result_index == tail)
2790 while (tail < head && !KnownAssignedXidsValid[tail])
2794 /* Array is empty, so we can reset both pointers */
2795 pArray->headKnownAssignedXids = 0;
2796 pArray->tailKnownAssignedXids = 0;
2800 pArray->tailKnownAssignedXids = tail;
2809 * Is the specified XID present in KnownAssignedXids[]?
2811 * Caller must hold ProcArrayLock in shared or exclusive mode.
2814 KnownAssignedXidExists(TransactionId xid)
2816 Assert(TransactionIdIsValid(xid));
2818 return KnownAssignedXidsSearch(xid, false);
2822 * Remove the specified XID from KnownAssignedXids[].
2824 * Caller must hold ProcArrayLock in exclusive mode.
2827 KnownAssignedXidsRemove(TransactionId xid)
2829 Assert(TransactionIdIsValid(xid));
2831 elog(trace_recovery(DEBUG4), "remove KnownAssignedXid %u", xid);
2834 * Note: we cannot consider it an error to remove an XID that's not
2835 * present. We intentionally remove subxact IDs while processing
2836 * XLOG_XACT_ASSIGNMENT, to avoid array overflow. Then those XIDs will be
2837 * removed again when the top-level xact commits or aborts.
2839 * It might be possible to track such XIDs to distinguish this case from
2840 * actual errors, but it would be complicated and probably not worth it.
2841 * So, just ignore the search result.
2843 (void) KnownAssignedXidsSearch(xid, true);
2847 * KnownAssignedXidsRemoveTree
2848 * Remove xid (if it's not InvalidTransactionId) and all the subxids.
2850 * Caller must hold ProcArrayLock in exclusive mode.
2853 KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
2854 TransactionId *subxids)
2858 if (TransactionIdIsValid(xid))
2859 KnownAssignedXidsRemove(xid);
2861 for (i = 0; i < nsubxids; i++)
2862 KnownAssignedXidsRemove(subxids[i]);
2864 /* Opportunistically compress the array */
2865 KnownAssignedXidsCompress(false);
2869 * Prune KnownAssignedXids up to, but *not* including xid. If xid is invalid
2870 * then clear the whole table.
2872 * Caller must hold ProcArrayLock in exclusive mode.
2875 KnownAssignedXidsRemovePreceding(TransactionId removeXid)
2877 /* use volatile pointer to prevent code rearrangement */
2878 volatile ProcArrayStruct *pArray = procArray;
2884 if (!TransactionIdIsValid(removeXid))
2886 elog(trace_recovery(DEBUG4), "removing all KnownAssignedXids");
2887 pArray->numKnownAssignedXids = 0;
2888 pArray->headKnownAssignedXids = pArray->tailKnownAssignedXids = 0;
2892 elog(trace_recovery(DEBUG4), "prune KnownAssignedXids to %u", removeXid);
2895 * Mark entries invalid starting at the tail. Since array is sorted, we
2896 * can stop as soon as we reach a entry >= removeXid.
2898 tail = pArray->tailKnownAssignedXids;
2899 head = pArray->headKnownAssignedXids;
2901 for (i = tail; i < head; i++)
2903 if (KnownAssignedXidsValid[i])
2905 TransactionId knownXid = KnownAssignedXids[i];
2907 if (TransactionIdFollowsOrEquals(knownXid, removeXid))
2910 if (!StandbyTransactionIdIsPrepared(knownXid))
2912 KnownAssignedXidsValid[i] = false;
2918 pArray->numKnownAssignedXids -= count;
2919 Assert(pArray->numKnownAssignedXids >= 0);
2922 * Advance the tail pointer if we've marked the tail item invalid.
2924 for (i = tail; i < head; i++)
2926 if (KnownAssignedXidsValid[i])
2931 /* Array is empty, so we can reset both pointers */
2932 pArray->headKnownAssignedXids = 0;
2933 pArray->tailKnownAssignedXids = 0;
2937 pArray->tailKnownAssignedXids = i;
2940 /* Opportunistically compress the array */
2941 KnownAssignedXidsCompress(false);
2945 * KnownAssignedXidsGet - Get an array of xids by scanning KnownAssignedXids.
2946 * We filter out anything >= xmax.
2948 * Returns the number of XIDs stored into xarray[]. Caller is responsible
2949 * that array is large enough.
2951 * Caller must hold ProcArrayLock in (at least) shared mode.
2954 KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax)
2956 TransactionId xtmp = InvalidTransactionId;
2958 return KnownAssignedXidsGetAndSetXmin(xarray, &xtmp, xmax);
2962 * KnownAssignedXidsGetAndSetXmin - as KnownAssignedXidsGet, plus
2963 * we reduce *xmin to the lowest xid value seen if not already lower.
2965 * Caller must hold ProcArrayLock in (at least) shared mode.
2968 KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
2971 /* use volatile pointer to prevent code rearrangement */
2972 volatile ProcArrayStruct *pArray = procArray;
2979 * Fetch head just once, since it may change while we loop. We can stop
2980 * once we reach the initially seen head, since we are certain that an xid
2981 * cannot enter and then leave the array while we hold ProcArrayLock. We
2982 * might miss newly-added xids, but they should be >= xmax so irrelevant
2985 * Must take spinlock to ensure we see up-to-date array contents.
2987 SpinLockAcquire(&pArray->known_assigned_xids_lck);
2988 tail = pArray->tailKnownAssignedXids;
2989 head = pArray->headKnownAssignedXids;
2990 SpinLockRelease(&pArray->known_assigned_xids_lck);
2992 for (i = tail; i < head; i++)
2994 /* Skip any gaps in the array */
2995 if (KnownAssignedXidsValid[i])
2997 TransactionId knownXid = KnownAssignedXids[i];
3000 * Update xmin if required. Only the first XID need be checked,
3001 * since the array is sorted.
3004 TransactionIdPrecedes(knownXid, *xmin))
3008 * Filter out anything >= xmax, again relying on sorted property
3011 if (TransactionIdIsValid(xmax) &&
3012 TransactionIdFollowsOrEquals(knownXid, xmax))
3015 /* Add knownXid into output array */
3016 xarray[count++] = knownXid;
3024 * Get oldest XID in the KnownAssignedXids array, or InvalidTransactionId
3027 static TransactionId
3028 KnownAssignedXidsGetOldestXmin(void)
3030 /* use volatile pointer to prevent code rearrangement */
3031 volatile ProcArrayStruct *pArray = procArray;
3037 * Fetch head just once, since it may change while we loop.
3039 SpinLockAcquire(&pArray->known_assigned_xids_lck);
3040 tail = pArray->tailKnownAssignedXids;
3041 head = pArray->headKnownAssignedXids;
3042 SpinLockRelease(&pArray->known_assigned_xids_lck);
3044 for (i = tail; i < head; i++)
3046 /* Skip any gaps in the array */
3047 if (KnownAssignedXidsValid[i])
3048 return KnownAssignedXids[i];
3051 return InvalidTransactionId;
3055 * Display KnownAssignedXids to provide debug trail
3057 * Currently this is only called within startup process, so we need no
3060 * Note this is pretty expensive, and much of the expense will be incurred
3061 * even if the elog message will get discarded. It's not currently called
3062 * in any performance-critical places, however, so no need to be tenser.
3065 KnownAssignedXidsDisplay(int trace_level)
3067 /* use volatile pointer to prevent code rearrangement */
3068 volatile ProcArrayStruct *pArray = procArray;
3075 tail = pArray->tailKnownAssignedXids;
3076 head = pArray->headKnownAssignedXids;
3078 initStringInfo(&buf);
3080 for (i = tail; i < head; i++)
3082 if (KnownAssignedXidsValid[i])
3085 appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]);
3089 elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s",
3091 pArray->numKnownAssignedXids,
3092 pArray->tailKnownAssignedXids,
3093 pArray->headKnownAssignedXids,