granicus.if.org Git - postgresql/blob - src/backend/storage/ipc/procarray.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * procarray.c
   4  *        POSTGRES process array code.
   5  *
   6  *
   7  * This module maintains an unsorted array of the PGPROC structures for all
   8  * active backends.  Although there are several uses for this, the principal
   9  * one is as a means of determining the set of currently running transactions.
  10  *
  11  * Because of various subtle race conditions it is critical that a backend
  12  * hold the correct locks while setting or clearing its MyProc->xid field.
  13  * See notes in src/backend/access/transam/README.
  14  *
  15  * The process array now also includes PGPROC structures representing
  16  * prepared transactions.  The xid and subxids fields of these are valid,
  17  * as are the myProcLocks lists.  They can be distinguished from regular
  18  * backend PGPROCs at need by checking for pid == 0.
  19  *
  20  * During hot standby, we also keep a list of XIDs representing transactions
  21  * that are known to be running in the master (or more precisely, were running
  22  * as of the current point in the WAL stream).  This list is kept in the
  23  * KnownAssignedXids array, and is updated by watching the sequence of
  24  * arriving XIDs.  This is necessary because if we leave those XIDs out of
  25  * snapshots taken for standby queries, then they will appear to be already
  26  * complete, leading to MVCC failures.  Note that in hot standby, the PGPROC
  27  * array represents standby processes, which by definition are not running
  28  * transactions that have XIDs.
  29  *
  30  * It is perhaps possible for a backend on the master to terminate without
  31  * writing an abort record for its transaction.  While that shouldn't really
  32  * happen, it would tie up KnownAssignedXids indefinitely, so we protect
  33  * ourselves by pruning the array when a valid list of running XIDs arrives.
  34  *
  35  * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
  36  * Portions Copyright (c) 1994, Regents of the University of California
  37  *
  38  *
  39  * IDENTIFICATION
  40  *        src/backend/storage/ipc/procarray.c
  41  *
  42  *-------------------------------------------------------------------------
  43  */
  44 #include "postgres.h"
  45
  46 #include <signal.h>
  47
  48 #include "access/clog.h"
  49 #include "access/subtrans.h"
  50 #include "access/transam.h"
  51 #include "access/xact.h"
  52 #include "access/twophase.h"
  53 #include "miscadmin.h"
  54 #include "storage/procarray.h"
  55 #include "storage/spin.h"
  56 #include "utils/builtins.h"
  57 #include "utils/snapmgr.h"
  58
  59
  60 /* Our shared memory area */
  61 typedef struct ProcArrayStruct
  62 {
  63         int                     numProcs;               /* number of valid procs entries */
  64         int                     maxProcs;               /* allocated size of procs array */
  65
  66         /*
  67          * Known assigned XIDs handling
  68          */
  69         int                     maxKnownAssignedXids;   /* allocated size of array */
  70         int                     numKnownAssignedXids;   /* currrent # of valid entries */
  71         int                     tailKnownAssignedXids;  /* index of oldest valid element */
  72         int                     headKnownAssignedXids;  /* index of newest element, + 1 */
  73         slock_t         known_assigned_xids_lck;                /* protects head/tail pointers */
  74
  75         /*
  76          * Highest subxid that has been removed from KnownAssignedXids array to
  77          * prevent overflow; or InvalidTransactionId if none.  We track this for
  78          * similar reasons to tracking overflowing cached subxids in PGPROC
  79          * entries.  Must hold exclusive ProcArrayLock to change this, and shared
  80          * lock to read it.
  81          */
  82         TransactionId lastOverflowedXid;
  83
  84         /*
  85          * We declare procs[] as 1 entry because C wants a fixed-size array, but
  86          * actually it is maxProcs entries long.
  87          */
  88         PGPROC     *procs[1];           /* VARIABLE LENGTH ARRAY */
  89 } ProcArrayStruct;
  90
  91 static ProcArrayStruct *procArray;
  92
  93 /*
  94  * Bookkeeping for tracking emulated transactions in recovery
  95  */
  96 static TransactionId *KnownAssignedXids;
  97 static bool *KnownAssignedXidsValid;
  98 static TransactionId latestObservedXid = InvalidTransactionId;
  99
 100 /*
 101  * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is
 102  * the highest xid that might still be running that we don't have in
 103  * KnownAssignedXids.
 104  */
 105 static TransactionId standbySnapshotPendingXmin;
 106
 107 #ifdef XIDCACHE_DEBUG
 108
 109 /* counters for XidCache measurement */
 110 static long xc_by_recent_xmin = 0;
 111 static long xc_by_known_xact = 0;
 112 static long xc_by_my_xact = 0;
 113 static long xc_by_latest_xid = 0;
 114 static long xc_by_main_xid = 0;
 115 static long xc_by_child_xid = 0;
 116 static long xc_by_known_assigned = 0;
 117 static long xc_no_overflow = 0;
 118 static long xc_slow_answer = 0;
 119
 120 #define xc_by_recent_xmin_inc()         (xc_by_recent_xmin++)
 121 #define xc_by_known_xact_inc()          (xc_by_known_xact++)
 122 #define xc_by_my_xact_inc()                     (xc_by_my_xact++)
 123 #define xc_by_latest_xid_inc()          (xc_by_latest_xid++)
 124 #define xc_by_main_xid_inc()            (xc_by_main_xid++)
 125 #define xc_by_child_xid_inc()           (xc_by_child_xid++)
 126 #define xc_by_known_assigned_inc()      (xc_by_known_assigned++)
 127 #define xc_no_overflow_inc()            (xc_no_overflow++)
 128 #define xc_slow_answer_inc()            (xc_slow_answer++)
 129
 130 static void DisplayXidCache(void);
 131 #else                                                   /* !XIDCACHE_DEBUG */
 132
 133 #define xc_by_recent_xmin_inc()         ((void) 0)
 134 #define xc_by_known_xact_inc()          ((void) 0)
 135 #define xc_by_my_xact_inc()                     ((void) 0)
 136 #define xc_by_latest_xid_inc()          ((void) 0)
 137 #define xc_by_main_xid_inc()            ((void) 0)
 138 #define xc_by_child_xid_inc()           ((void) 0)
 139 #define xc_by_known_assigned_inc()      ((void) 0)
 140 #define xc_no_overflow_inc()            ((void) 0)
 141 #define xc_slow_answer_inc()            ((void) 0)
 142 #endif   /* XIDCACHE_DEBUG */
 143
 144 /* Primitives for KnownAssignedXids array handling for standby */
 145 static void KnownAssignedXidsCompress(bool force);
 146 static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
 147                                          bool exclusive_lock);
 148 static bool KnownAssignedXidsSearch(TransactionId xid, bool remove);
 149 static bool KnownAssignedXidExists(TransactionId xid);
 150 static void KnownAssignedXidsRemove(TransactionId xid);
 151 static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
 152                                                         TransactionId *subxids);
 153 static void KnownAssignedXidsRemovePreceding(TransactionId xid);
 154 static int      KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax);
 155 static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray,
 156                                                            TransactionId *xmin,
 157                                                            TransactionId xmax);
 158 static TransactionId KnownAssignedXidsGetOldestXmin(void);
 159 static void KnownAssignedXidsDisplay(int trace_level);
 160
 161 /*
 162  * Report shared-memory space needed by CreateSharedProcArray.
 163  */
 164 Size
 165 ProcArrayShmemSize(void)
 166 {
 167         Size            size;
 168
 169         /* Size of the ProcArray structure itself */
 170 #define PROCARRAY_MAXPROCS      (MaxBackends + max_prepared_xacts)
 171
 172         size = offsetof(ProcArrayStruct, procs);
 173         size = add_size(size, mul_size(sizeof(PGPROC *), PROCARRAY_MAXPROCS));
 174
 175         /*
 176          * During Hot Standby processing we have a data structure called
 177          * KnownAssignedXids, created in shared memory. Local data structures are
 178          * also created in various backends during GetSnapshotData(),
 179          * TransactionIdIsInProgress() and GetRunningTransactionData(). All of the
 180          * main structures created in those functions must be identically sized,
 181          * since we may at times copy the whole of the data structures around. We
 182          * refer to this size as TOTAL_MAX_CACHED_SUBXIDS.
 183          *
 184          * Ideally we'd only create this structure if we were actually doing hot
 185          * standby in the current run, but we don't know that yet at the time
 186          * shared memory is being set up.
 187          */
 188 #define TOTAL_MAX_CACHED_SUBXIDS \
 189         ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
 190
 191         if (EnableHotStandby)
 192         {
 193                 size = add_size(size,
 194                                                 mul_size(sizeof(TransactionId),
 195                                                                  TOTAL_MAX_CACHED_SUBXIDS));
 196                 size = add_size(size,
 197                                                 mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS));
 198         }
 199
 200         return size;
 201 }
 202
 203 /*
 204  * Initialize the shared PGPROC array during postmaster startup.
 205  */
 206 void
 207 CreateSharedProcArray(void)
 208 {
 209         bool            found;
 210
 211         /* Create or attach to the ProcArray shared structure */
 212         procArray = (ProcArrayStruct *)
 213                 ShmemInitStruct("Proc Array",
 214                                                 add_size(offsetof(ProcArrayStruct, procs),
 215                                                                  mul_size(sizeof(PGPROC *),
 216                                                                                   PROCARRAY_MAXPROCS)),
 217                                                 &found);
 218
 219         if (!found)
 220         {
 221                 /*
 222                  * We're the first - initialize.
 223                  */
 224                 procArray->numProcs = 0;
 225                 procArray->maxProcs = PROCARRAY_MAXPROCS;
 226                 procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS;
 227                 procArray->numKnownAssignedXids = 0;
 228                 procArray->tailKnownAssignedXids = 0;
 229                 procArray->headKnownAssignedXids = 0;
 230                 SpinLockInit(&procArray->known_assigned_xids_lck);
 231                 procArray->lastOverflowedXid = InvalidTransactionId;
 232         }
 233
 234         /* Create or attach to the KnownAssignedXids arrays too, if needed */
 235         if (EnableHotStandby)
 236         {
 237                 KnownAssignedXids = (TransactionId *)
 238                         ShmemInitStruct("KnownAssignedXids",
 239                                                         mul_size(sizeof(TransactionId),
 240                                                                          TOTAL_MAX_CACHED_SUBXIDS),
 241                                                         &found);
 242                 KnownAssignedXidsValid = (bool *)
 243                         ShmemInitStruct("KnownAssignedXidsValid",
 244                                                         mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS),
 245                                                         &found);
 246         }
 247 }
 248
 249 /*
 250  * Add the specified PGPROC to the shared array.
 251  */
 252 void
 253 ProcArrayAdd(PGPROC *proc)
 254 {
 255         ProcArrayStruct *arrayP = procArray;
 256
 257         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 258
 259         if (arrayP->numProcs >= arrayP->maxProcs)
 260         {
 261                 /*
 262                  * Ooops, no room.      (This really shouldn't happen, since there is a
 263                  * fixed supply of PGPROC structs too, and so we should have failed
 264                  * earlier.)
 265                  */
 266                 LWLockRelease(ProcArrayLock);
 267                 ereport(FATAL,
 268                                 (errcode(ERRCODE_TOO_MANY_CONNECTIONS),
 269                                  errmsg("sorry, too many clients already")));
 270         }
 271
 272         arrayP->procs[arrayP->numProcs] = proc;
 273         arrayP->numProcs++;
 274
 275         LWLockRelease(ProcArrayLock);
 276 }
 277
 278 /*
 279  * Remove the specified PGPROC from the shared array.
 280  *
 281  * When latestXid is a valid XID, we are removing a live 2PC gxact from the
 282  * array, and thus causing it to appear as "not running" anymore.  In this
 283  * case we must advance latestCompletedXid.  (This is essentially the same
 284  * as ProcArrayEndTransaction followed by removal of the PGPROC, but we take
 285  * the ProcArrayLock only once, and don't damage the content of the PGPROC;
 286  * twophase.c depends on the latter.)
 287  */
 288 void
 289 ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 290 {
 291         ProcArrayStruct *arrayP = procArray;
 292         int                     index;
 293
 294 #ifdef XIDCACHE_DEBUG
 295         /* dump stats at backend shutdown, but not prepared-xact end */
 296         if (proc->pid != 0)
 297                 DisplayXidCache();
 298 #endif
 299
 300         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 301
 302         if (TransactionIdIsValid(latestXid))
 303         {
 304                 Assert(TransactionIdIsValid(proc->xid));
 305
 306                 /* Advance global latestCompletedXid while holding the lock */
 307                 if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
 308                                                                   latestXid))
 309                         ShmemVariableCache->latestCompletedXid = latestXid;
 310         }
 311         else
 312         {
 313                 /* Shouldn't be trying to remove a live transaction here */
 314                 Assert(!TransactionIdIsValid(proc->xid));
 315         }
 316
 317         for (index = 0; index < arrayP->numProcs; index++)
 318         {
 319                 if (arrayP->procs[index] == proc)
 320                 {
 321                         arrayP->procs[index] = arrayP->procs[arrayP->numProcs - 1];
 322                         arrayP->procs[arrayP->numProcs - 1] = NULL; /* for debugging */
 323                         arrayP->numProcs--;
 324                         LWLockRelease(ProcArrayLock);
 325                         return;
 326                 }
 327         }
 328
 329         /* Ooops */
 330         LWLockRelease(ProcArrayLock);
 331
 332         elog(LOG, "failed to find proc %p in ProcArray", proc);
 333 }
 334
 335
 336 /*
 337  * ProcArrayEndTransaction -- mark a transaction as no longer running
 338  *
 339  * This is used interchangeably for commit and abort cases.  The transaction
 340  * commit/abort must already be reported to WAL and pg_clog.
 341  *
 342  * proc is currently always MyProc, but we pass it explicitly for flexibility.
 343  * latestXid is the latest Xid among the transaction's main XID and
 344  * subtransactions, or InvalidTransactionId if it has no XID.  (We must ask
 345  * the caller to pass latestXid, instead of computing it from the PGPROC's
 346  * contents, because the subxid information in the PGPROC might be
 347  * incomplete.)
 348  */
 349 void
 350 ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 351 {
 352         if (TransactionIdIsValid(latestXid))
 353         {
 354                 /*
 355                  * We must lock ProcArrayLock while clearing proc->xid, so that we do
 356                  * not exit the set of "running" transactions while someone else is
 357                  * taking a snapshot.  See discussion in
 358                  * src/backend/access/transam/README.
 359                  */
 360                 Assert(TransactionIdIsValid(proc->xid));
 361
 362                 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 363
 364                 proc->xid = InvalidTransactionId;
 365                 proc->lxid = InvalidLocalTransactionId;
 366                 proc->xmin = InvalidTransactionId;
 367                 /* must be cleared with xid/xmin: */
 368                 proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
 369                 proc->inCommit = false; /* be sure this is cleared in abort */
 370                 proc->recoveryConflictPending = false;
 371
 372                 /* Clear the subtransaction-XID cache too while holding the lock */
 373                 proc->subxids.nxids = 0;
 374                 proc->subxids.overflowed = false;
 375
 376                 /* Also advance global latestCompletedXid while holding the lock */
 377                 if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
 378                                                                   latestXid))
 379                         ShmemVariableCache->latestCompletedXid = latestXid;
 380
 381                 LWLockRelease(ProcArrayLock);
 382         }
 383         else
 384         {
 385                 /*
 386                  * If we have no XID, we don't need to lock, since we won't affect
 387                  * anyone else's calculation of a snapshot.  We might change their
 388                  * estimate of global xmin, but that's OK.
 389                  */
 390                 Assert(!TransactionIdIsValid(proc->xid));
 391
 392                 proc->lxid = InvalidLocalTransactionId;
 393                 proc->xmin = InvalidTransactionId;
 394                 /* must be cleared with xid/xmin: */
 395                 proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
 396                 proc->inCommit = false; /* be sure this is cleared in abort */
 397                 proc->recoveryConflictPending = false;
 398
 399                 Assert(proc->subxids.nxids == 0);
 400                 Assert(proc->subxids.overflowed == false);
 401         }
 402 }
 403
 404
 405 /*
 406  * ProcArrayClearTransaction -- clear the transaction fields
 407  *
 408  * This is used after successfully preparing a 2-phase transaction.  We are
 409  * not actually reporting the transaction's XID as no longer running --- it
 410  * will still appear as running because the 2PC's gxact is in the ProcArray
 411  * too.  We just have to clear out our own PGPROC.
 412  */
 413 void
 414 ProcArrayClearTransaction(PGPROC *proc)
 415 {
 416         /*
 417          * We can skip locking ProcArrayLock here, because this action does not
 418          * actually change anyone's view of the set of running XIDs: our entry is
 419          * duplicate with the gxact that has already been inserted into the
 420          * ProcArray.
 421          */
 422         proc->xid = InvalidTransactionId;
 423         proc->lxid = InvalidLocalTransactionId;
 424         proc->xmin = InvalidTransactionId;
 425         proc->recoveryConflictPending = false;
 426
 427         /* redundant, but just in case */
 428         proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
 429         proc->inCommit = false;
 430
 431         /* Clear the subtransaction-XID cache too */
 432         proc->subxids.nxids = 0;
 433         proc->subxids.overflowed = false;
 434 }
 435
 436 /*
 437  * ProcArrayApplyRecoveryInfo -- apply recovery info about xids
 438  *
 439  * Takes us through 3 states: Initialized, Pending and Ready.
 440  * Normal case is to go all the way to Ready straight away, though there
 441  * are atypical cases where we need to take it in steps.
 442  *
 443  * Use the data about running transactions on master to create the initial
 444  * state of KnownAssignedXids. We also use these records to regularly prune
 445  * KnownAssignedXids because we know it is possible that some transactions
 446  * with FATAL errors fail to write abort records, which could cause eventual
 447  * overflow.
 448  *
 449  * See comments for LogStandbySnapshot().
 450  */
 451 void
 452 ProcArrayApplyRecoveryInfo(RunningTransactions running)
 453 {
 454         TransactionId *xids;
 455         int                     nxids;
 456         TransactionId nextXid;
 457         int                     i;
 458
 459         Assert(standbyState >= STANDBY_INITIALIZED);
 460         Assert(TransactionIdIsValid(running->nextXid));
 461         Assert(TransactionIdIsValid(running->oldestRunningXid));
 462         Assert(TransactionIdIsNormal(running->latestCompletedXid));
 463
 464         /*
 465          * Remove stale transactions, if any.
 466          */
 467         ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid);
 468         StandbyReleaseOldLocks(running->oldestRunningXid);
 469
 470         /*
 471          * If our snapshot is already valid, nothing else to do...
 472          */
 473         if (standbyState == STANDBY_SNAPSHOT_READY)
 474                 return;
 475
 476         /*
 477          * If our initial RunningTransactionsData had an overflowed snapshot then
 478          * we knew we were missing some subxids from our snapshot. We can use this
 479          * data as an initial snapshot, but we cannot yet mark it valid. We know
 480          * that the missing subxids are equal to or earlier than nextXid. After we
 481          * initialise we continue to apply changes during recovery, so once the
 482          * oldestRunningXid is later than the nextXid from the initial snapshot we
 483          * know that we no longer have missing information and can mark the
 484          * snapshot as valid.
 485          */
 486         if (standbyState == STANDBY_SNAPSHOT_PENDING)
 487         {
 488                 if (TransactionIdPrecedes(standbySnapshotPendingXmin,
 489                                                                   running->oldestRunningXid))
 490                 {
 491                         standbyState = STANDBY_SNAPSHOT_READY;
 492                         elog(trace_recovery(DEBUG2),
 493                                  "running xact data now proven complete");
 494                         elog(trace_recovery(DEBUG2),
 495                                  "recovery snapshots are now enabled");
 496                 }
 497                 else
 498                         elog(trace_recovery(DEBUG2),
 499                                  "recovery snapshot waiting for %u oldest active xid on standby is %u",
 500                                  standbySnapshotPendingXmin,
 501                                  running->oldestRunningXid);
 502                 return;
 503         }
 504
 505         Assert(standbyState == STANDBY_INITIALIZED);
 506
 507         /*
 508          * OK, we need to initialise from the RunningTransactionsData record
 509          */
 510
 511         /*
 512          * Release any locks belonging to old transactions that are not running
 513          * according to the running-xacts record.
 514          */
 515         StandbyReleaseOldLocks(running->nextXid);
 516
 517         /*
 518          * Nobody else is running yet, but take locks anyhow
 519          */
 520         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 521
 522         /*
 523          * KnownAssignedXids is sorted so we cannot just add the xids, we have to
 524          * sort them first.
 525          *
 526          * Some of the new xids are top-level xids and some are subtransactions.
 527          * We don't call SubtransSetParent because it doesn't matter yet. If we
 528          * aren't overflowed then all xids will fit in snapshot and so we don't
 529          * need subtrans. If we later overflow, an xid assignment record will add
 530          * xids to subtrans. If RunningXacts is overflowed then we don't have
 531          * enough information to correctly update subtrans anyway.
 532          */
 533         Assert(procArray->numKnownAssignedXids == 0);
 534
 535         /*
 536          * Allocate a temporary array to avoid modifying the array passed as
 537          * argument.
 538          */
 539         xids = palloc(sizeof(TransactionId) * running->xcnt);
 540
 541         /*
 542          * Add to the temp array any xids which have not already completed.
 543          */
 544         nxids = 0;
 545         for (i = 0; i < running->xcnt; i++)
 546         {
 547                 TransactionId xid = running->xids[i];
 548
 549                 /*
 550                  * The running-xacts snapshot can contain xids that were still visible
 551                  * in the procarray when the snapshot was taken, but were already
 552                  * WAL-logged as completed. They're not running anymore, so ignore
 553                  * them.
 554                  */
 555                 if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
 556                         continue;
 557
 558                 xids[nxids++] = xid;
 559         }
 560
 561         if (nxids > 0)
 562         {
 563                 /*
 564                  * Sort the array so that we can add them safely into
 565                  * KnownAssignedXids.
 566                  */
 567                 qsort(xids, nxids, sizeof(TransactionId), xidComparator);
 568
 569                 /*
 570                  * Add the sorted snapshot into KnownAssignedXids
 571                  */
 572                 for (i = 0; i < nxids; i++)
 573                         KnownAssignedXidsAdd(xids[i], xids[i], true);
 574
 575                 KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
 576         }
 577
 578         pfree(xids);
 579
 580         /*
 581          * Now we've got the running xids we need to set the global values that
 582          * are used to track snapshots as they evolve further.
 583          *
 584          * - latestCompletedXid which will be the xmax for snapshots -
 585          * lastOverflowedXid which shows whether snapshots overflow - nextXid
 586          *
 587          * If the snapshot overflowed, then we still initialise with what we know,
 588          * but the recovery snapshot isn't fully valid yet because we know there
 589          * are some subxids missing. We don't know the specific subxids that are
 590          * missing, so conservatively assume the last one is latestObservedXid.
 591          */
 592         latestObservedXid = running->nextXid;
 593         TransactionIdRetreat(latestObservedXid);
 594
 595         if (running->subxid_overflow)
 596         {
 597                 standbyState = STANDBY_SNAPSHOT_PENDING;
 598
 599                 standbySnapshotPendingXmin = latestObservedXid;
 600                 procArray->lastOverflowedXid = latestObservedXid;
 601         }
 602         else
 603         {
 604                 standbyState = STANDBY_SNAPSHOT_READY;
 605
 606                 standbySnapshotPendingXmin = InvalidTransactionId;
 607                 procArray->lastOverflowedXid = InvalidTransactionId;
 608         }
 609
 610         /*
 611          * If a transaction wrote a commit record in the gap between taking and
 612          * logging the snapshot then latestCompletedXid may already be higher than
 613          * the value from the snapshot, so check before we use the incoming value.
 614          */
 615         if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
 616                                                           running->latestCompletedXid))
 617                 ShmemVariableCache->latestCompletedXid = running->latestCompletedXid;
 618
 619         /* nextXid must be beyond any observed xid */
 620         nextXid = latestObservedXid;
 621         TransactionIdAdvance(nextXid);
 622         if (TransactionIdFollows(nextXid, ShmemVariableCache->nextXid))
 623                 ShmemVariableCache->nextXid = nextXid;
 624
 625         Assert(TransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
 626         Assert(TransactionIdIsValid(ShmemVariableCache->nextXid));
 627
 628         LWLockRelease(ProcArrayLock);
 629
 630         elog(trace_recovery(DEBUG2), "running transaction data initialized");
 631         KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
 632         if (standbyState == STANDBY_SNAPSHOT_READY)
 633                 elog(trace_recovery(DEBUG2), "recovery snapshots are now enabled");
 634         else
 635                 ereport(LOG,
 636                                 (errmsg("consistent state delayed because recovery snapshot incomplete")));
 637 }
 638
 639 /*
 640  * ProcArrayApplyXidAssignment
 641  *              Process an XLOG_XACT_ASSIGNMENT WAL record
 642  */
 643 void
 644 ProcArrayApplyXidAssignment(TransactionId topxid,
 645                                                         int nsubxids, TransactionId *subxids)
 646 {
 647         TransactionId max_xid;
 648         int                     i;
 649
 650         Assert(standbyState >= STANDBY_INITIALIZED);
 651
 652         max_xid = TransactionIdLatest(topxid, nsubxids, subxids);
 653
 654         /*
 655          * Mark all the subtransactions as observed.
 656          *
 657          * NOTE: This will fail if the subxid contains too many previously
 658          * unobserved xids to fit into known-assigned-xids. That shouldn't happen
 659          * as the code stands, because xid-assignment records should never contain
 660          * more than PGPROC_MAX_CACHED_SUBXIDS entries.
 661          */
 662         RecordKnownAssignedTransactionIds(max_xid);
 663
 664         /*
 665          * Notice that we update pg_subtrans with the top-level xid, rather than
 666          * the parent xid. This is a difference between normal processing and
 667          * recovery, yet is still correct in all cases. The reason is that
 668          * subtransaction commit is not marked in clog until commit processing, so
 669          * all aborted subtransactions have already been clearly marked in clog.
 670          * As a result we are able to refer directly to the top-level
 671          * transaction's state rather than skipping through all the intermediate
 672          * states in the subtransaction tree. This should be the first time we
 673          * have attempted to SubTransSetParent().
 674          */
 675         for (i = 0; i < nsubxids; i++)
 676                 SubTransSetParent(subxids[i], topxid, false);
 677
 678         /*
 679          * Uses same locking as transaction commit
 680          */
 681         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 682
 683         /*
 684          * Remove subxids from known-assigned-xacts.
 685          */
 686         KnownAssignedXidsRemoveTree(InvalidTransactionId, nsubxids, subxids);
 687
 688         /*
 689          * Advance lastOverflowedXid to be at least the last of these subxids.
 690          */
 691         if (TransactionIdPrecedes(procArray->lastOverflowedXid, max_xid))
 692                 procArray->lastOverflowedXid = max_xid;
 693
 694         LWLockRelease(ProcArrayLock);
 695 }
 696
 697 /*
 698  * TransactionIdIsInProgress -- is given transaction running in some backend
 699  *
 700  * Aside from some shortcuts such as checking RecentXmin and our own Xid,
 701  * there are four possibilities for finding a running transaction:
 702  *
 703  * 1. The given Xid is a main transaction Id.  We will find this out cheaply
 704  * by looking at the PGPROC struct for each backend.
 705  *
 706  * 2. The given Xid is one of the cached subxact Xids in the PGPROC array.
 707  * We can find this out cheaply too.
 708  *
 709  * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see
 710  * if the Xid is running on the master.
 711  *
 712  * 4. Search the SubTrans tree to find the Xid's topmost parent, and then
 713  * see if that is running according to PGPROC or KnownAssignedXids.  This is
 714  * the slowest way, but sadly it has to be done always if the others failed,
 715  * unless we see that the cached subxact sets are complete (none have
 716  * overflowed).
 717  *
 718  * ProcArrayLock has to be held while we do 1, 2, 3.  If we save the top Xids
 719  * while doing 1 and 3, we can release the ProcArrayLock while we do 4.
 720  * This buys back some concurrency (and we can't retrieve the main Xids from
 721  * PGPROC again anyway; see GetNewTransactionId).
 722  */
 723 bool
 724 TransactionIdIsInProgress(TransactionId xid)
 725 {
 726         static TransactionId *xids = NULL;
 727         int                     nxids = 0;
 728         ProcArrayStruct *arrayP = procArray;
 729         TransactionId topxid;
 730         int                     i,
 731                                 j;
 732
 733         /*
 734          * Don't bother checking a transaction older than RecentXmin; it could not
 735          * possibly still be running.  (Note: in particular, this guarantees that
 736          * we reject InvalidTransactionId, FrozenTransactionId, etc as not
 737          * running.)
 738          */
 739         if (TransactionIdPrecedes(xid, RecentXmin))
 740         {
 741                 xc_by_recent_xmin_inc();
 742                 return false;
 743         }
 744
 745         /*
 746          * We may have just checked the status of this transaction, so if it is
 747          * already known to be completed, we can fall out without any access to
 748          * shared memory.
 749          */
 750         if (TransactionIdIsKnownCompleted(xid))
 751         {
 752                 xc_by_known_xact_inc();
 753                 return false;
 754         }
 755
 756         /*
 757          * Also, we can handle our own transaction (and subtransactions) without
 758          * any access to shared memory.
 759          */
 760         if (TransactionIdIsCurrentTransactionId(xid))
 761         {
 762                 xc_by_my_xact_inc();
 763                 return true;
 764         }
 765
 766         /*
 767          * If first time through, get workspace to remember main XIDs in. We
 768          * malloc it permanently to avoid repeated palloc/pfree overhead.
 769          */
 770         if (xids == NULL)
 771         {
 772                 /*
 773                  * In hot standby mode, reserve enough space to hold all xids in the
 774                  * known-assigned list. If we later finish recovery, we no longer need
 775                  * the bigger array, but we don't bother to shrink it.
 776                  */
 777                 int                     maxxids = RecoveryInProgress() ? TOTAL_MAX_CACHED_SUBXIDS : arrayP->maxProcs;
 778
 779                 xids = (TransactionId *) malloc(maxxids * sizeof(TransactionId));
 780                 if (xids == NULL)
 781                         ereport(ERROR,
 782                                         (errcode(ERRCODE_OUT_OF_MEMORY),
 783                                          errmsg("out of memory")));
 784         }
 785
 786         LWLockAcquire(ProcArrayLock, LW_SHARED);
 787
 788         /*
 789          * Now that we have the lock, we can check latestCompletedXid; if the
 790          * target Xid is after that, it's surely still running.
 791          */
 792         if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, xid))
 793         {
 794                 LWLockRelease(ProcArrayLock);
 795                 xc_by_latest_xid_inc();
 796                 return true;
 797         }
 798
 799         /* No shortcuts, gotta grovel through the array */
 800         for (i = 0; i < arrayP->numProcs; i++)
 801         {
 802                 volatile PGPROC *proc = arrayP->procs[i];
 803                 TransactionId pxid;
 804
 805                 /* Ignore my own proc --- dealt with it above */
 806                 if (proc == MyProc)
 807                         continue;
 808
 809                 /* Fetch xid just once - see GetNewTransactionId */
 810                 pxid = proc->xid;
 811
 812                 if (!TransactionIdIsValid(pxid))
 813                         continue;
 814
 815                 /*
 816                  * Step 1: check the main Xid
 817                  */
 818                 if (TransactionIdEquals(pxid, xid))
 819                 {
 820                         LWLockRelease(ProcArrayLock);
 821                         xc_by_main_xid_inc();
 822                         return true;
 823                 }
 824
 825                 /*
 826                  * We can ignore main Xids that are younger than the target Xid, since
 827                  * the target could not possibly be their child.
 828                  */
 829                 if (TransactionIdPrecedes(xid, pxid))
 830                         continue;
 831
 832                 /*
 833                  * Step 2: check the cached child-Xids arrays
 834                  */
 835                 for (j = proc->subxids.nxids - 1; j >= 0; j--)
 836                 {
 837                         /* Fetch xid just once - see GetNewTransactionId */
 838                         TransactionId cxid = proc->subxids.xids[j];
 839
 840                         if (TransactionIdEquals(cxid, xid))
 841                         {
 842                                 LWLockRelease(ProcArrayLock);
 843                                 xc_by_child_xid_inc();
 844                                 return true;
 845                         }
 846                 }
 847
 848                 /*
 849                  * Save the main Xid for step 4.  We only need to remember main Xids
 850                  * that have uncached children.  (Note: there is no race condition
 851                  * here because the overflowed flag cannot be cleared, only set, while
 852                  * we hold ProcArrayLock.  So we can't miss an Xid that we need to
 853                  * worry about.)
 854                  */
 855                 if (proc->subxids.overflowed)
 856                         xids[nxids++] = pxid;
 857         }
 858
 859         /*
 860          * Step 3: in hot standby mode, check the known-assigned-xids list.  XIDs
 861          * in the list must be treated as running.
 862          */
 863         if (RecoveryInProgress())
 864         {
 865                 /* none of the PGPROC entries should have XIDs in hot standby mode */
 866                 Assert(nxids == 0);
 867
 868                 if (KnownAssignedXidExists(xid))
 869                 {
 870                         LWLockRelease(ProcArrayLock);
 871                         xc_by_known_assigned_inc();
 872                         return true;
 873                 }
 874
 875                 /*
 876                  * If the KnownAssignedXids overflowed, we have to check pg_subtrans
 877                  * too.  Fetch all xids from KnownAssignedXids that are lower than
 878                  * xid, since if xid is a subtransaction its parent will always have a
 879                  * lower value.  Note we will collect both main and subXIDs here, but
 880                  * there's no help for it.
 881                  */
 882                 if (TransactionIdPrecedesOrEquals(xid, procArray->lastOverflowedXid))
 883                         nxids = KnownAssignedXidsGet(xids, xid);
 884         }
 885
 886         LWLockRelease(ProcArrayLock);
 887
 888         /*
 889          * If none of the relevant caches overflowed, we know the Xid is not
 890          * running without even looking at pg_subtrans.
 891          */
 892         if (nxids == 0)
 893         {
 894                 xc_no_overflow_inc();
 895                 return false;
 896         }
 897
 898         /*
 899          * Step 4: have to check pg_subtrans.
 900          *
 901          * At this point, we know it's either a subtransaction of one of the Xids
 902          * in xids[], or it's not running.  If it's an already-failed
 903          * subtransaction, we want to say "not running" even though its parent may
 904          * still be running.  So first, check pg_clog to see if it's been aborted.
 905          */
 906         xc_slow_answer_inc();
 907
 908         if (TransactionIdDidAbort(xid))
 909                 return false;
 910
 911         /*
 912          * It isn't aborted, so check whether the transaction tree it belongs to
 913          * is still running (or, more precisely, whether it was running when we
 914          * held ProcArrayLock).
 915          */
 916         topxid = SubTransGetTopmostTransaction(xid);
 917         Assert(TransactionIdIsValid(topxid));
 918         if (!TransactionIdEquals(topxid, xid))
 919         {
 920                 for (i = 0; i < nxids; i++)
 921                 {
 922                         if (TransactionIdEquals(xids[i], topxid))
 923                                 return true;
 924                 }
 925         }
 926
 927         return false;
 928 }
 929
 930 /*
 931  * TransactionIdIsActive -- is xid the top-level XID of an active backend?
 932  *
 933  * This differs from TransactionIdIsInProgress in that it ignores prepared
 934  * transactions, as well as transactions running on the master if we're in
 935  * hot standby.  Also, we ignore subtransactions since that's not needed
 936  * for current uses.
 937  */
 938 bool
 939 TransactionIdIsActive(TransactionId xid)
 940 {
 941         bool            result = false;
 942         ProcArrayStruct *arrayP = procArray;
 943         int                     i;
 944
 945         /*
 946          * Don't bother checking a transaction older than RecentXmin; it could not
 947          * possibly still be running.
 948          */
 949         if (TransactionIdPrecedes(xid, RecentXmin))
 950                 return false;
 951
 952         LWLockAcquire(ProcArrayLock, LW_SHARED);
 953
 954         for (i = 0; i < arrayP->numProcs; i++)
 955         {
 956                 volatile PGPROC *proc = arrayP->procs[i];
 957
 958                 /* Fetch xid just once - see GetNewTransactionId */
 959                 TransactionId pxid = proc->xid;
 960
 961                 if (!TransactionIdIsValid(pxid))
 962                         continue;
 963
 964                 if (proc->pid == 0)
 965                         continue;                       /* ignore prepared transactions */
 966
 967                 if (TransactionIdEquals(pxid, xid))
 968                 {
 969                         result = true;
 970                         break;
 971                 }
 972         }
 973
 974         LWLockRelease(ProcArrayLock);
 975
 976         return result;
 977 }
 978
 979
 980 /*
 981  * GetOldestXmin -- returns oldest transaction that was running
 982  *                                      when any current transaction was started.
 983  *
 984  * If allDbs is TRUE then all backends are considered; if allDbs is FALSE
 985  * then only backends running in my own database are considered.
 986  *
 987  * If ignoreVacuum is TRUE then backends with the PROC_IN_VACUUM flag set are
 988  * ignored.
 989  *
 990  * This is used by VACUUM to decide which deleted tuples must be preserved
 991  * in a table.  allDbs = TRUE is needed for shared relations, but allDbs =
 992  * FALSE is sufficient for non-shared relations, since only backends in my
 993  * own database could ever see the tuples in them.      Also, we can ignore
 994  * concurrently running lazy VACUUMs because (a) they must be working on other
 995  * tables, and (b) they don't need to do snapshot-based lookups.
 996  *
 997  * This is also used to determine where to truncate pg_subtrans.  allDbs
 998  * must be TRUE for that case, and ignoreVacuum FALSE.
 999  *
1000  * Note: it's possible for the calculated value to move backwards on repeated
1001  * calls. The calculated value is conservative, so that anything older is
1002  * definitely not considered as running by anyone anymore, but the exact
1003  * value calculated depends on a number of things. For example, if allDbs is
1004  * TRUE and there are no transactions running in the current database,
1005  * GetOldestXmin() returns latestCompletedXid. If a transaction begins after
1006  * that, its xmin will include in-progress transactions in other databases
1007  * that started earlier, so another call will return an lower value. The
1008  * return value is also adjusted with vacuum_defer_cleanup_age, so increasing
1009  * that setting on the fly is an easy way to have GetOldestXmin() move
1010  * backwards.
1011  *
1012  * Note: we include all currently running xids in the set of considered xids.
1013  * This ensures that if a just-started xact has not yet set its snapshot,
1014  * when it does set the snapshot it cannot set xmin less than what we compute.
1015  * See notes in src/backend/access/transam/README.
1016  */
1017 TransactionId
1018 GetOldestXmin(bool allDbs, bool ignoreVacuum)
1019 {
1020         ProcArrayStruct *arrayP = procArray;
1021         TransactionId result;
1022         int                     index;
1023
1024         /* Cannot look for individual databases during recovery */
1025         Assert(allDbs || !RecoveryInProgress());
1026
1027         LWLockAcquire(ProcArrayLock, LW_SHARED);
1028
1029         /*
1030          * We initialize the MIN() calculation with latestCompletedXid + 1. This
1031          * is a lower bound for the XIDs that might appear in the ProcArray later,
1032          * and so protects us against overestimating the result due to future
1033          * additions.
1034          */
1035         result = ShmemVariableCache->latestCompletedXid;
1036         Assert(TransactionIdIsNormal(result));
1037         TransactionIdAdvance(result);
1038
1039         for (index = 0; index < arrayP->numProcs; index++)
1040         {
1041                 volatile PGPROC *proc = arrayP->procs[index];
1042
1043                 if (ignoreVacuum && (proc->vacuumFlags & PROC_IN_VACUUM))
1044                         continue;
1045
1046                 if (allDbs ||
1047                         proc->databaseId == MyDatabaseId ||
1048                         proc->databaseId == 0)          /* include WalSender */
1049                 {
1050                         /* Fetch xid just once - see GetNewTransactionId */
1051                         TransactionId xid = proc->xid;
1052
1053                         /* First consider the transaction's own Xid, if any */
1054                         if (TransactionIdIsNormal(xid) &&
1055                                 TransactionIdPrecedes(xid, result))
1056                                 result = xid;
1057
1058                         /*
1059                          * Also consider the transaction's Xmin, if set.
1060                          *
1061                          * We must check both Xid and Xmin because a transaction might
1062                          * have an Xmin but not (yet) an Xid; conversely, if it has an
1063                          * Xid, that could determine some not-yet-set Xmin.
1064                          */
1065                         xid = proc->xmin;       /* Fetch just once */
1066                         if (TransactionIdIsNormal(xid) &&
1067                                 TransactionIdPrecedes(xid, result))
1068                                 result = xid;
1069                 }
1070         }
1071
1072         if (RecoveryInProgress())
1073         {
1074                 /*
1075                  * Check to see whether KnownAssignedXids contains an xid value older
1076                  * than the main procarray.
1077                  */
1078                 TransactionId kaxmin = KnownAssignedXidsGetOldestXmin();
1079
1080                 LWLockRelease(ProcArrayLock);
1081
1082                 if (TransactionIdIsNormal(kaxmin) &&
1083                         TransactionIdPrecedes(kaxmin, result))
1084                         result = kaxmin;
1085         }
1086         else
1087         {
1088                 /*
1089                  * No other information needed, so release the lock immediately.
1090                  */
1091                 LWLockRelease(ProcArrayLock);
1092
1093                 /*
1094                  * Compute the cutoff XID, being careful not to generate a "permanent"
1095                  * XID. We need do this only on the primary, never on standby.
1096                  *
1097                  * vacuum_defer_cleanup_age provides some additional "slop" for the
1098                  * benefit of hot standby queries on slave servers.  This is quick and
1099                  * dirty, and perhaps not all that useful unless the master has a
1100                  * predictable transaction rate, but it's what we've got.  Note that
1101                  * we are assuming vacuum_defer_cleanup_age isn't large enough to
1102                  * cause wraparound --- so guc.c should limit it to no more than the
1103                  * xidStopLimit threshold in varsup.c.
1104                  */
1105                 result -= vacuum_defer_cleanup_age;
1106                 if (!TransactionIdIsNormal(result))
1107                         result = FirstNormalTransactionId;
1108         }
1109
1110         return result;
1111 }
1112
1113 /*
1114  * GetSnapshotData -- returns information about running transactions.
1115  *
1116  * The returned snapshot includes xmin (lowest still-running xact ID),
1117  * xmax (highest completed xact ID + 1), and a list of running xact IDs
1118  * in the range xmin <= xid < xmax.  It is used as follows:
1119  *              All xact IDs < xmin are considered finished.
1120  *              All xact IDs >= xmax are considered still running.
1121  *              For an xact ID xmin <= xid < xmax, consult list to see whether
1122  *              it is considered running or not.
1123  * This ensures that the set of transactions seen as "running" by the
1124  * current xact will not change after it takes the snapshot.
1125  *
1126  * All running top-level XIDs are included in the snapshot, except for lazy
1127  * VACUUM processes.  We also try to include running subtransaction XIDs,
1128  * but since PGPROC has only a limited cache area for subxact XIDs, full
1129  * information may not be available.  If we find any overflowed subxid arrays,
1130  * we have to mark the snapshot's subxid data as overflowed, and extra work
1131  * *may* need to be done to determine what's running (see XidInMVCCSnapshot()
1132  * in tqual.c).
1133  *
1134  * We also update the following backend-global variables:
1135  *              TransactionXmin: the oldest xmin of any snapshot in use in the
1136  *                      current transaction (this is the same as MyProc->xmin).
1137  *              RecentXmin: the xmin computed for the most recent snapshot.  XIDs
1138  *                      older than this are known not running any more.
1139  *              RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
1140  *                      running transactions, except those running LAZY VACUUM).  This is
1141  *                      the same computation done by GetOldestXmin(true, true).
1142  *
1143  * Note: this function should probably not be called with an argument that's
1144  * not statically allocated (see xip allocation below).
1145  */
1146 Snapshot
1147 GetSnapshotData(Snapshot snapshot)
1148 {
1149         ProcArrayStruct *arrayP = procArray;
1150         TransactionId xmin;
1151         TransactionId xmax;
1152         TransactionId globalxmin;
1153         int                     index;
1154         int                     count = 0;
1155         int                     subcount = 0;
1156         bool            suboverflowed = false;
1157
1158         Assert(snapshot != NULL);
1159
1160         /*
1161          * Allocating space for maxProcs xids is usually overkill; numProcs would
1162          * be sufficient.  But it seems better to do the malloc while not holding
1163          * the lock, so we can't look at numProcs.  Likewise, we allocate much
1164          * more subxip storage than is probably needed.
1165          *
1166          * This does open a possibility for avoiding repeated malloc/free: since
1167          * maxProcs does not change at runtime, we can simply reuse the previous
1168          * xip arrays if any.  (This relies on the fact that all callers pass
1169          * static SnapshotData structs.)
1170          */
1171         if (snapshot->xip == NULL)
1172         {
1173                 /*
1174                  * First call for this snapshot. Snapshot is same size whether or not
1175                  * we are in recovery, see later comments.
1176                  */
1177                 snapshot->xip = (TransactionId *)
1178                         malloc(arrayP->maxProcs * sizeof(TransactionId));
1179                 if (snapshot->xip == NULL)
1180                         ereport(ERROR,
1181                                         (errcode(ERRCODE_OUT_OF_MEMORY),
1182                                          errmsg("out of memory")));
1183                 Assert(snapshot->subxip == NULL);
1184                 snapshot->subxip = (TransactionId *)
1185                         malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
1186                 if (snapshot->subxip == NULL)
1187                         ereport(ERROR,
1188                                         (errcode(ERRCODE_OUT_OF_MEMORY),
1189                                          errmsg("out of memory")));
1190         }
1191
1192         /*
1193          * It is sufficient to get shared lock on ProcArrayLock, even if we are
1194          * going to set MyProc->xmin.
1195          */
1196         LWLockAcquire(ProcArrayLock, LW_SHARED);
1197
1198         /* xmax is always latestCompletedXid + 1 */
1199         xmax = ShmemVariableCache->latestCompletedXid;
1200         Assert(TransactionIdIsNormal(xmax));
1201         TransactionIdAdvance(xmax);
1202
1203         /* initialize xmin calculation with xmax */
1204         globalxmin = xmin = xmax;
1205
1206         /*
1207          * If we're in recovery then snapshot data comes from a different place,
1208          * so decide which route we take before grab the lock. It is possible for
1209          * recovery to end before we finish taking snapshot, and for newly
1210          * assigned transaction ids to be added to the procarray. Xmax cannot
1211          * change while we hold ProcArrayLock, so those newly added transaction
1212          * ids would be filtered away, so we need not be concerned about them.
1213          */
1214         snapshot->takenDuringRecovery = RecoveryInProgress();
1215
1216         if (!snapshot->takenDuringRecovery)
1217         {
1218                 /*
1219                  * Spin over procArray checking xid, xmin, and subxids.  The goal is
1220                  * to gather all active xids, find the lowest xmin, and try to record
1221                  * subxids. During recovery no xids will be assigned, so all normal
1222                  * backends can be ignored, nor are there any VACUUMs running. All
1223                  * prepared transaction xids are held in KnownAssignedXids, so these
1224                  * will be seen without needing to loop through procs here.
1225                  */
1226                 for (index = 0; index < arrayP->numProcs; index++)
1227                 {
1228                         volatile PGPROC *proc = arrayP->procs[index];
1229                         TransactionId xid;
1230
1231                         /* Ignore procs running LAZY VACUUM */
1232                         if (proc->vacuumFlags & PROC_IN_VACUUM)
1233                                 continue;
1234
1235                         /* Update globalxmin to be the smallest valid xmin */
1236                         xid = proc->xmin;       /* fetch just once */
1237                         if (TransactionIdIsNormal(xid) &&
1238                                 TransactionIdPrecedes(xid, globalxmin))
1239                                 globalxmin = xid;
1240
1241                         /* Fetch xid just once - see GetNewTransactionId */
1242                         xid = proc->xid;
1243
1244                         /*
1245                          * If the transaction has been assigned an xid < xmax we add it to
1246                          * the snapshot, and update xmin if necessary.  There's no need to
1247                          * store XIDs >= xmax, since we'll treat them as running anyway.
1248                          * We don't bother to examine their subxids either.
1249                          *
1250                          * We don't include our own XID (if any) in the snapshot, but we
1251                          * must include it into xmin.
1252                          */
1253                         if (TransactionIdIsNormal(xid))
1254                         {
1255                                 if (TransactionIdFollowsOrEquals(xid, xmax))
1256                                         continue;
1257                                 if (proc != MyProc)
1258                                         snapshot->xip[count++] = xid;
1259                                 if (TransactionIdPrecedes(xid, xmin))
1260                                         xmin = xid;
1261                         }
1262
1263                         /*
1264                          * Save subtransaction XIDs if possible (if we've already
1265                          * overflowed, there's no point).  Note that the subxact XIDs must
1266                          * be later than their parent, so no need to check them against
1267                          * xmin.  We could filter against xmax, but it seems better not to
1268                          * do that much work while holding the ProcArrayLock.
1269                          *
1270                          * The other backend can add more subxids concurrently, but cannot
1271                          * remove any.  Hence it's important to fetch nxids just once.
1272                          * Should be safe to use memcpy, though.  (We needn't worry about
1273                          * missing any xids added concurrently, because they must postdate
1274                          * xmax.)
1275                          *
1276                          * Again, our own XIDs are not included in the snapshot.
1277                          */
1278                         if (!suboverflowed && proc != MyProc)
1279                         {
1280                                 if (proc->subxids.overflowed)
1281                                         suboverflowed = true;
1282                                 else
1283                                 {
1284                                         int                     nxids = proc->subxids.nxids;
1285
1286                                         if (nxids > 0)
1287                                         {
1288                                                 memcpy(snapshot->subxip + subcount,
1289                                                            (void *) proc->subxids.xids,
1290                                                            nxids * sizeof(TransactionId));
1291                                                 subcount += nxids;
1292                                         }
1293                                 }
1294                         }
1295                 }
1296         }
1297         else
1298         {
1299                 /*
1300                  * We're in hot standby, so get XIDs from KnownAssignedXids.
1301                  *
1302                  * We store all xids directly into subxip[]. Here's why:
1303                  *
1304                  * In recovery we don't know which xids are top-level and which are
1305                  * subxacts, a design choice that greatly simplifies xid processing.
1306                  *
1307                  * It seems like we would want to try to put xids into xip[] only, but
1308                  * that is fairly small. We would either need to make that bigger or
1309                  * to increase the rate at which we WAL-log xid assignment; neither is
1310                  * an appealing choice.
1311                  *
1312                  * We could try to store xids into xip[] first and then into subxip[]
1313                  * if there are too many xids. That only works if the snapshot doesn't
1314                  * overflow because we do not search subxip[] in that case. A simpler
1315                  * way is to just store all xids in the subxact array because this is
1316                  * by far the bigger array. We just leave the xip array empty.
1317                  *
1318                  * Either way we need to change the way XidInMVCCSnapshot() works
1319                  * depending upon when the snapshot was taken, or change normal
1320                  * snapshot processing so it matches.
1321                  */
1322                 subcount = KnownAssignedXidsGetAndSetXmin(snapshot->subxip, &xmin,
1323                                                                                                   xmax);
1324
1325                 if (TransactionIdPrecedesOrEquals(xmin, procArray->lastOverflowedXid))
1326                         suboverflowed = true;
1327         }
1328
1329         if (!TransactionIdIsValid(MyProc->xmin))
1330                 MyProc->xmin = TransactionXmin = xmin;
1331
1332         LWLockRelease(ProcArrayLock);
1333
1334         /*
1335          * Update globalxmin to include actual process xids.  This is a slightly
1336          * different way of computing it than GetOldestXmin uses, but should give
1337          * the same result.
1338          */
1339         if (TransactionIdPrecedes(xmin, globalxmin))
1340                 globalxmin = xmin;
1341
1342         /* Update global variables too */
1343         RecentGlobalXmin = globalxmin - vacuum_defer_cleanup_age;
1344         if (!TransactionIdIsNormal(RecentGlobalXmin))
1345                 RecentGlobalXmin = FirstNormalTransactionId;
1346         RecentXmin = xmin;
1347
1348         snapshot->xmin = xmin;
1349         snapshot->xmax = xmax;
1350         snapshot->xcnt = count;
1351         snapshot->subxcnt = subcount;
1352         snapshot->suboverflowed = suboverflowed;
1353
1354         snapshot->curcid = GetCurrentCommandId(false);
1355
1356         /*
1357          * This is a new snapshot, so set both refcounts are zero, and mark it as
1358          * not copied in persistent memory.
1359          */
1360         snapshot->active_count = 0;
1361         snapshot->regd_count = 0;
1362         snapshot->copied = false;
1363
1364         return snapshot;
1365 }
1366
1367 /*
1368  * GetRunningTransactionData -- returns information about running transactions.
1369  *
1370  * Similar to GetSnapshotData but returns more information. We include
1371  * all PGPROCs with an assigned TransactionId, even VACUUM processes.
1372  *
1373  * We acquire XidGenLock, but the caller is responsible for releasing it.
1374  * This ensures that no new XIDs enter the proc array until the caller has
1375  * WAL-logged this snapshot, and releases the lock.
1376  *
1377  * The returned data structure is statically allocated; caller should not
1378  * modify it, and must not assume it is valid past the next call.
1379  *
1380  * This is never executed during recovery so there is no need to look at
1381  * KnownAssignedXids.
1382  *
1383  * We don't worry about updating other counters, we want to keep this as
1384  * simple as possible and leave GetSnapshotData() as the primary code for
1385  * that bookkeeping.
1386  */
1387 RunningTransactions
1388 GetRunningTransactionData(void)
1389 {
1390         /* result workspace */
1391         static RunningTransactionsData CurrentRunningXactsData;
1392
1393         ProcArrayStruct *arrayP = procArray;
1394         RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData;
1395         TransactionId latestCompletedXid;
1396         TransactionId oldestRunningXid;
1397         TransactionId *xids;
1398         int                     index;
1399         int                     count;
1400         int                     subcount;
1401         bool            suboverflowed;
1402
1403         Assert(!RecoveryInProgress());
1404
1405         /*
1406          * Allocating space for maxProcs xids is usually overkill; numProcs would
1407          * be sufficient.  But it seems better to do the malloc while not holding
1408          * the lock, so we can't look at numProcs.  Likewise, we allocate much
1409          * more subxip storage than is probably needed.
1410          *
1411          * Should only be allocated in bgwriter, since only ever executed during
1412          * checkpoints.
1413          */
1414         if (CurrentRunningXacts->xids == NULL)
1415         {
1416                 /*
1417                  * First call
1418                  */
1419                 CurrentRunningXacts->xids = (TransactionId *)
1420                         malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
1421                 if (CurrentRunningXacts->xids == NULL)
1422                         ereport(ERROR,
1423                                         (errcode(ERRCODE_OUT_OF_MEMORY),
1424                                          errmsg("out of memory")));
1425         }
1426
1427         xids = CurrentRunningXacts->xids;
1428
1429         count = subcount = 0;
1430         suboverflowed = false;
1431
1432         /*
1433          * Ensure that no xids enter or leave the procarray while we obtain
1434          * snapshot.
1435          */
1436         LWLockAcquire(ProcArrayLock, LW_SHARED);
1437         LWLockAcquire(XidGenLock, LW_SHARED);
1438
1439         latestCompletedXid = ShmemVariableCache->latestCompletedXid;
1440
1441         oldestRunningXid = ShmemVariableCache->nextXid;
1442
1443         /*
1444          * Spin over procArray collecting all xids and subxids.
1445          */
1446         for (index = 0; index < arrayP->numProcs; index++)
1447         {
1448                 volatile PGPROC *proc = arrayP->procs[index];
1449                 TransactionId xid;
1450                 int                     nxids;
1451
1452                 /* Fetch xid just once - see GetNewTransactionId */
1453                 xid = proc->xid;
1454
1455                 /*
1456                  * We don't need to store transactions that don't have a TransactionId
1457                  * yet because they will not show as running on a standby server.
1458                  */
1459                 if (!TransactionIdIsValid(xid))
1460                         continue;
1461
1462                 xids[count++] = xid;
1463
1464                 if (TransactionIdPrecedes(xid, oldestRunningXid))
1465                         oldestRunningXid = xid;
1466
1467                 /*
1468                  * Save subtransaction XIDs. Other backends can't add or remove
1469                  * entries while we're holding XidGenLock.
1470                  */
1471                 nxids = proc->subxids.nxids;
1472                 if (nxids > 0)
1473                 {
1474                         memcpy(&xids[count], (void *) proc->subxids.xids,
1475                                    nxids * sizeof(TransactionId));
1476                         count += nxids;
1477                         subcount += nxids;
1478
1479                         if (proc->subxids.overflowed)
1480                                 suboverflowed = true;
1481
1482                         /*
1483                          * Top-level XID of a transaction is always less than any of its
1484                          * subxids, so we don't need to check if any of the subxids are
1485                          * smaller than oldestRunningXid
1486                          */
1487                 }
1488         }
1489
1490         CurrentRunningXacts->xcnt = count;
1491         CurrentRunningXacts->subxid_overflow = suboverflowed;
1492         CurrentRunningXacts->nextXid = ShmemVariableCache->nextXid;
1493         CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
1494         CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
1495
1496         /* We don't release XidGenLock here, the caller is responsible for that */
1497         LWLockRelease(ProcArrayLock);
1498
1499         Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
1500         Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid));
1501         Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid));
1502
1503         return CurrentRunningXacts;
1504 }
1505
1506 /*
1507  * GetTransactionsInCommit -- Get the XIDs of transactions that are committing
1508  *
1509  * Constructs an array of XIDs of transactions that are currently in commit
1510  * critical sections, as shown by having inCommit set in their PGPROC entries.
1511  *
1512  * *xids_p is set to a palloc'd array that should be freed by the caller.
1513  * The return value is the number of valid entries.
1514  *
1515  * Note that because backends set or clear inCommit without holding any lock,
1516  * the result is somewhat indeterminate, but we don't really care.  Even in
1517  * a multiprocessor with delayed writes to shared memory, it should be certain
1518  * that setting of inCommit will propagate to shared memory when the backend
1519  * takes the WALInsertLock, so we cannot fail to see an xact as inCommit if
1520  * it's already inserted its commit record.  Whether it takes a little while
1521  * for clearing of inCommit to propagate is unimportant for correctness.
1522  */
1523 int
1524 GetTransactionsInCommit(TransactionId **xids_p)
1525 {
1526         ProcArrayStruct *arrayP = procArray;
1527         TransactionId *xids;
1528         int                     nxids;
1529         int                     index;
1530
1531         xids = (TransactionId *) palloc(arrayP->maxProcs * sizeof(TransactionId));
1532         nxids = 0;
1533
1534         LWLockAcquire(ProcArrayLock, LW_SHARED);
1535
1536         for (index = 0; index < arrayP->numProcs; index++)
1537         {
1538                 volatile PGPROC *proc = arrayP->procs[index];
1539
1540                 /* Fetch xid just once - see GetNewTransactionId */
1541                 TransactionId pxid = proc->xid;
1542
1543                 if (proc->inCommit && TransactionIdIsValid(pxid))
1544                         xids[nxids++] = pxid;
1545         }
1546
1547         LWLockRelease(ProcArrayLock);
1548
1549         *xids_p = xids;
1550         return nxids;
1551 }
1552
1553 /*
1554  * HaveTransactionsInCommit -- Are any of the specified XIDs in commit?
1555  *
1556  * This is used with the results of GetTransactionsInCommit to see if any
1557  * of the specified XIDs are still in their commit critical sections.
1558  *
1559  * Note: this is O(N^2) in the number of xacts that are/were in commit, but
1560  * those numbers should be small enough for it not to be a problem.
1561  */
1562 bool
1563 HaveTransactionsInCommit(TransactionId *xids, int nxids)
1564 {
1565         bool            result = false;
1566         ProcArrayStruct *arrayP = procArray;
1567         int                     index;
1568
1569         LWLockAcquire(ProcArrayLock, LW_SHARED);
1570
1571         for (index = 0; index < arrayP->numProcs; index++)
1572         {
1573                 volatile PGPROC *proc = arrayP->procs[index];
1574
1575                 /* Fetch xid just once - see GetNewTransactionId */
1576                 TransactionId pxid = proc->xid;
1577
1578                 if (proc->inCommit && TransactionIdIsValid(pxid))
1579                 {
1580                         int                     i;
1581
1582                         for (i = 0; i < nxids; i++)
1583                         {
1584                                 if (xids[i] == pxid)
1585                                 {
1586                                         result = true;
1587                                         break;
1588                                 }
1589                         }
1590                         if (result)
1591                                 break;
1592                 }
1593         }
1594
1595         LWLockRelease(ProcArrayLock);
1596
1597         return result;
1598 }
1599
1600 /*
1601  * BackendPidGetProc -- get a backend's PGPROC given its PID
1602  *
1603  * Returns NULL if not found.  Note that it is up to the caller to be
1604  * sure that the question remains meaningful for long enough for the
1605  * answer to be used ...
1606  */
1607 PGPROC *
1608 BackendPidGetProc(int pid)
1609 {
1610         PGPROC     *result = NULL;
1611         ProcArrayStruct *arrayP = procArray;
1612         int                     index;
1613
1614         if (pid == 0)                           /* never match dummy PGPROCs */
1615                 return NULL;
1616
1617         LWLockAcquire(ProcArrayLock, LW_SHARED);
1618
1619         for (index = 0; index < arrayP->numProcs; index++)
1620         {
1621                 PGPROC     *proc = arrayP->procs[index];
1622
1623                 if (proc->pid == pid)
1624                 {
1625                         result = proc;
1626                         break;
1627                 }
1628         }
1629
1630         LWLockRelease(ProcArrayLock);
1631
1632         return result;
1633 }
1634
1635 /*
1636  * BackendXidGetPid -- get a backend's pid given its XID
1637  *
1638  * Returns 0 if not found or it's a prepared transaction.  Note that
1639  * it is up to the caller to be sure that the question remains
1640  * meaningful for long enough for the answer to be used ...
1641  *
1642  * Only main transaction Ids are considered.  This function is mainly
1643  * useful for determining what backend owns a lock.
1644  *
1645  * Beware that not every xact has an XID assigned.      However, as long as you
1646  * only call this using an XID found on disk, you're safe.
1647  */
1648 int
1649 BackendXidGetPid(TransactionId xid)
1650 {
1651         int                     result = 0;
1652         ProcArrayStruct *arrayP = procArray;
1653         int                     index;
1654
1655         if (xid == InvalidTransactionId)        /* never match invalid xid */
1656                 return 0;
1657
1658         LWLockAcquire(ProcArrayLock, LW_SHARED);
1659
1660         for (index = 0; index < arrayP->numProcs; index++)
1661         {
1662                 volatile PGPROC *proc = arrayP->procs[index];
1663
1664                 if (proc->xid == xid)
1665                 {
1666                         result = proc->pid;
1667                         break;
1668                 }
1669         }
1670
1671         LWLockRelease(ProcArrayLock);
1672
1673         return result;
1674 }
1675
1676 /*
1677  * IsBackendPid -- is a given pid a running backend
1678  */
1679 bool
1680 IsBackendPid(int pid)
1681 {
1682         return (BackendPidGetProc(pid) != NULL);
1683 }
1684
1685
1686 /*
1687  * GetCurrentVirtualXIDs -- returns an array of currently active VXIDs.
1688  *
1689  * The array is palloc'd. The number of valid entries is returned into *nvxids.
1690  *
1691  * The arguments allow filtering the set of VXIDs returned.  Our own process
1692  * is always skipped.  In addition:
1693  *      If limitXmin is not InvalidTransactionId, skip processes with
1694  *              xmin > limitXmin.
1695  *      If excludeXmin0 is true, skip processes with xmin = 0.
1696  *      If allDbs is false, skip processes attached to other databases.
1697  *      If excludeVacuum isn't zero, skip processes for which
1698  *              (vacuumFlags & excludeVacuum) is not zero.
1699  *
1700  * Note: the purpose of the limitXmin and excludeXmin0 parameters is to
1701  * allow skipping backends whose oldest live snapshot is no older than
1702  * some snapshot we have.  Since we examine the procarray with only shared
1703  * lock, there are race conditions: a backend could set its xmin just after
1704  * we look.  Indeed, on multiprocessors with weak memory ordering, the
1705  * other backend could have set its xmin *before* we look.      We know however
1706  * that such a backend must have held shared ProcArrayLock overlapping our
1707  * own hold of ProcArrayLock, else we would see its xmin update.  Therefore,
1708  * any snapshot the other backend is taking concurrently with our scan cannot
1709  * consider any transactions as still running that we think are committed
1710  * (since backends must hold ProcArrayLock exclusive to commit).
1711  */
1712 VirtualTransactionId *
1713 GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
1714                                           bool allDbs, int excludeVacuum,
1715                                           int *nvxids)
1716 {
1717         VirtualTransactionId *vxids;
1718         ProcArrayStruct *arrayP = procArray;
1719         int                     count = 0;
1720         int                     index;
1721
1722         /* allocate what's certainly enough result space */
1723         vxids = (VirtualTransactionId *)
1724                 palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
1725
1726         LWLockAcquire(ProcArrayLock, LW_SHARED);
1727
1728         for (index = 0; index < arrayP->numProcs; index++)
1729         {
1730                 volatile PGPROC *proc = arrayP->procs[index];
1731
1732                 if (proc == MyProc)
1733                         continue;
1734
1735                 if (excludeVacuum & proc->vacuumFlags)
1736                         continue;
1737
1738                 if (allDbs || proc->databaseId == MyDatabaseId)
1739                 {
1740                         /* Fetch xmin just once - might change on us */
1741                         TransactionId pxmin = proc->xmin;
1742
1743                         if (excludeXmin0 && !TransactionIdIsValid(pxmin))
1744                                 continue;
1745
1746                         /*
1747                          * InvalidTransactionId precedes all other XIDs, so a proc that
1748                          * hasn't set xmin yet will not be rejected by this test.
1749                          */
1750                         if (!TransactionIdIsValid(limitXmin) ||
1751                                 TransactionIdPrecedesOrEquals(pxmin, limitXmin))
1752                         {
1753                                 VirtualTransactionId vxid;
1754
1755                                 GET_VXID_FROM_PGPROC(vxid, *proc);
1756                                 if (VirtualTransactionIdIsValid(vxid))
1757                                         vxids[count++] = vxid;
1758                         }
1759                 }
1760         }
1761
1762         LWLockRelease(ProcArrayLock);
1763
1764         *nvxids = count;
1765         return vxids;
1766 }
1767
1768 /*
1769  * GetConflictingVirtualXIDs -- returns an array of currently active VXIDs.
1770  *
1771  * Usage is limited to conflict resolution during recovery on standby servers.
1772  * limitXmin is supplied as either latestRemovedXid, or InvalidTransactionId
1773  * in cases where we cannot accurately determine a value for latestRemovedXid.
1774  *
1775  * If limitXmin is InvalidTransactionId then we want to kill everybody,
1776  * so we're not worried if they have a snapshot or not, nor does it really
1777  * matter what type of lock we hold.
1778  *
1779  * All callers that are checking xmins always now supply a valid and useful
1780  * value for limitXmin. The limitXmin is always lower than the lowest
1781  * numbered KnownAssignedXid that is not already a FATAL error. This is
1782  * because we only care about cleanup records that are cleaning up tuple
1783  * versions from committed transactions. In that case they will only occur
1784  * at the point where the record is less than the lowest running xid. That
1785  * allows us to say that if any backend takes a snapshot concurrently with
1786  * us then the conflict assessment made here would never include the snapshot
1787  * that is being derived. So we take LW_SHARED on the ProcArray and allow
1788  * concurrent snapshots when limitXmin is valid. We might think about adding
1789  *       Assert(limitXmin < lowest(KnownAssignedXids))
1790  * but that would not be true in the case of FATAL errors lagging in array,
1791  * but we already know those are bogus anyway, so we skip that test.
1792  *
1793  * If dbOid is valid we skip backends attached to other databases.
1794  *
1795  * Be careful to *not* pfree the result from this function. We reuse
1796  * this array sufficiently often that we use malloc for the result.
1797  */
1798 VirtualTransactionId *
1799 GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
1800 {
1801         static VirtualTransactionId *vxids;
1802         ProcArrayStruct *arrayP = procArray;
1803         int                     count = 0;
1804         int                     index;
1805
1806         /*
1807          * If first time through, get workspace to remember main XIDs in. We
1808          * malloc it permanently to avoid repeated palloc/pfree overhead. Allow
1809          * result space, remembering room for a terminator.
1810          */
1811         if (vxids == NULL)
1812         {
1813                 vxids = (VirtualTransactionId *)
1814                         malloc(sizeof(VirtualTransactionId) * (arrayP->maxProcs + 1));
1815                 if (vxids == NULL)
1816                         ereport(ERROR,
1817                                         (errcode(ERRCODE_OUT_OF_MEMORY),
1818                                          errmsg("out of memory")));
1819         }
1820
1821         LWLockAcquire(ProcArrayLock, LW_SHARED);
1822
1823         for (index = 0; index < arrayP->numProcs; index++)
1824         {
1825                 volatile PGPROC *proc = arrayP->procs[index];
1826
1827                 /* Exclude prepared transactions */
1828                 if (proc->pid == 0)
1829                         continue;
1830
1831                 if (!OidIsValid(dbOid) ||
1832                         proc->databaseId == dbOid)
1833                 {
1834                         /* Fetch xmin just once - can't change on us, but good coding */
1835                         TransactionId pxmin = proc->xmin;
1836
1837                         /*
1838                          * We ignore an invalid pxmin because this means that backend has
1839                          * no snapshot and cannot get another one while we hold exclusive
1840                          * lock.
1841                          */
1842                         if (!TransactionIdIsValid(limitXmin) ||
1843                                 (TransactionIdIsValid(pxmin) && !TransactionIdFollows(pxmin, limitXmin)))
1844                         {
1845                                 VirtualTransactionId vxid;
1846
1847                                 GET_VXID_FROM_PGPROC(vxid, *proc);
1848                                 if (VirtualTransactionIdIsValid(vxid))
1849                                         vxids[count++] = vxid;
1850                         }
1851                 }
1852         }
1853
1854         LWLockRelease(ProcArrayLock);
1855
1856         /* add the terminator */
1857         vxids[count].backendId = InvalidBackendId;
1858         vxids[count].localTransactionId = InvalidLocalTransactionId;
1859
1860         return vxids;
1861 }
1862
1863 /*
1864  * CancelVirtualTransaction - used in recovery conflict processing
1865  *
1866  * Returns pid of the process signaled, or 0 if not found.
1867  */
1868 pid_t
1869 CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode)
1870 {
1871         ProcArrayStruct *arrayP = procArray;
1872         int                     index;
1873         pid_t           pid = 0;
1874
1875         LWLockAcquire(ProcArrayLock, LW_SHARED);
1876
1877         for (index = 0; index < arrayP->numProcs; index++)
1878         {
1879                 VirtualTransactionId procvxid;
1880                 PGPROC     *proc = arrayP->procs[index];
1881
1882                 GET_VXID_FROM_PGPROC(procvxid, *proc);
1883
1884                 if (procvxid.backendId == vxid.backendId &&
1885                         procvxid.localTransactionId == vxid.localTransactionId)
1886                 {
1887                         proc->recoveryConflictPending = true;
1888                         pid = proc->pid;
1889                         if (pid != 0)
1890                         {
1891                                 /*
1892                                  * Kill the pid if it's still here. If not, that's what we
1893                                  * wanted so ignore any errors.
1894                                  */
1895                                 (void) SendProcSignal(pid, sigmode, vxid.backendId);
1896                         }
1897                         break;
1898                 }
1899         }
1900
1901         LWLockRelease(ProcArrayLock);
1902
1903         return pid;
1904 }
1905
1906 /*
1907  * MinimumActiveBackends --- count backends (other than myself) that are
1908  *              in active transactions.  Return true if the count exceeds the
1909  *              minimum threshold passed.  This is used as a heuristic to decide if
1910  *              a pre-XLOG-flush delay is worthwhile during commit.
1911  *
1912  * Do not count backends that are blocked waiting for locks, since they are
1913  * not going to get to run until someone else commits.
1914  */
1915 bool
1916 MinimumActiveBackends(int min)
1917 {
1918         ProcArrayStruct *arrayP = procArray;
1919         int                     count = 0;
1920         int                     index;
1921
1922         /* Quick short-circuit if no minimum is specified */
1923         if (min == 0)
1924                 return true;
1925
1926         /*
1927          * Note: for speed, we don't acquire ProcArrayLock.  This is a little bit
1928          * bogus, but since we are only testing fields for zero or nonzero, it
1929          * should be OK.  The result is only used for heuristic purposes anyway...
1930          */
1931         for (index = 0; index < arrayP->numProcs; index++)
1932         {
1933                 volatile PGPROC *proc = arrayP->procs[index];
1934
1935                 /*
1936                  * Since we're not holding a lock, need to check that the pointer is
1937                  * valid. Someone holding the lock could have incremented numProcs
1938                  * already, but not yet inserted a valid pointer to the array.
1939                  *
1940                  * If someone just decremented numProcs, 'proc' could also point to a
1941                  * PGPROC entry that's no longer in the array. It still points to a
1942                  * PGPROC struct, though, because freed PGPPROC entries just go to the
1943                  * free list and are recycled. Its contents are nonsense in that case,
1944                  * but that's acceptable for this function.
1945                  */
1946                 if (proc == NULL)
1947                         continue;
1948
1949                 if (proc == MyProc)
1950                         continue;                       /* do not count myself */
1951                 if (proc->pid == 0)
1952                         continue;                       /* do not count prepared xacts */
1953                 if (proc->xid == InvalidTransactionId)
1954                         continue;                       /* do not count if no XID assigned */
1955                 if (proc->waitLock != NULL)
1956                         continue;                       /* do not count if blocked on a lock */
1957                 count++;
1958                 if (count >= min)
1959                         break;
1960         }
1961
1962         return count >= min;
1963 }
1964
1965 /*
1966  * CountDBBackends --- count backends that are using specified database
1967  */
1968 int
1969 CountDBBackends(Oid databaseid)
1970 {
1971         ProcArrayStruct *arrayP = procArray;
1972         int                     count = 0;
1973         int                     index;
1974
1975         LWLockAcquire(ProcArrayLock, LW_SHARED);
1976
1977         for (index = 0; index < arrayP->numProcs; index++)
1978         {
1979                 volatile PGPROC *proc = arrayP->procs[index];
1980
1981                 if (proc->pid == 0)
1982                         continue;                       /* do not count prepared xacts */
1983                 if (!OidIsValid(databaseid) ||
1984                         proc->databaseId == databaseid)
1985                         count++;
1986         }
1987
1988         LWLockRelease(ProcArrayLock);
1989
1990         return count;
1991 }
1992
1993 /*
1994  * CancelDBBackends --- cancel backends that are using specified database
1995  */
1996 void
1997 CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending)
1998 {
1999         ProcArrayStruct *arrayP = procArray;
2000         int                     index;
2001         pid_t           pid = 0;
2002
2003         /* tell all backends to die */
2004         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2005
2006         for (index = 0; index < arrayP->numProcs; index++)
2007         {
2008                 volatile PGPROC *proc = arrayP->procs[index];
2009
2010                 if (databaseid == InvalidOid || proc->databaseId == databaseid)
2011                 {
2012                         VirtualTransactionId procvxid;
2013
2014                         GET_VXID_FROM_PGPROC(procvxid, *proc);
2015
2016                         proc->recoveryConflictPending = conflictPending;
2017                         pid = proc->pid;
2018                         if (pid != 0)
2019                         {
2020                                 /*
2021                                  * Kill the pid if it's still here. If not, that's what we
2022                                  * wanted so ignore any errors.
2023                                  */
2024                                 (void) SendProcSignal(pid, sigmode, procvxid.backendId);
2025                         }
2026                 }
2027         }
2028
2029         LWLockRelease(ProcArrayLock);
2030 }
2031
2032 /*
2033  * CountUserBackends --- count backends that are used by specified user
2034  */
2035 int
2036 CountUserBackends(Oid roleid)
2037 {
2038         ProcArrayStruct *arrayP = procArray;
2039         int                     count = 0;
2040         int                     index;
2041
2042         LWLockAcquire(ProcArrayLock, LW_SHARED);
2043
2044         for (index = 0; index < arrayP->numProcs; index++)
2045         {
2046                 volatile PGPROC *proc = arrayP->procs[index];
2047
2048                 if (proc->pid == 0)
2049                         continue;                       /* do not count prepared xacts */
2050                 if (proc->roleId == roleid)
2051                         count++;
2052         }
2053
2054         LWLockRelease(ProcArrayLock);
2055
2056         return count;
2057 }
2058
2059 /*
2060  * CountOtherDBBackends -- check for other backends running in the given DB
2061  *
2062  * If there are other backends in the DB, we will wait a maximum of 5 seconds
2063  * for them to exit.  Autovacuum backends are encouraged to exit early by
2064  * sending them SIGTERM, but normal user backends are just waited for.
2065  *
2066  * The current backend is always ignored; it is caller's responsibility to
2067  * check whether the current backend uses the given DB, if it's important.
2068  *
2069  * Returns TRUE if there are (still) other backends in the DB, FALSE if not.
2070  * Also, *nbackends and *nprepared are set to the number of other backends
2071  * and prepared transactions in the DB, respectively.
2072  *
2073  * This function is used to interlock DROP DATABASE and related commands
2074  * against there being any active backends in the target DB --- dropping the
2075  * DB while active backends remain would be a Bad Thing.  Note that we cannot
2076  * detect here the possibility of a newly-started backend that is trying to
2077  * connect to the doomed database, so additional interlocking is needed during
2078  * backend startup.  The caller should normally hold an exclusive lock on the
2079  * target DB before calling this, which is one reason we mustn't wait
2080  * indefinitely.
2081  */
2082 bool
2083 CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
2084 {
2085         ProcArrayStruct *arrayP = procArray;
2086
2087 #define MAXAUTOVACPIDS  10              /* max autovacs to SIGTERM per iteration */
2088         int                     autovac_pids[MAXAUTOVACPIDS];
2089         int                     tries;
2090
2091         /* 50 tries with 100ms sleep between tries makes 5 sec total wait */
2092         for (tries = 0; tries < 50; tries++)
2093         {
2094                 int                     nautovacs = 0;
2095                 bool            found = false;
2096                 int                     index;
2097
2098                 CHECK_FOR_INTERRUPTS();
2099
2100                 *nbackends = *nprepared = 0;
2101
2102                 LWLockAcquire(ProcArrayLock, LW_SHARED);
2103
2104                 for (index = 0; index < arrayP->numProcs; index++)
2105                 {
2106                         volatile PGPROC *proc = arrayP->procs[index];
2107
2108                         if (proc->databaseId != databaseId)
2109                                 continue;
2110                         if (proc == MyProc)
2111                                 continue;
2112
2113                         found = true;
2114
2115                         if (proc->pid == 0)
2116                                 (*nprepared)++;
2117                         else
2118                         {
2119                                 (*nbackends)++;
2120                                 if ((proc->vacuumFlags & PROC_IS_AUTOVACUUM) &&
2121                                         nautovacs < MAXAUTOVACPIDS)
2122                                         autovac_pids[nautovacs++] = proc->pid;
2123                         }
2124                 }
2125
2126                 LWLockRelease(ProcArrayLock);
2127
2128                 if (!found)
2129                         return false;           /* no conflicting backends, so done */
2130
2131                 /*
2132                  * Send SIGTERM to any conflicting autovacuums before sleeping. We
2133                  * postpone this step until after the loop because we don't want to
2134                  * hold ProcArrayLock while issuing kill(). We have no idea what might
2135                  * block kill() inside the kernel...
2136                  */
2137                 for (index = 0; index < nautovacs; index++)
2138                         (void) kill(autovac_pids[index], SIGTERM);      /* ignore any error */
2139
2140                 /* sleep, then try again */
2141                 pg_usleep(100 * 1000L); /* 100ms */
2142         }
2143
2144         return true;                            /* timed out, still conflicts */
2145 }
2146
2147
2148 #define XidCacheRemove(i) \
2149         do { \
2150                 MyProc->subxids.xids[i] = MyProc->subxids.xids[MyProc->subxids.nxids - 1]; \
2151                 MyProc->subxids.nxids--; \
2152         } while (0)
2153
2154 /*
2155  * XidCacheRemoveRunningXids
2156  *
2157  * Remove a bunch of TransactionIds from the list of known-running
2158  * subtransactions for my backend.      Both the specified xid and those in
2159  * the xids[] array (of length nxids) are removed from the subxids cache.
2160  * latestXid must be the latest XID among the group.
2161  */
2162 void
2163 XidCacheRemoveRunningXids(TransactionId xid,
2164                                                   int nxids, const TransactionId *xids,
2165                                                   TransactionId latestXid)
2166 {
2167         int                     i,
2168                                 j;
2169
2170         Assert(TransactionIdIsValid(xid));
2171
2172         /*
2173          * We must hold ProcArrayLock exclusively in order to remove transactions
2174          * from the PGPROC array.  (See src/backend/access/transam/README.)  It's
2175          * possible this could be relaxed since we know this routine is only used
2176          * to abort subtransactions, but pending closer analysis we'd best be
2177          * conservative.
2178          */
2179         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2180
2181         /*
2182          * Under normal circumstances xid and xids[] will be in increasing order,
2183          * as will be the entries in subxids.  Scan backwards to avoid O(N^2)
2184          * behavior when removing a lot of xids.
2185          */
2186         for (i = nxids - 1; i >= 0; i--)
2187         {
2188                 TransactionId anxid = xids[i];
2189
2190                 for (j = MyProc->subxids.nxids - 1; j >= 0; j--)
2191                 {
2192                         if (TransactionIdEquals(MyProc->subxids.xids[j], anxid))
2193                         {
2194                                 XidCacheRemove(j);
2195                                 break;
2196                         }
2197                 }
2198
2199                 /*
2200                  * Ordinarily we should have found it, unless the cache has
2201                  * overflowed. However it's also possible for this routine to be
2202                  * invoked multiple times for the same subtransaction, in case of an
2203                  * error during AbortSubTransaction.  So instead of Assert, emit a
2204                  * debug warning.
2205                  */
2206                 if (j < 0 && !MyProc->subxids.overflowed)
2207                         elog(WARNING, "did not find subXID %u in MyProc", anxid);
2208         }
2209
2210         for (j = MyProc->subxids.nxids - 1; j >= 0; j--)
2211         {
2212                 if (TransactionIdEquals(MyProc->subxids.xids[j], xid))
2213                 {
2214                         XidCacheRemove(j);
2215                         break;
2216                 }
2217         }
2218         /* Ordinarily we should have found it, unless the cache has overflowed */
2219         if (j < 0 && !MyProc->subxids.overflowed)
2220                 elog(WARNING, "did not find subXID %u in MyProc", xid);
2221
2222         /* Also advance global latestCompletedXid while holding the lock */
2223         if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
2224                                                           latestXid))
2225                 ShmemVariableCache->latestCompletedXid = latestXid;
2226
2227         LWLockRelease(ProcArrayLock);
2228 }
2229
2230 #ifdef XIDCACHE_DEBUG
2231
2232 /*
2233  * Print stats about effectiveness of XID cache
2234  */
2235 static void
2236 DisplayXidCache(void)
2237 {
2238         fprintf(stderr,
2239                         "XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, nooflo: %ld, slow: %ld\n",
2240                         xc_by_recent_xmin,
2241                         xc_by_known_xact,
2242                         xc_by_my_xact,
2243                         xc_by_latest_xid,
2244                         xc_by_main_xid,
2245                         xc_by_child_xid,
2246                         xc_by_known_assigned,
2247                         xc_no_overflow,
2248                         xc_slow_answer);
2249 }
2250 #endif   /* XIDCACHE_DEBUG */
2251
2252
2253 /* ----------------------------------------------
2254  *              KnownAssignedTransactions sub-module
2255  * ----------------------------------------------
2256  */
2257
2258 /*
2259  * In Hot Standby mode, we maintain a list of transactions that are (or were)
2260  * running in the master at the current point in WAL.  These XIDs must be
2261  * treated as running by standby transactions, even though they are not in
2262  * the standby server's PGPROC array.
2263  *
2264  * We record all XIDs that we know have been assigned.  That includes all the
2265  * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have
2266  * been assigned.  We can deduce the existence of unobserved XIDs because we
2267  * know XIDs are assigned in sequence, with no gaps.  The KnownAssignedXids
2268  * list expands as new XIDs are observed or inferred, and contracts when
2269  * transaction completion records arrive.
2270  *
2271  * During hot standby we do not fret too much about the distinction between
2272  * top-level XIDs and subtransaction XIDs. We store both together in the
2273  * KnownAssignedXids list.      In backends, this is copied into snapshots in
2274  * GetSnapshotData(), taking advantage of the fact that XidInMVCCSnapshot()
2275  * doesn't care about the distinction either.  Subtransaction XIDs are
2276  * effectively treated as top-level XIDs and in the typical case pg_subtrans
2277  * links are *not* maintained (which does not affect visibility).
2278  *
2279  * We have room in KnownAssignedXids and in snapshots to hold maxProcs *
2280  * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every master transaction must
2281  * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at
2282  * least every PGPROC_MAX_CACHED_SUBXIDS.  When we receive one of these
2283  * records, we mark the subXIDs as children of the top XID in pg_subtrans,
2284  * and then remove them from KnownAssignedXids.  This prevents overflow of
2285  * KnownAssignedXids and snapshots, at the cost that status checks for these
2286  * subXIDs will take a slower path through TransactionIdIsInProgress().
2287  * This means that KnownAssignedXids is not necessarily complete for subXIDs,
2288  * though it should be complete for top-level XIDs; this is the same situation
2289  * that holds with respect to the PGPROC entries in normal running.
2290  *
2291  * When we throw away subXIDs from KnownAssignedXids, we need to keep track of
2292  * that, similarly to tracking overflow of a PGPROC's subxids array.  We do
2293  * that by remembering the lastOverflowedXID, ie the last thrown-away subXID.
2294  * As long as that is within the range of interesting XIDs, we have to assume
2295  * that subXIDs are missing from snapshots.  (Note that subXID overflow occurs
2296  * on primary when 65th subXID arrives, whereas on standby it occurs when 64th
2297  * subXID arrives - that is not an error.)
2298  *
2299  * Should a backend on primary somehow disappear before it can write an abort
2300  * record, then we just leave those XIDs in KnownAssignedXids. They actually
2301  * aborted but we think they were running; the distinction is irrelevant
2302  * because either way any changes done by the transaction are not visible to
2303  * backends in the standby.  We prune KnownAssignedXids when
2304  * XLOG_RUNNING_XACTS arrives, to forestall possible overflow of the
2305  * array due to such dead XIDs.
2306  */
2307
2308 /*
2309  * RecordKnownAssignedTransactionIds
2310  *              Record the given XID in KnownAssignedXids, as well as any preceding
2311  *              unobserved XIDs.
2312  *
2313  * RecordKnownAssignedTransactionIds() should be run for *every* WAL record
2314  * associated with a transaction. Must be called for each record after we
2315  * have executed StartupCLOG() et al, since we must ExtendCLOG() etc..
2316  *
2317  * Called during recovery in analogy with and in place of GetNewTransactionId()
2318  */
2319 void
2320 RecordKnownAssignedTransactionIds(TransactionId xid)
2321 {
2322         Assert(standbyState >= STANDBY_INITIALIZED);
2323         Assert(TransactionIdIsValid(xid));
2324
2325         elog(trace_recovery(DEBUG4), "record known xact %u latestObservedXid %u",
2326                  xid, latestObservedXid);
2327
2328         /*
2329          * If the KnownAssignedXids machinery isn't up yet, do nothing.
2330          */
2331         if (standbyState <= STANDBY_INITIALIZED)
2332                 return;
2333
2334         Assert(TransactionIdIsValid(latestObservedXid));
2335
2336         /*
2337          * When a newly observed xid arrives, it is frequently the case that it is
2338          * *not* the next xid in sequence. When this occurs, we must treat the
2339          * intervening xids as running also.
2340          */
2341         if (TransactionIdFollows(xid, latestObservedXid))
2342         {
2343                 TransactionId next_expected_xid;
2344
2345                 /*
2346                  * Extend clog and subtrans like we do in GetNewTransactionId() during
2347                  * normal operation using individual extend steps. Typical case
2348                  * requires almost no activity.
2349                  */
2350                 next_expected_xid = latestObservedXid;
2351                 TransactionIdAdvance(next_expected_xid);
2352                 while (TransactionIdPrecedesOrEquals(next_expected_xid, xid))
2353                 {
2354                         ExtendCLOG(next_expected_xid);
2355                         ExtendSUBTRANS(next_expected_xid);
2356
2357                         TransactionIdAdvance(next_expected_xid);
2358                 }
2359
2360                 /*
2361                  * Add the new xids onto the KnownAssignedXids array.
2362                  */
2363                 next_expected_xid = latestObservedXid;
2364                 TransactionIdAdvance(next_expected_xid);
2365                 KnownAssignedXidsAdd(next_expected_xid, xid, false);
2366
2367                 /*
2368                  * Now we can advance latestObservedXid
2369                  */
2370                 latestObservedXid = xid;
2371
2372                 /* ShmemVariableCache->nextXid must be beyond any observed xid */
2373                 next_expected_xid = latestObservedXid;
2374                 TransactionIdAdvance(next_expected_xid);
2375                 ShmemVariableCache->nextXid = next_expected_xid;
2376         }
2377 }
2378
2379 /*
2380  * ExpireTreeKnownAssignedTransactionIds
2381  *              Remove the given XIDs from KnownAssignedXids.
2382  *
2383  * Called during recovery in analogy with and in place of ProcArrayEndTransaction()
2384  */
2385 void
2386 ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids,
2387                                                            TransactionId *subxids, TransactionId max_xid)
2388 {
2389         Assert(standbyState >= STANDBY_INITIALIZED);
2390
2391         /*
2392          * Uses same locking as transaction commit
2393          */
2394         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2395
2396         KnownAssignedXidsRemoveTree(xid, nsubxids, subxids);
2397
2398         /* As in ProcArrayEndTransaction, advance latestCompletedXid */
2399         if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
2400                                                           max_xid))
2401                 ShmemVariableCache->latestCompletedXid = max_xid;
2402
2403         LWLockRelease(ProcArrayLock);
2404 }
2405
2406 /*
2407  * ExpireAllKnownAssignedTransactionIds
2408  *              Remove all entries in KnownAssignedXids
2409  */
2410 void
2411 ExpireAllKnownAssignedTransactionIds(void)
2412 {
2413         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2414         KnownAssignedXidsRemovePreceding(InvalidTransactionId);
2415         LWLockRelease(ProcArrayLock);
2416 }
2417
2418 /*
2419  * ExpireOldKnownAssignedTransactionIds
2420  *              Remove KnownAssignedXids entries preceding the given XID
2421  */
2422 void
2423 ExpireOldKnownAssignedTransactionIds(TransactionId xid)
2424 {
2425         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2426         KnownAssignedXidsRemovePreceding(xid);
2427         LWLockRelease(ProcArrayLock);
2428 }
2429
2430
2431 /*
2432  * Private module functions to manipulate KnownAssignedXids
2433  *
2434  * There are 5 main uses of the KnownAssignedXids data structure:
2435  *
2436  *      * backends taking snapshots - all valid XIDs need to be copied out
2437  *      * backends seeking to determine presence of a specific XID
2438  *      * startup process adding new known-assigned XIDs
2439  *      * startup process removing specific XIDs as transactions end
2440  *      * startup process pruning array when special WAL records arrive
2441  *
2442  * This data structure is known to be a hot spot during Hot Standby, so we
2443  * go to some lengths to make these operations as efficient and as concurrent
2444  * as possible.
2445  *
2446  * The XIDs are stored in an array in sorted order --- TransactionIdPrecedes
2447  * order, to be exact --- to allow binary search for specific XIDs.  Note:
2448  * in general TransactionIdPrecedes would not provide a total order, but
2449  * we know that the entries present at any instant should not extend across
2450  * a large enough fraction of XID space to wrap around (the master would
2451  * shut down for fear of XID wrap long before that happens).  So it's OK to
2452  * use TransactionIdPrecedes as a binary-search comparator.
2453  *
2454  * It's cheap to maintain the sortedness during insertions, since new known
2455  * XIDs are always reported in XID order; we just append them at the right.
2456  *
2457  * To keep individual deletions cheap, we need to allow gaps in the array.
2458  * This is implemented by marking array elements as valid or invalid using
2459  * the parallel boolean array KnownAssignedXidsValid[].  A deletion is done
2460  * by setting KnownAssignedXidsValid[i] to false, *without* clearing the
2461  * XID entry itself.  This preserves the property that the XID entries are
2462  * sorted, so we can do binary searches easily.  Periodically we compress
2463  * out the unused entries; that's much cheaper than having to compress the
2464  * array immediately on every deletion.
2465  *
2466  * The actually valid items in KnownAssignedXids[] and KnownAssignedXidsValid[]
2467  * are those with indexes tail <= i < head; items outside this subscript range
2468  * have unspecified contents.  When head reaches the end of the array, we
2469  * force compression of unused entries rather than wrapping around, since
2470  * allowing wraparound would greatly complicate the search logic.  We maintain
2471  * an explicit tail pointer so that pruning of old XIDs can be done without
2472  * immediately moving the array contents.  In most cases only a small fraction
2473  * of the array contains valid entries at any instant.
2474  *
2475  * Although only the startup process can ever change the KnownAssignedXids
2476  * data structure, we still need interlocking so that standby backends will
2477  * not observe invalid intermediate states.  The convention is that backends
2478  * must hold shared ProcArrayLock to examine the array.  To remove XIDs from
2479  * the array, the startup process must hold ProcArrayLock exclusively, for
2480  * the usual transactional reasons (compare commit/abort of a transaction
2481  * during normal running).      Compressing unused entries out of the array
2482  * likewise requires exclusive lock.  To add XIDs to the array, we just insert
2483  * them into slots to the right of the head pointer and then advance the head
2484  * pointer.  This wouldn't require any lock at all, except that on machines
2485  * with weak memory ordering we need to be careful that other processors
2486  * see the array element changes before they see the head pointer change.
2487  * We handle this by using a spinlock to protect reads and writes of the
2488  * head/tail pointers.  (We could dispense with the spinlock if we were to
2489  * create suitable memory access barrier primitives and use those instead.)
2490  * The spinlock must be taken to read or write the head/tail pointers unless
2491  * the caller holds ProcArrayLock exclusively.
2492  *
2493  * Algorithmic analysis:
2494  *
2495  * If we have a maximum of M slots, with N XIDs currently spread across
2496  * S elements then we have N <= S <= M always.
2497  *
2498  *      * Adding a new XID is O(1) and needs little locking (unless compression
2499  *              must happen)
2500  *      * Compressing the array is O(S) and requires exclusive lock
2501  *      * Removing an XID is O(logS) and requires exclusive lock
2502  *      * Taking a snapshot is O(S) and requires shared lock
2503  *      * Checking for an XID is O(logS) and requires shared lock
2504  *
2505  * In comparison, using a hash table for KnownAssignedXids would mean that
2506  * taking snapshots would be O(M). If we can maintain S << M then the
2507  * sorted array technique will deliver significantly faster snapshots.
2508  * If we try to keep S too small then we will spend too much time compressing,
2509  * so there is an optimal point for any workload mix. We use a heuristic to
2510  * decide when to compress the array, though trimming also helps reduce
2511  * frequency of compressing. The heuristic requires us to track the number of
2512  * currently valid XIDs in the array.
2513  */
2514
2515
2516 /*
2517  * Compress KnownAssignedXids by shifting valid data down to the start of the
2518  * array, removing any gaps.
2519  *
2520  * A compression step is forced if "force" is true, otherwise we do it
2521  * only if a heuristic indicates it's a good time to do it.
2522  *
2523  * Caller must hold ProcArrayLock in exclusive mode.
2524  */
2525 static void
2526 KnownAssignedXidsCompress(bool force)
2527 {
2528         /* use volatile pointer to prevent code rearrangement */
2529         volatile ProcArrayStruct *pArray = procArray;
2530         int                     head,
2531                                 tail;
2532         int                     compress_index;
2533         int                     i;
2534
2535         /* no spinlock required since we hold ProcArrayLock exclusively */
2536         head = pArray->headKnownAssignedXids;
2537         tail = pArray->tailKnownAssignedXids;
2538
2539         if (!force)
2540         {
2541                 /*
2542                  * If we can choose how much to compress, use a heuristic to avoid
2543                  * compressing too often or not often enough.
2544                  *
2545                  * Heuristic is if we have a large enough current spread and less than
2546                  * 50% of the elements are currently in use, then compress. This
2547                  * should ensure we compress fairly infrequently. We could compress
2548                  * less often though the virtual array would spread out more and
2549                  * snapshots would become more expensive.
2550                  */
2551                 int                     nelements = head - tail;
2552
2553                 if (nelements < 4 * PROCARRAY_MAXPROCS ||
2554                         nelements < 2 * pArray->numKnownAssignedXids)
2555                         return;
2556         }
2557
2558         /*
2559          * We compress the array by reading the valid values from tail to head,
2560          * re-aligning data to 0th element.
2561          */
2562         compress_index = 0;
2563         for (i = tail; i < head; i++)
2564         {
2565                 if (KnownAssignedXidsValid[i])
2566                 {
2567                         KnownAssignedXids[compress_index] = KnownAssignedXids[i];
2568                         KnownAssignedXidsValid[compress_index] = true;
2569                         compress_index++;
2570                 }
2571         }
2572
2573         pArray->tailKnownAssignedXids = 0;
2574         pArray->headKnownAssignedXids = compress_index;
2575 }
2576
2577 /*
2578  * Add xids into KnownAssignedXids at the head of the array.
2579  *
2580  * xids from from_xid to to_xid, inclusive, are added to the array.
2581  *
2582  * If exclusive_lock is true then caller already holds ProcArrayLock in
2583  * exclusive mode, so we need no extra locking here.  Else caller holds no
2584  * lock, so we need to be sure we maintain sufficient interlocks against
2585  * concurrent readers.  (Only the startup process ever calls this, so no need
2586  * to worry about concurrent writers.)
2587  */
2588 static void
2589 KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
2590                                          bool exclusive_lock)
2591 {
2592         /* use volatile pointer to prevent code rearrangement */
2593         volatile ProcArrayStruct *pArray = procArray;
2594         TransactionId next_xid;
2595         int                     head,
2596                                 tail;
2597         int                     nxids;
2598         int                     i;
2599
2600         Assert(TransactionIdPrecedesOrEquals(from_xid, to_xid));
2601
2602         /*
2603          * Calculate how many array slots we'll need.  Normally this is cheap; in
2604          * the unusual case where the XIDs cross the wrap point, we do it the hard
2605          * way.
2606          */
2607         if (to_xid >= from_xid)
2608                 nxids = to_xid - from_xid + 1;
2609         else
2610         {
2611                 nxids = 1;
2612                 next_xid = from_xid;
2613                 while (TransactionIdPrecedes(next_xid, to_xid))
2614                 {
2615                         nxids++;
2616                         TransactionIdAdvance(next_xid);
2617                 }
2618         }
2619
2620         /*
2621          * Since only the startup process modifies the head/tail pointers, we
2622          * don't need a lock to read them here.
2623          */
2624         head = pArray->headKnownAssignedXids;
2625         tail = pArray->tailKnownAssignedXids;
2626
2627         Assert(head >= 0 && head <= pArray->maxKnownAssignedXids);
2628         Assert(tail >= 0 && tail < pArray->maxKnownAssignedXids);
2629
2630         /*
2631          * Verify that insertions occur in TransactionId sequence.      Note that even
2632          * if the last existing element is marked invalid, it must still have a
2633          * correctly sequenced XID value.
2634          */
2635         if (head > tail &&
2636                 TransactionIdFollowsOrEquals(KnownAssignedXids[head - 1], from_xid))
2637         {
2638                 KnownAssignedXidsDisplay(LOG);
2639                 elog(ERROR, "out-of-order XID insertion in KnownAssignedXids");
2640         }
2641
2642         /*
2643          * If our xids won't fit in the remaining space, compress out free space
2644          */
2645         if (head + nxids > pArray->maxKnownAssignedXids)
2646         {
2647                 /* must hold lock to compress */
2648                 if (!exclusive_lock)
2649                         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
2650
2651                 KnownAssignedXidsCompress(true);
2652
2653                 head = pArray->headKnownAssignedXids;
2654                 /* note: we no longer care about the tail pointer */
2655
2656                 if (!exclusive_lock)
2657                         LWLockRelease(ProcArrayLock);
2658
2659                 /*
2660                  * If it still won't fit then we're out of memory
2661                  */
2662                 if (head + nxids > pArray->maxKnownAssignedXids)
2663                         elog(ERROR, "too many KnownAssignedXids");
2664         }
2665
2666         /* Now we can insert the xids into the space starting at head */
2667         next_xid = from_xid;
2668         for (i = 0; i < nxids; i++)
2669         {
2670                 KnownAssignedXids[head] = next_xid;
2671                 KnownAssignedXidsValid[head] = true;
2672                 TransactionIdAdvance(next_xid);
2673                 head++;
2674         }
2675
2676         /* Adjust count of number of valid entries */
2677         pArray->numKnownAssignedXids += nxids;
2678
2679         /*
2680          * Now update the head pointer.  We use a spinlock to protect this
2681          * pointer, not because the update is likely to be non-atomic, but to
2682          * ensure that other processors see the above array updates before they
2683          * see the head pointer change.
2684          *
2685          * If we're holding ProcArrayLock exclusively, there's no need to take the
2686          * spinlock.
2687          */
2688         if (exclusive_lock)
2689                 pArray->headKnownAssignedXids = head;
2690         else
2691         {
2692                 SpinLockAcquire(&pArray->known_assigned_xids_lck);
2693                 pArray->headKnownAssignedXids = head;
2694                 SpinLockRelease(&pArray->known_assigned_xids_lck);
2695         }
2696 }
2697
2698 /*
2699  * KnownAssignedXidsSearch
2700  *
2701  * Searches KnownAssignedXids for a specific xid and optionally removes it.
2702  * Returns true if it was found, false if not.
2703  *
2704  * Caller must hold ProcArrayLock in shared or exclusive mode.
2705  * Exclusive lock must be held for remove = true.
2706  */
2707 static bool
2708 KnownAssignedXidsSearch(TransactionId xid, bool remove)
2709 {
2710         /* use volatile pointer to prevent code rearrangement */
2711         volatile ProcArrayStruct *pArray = procArray;
2712         int                     first,
2713                                 last;
2714         int                     head;
2715         int                     tail;
2716         int                     result_index = -1;
2717
2718         if (remove)
2719         {
2720                 /* we hold ProcArrayLock exclusively, so no need for spinlock */
2721                 tail = pArray->tailKnownAssignedXids;
2722                 head = pArray->headKnownAssignedXids;
2723         }
2724         else
2725         {
2726                 /* take spinlock to ensure we see up-to-date array contents */
2727                 SpinLockAcquire(&pArray->known_assigned_xids_lck);
2728                 tail = pArray->tailKnownAssignedXids;
2729                 head = pArray->headKnownAssignedXids;
2730                 SpinLockRelease(&pArray->known_assigned_xids_lck);
2731         }
2732
2733         /*
2734          * Standard binary search.      Note we can ignore the KnownAssignedXidsValid
2735          * array here, since even invalid entries will contain sorted XIDs.
2736          */
2737         first = tail;
2738         last = head - 1;
2739         while (first <= last)
2740         {
2741                 int                     mid_index;
2742                 TransactionId mid_xid;
2743
2744                 mid_index = (first + last) / 2;
2745                 mid_xid = KnownAssignedXids[mid_index];
2746
2747                 if (xid == mid_xid)
2748                 {
2749                         result_index = mid_index;
2750                         break;
2751                 }
2752                 else if (TransactionIdPrecedes(xid, mid_xid))
2753                         last = mid_index - 1;
2754                 else
2755                         first = mid_index + 1;
2756         }
2757
2758         if (result_index < 0)
2759                 return false;                   /* not in array */
2760
2761         if (!KnownAssignedXidsValid[result_index])
2762                 return false;                   /* in array, but invalid */
2763
2764         if (remove)
2765         {
2766                 KnownAssignedXidsValid[result_index] = false;
2767
2768                 pArray->numKnownAssignedXids--;
2769                 Assert(pArray->numKnownAssignedXids >= 0);
2770
2771                 /*
2772                  * If we're removing the tail element then advance tail pointer over
2773                  * any invalid elements.  This will speed future searches.
2774                  */
2775                 if (result_index == tail)
2776                 {
2777                         tail++;
2778                         while (tail < head && !KnownAssignedXidsValid[tail])
2779                                 tail++;
2780                         if (tail >= head)
2781                         {
2782                                 /* Array is empty, so we can reset both pointers */
2783                                 pArray->headKnownAssignedXids = 0;
2784                                 pArray->tailKnownAssignedXids = 0;
2785                         }
2786                         else
2787                         {
2788                                 pArray->tailKnownAssignedXids = tail;
2789                         }
2790                 }
2791         }
2792
2793         return true;
2794 }
2795
2796 /*
2797  * Is the specified XID present in KnownAssignedXids[]?
2798  *
2799  * Caller must hold ProcArrayLock in shared or exclusive mode.
2800  */
2801 static bool
2802 KnownAssignedXidExists(TransactionId xid)
2803 {
2804         Assert(TransactionIdIsValid(xid));
2805
2806         return KnownAssignedXidsSearch(xid, false);
2807 }
2808
2809 /*
2810  * Remove the specified XID from KnownAssignedXids[].
2811  *
2812  * Caller must hold ProcArrayLock in exclusive mode.
2813  */
2814 static void
2815 KnownAssignedXidsRemove(TransactionId xid)
2816 {
2817         Assert(TransactionIdIsValid(xid));
2818
2819         elog(trace_recovery(DEBUG4), "remove KnownAssignedXid %u", xid);
2820
2821         /*
2822          * Note: we cannot consider it an error to remove an XID that's not
2823          * present.  We intentionally remove subxact IDs while processing
2824          * XLOG_XACT_ASSIGNMENT, to avoid array overflow.  Then those XIDs will be
2825          * removed again when the top-level xact commits or aborts.
2826          *
2827          * It might be possible to track such XIDs to distinguish this case from
2828          * actual errors, but it would be complicated and probably not worth it.
2829          * So, just ignore the search result.
2830          */
2831         (void) KnownAssignedXidsSearch(xid, true);
2832 }
2833
2834 /*
2835  * KnownAssignedXidsRemoveTree
2836  *              Remove xid (if it's not InvalidTransactionId) and all the subxids.
2837  *
2838  * Caller must hold ProcArrayLock in exclusive mode.
2839  */
2840 static void
2841 KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
2842                                                         TransactionId *subxids)
2843 {
2844         int                     i;
2845
2846         if (TransactionIdIsValid(xid))
2847                 KnownAssignedXidsRemove(xid);
2848
2849         for (i = 0; i < nsubxids; i++)
2850                 KnownAssignedXidsRemove(subxids[i]);
2851
2852         /* Opportunistically compress the array */
2853         KnownAssignedXidsCompress(false);
2854 }
2855
2856 /*
2857  * Prune KnownAssignedXids up to, but *not* including xid. If xid is invalid
2858  * then clear the whole table.
2859  *
2860  * Caller must hold ProcArrayLock in exclusive mode.
2861  */
2862 static void
2863 KnownAssignedXidsRemovePreceding(TransactionId removeXid)
2864 {
2865         /* use volatile pointer to prevent code rearrangement */
2866         volatile ProcArrayStruct *pArray = procArray;
2867         int                     count = 0;
2868         int                     head,
2869                                 tail,
2870                                 i;
2871
2872         if (!TransactionIdIsValid(removeXid))
2873         {
2874                 elog(trace_recovery(DEBUG4), "removing all KnownAssignedXids");
2875                 pArray->numKnownAssignedXids = 0;
2876                 pArray->headKnownAssignedXids = pArray->tailKnownAssignedXids = 0;
2877                 return;
2878         }
2879
2880         elog(trace_recovery(DEBUG4), "prune KnownAssignedXids to %u", removeXid);
2881
2882         /*
2883          * Mark entries invalid starting at the tail.  Since array is sorted, we
2884          * can stop as soon as we reach a entry >= removeXid.
2885          */
2886         tail = pArray->tailKnownAssignedXids;
2887         head = pArray->headKnownAssignedXids;
2888
2889         for (i = tail; i < head; i++)
2890         {
2891                 if (KnownAssignedXidsValid[i])
2892                 {
2893                         TransactionId knownXid = KnownAssignedXids[i];
2894
2895                         if (TransactionIdFollowsOrEquals(knownXid, removeXid))
2896                                 break;
2897
2898                         if (!StandbyTransactionIdIsPrepared(knownXid))
2899                         {
2900                                 KnownAssignedXidsValid[i] = false;
2901                                 count++;
2902                         }
2903                 }
2904         }
2905
2906         pArray->numKnownAssignedXids -= count;
2907         Assert(pArray->numKnownAssignedXids >= 0);
2908
2909         /*
2910          * Advance the tail pointer if we've marked the tail item invalid.
2911          */
2912         for (i = tail; i < head; i++)
2913         {
2914                 if (KnownAssignedXidsValid[i])
2915                         break;
2916         }
2917         if (i >= head)
2918         {
2919                 /* Array is empty, so we can reset both pointers */
2920                 pArray->headKnownAssignedXids = 0;
2921                 pArray->tailKnownAssignedXids = 0;
2922         }
2923         else
2924         {
2925                 pArray->tailKnownAssignedXids = i;
2926         }
2927
2928         /* Opportunistically compress the array */
2929         KnownAssignedXidsCompress(false);
2930 }
2931
2932 /*
2933  * KnownAssignedXidsGet - Get an array of xids by scanning KnownAssignedXids.
2934  * We filter out anything >= xmax.
2935  *
2936  * Returns the number of XIDs stored into xarray[].  Caller is responsible
2937  * that array is large enough.
2938  *
2939  * Caller must hold ProcArrayLock in (at least) shared mode.
2940  */
2941 static int
2942 KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax)
2943 {
2944         TransactionId xtmp = InvalidTransactionId;
2945
2946         return KnownAssignedXidsGetAndSetXmin(xarray, &xtmp, xmax);
2947 }
2948
2949 /*
2950  * KnownAssignedXidsGetAndSetXmin - as KnownAssignedXidsGet, plus
2951  * we reduce *xmin to the lowest xid value seen if not already lower.
2952  *
2953  * Caller must hold ProcArrayLock in (at least) shared mode.
2954  */
2955 static int
2956 KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
2957                                                            TransactionId xmax)
2958 {
2959         /* use volatile pointer to prevent code rearrangement */
2960         volatile ProcArrayStruct *pArray = procArray;
2961         int                     count = 0;
2962         int                     head,
2963                                 tail;
2964         int                     i;
2965
2966         /*
2967          * Fetch head just once, since it may change while we loop. We can stop
2968          * once we reach the initially seen head, since we are certain that an xid
2969          * cannot enter and then leave the array while we hold ProcArrayLock.  We
2970          * might miss newly-added xids, but they should be >= xmax so irrelevant
2971          * anyway.
2972          *
2973          * Must take spinlock to ensure we see up-to-date array contents.
2974          */
2975         SpinLockAcquire(&pArray->known_assigned_xids_lck);
2976         tail = pArray->tailKnownAssignedXids;
2977         head = pArray->headKnownAssignedXids;
2978         SpinLockRelease(&pArray->known_assigned_xids_lck);
2979
2980         for (i = tail; i < head; i++)
2981         {
2982                 /* Skip any gaps in the array */
2983                 if (KnownAssignedXidsValid[i])
2984                 {
2985                         TransactionId knownXid = KnownAssignedXids[i];
2986
2987                         /*
2988                          * Update xmin if required.  Only the first XID need be checked,
2989                          * since the array is sorted.
2990                          */
2991                         if (count == 0 &&
2992                                 TransactionIdPrecedes(knownXid, *xmin))
2993                                 *xmin = knownXid;
2994
2995                         /*
2996                          * Filter out anything >= xmax, again relying on sorted property
2997                          * of array.
2998                          */
2999                         if (TransactionIdIsValid(xmax) &&
3000                                 TransactionIdFollowsOrEquals(knownXid, xmax))
3001                                 break;
3002
3003                         /* Add knownXid into output array */
3004                         xarray[count++] = knownXid;
3005                 }
3006         }
3007
3008         return count;
3009 }
3010
3011 /*
3012  * Get oldest XID in the KnownAssignedXids array, or InvalidTransactionId
3013  * if nothing there.
3014  */
3015 static TransactionId
3016 KnownAssignedXidsGetOldestXmin(void)
3017 {
3018         /* use volatile pointer to prevent code rearrangement */
3019         volatile ProcArrayStruct *pArray = procArray;
3020         int                     head,
3021                                 tail;
3022         int                     i;
3023
3024         /*
3025          * Fetch head just once, since it may change while we loop.
3026          */
3027         SpinLockAcquire(&pArray->known_assigned_xids_lck);
3028         tail = pArray->tailKnownAssignedXids;
3029         head = pArray->headKnownAssignedXids;
3030         SpinLockRelease(&pArray->known_assigned_xids_lck);
3031
3032         for (i = tail; i < head; i++)
3033         {
3034                 /* Skip any gaps in the array */
3035                 if (KnownAssignedXidsValid[i])
3036                         return KnownAssignedXids[i];
3037         }
3038
3039         return InvalidTransactionId;
3040 }
3041
3042 /*
3043  * Display KnownAssignedXids to provide debug trail
3044  *
3045  * Currently this is only called within startup process, so we need no
3046  * special locking.
3047  *
3048  * Note this is pretty expensive, and much of the expense will be incurred
3049  * even if the elog message will get discarded.  It's not currently called
3050  * in any performance-critical places, however, so no need to be tenser.
3051  */
3052 static void
3053 KnownAssignedXidsDisplay(int trace_level)
3054 {
3055         /* use volatile pointer to prevent code rearrangement */
3056         volatile ProcArrayStruct *pArray = procArray;
3057         StringInfoData buf;
3058         int                     head,
3059                                 tail,
3060                                 i;
3061         int                     nxids = 0;
3062
3063         tail = pArray->tailKnownAssignedXids;
3064         head = pArray->headKnownAssignedXids;
3065
3066         initStringInfo(&buf);
3067
3068         for (i = tail; i < head; i++)
3069         {
3070                 if (KnownAssignedXidsValid[i])
3071                 {
3072                         nxids++;
3073                         appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]);
3074                 }
3075         }
3076
3077         elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s",
3078                  nxids,
3079                  pArray->numKnownAssignedXids,
3080                  pArray->tailKnownAssignedXids,
3081                  pArray->headKnownAssignedXids,
3082                  buf.data);
3083
3084         pfree(buf.data);
3085 }