1 /*-------------------------------------------------------------------------
5 * Infrastructure for building historic catalog snapshots based on contents
6 * of the WAL, for the purpose of decoding heapam.c style values in the
11 * We build snapshots which can *only* be used to read catalog contents and we
12 * do so by reading and interpreting the WAL stream. The aim is to build a
13 * snapshot that behaves the same as a freshly taken MVCC snapshot would have
14 * at the time the XLogRecord was generated.
16 * To build the snapshots we reuse the infrastructure built for Hot
17 * Standby. The in-memory snapshots we build look different than HS' because
18 * we have different needs. To successfully decode data from the WAL we only
19 * need to access catalog tables and (sys|rel|cat)cache, not the actual user
20 * tables since the data we decode is wholly contained in the WAL
21 * records. Also, our snapshots need to be different in comparison to normal
22 * MVCC ones because in contrast to those we cannot fully rely on the clog and
23 * pg_subtrans for information about committed transactions because they might
24 * commit in the future from the POV of the WAL entry we're currently
25 * decoding. This definition has the advantage that we only need to prevent
26 * removal of catalog rows, while normal table's rows can still be
27 * removed. This is achieved by using the replication slot mechanism.
29 * As the percentage of transactions modifying the catalog normally is fairly
30 * small in comparisons to ones only manipulating user data, we keep track of
31 * the committed catalog modifying ones inside [xmin, xmax) instead of keeping
32 * track of all running transactions like it's done in a normal snapshot. Note
33 * that we're generally only looking at transactions that have acquired an
34 * xid. That is we keep a list of transactions between snapshot->(xmin, xmax)
35 * that we consider committed, everything else is considered aborted/in
36 * progress. That also allows us not to care about subtransactions before they
37 * have committed which means this modules, in contrast to HS, doesn't have to
38 * care about suboverflowed subtransactions and similar.
40 * One complexity of doing this is that to e.g. handle mixed DDL/DML
41 * transactions we need Snapshots that see intermediate versions of the
42 * catalog in a transaction. During normal operation this is achieved by using
43 * CommandIds/cmin/cmax. The problem with that however is that for space
44 * efficiency reasons only one value of that is stored
45 * (c.f. combocid.c). Since ComboCids are only available in memory we log
46 * additional information which allows us to get the original (cmin, cmax)
47 * pair during visibility checks. Check the reorderbuffer.c's comment above
48 * ResolveCminCmaxDuringDecoding() for details.
50 * To facilitate all this we need our own visibility routine, as the normal
51 * ones are optimized for different usecases.
53 * To replace the normal catalog snapshots with decoding ones use the
54 * SetupHistoricSnapshot() and TeardownHistoricSnapshot() functions.
58 * The snapbuild machinery is starting up in several stages, as illustrated
59 * by the following graph:
60 * +-------------------------+
61 * +----|SNAPBUILD_START |-------------+
62 * | +-------------------------+ |
65 * | running_xacts with running xacts |
69 * | +-------------------------+ v
70 * | |SNAPBUILD_FULL_SNAPSHOT |------------>|
71 * | +-------------------------+ |
72 * running_xacts | saved snapshot
73 * with zero xacts | at running_xacts's lsn
75 * | all running toplevel TXNs finished |
78 * | +-------------------------+ |
79 * +--->|SNAPBUILD_CONSISTENT |<------------+
80 * +-------------------------+
82 * Initially the machinery is in the START stage. When an xl_running_xacts
83 * record is read that is sufficiently new (above the safe xmin horizon),
84 * there's a state transition. If there were no running xacts when the
85 * runnign_xacts record was generated, we'll directly go into CONSISTENT
86 * state, otherwise we'll switch to the FULL_SNAPSHOT state. Having a full
87 * snapshot means that all transactions that start henceforth can be decoded
88 * in their entirety, but transactions that started previously can't. In
89 * FULL_SNAPSHOT we'll switch into CONSISTENT once all those previously
90 * running transactions have committed or aborted.
92 * Only transactions that commit after CONSISTENT state has been reached will
93 * be replayed, even though they might have started while still in
94 * FULL_SNAPSHOT. That ensures that we'll reach a point where no previous
95 * changes has been exported, but all the following ones will be. That point
96 * is a convenient point to initialize replication from, which is why we
97 * export a snapshot at that point, which *can* be used to read normal data.
99 * Copyright (c) 2012-2016, PostgreSQL Global Development Group
102 * src/backend/replication/snapbuild.c
104 *-------------------------------------------------------------------------
107 #include "postgres.h"
109 #include <sys/stat.h>
110 #include <sys/types.h>
113 #include "miscadmin.h"
115 #include "access/heapam_xlog.h"
116 #include "access/transam.h"
117 #include "access/xact.h"
119 #include "replication/logical.h"
120 #include "replication/reorderbuffer.h"
121 #include "replication/snapbuild.h"
123 #include "utils/builtins.h"
124 #include "utils/memutils.h"
125 #include "utils/snapshot.h"
126 #include "utils/snapmgr.h"
127 #include "utils/tqual.h"
129 #include "storage/block.h" /* debugging output */
130 #include "storage/fd.h"
131 #include "storage/lmgr.h"
132 #include "storage/proc.h"
133 #include "storage/procarray.h"
134 #include "storage/standby.h"
137 * This struct contains the current state of the snapshot building
138 * machinery. Besides a forward declaration in the header, it is not exposed
139 * to the public, so we can easily change its contents.
143 /* how far are we along building our first full snapshot */
144 SnapBuildState state;
146 /* private memory context used to allocate memory for this module. */
147 MemoryContext context;
149 /* all transactions < than this have committed/aborted */
152 /* all transactions >= than this are uncommitted */
156 * Don't replay commits from an LSN < this LSN. This can be set externally
157 * but it will also be advanced (never retreat) from within snapbuild.c.
159 XLogRecPtr start_decoding_at;
162 * Don't start decoding WAL until the "xl_running_xacts" information
163 * indicates there are no running xids with an xid smaller than this.
165 TransactionId initial_xmin_horizon;
168 * Snapshot that's valid to see the catalog state seen at this moment.
173 * LSN of the last location we are sure a snapshot has been serialized to.
175 XLogRecPtr last_serialized_snapshot;
178 * The reorderbuffer we need to update with usable snapshots et al.
180 ReorderBuffer *reorder;
183 * Information about initially running transactions
185 * When we start building a snapshot there already may be transactions in
186 * progress. Those are stored in running.xip. We don't have enough
187 * information about those to decode their contents, so until they are
188 * finished (xcnt=0) we cannot switch to a CONSISTENT state.
193 * As long as running.xcnt all XIDs < running.xmin and > running.xmax
194 * have to be checked whether they still are running.
199 size_t xcnt; /* number of used xip entries */
200 size_t xcnt_space; /* allocated size of xip */
201 TransactionId *xip; /* running xacts array, xidComparator-sorted */
205 * Array of transactions which could have catalog changes that committed
206 * between xmin and xmax.
210 /* number of committed transactions */
213 /* available space for committed transactions */
217 * Until we reach a CONSISTENT state, we record commits of all
218 * transactions, not just the catalog changing ones. Record when that
219 * changes so we know we cannot export a snapshot safely anymore.
221 bool includes_all_transactions;
224 * Array of committed transactions that have modified the catalog.
226 * As this array is frequently modified we do *not* keep it in
227 * xidComparator order. Instead we sort the array when building &
228 * distributing a snapshot.
230 * TODO: It's unclear whether that reasoning has much merit. Every
231 * time we add something here after becoming consistent will also
232 * require distributing a snapshot. Storing them sorted would
233 * potentially also make it easier to purge (but more complicated wrt
234 * wraparound?). Should be improved if sorting while building the
235 * snapshot shows up in profiles.
242 * Starting a transaction -- which we need to do while exporting a snapshot --
243 * removes knowledge about the previously used resowner, so we save it here.
245 static ResourceOwner SavedResourceOwnerDuringExport = NULL;
246 static bool ExportInProgress = false;
248 /* transaction state manipulation functions */
249 static void SnapBuildEndTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid);
251 /* ->running manipulation */
252 static bool SnapBuildTxnIsRunning(SnapBuild *builder, TransactionId xid);
254 /* ->committed manipulation */
255 static void SnapBuildPurgeCommittedTxn(SnapBuild *builder);
257 /* snapshot building/manipulation/distribution functions */
258 static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid);
260 static void SnapBuildFreeSnapshot(Snapshot snap);
262 static void SnapBuildSnapIncRefcount(Snapshot snap);
264 static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn);
266 /* xlog reading helper functions for SnapBuildProcessRecord */
267 static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running);
269 /* serialization functions */
270 static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn);
271 static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn);
275 * Allocate a new snapshot builder.
277 * xmin_horizon is the xid >=which we can be sure no catalog rows have been
278 * removed, start_lsn is the LSN >= we want to replay commits.
281 AllocateSnapshotBuilder(ReorderBuffer *reorder,
282 TransactionId xmin_horizon,
283 XLogRecPtr start_lsn)
285 MemoryContext context;
286 MemoryContext oldcontext;
289 /* allocate memory in own context, to have better accountability */
290 context = AllocSetContextCreate(CurrentMemoryContext,
291 "snapshot builder context",
292 ALLOCSET_DEFAULT_SIZES);
293 oldcontext = MemoryContextSwitchTo(context);
295 builder = palloc0(sizeof(SnapBuild));
297 builder->state = SNAPBUILD_START;
298 builder->context = context;
299 builder->reorder = reorder;
300 /* Other struct members initialized by zeroing via palloc0 above */
302 builder->committed.xcnt = 0;
303 builder->committed.xcnt_space = 128; /* arbitrary number */
304 builder->committed.xip =
305 palloc0(builder->committed.xcnt_space * sizeof(TransactionId));
306 builder->committed.includes_all_transactions = true;
308 builder->initial_xmin_horizon = xmin_horizon;
309 builder->start_decoding_at = start_lsn;
311 MemoryContextSwitchTo(oldcontext);
317 * Free a snapshot builder.
320 FreeSnapshotBuilder(SnapBuild *builder)
322 MemoryContext context = builder->context;
324 /* free snapshot explicitly, that contains some error checking */
325 if (builder->snapshot != NULL)
327 SnapBuildSnapDecRefcount(builder->snapshot);
328 builder->snapshot = NULL;
331 /* other resources are deallocated via memory context reset */
332 MemoryContextDelete(context);
336 * Free an unreferenced snapshot that has previously been built by us.
339 SnapBuildFreeSnapshot(Snapshot snap)
341 /* make sure we don't get passed an external snapshot */
342 Assert(snap->satisfies == HeapTupleSatisfiesHistoricMVCC);
344 /* make sure nobody modified our snapshot */
345 Assert(snap->curcid == FirstCommandId);
346 Assert(!snap->suboverflowed);
347 Assert(!snap->takenDuringRecovery);
348 Assert(snap->regd_count == 0);
350 /* slightly more likely, so it's checked even without c-asserts */
352 elog(ERROR, "cannot free a copied snapshot");
354 if (snap->active_count)
355 elog(ERROR, "cannot free an active snapshot");
361 * In which state of snapshot building are we?
364 SnapBuildCurrentState(SnapBuild *builder)
366 return builder->state;
370 * Should the contents of transaction ending at 'ptr' be decoded?
373 SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr)
375 return ptr < builder->start_decoding_at;
379 * Increase refcount of a snapshot.
381 * This is used when handing out a snapshot to some external resource or when
382 * adding a Snapshot as builder->snapshot.
385 SnapBuildSnapIncRefcount(Snapshot snap)
387 snap->active_count++;
391 * Decrease refcount of a snapshot and free if the refcount reaches zero.
393 * Externally visible, so that external resources that have been handed an
394 * IncRef'ed Snapshot can adjust its refcount easily.
397 SnapBuildSnapDecRefcount(Snapshot snap)
399 /* make sure we don't get passed an external snapshot */
400 Assert(snap->satisfies == HeapTupleSatisfiesHistoricMVCC);
402 /* make sure nobody modified our snapshot */
403 Assert(snap->curcid == FirstCommandId);
404 Assert(!snap->suboverflowed);
405 Assert(!snap->takenDuringRecovery);
407 Assert(snap->regd_count == 0);
409 Assert(snap->active_count > 0);
411 /* slightly more likely, so it's checked even without casserts */
413 elog(ERROR, "cannot free a copied snapshot");
415 snap->active_count--;
416 if (snap->active_count == 0)
417 SnapBuildFreeSnapshot(snap);
421 * Build a new snapshot, based on currently committed catalog-modifying
424 * In-progress transactions with catalog access are *not* allowed to modify
425 * these snapshots; they have to copy them and fill in appropriate ->curcid
426 * and ->subxip/subxcnt values.
429 SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid)
434 Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT);
436 ssize = sizeof(SnapshotData)
437 + sizeof(TransactionId) * builder->committed.xcnt
438 + sizeof(TransactionId) * 1 /* toplevel xid */ ;
440 snapshot = MemoryContextAllocZero(builder->context, ssize);
442 snapshot->satisfies = HeapTupleSatisfiesHistoricMVCC;
445 * We misuse the original meaning of SnapshotData's xip and subxip fields
446 * to make the more fitting for our needs.
448 * In the 'xip' array we store transactions that have to be treated as
449 * committed. Since we will only ever look at tuples from transactions
450 * that have modified the catalog it's more efficient to store those few
451 * that exist between xmin and xmax (frequently there are none).
453 * Snapshots that are used in transactions that have modified the catalog
454 * also use the 'subxip' array to store their toplevel xid and all the
455 * subtransaction xids so we can recognize when we need to treat rows as
456 * visible that are not in xip but still need to be visible. Subxip only
457 * gets filled when the transaction is copied into the context of a
458 * catalog modifying transaction since we otherwise share a snapshot
459 * between transactions. As long as a txn hasn't modified the catalog it
460 * doesn't need to treat any uncommitted rows as visible, so there is no
461 * need for those xids.
463 * Both arrays are qsort'ed so that we can use bsearch() on them.
465 Assert(TransactionIdIsNormal(builder->xmin));
466 Assert(TransactionIdIsNormal(builder->xmax));
468 snapshot->xmin = builder->xmin;
469 snapshot->xmax = builder->xmax;
471 /* store all transactions to be treated as committed by this snapshot */
473 (TransactionId *) ((char *) snapshot + sizeof(SnapshotData));
474 snapshot->xcnt = builder->committed.xcnt;
475 memcpy(snapshot->xip,
476 builder->committed.xip,
477 builder->committed.xcnt * sizeof(TransactionId));
479 /* sort so we can bsearch() */
480 qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator);
483 * Initially, subxip is empty, i.e. it's a snapshot to be used by
484 * transactions that don't modify the catalog. Will be filled by
485 * ReorderBufferCopySnap() if necessary.
487 snapshot->subxcnt = 0;
488 snapshot->subxip = NULL;
490 snapshot->suboverflowed = false;
491 snapshot->takenDuringRecovery = false;
492 snapshot->copied = false;
493 snapshot->curcid = FirstCommandId;
494 snapshot->active_count = 0;
495 snapshot->regd_count = 0;
501 * Export a snapshot so it can be set in another session with SET TRANSACTION
504 * For that we need to start a transaction in the current backend as the
505 * importing side checks whether the source transaction is still open to make
506 * sure the xmin horizon hasn't advanced since then.
508 * After that we convert a locally built snapshot into the normal variant
509 * understood by HeapTupleSatisfiesMVCC et al.
512 SnapBuildExportSnapshot(SnapBuild *builder)
517 TransactionId *newxip;
520 if (builder->state != SNAPBUILD_CONSISTENT)
521 elog(ERROR, "cannot export a snapshot before reaching a consistent state");
523 if (!builder->committed.includes_all_transactions)
524 elog(ERROR, "cannot export a snapshot, not all transactions are monitored anymore");
526 /* so we don't overwrite the existing value */
527 if (TransactionIdIsValid(MyPgXact->xmin))
528 elog(ERROR, "cannot export a snapshot when MyPgXact->xmin already is valid");
530 if (IsTransactionOrTransactionBlock())
531 elog(ERROR, "cannot export a snapshot from within a transaction");
533 if (SavedResourceOwnerDuringExport)
534 elog(ERROR, "can only export one snapshot at a time");
536 SavedResourceOwnerDuringExport = CurrentResourceOwner;
537 ExportInProgress = true;
539 StartTransactionCommand();
541 Assert(!FirstSnapshotSet);
543 /* There doesn't seem to a nice API to set these */
544 XactIsoLevel = XACT_REPEATABLE_READ;
547 snap = SnapBuildBuildSnapshot(builder, GetTopTransactionId());
550 * We know that snap->xmin is alive, enforced by the logical xmin
551 * mechanism. Due to that we can do this without locks, we're only
552 * changing our own value.
554 MyPgXact->xmin = snap->xmin;
556 /* allocate in transaction context */
557 newxip = (TransactionId *)
558 palloc(sizeof(TransactionId) * GetMaxSnapshotXidCount());
561 * snapbuild.c builds transactions in an "inverted" manner, which means it
562 * stores committed transactions in ->xip, not ones in progress. Build a
563 * classical snapshot by marking all non-committed transactions as
564 * in-progress. This can be expensive.
566 for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);)
571 * Check whether transaction committed using the decoding snapshot
574 test = bsearch(&xid, snap->xip, snap->xcnt,
575 sizeof(TransactionId), xidComparator);
579 if (newxcnt >= GetMaxSnapshotXidCount())
580 elog(ERROR, "snapshot too large");
582 newxip[newxcnt++] = xid;
585 TransactionIdAdvance(xid);
588 snap->xcnt = newxcnt;
592 * now that we've built a plain snapshot, use the normal mechanisms for
595 snapname = ExportSnapshot(snap);
598 (errmsg_plural("exported logical decoding snapshot: \"%s\" with %u transaction ID",
599 "exported logical decoding snapshot: \"%s\" with %u transaction IDs",
601 snapname, snap->xcnt)));
606 * Ensure there is a snapshot and if not build one for current transaction.
609 SnapBuildGetOrBuildSnapshot(SnapBuild *builder, TransactionId xid)
611 Assert(builder->state == SNAPBUILD_CONSISTENT);
613 /* only build a new snapshot if we don't have a prebuilt one */
614 if (builder->snapshot == NULL)
616 builder->snapshot = SnapBuildBuildSnapshot(builder, xid);
617 /* inrease refcount for the snapshot builder */
618 SnapBuildSnapIncRefcount(builder->snapshot);
621 return builder->snapshot;
625 * Reset a previously SnapBuildExportSnapshot()'ed snapshot if there is
626 * any. Aborts the previously started transaction and resets the resource
627 * owner back to its original value.
630 SnapBuildClearExportedSnapshot(void)
632 /* nothing exported, that is the usual case */
633 if (!ExportInProgress)
636 if (!IsTransactionState())
637 elog(ERROR, "clearing exported snapshot in wrong transaction state");
639 /* make sure nothing could have ever happened */
640 AbortCurrentTransaction();
642 CurrentResourceOwner = SavedResourceOwnerDuringExport;
643 SavedResourceOwnerDuringExport = NULL;
644 ExportInProgress = false;
648 * Handle the effects of a single heap change, appropriate to the current state
649 * of the snapshot builder and returns whether changes made at (xid, lsn) can
653 SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn)
656 * We can't handle data in transactions if we haven't built a snapshot
657 * yet, so don't store them.
659 if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
663 * No point in keeping track of changes in transactions that we don't have
664 * enough information about to decode. This means that they started before
665 * we got into the SNAPBUILD_FULL_SNAPSHOT state.
667 if (builder->state < SNAPBUILD_CONSISTENT &&
668 SnapBuildTxnIsRunning(builder, xid))
672 * If the reorderbuffer doesn't yet have a snapshot, add one now, it will
673 * be needed to decode the change we're currently processing.
675 if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
677 /* only build a new snapshot if we don't have a prebuilt one */
678 if (builder->snapshot == NULL)
680 builder->snapshot = SnapBuildBuildSnapshot(builder, xid);
681 /* inrease refcount for the snapshot builder */
682 SnapBuildSnapIncRefcount(builder->snapshot);
686 * Increase refcount for the transaction we're handing the snapshot
689 SnapBuildSnapIncRefcount(builder->snapshot);
690 ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
698 * Do CommandId/ComboCid handling after reading an xl_heap_new_cid record.
699 * This implies that a transaction has done some form of write to system
703 SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
704 XLogRecPtr lsn, xl_heap_new_cid *xlrec)
709 * we only log new_cid's if a catalog tuple was modified, so mark the
710 * transaction as containing catalog modifications
712 ReorderBufferXidSetCatalogChanges(builder->reorder, xid, lsn);
714 ReorderBufferAddNewTupleCids(builder->reorder, xlrec->top_xid, lsn,
715 xlrec->target_node, xlrec->target_tid,
716 xlrec->cmin, xlrec->cmax,
719 /* figure out new command id */
720 if (xlrec->cmin != InvalidCommandId &&
721 xlrec->cmax != InvalidCommandId)
722 cid = Max(xlrec->cmin, xlrec->cmax);
723 else if (xlrec->cmax != InvalidCommandId)
725 else if (xlrec->cmin != InvalidCommandId)
729 cid = InvalidCommandId; /* silence compiler */
730 elog(ERROR, "xl_heap_new_cid record without a valid CommandId");
733 ReorderBufferAddNewCommandId(builder->reorder, xid, lsn, cid + 1);
737 * Check whether `xid` is currently 'running'.
739 * Running transactions in our parlance are transactions which we didn't
740 * observe from the start so we can't properly decode their contents. They
741 * only exist after we freshly started from an < CONSISTENT snapshot.
744 SnapBuildTxnIsRunning(SnapBuild *builder, TransactionId xid)
746 Assert(builder->state < SNAPBUILD_CONSISTENT);
747 Assert(TransactionIdIsNormal(builder->running.xmin));
748 Assert(TransactionIdIsNormal(builder->running.xmax));
750 if (builder->running.xcnt &&
751 NormalTransactionIdFollows(xid, builder->running.xmin) &&
752 NormalTransactionIdPrecedes(xid, builder->running.xmax))
754 TransactionId *search =
755 bsearch(&xid, builder->running.xip, builder->running.xcnt_space,
756 sizeof(TransactionId), xidComparator);
760 Assert(*search == xid);
769 * Add a new Snapshot to all transactions we're decoding that currently are
770 * in-progress so they can see new catalog contents made by the transaction
771 * that just committed. This is necessary because those in-progress
772 * transactions will use the new catalog's contents from here on (at the very
773 * least everything they do needs to be compatible with newer catalog
777 SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn)
780 ReorderBufferTXN *txn;
783 * Iterate through all toplevel transactions. This can include
784 * subtransactions which we just don't yet know to be that, but that's
785 * fine, they will just get an unnecessary snapshot queued.
787 dlist_foreach(txn_i, &builder->reorder->toplevel_by_lsn)
789 txn = dlist_container(ReorderBufferTXN, node, txn_i.cur);
791 Assert(TransactionIdIsValid(txn->xid));
794 * If we don't have a base snapshot yet, there are no changes in this
795 * transaction which in turn implies we don't yet need a snapshot at
796 * all. We'll add a snapshot when the first change gets queued.
798 * NB: This works correctly even for subtransactions because
799 * ReorderBufferCommitChild() takes care to pass the parent the base
800 * snapshot, and while iterating the changequeue we'll get the change
803 if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, txn->xid))
806 elog(DEBUG2, "adding a new snapshot to %u at %X/%X",
807 txn->xid, (uint32) (lsn >> 32), (uint32) lsn);
810 * increase the snapshot's refcount for the transaction we are handing
813 SnapBuildSnapIncRefcount(builder->snapshot);
814 ReorderBufferAddSnapshot(builder->reorder, txn->xid, lsn,
820 * Keep track of a new catalog changing transaction that has committed.
823 SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid)
825 Assert(TransactionIdIsValid(xid));
827 if (builder->committed.xcnt == builder->committed.xcnt_space)
829 builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1;
831 elog(DEBUG1, "increasing space for committed transactions to %u",
832 (uint32) builder->committed.xcnt_space);
834 builder->committed.xip = repalloc(builder->committed.xip,
835 builder->committed.xcnt_space * sizeof(TransactionId));
839 * TODO: It might make sense to keep the array sorted here instead of
840 * doing it every time we build a new snapshot. On the other hand this
841 * gets called repeatedly when a transaction with subtransactions commits.
843 builder->committed.xip[builder->committed.xcnt++] = xid;
847 * Remove knowledge about transactions we treat as committed that are smaller
848 * than ->xmin. Those won't ever get checked via the ->committed array but via
849 * the clog machinery, so we don't need to waste memory on them.
852 SnapBuildPurgeCommittedTxn(SnapBuild *builder)
855 TransactionId *workspace;
856 int surviving_xids = 0;
859 if (!TransactionIdIsNormal(builder->xmin))
862 /* TODO: Neater algorithm than just copying and iterating? */
864 MemoryContextAlloc(builder->context,
865 builder->committed.xcnt * sizeof(TransactionId));
867 /* copy xids that still are interesting to workspace */
868 for (off = 0; off < builder->committed.xcnt; off++)
870 if (NormalTransactionIdPrecedes(builder->committed.xip[off],
874 workspace[surviving_xids++] = builder->committed.xip[off];
877 /* copy workspace back to persistent state */
878 memcpy(builder->committed.xip, workspace,
879 surviving_xids * sizeof(TransactionId));
881 elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
882 (uint32) builder->committed.xcnt, (uint32) surviving_xids,
883 builder->xmin, builder->xmax);
884 builder->committed.xcnt = surviving_xids;
890 * Common logic for SnapBuildAbortTxn and SnapBuildCommitTxn dealing with
891 * keeping track of the amount of running transactions.
894 SnapBuildEndTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid)
896 if (builder->state == SNAPBUILD_CONSISTENT)
900 * NB: This handles subtransactions correctly even if we started from
901 * suboverflowed xl_running_xacts because we only keep track of toplevel
902 * transactions. Since the latter are always allocated before their
903 * subxids and since they end at the same time it's sufficient to deal
906 if (SnapBuildTxnIsRunning(builder, xid))
908 Assert(builder->running.xcnt > 0);
910 if (!--builder->running.xcnt)
913 * None of the originally running transaction is running anymore,
914 * so our incrementaly built snapshot now is consistent.
917 (errmsg("logical decoding found consistent point at %X/%X",
918 (uint32) (lsn >> 32), (uint32) lsn),
919 errdetail("Transaction ID %u finished; no more running transactions.",
921 builder->state = SNAPBUILD_CONSISTENT;
927 * Abort a transaction, throw away all state we kept.
930 SnapBuildAbortTxn(SnapBuild *builder, XLogRecPtr lsn,
932 int nsubxacts, TransactionId *subxacts)
936 for (i = 0; i < nsubxacts; i++)
938 TransactionId subxid = subxacts[i];
940 SnapBuildEndTxn(builder, lsn, subxid);
943 SnapBuildEndTxn(builder, lsn, xid);
947 * Handle everything that needs to be done when a transaction commits
950 SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
951 int nsubxacts, TransactionId *subxacts)
955 bool forced_timetravel = false;
956 bool sub_needs_timetravel = false;
957 bool top_needs_timetravel = false;
959 TransactionId xmax = xid;
962 * If we couldn't observe every change of a transaction because it was
963 * already running at the point we started to observe we have to assume it
964 * made catalog changes.
966 * This has the positive benefit that we afterwards have enough
967 * information to build an exportable snapshot that's usable by pg_dump et
970 if (builder->state < SNAPBUILD_CONSISTENT)
972 /* ensure that only commits after this are getting replayed */
973 if (builder->start_decoding_at <= lsn)
974 builder->start_decoding_at = lsn + 1;
977 * We could avoid treating !SnapBuildTxnIsRunning transactions as
978 * timetravel ones, but we want to be able to export a snapshot when
979 * we reached consistency.
981 forced_timetravel = true;
982 elog(DEBUG1, "forced to assume catalog changes for xid %u because it was running too early", xid);
985 for (nxact = 0; nxact < nsubxacts; nxact++)
987 TransactionId subxid = subxacts[nxact];
990 * make sure txn is not tracked in running txn's anymore, switch state
992 SnapBuildEndTxn(builder, lsn, subxid);
995 * If we're forcing timetravel we also need visibility information
996 * about subtransaction, so keep track of subtransaction's state.
998 if (forced_timetravel)
1000 SnapBuildAddCommittedTxn(builder, subxid);
1001 if (NormalTransactionIdFollows(subxid, xmax))
1006 * Add subtransaction to base snapshot if it DDL, we don't distinguish
1007 * to toplevel transactions there.
1009 else if (ReorderBufferXidHasCatalogChanges(builder->reorder, subxid))
1011 sub_needs_timetravel = true;
1013 elog(DEBUG1, "found subtransaction %u:%u with catalog changes.",
1016 SnapBuildAddCommittedTxn(builder, subxid);
1018 if (NormalTransactionIdFollows(subxid, xmax))
1024 * Make sure toplevel txn is not tracked in running txn's anymore, switch
1025 * state to consistent if possible.
1027 SnapBuildEndTxn(builder, lsn, xid);
1029 if (forced_timetravel)
1031 elog(DEBUG2, "forced transaction %u to do timetravel.", xid);
1033 SnapBuildAddCommittedTxn(builder, xid);
1035 /* add toplevel transaction to base snapshot */
1036 else if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid))
1038 elog(DEBUG2, "found top level transaction %u, with catalog changes!",
1041 top_needs_timetravel = true;
1042 SnapBuildAddCommittedTxn(builder, xid);
1044 else if (sub_needs_timetravel)
1046 /* mark toplevel txn as timetravel as well */
1047 SnapBuildAddCommittedTxn(builder, xid);
1050 /* if there's any reason to build a historic snapshot, do so now */
1051 if (forced_timetravel || top_needs_timetravel || sub_needs_timetravel)
1054 * Adjust xmax of the snapshot builder, we only do that for committed,
1055 * catalog modifying, transactions, everything else isn't interesting
1056 * for us since we'll never look at the respective rows.
1058 if (!TransactionIdIsValid(builder->xmax) ||
1059 TransactionIdFollowsOrEquals(xmax, builder->xmax))
1061 builder->xmax = xmax;
1062 TransactionIdAdvance(builder->xmax);
1066 * If we haven't built a complete snapshot yet there's no need to hand
1067 * it out, it wouldn't (and couldn't) be used anyway.
1069 if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
1073 * Decrease the snapshot builder's refcount of the old snapshot, note
1074 * that it still will be used if it has been handed out to the
1075 * reorderbuffer earlier.
1077 if (builder->snapshot)
1078 SnapBuildSnapDecRefcount(builder->snapshot);
1080 builder->snapshot = SnapBuildBuildSnapshot(builder, xid);
1082 /* we might need to execute invalidations, add snapshot */
1083 if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
1085 SnapBuildSnapIncRefcount(builder->snapshot);
1086 ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
1090 /* refcount of the snapshot builder for the new snapshot */
1091 SnapBuildSnapIncRefcount(builder->snapshot);
1093 /* add a new Snapshot to all currently running transactions */
1094 SnapBuildDistributeNewCatalogSnapshot(builder, lsn);
1098 /* record that we cannot export a general snapshot anymore */
1099 builder->committed.includes_all_transactions = false;
1104 /* -----------------------------------
1105 * Snapshot building functions dealing with xlog records
1106 * -----------------------------------
1110 * Process a running xacts record, and use its information to first build a
1111 * historic snapshot and later to release resources that aren't needed
1115 SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
1117 ReorderBufferTXN *txn;
1120 * If we're not consistent yet, inspect the record to see whether it
1121 * allows to get closer to being consistent. If we are consistent, dump
1122 * our snapshot so others or we, after a restart, can use it.
1124 if (builder->state < SNAPBUILD_CONSISTENT)
1126 /* returns false if there's no point in performing cleanup just yet */
1127 if (!SnapBuildFindSnapshot(builder, lsn, running))
1131 SnapBuildSerialize(builder, lsn);
1134 * Update range of interesting xids based on the running xacts
1135 * information. We don't increase ->xmax using it, because once we are in
1136 * a consistent state we can do that ourselves and much more efficiently
1137 * so, because we only need to do it for catalog transactions since we
1138 * only ever look at those.
1140 * NB: Because of that xmax can be lower than xmin, because we only
1141 * increase xmax when a catalog modifying transaction commits. While odd
1142 * looking, it's correct and actually more efficient this way since we hit
1143 * fast paths in tqual.c.
1145 builder->xmin = running->oldestRunningXid;
1147 /* Remove transactions we don't need to keep track off anymore */
1148 SnapBuildPurgeCommittedTxn(builder);
1150 elog(DEBUG3, "xmin: %u, xmax: %u, oldestrunning: %u",
1151 builder->xmin, builder->xmax,
1152 running->oldestRunningXid);
1155 * Inrease shared memory limits, so vacuum can work on tuples we prevented
1156 * from being pruned till now.
1158 LogicalIncreaseXminForSlot(lsn, running->oldestRunningXid);
1161 * Also tell the slot where we can restart decoding from. We don't want to
1162 * do that after every commit because changing that implies an fsync of
1163 * the logical slot's state file, so we only do it every time we see a
1164 * running xacts record.
1166 * Do so by looking for the oldest in progress transaction (determined by
1167 * the first LSN of any of its relevant records). Every transaction
1168 * remembers the last location we stored the snapshot to disk before its
1169 * beginning. That point is where we can restart from.
1173 * Can't know about a serialized snapshot's location if we're not
1176 if (builder->state < SNAPBUILD_CONSISTENT)
1179 txn = ReorderBufferGetOldestTXN(builder->reorder);
1182 * oldest ongoing txn might have started when we didn't yet serialize
1183 * anything because we hadn't reached a consistent state yet.
1185 if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr)
1186 LogicalIncreaseRestartDecodingForSlot(lsn, txn->restart_decoding_lsn);
1189 * No in-progress transaction, can reuse the last serialized snapshot if
1192 else if (txn == NULL &&
1193 builder->reorder->current_restart_decoding_lsn != InvalidXLogRecPtr &&
1194 builder->last_serialized_snapshot != InvalidXLogRecPtr)
1195 LogicalIncreaseRestartDecodingForSlot(lsn,
1196 builder->last_serialized_snapshot);
1201 * Build the start of a snapshot that's capable of decoding the catalog.
1203 * Helper function for SnapBuildProcessRunningXacts() while we're not yet
1206 * Returns true if there is a point in performing internal maintenance/cleanup
1207 * using the xl_running_xacts record.
1210 SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
1213 * Build catalog decoding snapshot incrementally using information about
1214 * the currently running transactions. There are several ways to do that:
1216 * a) There were no running transactions when the xl_running_xacts record
1217 * was inserted, jump to CONSISTENT immediately. We might find such a
1218 * state we were waiting for b) and c).
1220 * b) Wait for all toplevel transactions that were running to end. We
1221 * simply track the number of in-progress toplevel transactions and
1222 * lower it whenever one commits or aborts. When that number
1223 * (builder->running.xcnt) reaches zero, we can go from FULL_SNAPSHOT
1225 * NB: We need to search running.xip when seeing a transaction's end to
1226 * make sure it's a toplevel transaction and it's been one of the
1227 * initially running ones.
1228 * Interestingly, in contrast to HS, this allows us not to care about
1229 * subtransactions - and by extension suboverflowed xl_running_xacts -
1232 * c) This (in a previous run) or another decoding slot serialized a
1233 * snapshot to disk that we can use.
1238 * xl_running_xact record is older than what we can use, we might not have
1239 * all necessary catalog rows anymore.
1241 if (TransactionIdIsNormal(builder->initial_xmin_horizon) &&
1242 NormalTransactionIdPrecedes(running->oldestRunningXid,
1243 builder->initial_xmin_horizon))
1246 (errmsg_internal("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low",
1247 (uint32) (lsn >> 32), (uint32) lsn),
1248 errdetail_internal("initial xmin horizon of %u vs the snapshot's %u",
1249 builder->initial_xmin_horizon, running->oldestRunningXid)));
1254 * a) No transaction were running, we can jump to consistent.
1256 * NB: We might have already started to incrementally assemble a snapshot,
1257 * so we need to be careful to deal with that.
1259 if (running->xcnt == 0)
1261 if (builder->start_decoding_at == InvalidXLogRecPtr ||
1262 builder->start_decoding_at <= lsn)
1263 /* can decode everything after this */
1264 builder->start_decoding_at = lsn + 1;
1266 /* As no transactions were running xmin/xmax can be trivially set. */
1267 builder->xmin = running->nextXid; /* < are finished */
1268 builder->xmax = running->nextXid; /* >= are running */
1270 /* so we can safely use the faster comparisons */
1271 Assert(TransactionIdIsNormal(builder->xmin));
1272 Assert(TransactionIdIsNormal(builder->xmax));
1274 /* no transactions running now */
1275 builder->running.xcnt = 0;
1276 builder->running.xmin = InvalidTransactionId;
1277 builder->running.xmax = InvalidTransactionId;
1279 builder->state = SNAPBUILD_CONSISTENT;
1282 (errmsg("logical decoding found consistent point at %X/%X",
1283 (uint32) (lsn >> 32), (uint32) lsn),
1284 errdetail("There are no running transactions.")));
1288 /* c) valid on disk state */
1289 else if (SnapBuildRestore(builder, lsn))
1291 /* there won't be any state to cleanup */
1296 * b) first encounter of a useable xl_running_xacts record. If we had
1297 * found one earlier we would either track running transactions (i.e.
1298 * builder->running.xcnt != 0) or be consistent (this function wouldn't
1301 else if (!builder->running.xcnt)
1306 * We only care about toplevel xids as those are the ones we
1307 * definitely see in the wal stream. As snapbuild.c tracks committed
1308 * instead of running transactions we don't need to know anything
1309 * about uncommitted subtransactions.
1313 * Start with an xmin/xmax that's correct for future, when all the
1314 * currently running transactions have finished. We'll update both
1315 * while waiting for the pending transactions to finish.
1317 builder->xmin = running->nextXid; /* < are finished */
1318 builder->xmax = running->nextXid; /* >= are running */
1320 /* so we can safely use the faster comparisons */
1321 Assert(TransactionIdIsNormal(builder->xmin));
1322 Assert(TransactionIdIsNormal(builder->xmax));
1324 builder->running.xcnt = running->xcnt;
1325 builder->running.xcnt_space = running->xcnt;
1326 builder->running.xip =
1327 MemoryContextAlloc(builder->context,
1328 builder->running.xcnt * sizeof(TransactionId));
1329 memcpy(builder->running.xip, running->xids,
1330 builder->running.xcnt * sizeof(TransactionId));
1332 /* sort so we can do a binary search */
1333 qsort(builder->running.xip, builder->running.xcnt,
1334 sizeof(TransactionId), xidComparator);
1336 builder->running.xmin = builder->running.xip[0];
1337 builder->running.xmax = builder->running.xip[running->xcnt - 1];
1339 /* makes comparisons cheaper later */
1340 TransactionIdRetreat(builder->running.xmin);
1341 TransactionIdAdvance(builder->running.xmax);
1343 builder->state = SNAPBUILD_FULL_SNAPSHOT;
1346 (errmsg("logical decoding found initial starting point at %X/%X",
1347 (uint32) (lsn >> 32), (uint32) lsn),
1348 errdetail_plural("%u transaction needs to finish.",
1349 "%u transactions need to finish.",
1350 builder->running.xcnt,
1351 (uint32) builder->running.xcnt)));
1354 * Iterate through all xids, wait for them to finish.
1356 * This isn't required for the correctness of decoding, but to allow
1357 * isolationtester to notice that we're currently waiting for
1360 for (off = 0; off < builder->running.xcnt; off++)
1362 TransactionId xid = builder->running.xip[off];
1365 * Upper layers should prevent that we ever need to wait on
1366 * ourselves. Check anyway, since failing to do so would either
1367 * result in an endless wait or an Assert() failure.
1369 if (TransactionIdIsCurrentTransactionId(xid))
1370 elog(ERROR, "waiting for ourselves");
1372 XactLockTableWait(xid, NULL, NULL, XLTW_None);
1375 /* nothing could have built up so far, so don't perform cleanup */
1380 * We already started to track running xacts and need to wait for all
1381 * in-progress ones to finish. We fall through to the normal processing of
1382 * records so incremental cleanup can be performed.
1388 /* -----------------------------------
1389 * Snapshot serialization support
1390 * -----------------------------------
1394 * We store current state of struct SnapBuild on disk in the following manner:
1396 * struct SnapBuildOnDisk;
1397 * TransactionId * running.xcnt_space;
1398 * TransactionId * committed.xcnt; (*not xcnt_space*)
1401 typedef struct SnapBuildOnDisk
1403 /* first part of this struct needs to be version independent */
1405 /* data not covered by checksum */
1409 /* data covered by checksum */
1411 /* version, in case we want to support pg_upgrade */
1413 /* how large is the on disk data, excluding the constant sized part */
1416 /* version dependent part */
1419 /* variable amount of TransactionIds follows */
1422 #define SnapBuildOnDiskConstantSize \
1423 offsetof(SnapBuildOnDisk, builder)
1424 #define SnapBuildOnDiskNotChecksummedSize \
1425 offsetof(SnapBuildOnDisk, version)
1427 #define SNAPBUILD_MAGIC 0x51A1E001
1428 #define SNAPBUILD_VERSION 2
1431 * Store/Load a snapshot from disk, depending on the snapshot builder's state.
1433 * Supposed to be used by external (i.e. not snapbuild.c) code that just read
1434 * a record that's a potential location for a serialized snapshot.
1437 SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn)
1439 if (builder->state < SNAPBUILD_CONSISTENT)
1440 SnapBuildRestore(builder, lsn);
1442 SnapBuildSerialize(builder, lsn);
1446 * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already
1447 * been done by another decoding process.
1450 SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
1453 SnapBuildOnDisk *ondisk;
1456 char tmppath[MAXPGPATH];
1457 char path[MAXPGPATH];
1459 struct stat stat_buf;
1462 Assert(lsn != InvalidXLogRecPtr);
1463 Assert(builder->last_serialized_snapshot == InvalidXLogRecPtr ||
1464 builder->last_serialized_snapshot <= lsn);
1467 * no point in serializing if we cannot continue to work immediately after
1468 * restoring the snapshot
1470 if (builder->state < SNAPBUILD_CONSISTENT)
1474 * We identify snapshots by the LSN they are valid for. We don't need to
1475 * include timelines in the name as each LSN maps to exactly one timeline
1476 * unless the user used pg_resetxlog or similar. If a user did so, there's
1477 * no hope continuing to decode anyway.
1479 sprintf(path, "pg_logical/snapshots/%X-%X.snap",
1480 (uint32) (lsn >> 32), (uint32) lsn);
1483 * first check whether some other backend already has written the snapshot
1484 * for this LSN. It's perfectly fine if there's none, so we accept ENOENT
1485 * as a valid state. Everything else is an unexpected error.
1487 ret = stat(path, &stat_buf);
1489 if (ret != 0 && errno != ENOENT)
1491 (errmsg("could not stat file \"%s\": %m", path)));
1496 * somebody else has already serialized to this point, don't overwrite
1497 * but remember location, so we don't need to read old data again.
1499 * To be sure it has been synced to disk after the rename() from the
1500 * tempfile filename to the real filename, we just repeat the fsync.
1501 * That ought to be cheap because in most scenarios it should already
1502 * be safely on disk.
1504 fsync_fname(path, false);
1505 fsync_fname("pg_logical/snapshots", true);
1507 builder->last_serialized_snapshot = lsn;
1512 * there is an obvious race condition here between the time we stat(2) the
1513 * file and us writing the file. But we rename the file into place
1514 * atomically and all files created need to contain the same data anyway,
1515 * so this is perfectly fine, although a bit of a resource waste. Locking
1516 * seems like pointless complication.
1518 elog(DEBUG1, "serializing snapshot to %s", path);
1520 /* to make sure only we will write to this tempfile, include pid */
1521 sprintf(tmppath, "pg_logical/snapshots/%X-%X.snap.%u.tmp",
1522 (uint32) (lsn >> 32), (uint32) lsn, MyProcPid);
1525 * Unlink temporary file if it already exists, needs to have been before a
1526 * crash/error since we won't enter this function twice from within a
1527 * single decoding slot/backend and the temporary file contains the pid of
1528 * the current process.
1530 if (unlink(tmppath) != 0 && errno != ENOENT)
1532 (errcode_for_file_access(),
1533 errmsg("could not remove file \"%s\": %m", path)));
1535 needed_length = sizeof(SnapBuildOnDisk) +
1536 sizeof(TransactionId) * builder->running.xcnt_space +
1537 sizeof(TransactionId) * builder->committed.xcnt;
1539 ondisk_c = MemoryContextAllocZero(builder->context, needed_length);
1540 ondisk = (SnapBuildOnDisk *) ondisk_c;
1541 ondisk->magic = SNAPBUILD_MAGIC;
1542 ondisk->version = SNAPBUILD_VERSION;
1543 ondisk->length = needed_length;
1544 INIT_CRC32C(ondisk->checksum);
1545 COMP_CRC32C(ondisk->checksum,
1546 ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize,
1547 SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
1548 ondisk_c += sizeof(SnapBuildOnDisk);
1550 memcpy(&ondisk->builder, builder, sizeof(SnapBuild));
1551 /* NULL-ify memory-only data */
1552 ondisk->builder.context = NULL;
1553 ondisk->builder.snapshot = NULL;
1554 ondisk->builder.reorder = NULL;
1555 ondisk->builder.running.xip = NULL;
1556 ondisk->builder.committed.xip = NULL;
1558 COMP_CRC32C(ondisk->checksum,
1562 /* copy running xacts */
1563 sz = sizeof(TransactionId) * builder->running.xcnt_space;
1564 memcpy(ondisk_c, builder->running.xip, sz);
1565 COMP_CRC32C(ondisk->checksum, ondisk_c, sz);
1568 /* copy committed xacts */
1569 sz = sizeof(TransactionId) * builder->committed.xcnt;
1570 memcpy(ondisk_c, builder->committed.xip, sz);
1571 COMP_CRC32C(ondisk->checksum, ondisk_c, sz);
1574 FIN_CRC32C(ondisk->checksum);
1576 /* we have valid data now, open tempfile and write it there */
1577 fd = OpenTransientFile(tmppath,
1578 O_CREAT | O_EXCL | O_WRONLY | PG_BINARY,
1582 (errmsg("could not open file \"%s\": %m", path)));
1584 if ((write(fd, ondisk, needed_length)) != needed_length)
1586 CloseTransientFile(fd);
1588 (errcode_for_file_access(),
1589 errmsg("could not write to file \"%s\": %m", tmppath)));
1593 * fsync the file before renaming so that even if we crash after this we
1594 * have either a fully valid file or nothing.
1596 * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
1597 * some noticeable overhead since it's performed synchronously during
1600 if (pg_fsync(fd) != 0)
1602 CloseTransientFile(fd);
1604 (errcode_for_file_access(),
1605 errmsg("could not fsync file \"%s\": %m", tmppath)));
1607 CloseTransientFile(fd);
1609 fsync_fname("pg_logical/snapshots", true);
1612 * We may overwrite the work from some other backend, but that's ok, our
1613 * snapshot is valid as well, we'll just have done some superfluous work.
1615 if (rename(tmppath, path) != 0)
1618 (errcode_for_file_access(),
1619 errmsg("could not rename file \"%s\" to \"%s\": %m",
1623 /* make sure we persist */
1624 fsync_fname(path, false);
1625 fsync_fname("pg_logical/snapshots", true);
1628 * Now there's no way we can loose the dumped state anymore, remember this
1629 * as a serialization point.
1631 builder->last_serialized_snapshot = lsn;
1634 ReorderBufferSetRestartPoint(builder->reorder,
1635 builder->last_serialized_snapshot);
1639 * Restore a snapshot into 'builder' if previously one has been stored at the
1640 * location indicated by 'lsn'. Returns true if successful, false otherwise.
1643 SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn)
1645 SnapBuildOnDisk ondisk;
1647 char path[MAXPGPATH];
1652 /* no point in loading a snapshot if we're already there */
1653 if (builder->state == SNAPBUILD_CONSISTENT)
1656 sprintf(path, "pg_logical/snapshots/%X-%X.snap",
1657 (uint32) (lsn >> 32), (uint32) lsn);
1659 fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
1661 if (fd < 0 && errno == ENOENT)
1665 (errcode_for_file_access(),
1666 errmsg("could not open file \"%s\": %m", path)));
1669 * Make sure the snapshot had been stored safely to disk, that's normally
1671 * Note that we do not need PANIC here, nobody will be able to use the
1672 * slot without fsyncing, and saving it won't succeed without an fsync()
1676 fsync_fname(path, false);
1677 fsync_fname("pg_logical/snapshots", true);
1680 /* read statically sized portion of snapshot */
1681 readBytes = read(fd, &ondisk, SnapBuildOnDiskConstantSize);
1682 if (readBytes != SnapBuildOnDiskConstantSize)
1684 CloseTransientFile(fd);
1686 (errcode_for_file_access(),
1687 errmsg("could not read file \"%s\", read %d of %d: %m",
1688 path, readBytes, (int) SnapBuildOnDiskConstantSize)));
1691 if (ondisk.magic != SNAPBUILD_MAGIC)
1693 (errmsg("snapbuild state file \"%s\" has wrong magic number: %u instead of %u",
1694 path, ondisk.magic, SNAPBUILD_MAGIC)));
1696 if (ondisk.version != SNAPBUILD_VERSION)
1698 (errmsg("snapbuild state file \"%s\" has unsupported version: %u instead of %u",
1699 path, ondisk.version, SNAPBUILD_VERSION)));
1701 INIT_CRC32C(checksum);
1702 COMP_CRC32C(checksum,
1703 ((char *) &ondisk) + SnapBuildOnDiskNotChecksummedSize,
1704 SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
1706 /* read SnapBuild */
1707 readBytes = read(fd, &ondisk.builder, sizeof(SnapBuild));
1708 if (readBytes != sizeof(SnapBuild))
1710 CloseTransientFile(fd);
1712 (errcode_for_file_access(),
1713 errmsg("could not read file \"%s\", read %d of %d: %m",
1714 path, readBytes, (int) sizeof(SnapBuild))));
1716 COMP_CRC32C(checksum, &ondisk.builder, sizeof(SnapBuild));
1718 /* restore running xacts information */
1719 sz = sizeof(TransactionId) * ondisk.builder.running.xcnt_space;
1720 ondisk.builder.running.xip = MemoryContextAllocZero(builder->context, sz);
1721 readBytes = read(fd, ondisk.builder.running.xip, sz);
1722 if (readBytes != sz)
1724 CloseTransientFile(fd);
1726 (errcode_for_file_access(),
1727 errmsg("could not read file \"%s\", read %d of %d: %m",
1728 path, readBytes, (int) sz)));
1730 COMP_CRC32C(checksum, ondisk.builder.running.xip, sz);
1732 /* restore committed xacts information */
1733 sz = sizeof(TransactionId) * ondisk.builder.committed.xcnt;
1734 ondisk.builder.committed.xip = MemoryContextAllocZero(builder->context, sz);
1735 readBytes = read(fd, ondisk.builder.committed.xip, sz);
1736 if (readBytes != sz)
1738 CloseTransientFile(fd);
1740 (errcode_for_file_access(),
1741 errmsg("could not read file \"%s\", read %d of %d: %m",
1742 path, readBytes, (int) sz)));
1744 COMP_CRC32C(checksum, ondisk.builder.committed.xip, sz);
1746 CloseTransientFile(fd);
1748 FIN_CRC32C(checksum);
1750 /* verify checksum of what we've read */
1751 if (!EQ_CRC32C(checksum, ondisk.checksum))
1753 (errcode_for_file_access(),
1754 errmsg("checksum mismatch for snapbuild state file \"%s\": is %u, should be %u",
1755 path, checksum, ondisk.checksum)));
1758 * ok, we now have a sensible snapshot here, figure out if it has more
1759 * information than we have.
1763 * We are only interested in consistent snapshots for now, comparing
1764 * whether one incomplete snapshot is more "advanced" seems to be
1765 * unnecessarily complex.
1767 if (ondisk.builder.state < SNAPBUILD_CONSISTENT)
1768 goto snapshot_not_interesting;
1771 * Don't use a snapshot that requires an xmin that we cannot guarantee to
1774 if (TransactionIdPrecedes(ondisk.builder.xmin, builder->initial_xmin_horizon))
1775 goto snapshot_not_interesting;
1778 /* ok, we think the snapshot is sensible, copy over everything important */
1779 builder->xmin = ondisk.builder.xmin;
1780 builder->xmax = ondisk.builder.xmax;
1781 builder->state = ondisk.builder.state;
1783 builder->committed.xcnt = ondisk.builder.committed.xcnt;
1784 /* We only allocated/stored xcnt, not xcnt_space xids ! */
1785 /* don't overwrite preallocated xip, if we don't have anything here */
1786 if (builder->committed.xcnt > 0)
1788 pfree(builder->committed.xip);
1789 builder->committed.xcnt_space = ondisk.builder.committed.xcnt;
1790 builder->committed.xip = ondisk.builder.committed.xip;
1792 ondisk.builder.committed.xip = NULL;
1794 builder->running.xcnt = ondisk.builder.running.xcnt;
1795 if (builder->running.xip)
1796 pfree(builder->running.xip);
1797 builder->running.xcnt_space = ondisk.builder.running.xcnt_space;
1798 builder->running.xip = ondisk.builder.running.xip;
1800 /* our snapshot is not interesting anymore, build a new one */
1801 if (builder->snapshot != NULL)
1803 SnapBuildSnapDecRefcount(builder->snapshot);
1805 builder->snapshot = SnapBuildBuildSnapshot(builder, InvalidTransactionId);
1806 SnapBuildSnapIncRefcount(builder->snapshot);
1808 ReorderBufferSetRestartPoint(builder->reorder, lsn);
1810 Assert(builder->state == SNAPBUILD_CONSISTENT);
1813 (errmsg("logical decoding found consistent point at %X/%X",
1814 (uint32) (lsn >> 32), (uint32) lsn),
1815 errdetail("Logical decoding will begin using saved snapshot.")));
1818 snapshot_not_interesting:
1819 if (ondisk.builder.running.xip != NULL)
1820 pfree(ondisk.builder.running.xip);
1821 if (ondisk.builder.committed.xip != NULL)
1822 pfree(ondisk.builder.committed.xip);
1827 * Remove all serialized snapshots that are not required anymore because no
1828 * slot can need them. This doesn't actually have to run during a checkpoint,
1829 * but it's a convenient point to schedule this.
1831 * NB: We run this during checkpoints even if logical decoding is disabled so
1832 * we cleanup old slots at some point after it got disabled.
1835 CheckPointSnapBuild(void)
1840 struct dirent *snap_de;
1841 char path[MAXPGPATH];
1844 * We start of with a minimum of the last redo pointer. No new replication
1845 * slot will start before that, so that's a safe upper bound for removal.
1847 redo = GetRedoRecPtr();
1849 /* now check for the restart ptrs from existing slots */
1850 cutoff = ReplicationSlotsComputeLogicalRestartLSN();
1852 /* don't start earlier than the restart lsn */
1856 snap_dir = AllocateDir("pg_logical/snapshots");
1857 while ((snap_de = ReadDir(snap_dir, "pg_logical/snapshots")) != NULL)
1862 struct stat statbuf;
1864 if (strcmp(snap_de->d_name, ".") == 0 ||
1865 strcmp(snap_de->d_name, "..") == 0)
1868 snprintf(path, MAXPGPATH, "pg_logical/snapshots/%s", snap_de->d_name);
1870 if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
1872 elog(DEBUG1, "only regular files expected: %s", path);
1877 * temporary filenames from SnapBuildSerialize() include the LSN and
1878 * everything but are postfixed by .$pid.tmp. We can just remove them
1879 * the same as other files because there can be none that are
1880 * currently being written that are older than cutoff.
1882 * We just log a message if a file doesn't fit the pattern, it's
1883 * probably some editors lock/state file or similar...
1885 if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2)
1888 (errmsg("could not parse file name \"%s\"", path)));
1892 lsn = ((uint64) hi) << 32 | lo;
1894 /* check whether we still need it */
1895 if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
1897 elog(DEBUG1, "removing snapbuild snapshot %s", path);
1900 * It's not particularly harmful, though strange, if we can't
1901 * remove the file here. Don't prevent the checkpoint from
1902 * completing, that'd be cure worse than the disease.
1904 if (unlink(path) < 0)
1907 (errcode_for_file_access(),
1908 errmsg("could not remove file \"%s\": %m",