From 7c4f52409a8c7d85ed169bbbc1f6092274d03920 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Thu, 23 Mar 2017 08:36:36 -0400 Subject: [PATCH] Logical replication support for initial data copy Add functionality for a new subscription to copy the initial data in the tables and then sync with the ongoing apply process. For the copying, add a new internal COPY option to have the COPY source data provided by a callback function. The initial data copy works on the subscriber by receiving COPY data from the publisher and then providing it locally into a COPY that writes to the destination table. A WAL receiver can now execute full SQL commands. This is used here to obtain information about tables and publications. Several new options were added to CREATE and ALTER SUBSCRIPTION to control whether and when initial table syncing happens. Change pg_dump option --no-create-subscription-slots to --no-subscription-connect and use the new CREATE SUBSCRIPTION ... NOCONNECT option for that. Author: Petr Jelinek Tested-by: Erik Rijkers --- contrib/file_fdw/file_fdw.c | 5 +- doc/src/sgml/catalogs.sgml | 78 ++ doc/src/sgml/config.sgml | 25 + doc/src/sgml/logical-replication.sgml | 55 +- doc/src/sgml/monitoring.sgml | 9 +- doc/src/sgml/protocol.sgml | 9 +- doc/src/sgml/ref/alter_subscription.sgml | 50 +- doc/src/sgml/ref/create_subscription.sgml | 38 + doc/src/sgml/ref/pg_dump.sgml | 15 +- src/backend/catalog/Makefile | 1 + src/backend/catalog/heap.c | 6 + src/backend/catalog/pg_publication.c | 4 +- src/backend/catalog/pg_subscription.c | 282 ++++++ src/backend/catalog/system_views.sql | 1 + src/backend/commands/copy.c | 23 +- src/backend/commands/subscriptioncmds.c | 468 ++++++++-- src/backend/parser/gram.y | 39 +- src/backend/postmaster/pgstat.c | 6 + .../libpqwalreceiver/libpqwalreceiver.c | 203 ++++- src/backend/replication/logical/Makefile | 2 +- src/backend/replication/logical/launcher.c | 130 ++- src/backend/replication/logical/relation.c | 7 + src/backend/replication/logical/snapbuild.c | 85 +- src/backend/replication/logical/tablesync.c | 840 ++++++++++++++++++ src/backend/replication/logical/worker.c | 203 ++++- src/backend/replication/repl_gram.y | 32 +- src/backend/replication/repl_scanner.l | 5 +- src/backend/replication/walsender.c | 104 ++- src/backend/tcop/postgres.c | 5 +- src/backend/utils/adt/misc.c | 20 + src/backend/utils/cache/syscache.c | 14 +- src/backend/utils/misc/guc.c | 12 + src/bin/pg_dump/pg_backup.h | 2 +- src/bin/pg_dump/pg_dump.c | 9 +- src/bin/pg_dump/t/002_pg_dump.pl | 2 +- src/include/catalog/catversion.h | 2 +- src/include/catalog/indexing.h | 7 +- src/include/catalog/pg_proc.h | 5 +- src/include/catalog/pg_subscription_rel.h | 78 ++ src/include/commands/copy.h | 5 +- src/include/nodes/nodes.h | 1 + src/include/nodes/parsenodes.h | 13 + src/include/nodes/replnodes.h | 9 + src/include/parser/kwlist.h | 1 + src/include/pgstat.h | 4 +- src/include/replication/logical.h | 13 +- src/include/replication/logicallauncher.h | 1 + src/include/replication/snapbuild.h | 1 + src/include/replication/walreceiver.h | 67 +- src/include/replication/walsender.h | 12 +- src/include/replication/worker_internal.h | 30 +- src/include/utils/syscache.h | 1 + src/test/regress/expected/object_address.out | 3 +- src/test/regress/expected/rules.out | 3 +- src/test/regress/expected/sanity_check.out | 1 + src/test/regress/expected/subscription.out | 45 +- src/test/regress/sql/object_address.sql | 2 +- src/test/regress/sql/subscription.sql | 11 +- src/test/subscription/t/001_rep_changes.pl | 36 +- src/test/subscription/t/002_types.pl | 6 + src/test/subscription/t/003_constraints.pl | 2 +- src/test/subscription/t/004_sync.pl | 159 ++++ 62 files changed, 2966 insertions(+), 341 deletions(-) create mode 100644 src/backend/replication/logical/tablesync.c create mode 100644 src/include/catalog/pg_subscription_rel.h create mode 100644 src/test/subscription/t/004_sync.pl diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c index 735b79484c..277639f6e9 100644 --- a/contrib/file_fdw/file_fdw.c +++ b/contrib/file_fdw/file_fdw.c @@ -662,6 +662,7 @@ fileBeginForeignScan(ForeignScanState *node, int eflags) node->ss.ss_currentRelation, filename, is_program, + NULL, NIL, options); @@ -737,6 +738,7 @@ fileReScanForeignScan(ForeignScanState *node) node->ss.ss_currentRelation, festate->filename, festate->is_program, + NULL, NIL, festate->options); } @@ -1100,7 +1102,8 @@ file_acquire_sample_rows(Relation onerel, int elevel, /* * Create CopyState from FDW options. */ - cstate = BeginCopyFrom(NULL, onerel, filename, is_program, NIL, options); + cstate = BeginCopyFrom(NULL, onerel, filename, is_program, NULL, NIL, + options); /* * Use per-tuple memory context to prevent leak of memory used to read diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index df0435c3f0..228ec78031 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -300,6 +300,11 @@ logical replication subscriptions + + pg_subscription_rel + relation state for subscriptions + + pg_tablespace tablespaces within this database cluster @@ -6418,6 +6423,79 @@ + + <structname>pg_subscription_rel</structname> + + + pg_subscription_rel + + + + The catalog pg_subscription_rel contains the + state for each replicated relation in each subscription. This is a + many-to-many mapping. + + + + This catalog only contains tables known to the subscription after running + either CREATE SUBSCRIPTION or + ALTER SUBSCRIPTION ... REFRESH. + + + + <structname>pg_subscription_rel</structname> Columns + + + + + Name + Type + References + Description + + + + + + srsubid + oid + pg_subscription.oid + Reference to subscription + + + + srrelid + oid + pg_class.oid + Reference to relation + + + + srsubstate + char + + + State code: + i = initialize, + d = data is being copied, + s = synchronized, + r = ready (normal replication) + + + + + srsublsn + pg_lsn + + + End LSN for s and r states. + + + + +
+
+ <structname>pg_tablespace</structname> diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index b379b67b30..2de3540def 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -3449,6 +3449,31 @@ ANY num_sync ( + max_sync_workers_per_subscription (integer) + + max_sync_workers_per_subscription configuration parameter + + + + + Maximum number of synchronization workers per subscription. This + parameter controls the amount of paralelism of the initial data copy + during the subscription initialization or when new tables are added. + + + Currently, there can be only one synchronization worker per table. + + + The synchronization workers are taken from the pool defined by + max_logical_replication_workers. + + + The default value is 2. + + + + diff --git a/doc/src/sgml/logical-replication.sgml b/doc/src/sgml/logical-replication.sgml index 44cd78563d..48db9cd08b 100644 --- a/doc/src/sgml/logical-replication.sgml +++ b/doc/src/sgml/logical-replication.sgml @@ -24,9 +24,11 @@ - Logical replication sends changes on the publisher to the subscriber as - they occur in real-time. The subscriber applies the data in the same order - as the publisher so that transactional consistency is guaranteed for + Logical replication of a table typically starts with a taking a snapshot + of the data on the publisher database and copying that to the subscriber. + Once that is done, the changes on the publisher are sent to the subscriber + as they occur in real-time. The subscriber applies the data in the same + order as the publisher so that transactional consistency is guaranteed for publications within a single subscription. This method of data replication is sometimes referred to as transactional replication. @@ -159,7 +161,9 @@ Each subscription will receive changes via one replication slot (see - ). + ). Additional temporary + replication slots may be required for the initial data synchronization + of pre-existing table data. @@ -264,9 +268,25 @@ to replica, which produces the usual effects on triggers and constraints. + + + Initial Snapshot + + The initial data in existing subscribed tables are snapshotted and + copied in a parallel instance of a special kind of apply process. + This process will create its own temporary replication slot and + copy the existing data. Once existing data is copied, the worker + enters synchronization mode, which ensures that the table is brought + up to a synchronized state with the main apply process by streaming + any changes that happened during the initial data copy using standard + logical replication. Once the synchronization is done, the control + of the replication of the table is given back to the main apply + process where the replication continues as normal. + + - + Monitoring @@ -287,7 +307,9 @@ Normally, there is a single apply process running for an enabled subscription. A disabled subscription or a crashed subscription will have - zero rows in this view. + zero rows in this view. If the initial data synchronization of any + table is in progress, there will be additional workers for the tables + being synchronized. @@ -337,10 +359,11 @@ On the publisher side, wal_level must be set to logical, and max_replication_slots - must be set to at least the number of subscriptions expected to connect. - And max_wal_senders should be set to at least the same - as max_replication_slots plus the number of physical replicas - that are connected at the same time. + must be set to at least the number of subscriptions expected to connect, + plus some reserve for table synchronization. And + max_wal_senders should be set to at least the same as + max_replication_slots plus the number of physical + replicas that are connected at the same time. @@ -348,9 +371,9 @@ to be set. In this case it should be set to at least the number of subscriptions that will be added to the subscriber. max_logical_replication_workers must be set to at - least the number of subscriptions. Additionally the - max_worker_processes may need to be adjusted to - accommodate for replication workers, at least + least the number of subscriptions, again plus some reserve for the table + synchronization. Additionally the max_worker_processes + may need to be adjusted to accommodate for replication workers, at least (max_logical_replication_workers + 1). Note that some extensions and parallel queries also take worker slots from max_worker_processes. @@ -393,8 +416,10 @@ CREATE SUBSCRIPTION mysub CONNECTION 'dbname=foo host=bar user=repuser' PUBLICAT - The above will start the replication process of changes to - users and departments tables. + The above will start the replication process, which synchronizes the + initial table contents of the tables users and + departments and then starts replicating + incremental changes to those tables. diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index dcb2d3303c..eb6f486677 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1863,6 +1863,12 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i integer Process ID of the subscription worker process
+ + relid + Oid + OID of the relation that the worker is synchronizing; null for the + main apply worker + received_lsn pg_lsn @@ -1899,7 +1905,8 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i The pg_stat_subscription view will contain one row per subscription for main worker (with null PID if the worker is - not running). + not running), and additional rows for workers handling the initial data + copy of the subscribed tables. diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml index 244e381de9..48ca414031 100644 --- a/doc/src/sgml/protocol.sgml +++ b/doc/src/sgml/protocol.sgml @@ -1487,7 +1487,7 @@ The commands accepted in walsender mode are: - CREATE_REPLICATION_SLOT slot_name [ TEMPORARY ] { PHYSICAL [ RESERVE_WAL ] | LOGICAL output_plugin [ EXPORT_SNAPSHOT | NOEXPORT_SNAPSHOT ] } + CREATE_REPLICATION_SLOT slot_name [ TEMPORARY ] { PHYSICAL [ RESERVE_WAL ] | LOGICAL output_plugin [ EXPORT_SNAPSHOT | NOEXPORT_SNAPSHOT | USE_SNAPSHOT ] } CREATE_REPLICATION_SLOT @@ -1542,12 +1542,17 @@ The commands accepted in walsender mode are: EXPORT_SNAPSHOT NOEXPORT_SNAPSHOT + USE_SNAPSHOT Decides what to do with the snapshot created during logical slot initialization. EXPORT_SNAPSHOT, which is the default, will export the snapshot for use in other sessions. This option can't - be used inside a transaction. NOEXPORT_SNAPSHOT will + be used inside a transaction. USE_SNAPSHOT will use the + snapshot for the current transaction executing the command. This + option must be used in a transaction, and + CREATE_REPLICATION_SLOT must be the first command + run in that transaction. Finally, NOEXPORT_SNAPSHOT will just use the snapshot for logical decoding as normal but won't do anything else with it. diff --git a/doc/src/sgml/ref/alter_subscription.sgml b/doc/src/sgml/ref/alter_subscription.sgml index 5e18e2ff6c..6f94247b92 100644 --- a/doc/src/sgml/ref/alter_subscription.sgml +++ b/doc/src/sgml/ref/alter_subscription.sgml @@ -21,15 +21,21 @@ PostgreSQL documentation -ALTER SUBSCRIPTION name WITH ( option [, ... ] ) ] +ALTER SUBSCRIPTION name WITH ( suboption [, ... ] ) ] -where option can be: +where suboption can be: - SLOT NAME = slot_name + SLOT NAME = slot_name + +ALTER SUBSCRIPTION name SET PUBLICATION publication_name [, ...] { REFRESH WITH ( puboption [, ... ] ) | NOREFRESH } +ALTER SUBSCRIPTION name REFRESH PUBLICATION WITH ( puboption [, ... ] ) + +where puboption can be: + + COPY DATA | NOCOPY DATA ALTER SUBSCRIPTION name OWNER TO { new_owner | CURRENT_USER | SESSION_USER } ALTER SUBSCRIPTION name CONNECTION 'conninfo' -ALTER SUBSCRIPTION name SET PUBLICATION publication_name [, ...] ALTER SUBSCRIPTION name ENABLE ALTER SUBSCRIPTION name DISABLE @@ -65,7 +71,6 @@ ALTER SUBSCRIPTION name DISABLE CONNECTION 'conninfo' - SET PUBLICATION publication_name SLOT NAME = slot_name @@ -76,6 +81,40 @@ ALTER SUBSCRIPTION name DISABLE + + SET PUBLICATION publication_name + + + Changes list of subscribed publications. See + for more information. + + + When REFRESH is specified, this command will also + act like REFRESH PUBLICATION. When + NOREFRESH is specified, the comamnd will not try to + refresh table information. + + + + + + REFRESH PUBLICATION + + + Fetch missing table information from publisher. This will start + replication of tables that were added to the subscribed-to publications + since the last invocation of REFRESH PUBLICATION or + since CREATE SUBSCRIPTION. + + + The COPY DATA and NOCOPY DATA + options specify if the existing data in the publications that are being + subscribed to should be copied. COPY DATA is the + default. + + + + ENABLE @@ -95,6 +134,7 @@ ALTER SUBSCRIPTION name DISABLE + diff --git a/doc/src/sgml/ref/create_subscription.sgml b/doc/src/sgml/ref/create_subscription.sgml index e200076700..8f3c30b9b0 100644 --- a/doc/src/sgml/ref/create_subscription.sgml +++ b/doc/src/sgml/ref/create_subscription.sgml @@ -31,6 +31,8 @@ CREATE SUBSCRIPTION subscription_nameslot_name + | COPY DATA | NOCOPY DATA + | NOCONNECT @@ -132,6 +134,42 @@ CREATE SUBSCRIPTION subscription_name + + + COPY DATA + NOCOPY DATA + + + Specifies if the existing data in the publications that are being + subscribed to should be copied once the replication starts. + COPY DATA is the default. + + + + + + NOCONNECT + + + Instructs CREATE SUBSCRIPTION to skip the initial + connection to the provider. This will change default values of other + options to DISABLED, + NOCREATE SLOT, and NOCOPY DATA. + + + It's not allowed to combine NOCONNECT and + ENABLED, CREATE SLOT, or + COPY DATA. + + + Since no connection is made when this option is specified, the tables + are not subscribed, so after you enable the subscription nothing will + be replicated. It is required to run + ALTER SUBSCRIPTION ... REFRESH PUBLICATION in order for + tables to be subscribed. + + + diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml index bb32fb12e0..4f19b89232 100644 --- a/doc/src/sgml/ref/pg_dump.sgml +++ b/doc/src/sgml/ref/pg_dump.sgml @@ -799,22 +799,23 @@ PostgreSQL documentation - + - When dumping logical replication subscriptions, - generate CREATE SUBSCRIPTION commands that do not - create the remote replication slot. That way, the dump can be - restored without requiring network access to the remote servers. + Do not dump security labels. - + - Do not dump security labels. + When dumping logical replication subscriptions, + generate CREATE SUBSCRIPTION commands that do not + make remote connections for creating replication slot or initial table + copy. That way, the dump can be restored without requiring network + access to the remote servers. diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index 31368585d2..159cab5c18 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -44,6 +44,7 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\ pg_default_acl.h pg_init_privs.h pg_seclabel.h pg_shseclabel.h \ pg_collation.h pg_partitioned_table.h pg_range.h pg_transform.h \ pg_sequence.h pg_publication.h pg_publication_rel.h pg_subscription.h \ + pg_subscription_rel.h toasting.h indexing.h \ toasting.h indexing.h \ ) diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 41c0056556..d49dcdc015 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -52,6 +52,7 @@ #include "catalog/pg_opclass.h" #include "catalog/pg_partitioned_table.h" #include "catalog/pg_statistic.h" +#include "catalog/pg_subscription_rel.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_type.h" #include "catalog/pg_type_fn.h" @@ -1831,6 +1832,11 @@ heap_drop_with_catalog(Oid relid) */ relation_close(rel, NoLock); + /* + * Remove any associated relation synchronization states. + */ + RemoveSubscriptionRel(InvalidOid, relid); + /* * Forget any ON COMMIT action for the rel */ diff --git a/src/backend/catalog/pg_publication.c b/src/backend/catalog/pg_publication.c index 0f784690ce..9330e2380a 100644 --- a/src/backend/catalog/pg_publication.c +++ b/src/backend/catalog/pg_publication.c @@ -221,8 +221,8 @@ GetPublicationRelations(Oid pubid) BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(pubid)); - scan = systable_beginscan(pubrelsrel, PublicationRelMapIndexId, true, - NULL, 1, &scankey); + scan = systable_beginscan(pubrelsrel, PublicationRelPrrelidPrpubidIndexId, + true, NULL, 1, &scankey); result = NIL; while (HeapTupleIsValid(tup = systable_getnext(scan))) diff --git a/src/backend/catalog/pg_subscription.c b/src/backend/catalog/pg_subscription.c index 20fdd6a54f..e420ec14d2 100644 --- a/src/backend/catalog/pg_subscription.c +++ b/src/backend/catalog/pg_subscription.c @@ -19,15 +19,20 @@ #include "access/genam.h" #include "access/heapam.h" #include "access/htup_details.h" +#include "access/xact.h" +#include "catalog/indexing.h" #include "catalog/pg_type.h" #include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" #include "nodes/makefuncs.h" #include "utils/array.h" #include "utils/builtins.h" #include "utils/fmgroids.h" +#include "utils/pg_lsn.h" +#include "utils/rel.h" #include "utils/syscache.h" @@ -206,3 +211,280 @@ textarray_to_stringlist(ArrayType *textarray) return res; } + +/* + * Set the state of a subscription table. + */ +Oid +SetSubscriptionRelState(Oid subid, Oid relid, char state, + XLogRecPtr sublsn) +{ + Relation rel; + HeapTuple tup; + Oid subrelid; + bool nulls[Natts_pg_subscription_rel]; + Datum values[Natts_pg_subscription_rel]; + + /* Prevent concurrent changes. */ + rel = heap_open(SubscriptionRelRelationId, ShareRowExclusiveLock); + + /* Try finding existing mapping. */ + tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP, + ObjectIdGetDatum(relid), + ObjectIdGetDatum(subid)); + + /* + * If the record for given table does not exist yet create new + * record, otherwise update the existing one. + */ + if (!HeapTupleIsValid(tup)) + { + /* Form the tuple. */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + values[Anum_pg_subscription_rel_srsubid - 1] = ObjectIdGetDatum(subid); + values[Anum_pg_subscription_rel_srrelid - 1] = ObjectIdGetDatum(relid); + values[Anum_pg_subscription_rel_srsubstate - 1] = CharGetDatum(state); + if (sublsn != InvalidXLogRecPtr) + values[Anum_pg_subscription_rel_srsublsn - 1] = LSNGetDatum(sublsn); + else + nulls[Anum_pg_subscription_rel_srsublsn - 1] = true; + + tup = heap_form_tuple(RelationGetDescr(rel), values, nulls); + + /* Insert tuple into catalog. */ + subrelid = CatalogTupleInsert(rel, tup); + + heap_freetuple(tup); + } + else + { + bool replaces[Natts_pg_subscription_rel]; + + /* Update the tuple. */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + memset(replaces, false, sizeof(replaces)); + + replaces[Anum_pg_subscription_rel_srsubstate - 1] = true; + values[Anum_pg_subscription_rel_srsubstate - 1] = CharGetDatum(state); + + replaces[Anum_pg_subscription_rel_srsublsn - 1] = true; + if (sublsn != InvalidXLogRecPtr) + values[Anum_pg_subscription_rel_srsublsn - 1] = LSNGetDatum(sublsn); + else + nulls[Anum_pg_subscription_rel_srsublsn - 1] = true; + + tup = heap_modify_tuple(tup, RelationGetDescr(rel), values, nulls, + replaces); + + /* Update the catalog. */ + CatalogTupleUpdate(rel, &tup->t_self, tup); + + subrelid = HeapTupleGetOid(tup); + } + + /* Cleanup. */ + heap_close(rel, NoLock); + + return subrelid; +} + +/* + * Get state of subscription table. + * + * Returns SUBREL_STATE_UNKNOWN when not found and missing_ok is true. + */ +char +GetSubscriptionRelState(Oid subid, Oid relid, XLogRecPtr *sublsn, + bool missing_ok) +{ + Relation rel; + HeapTuple tup; + char substate; + bool isnull; + Datum d; + + rel = heap_open(SubscriptionRelRelationId, AccessShareLock); + + /* Try finding the mapping. */ + tup = SearchSysCache2(SUBSCRIPTIONRELMAP, + ObjectIdGetDatum(relid), + ObjectIdGetDatum(subid)); + + if (!HeapTupleIsValid(tup)) + { + if (missing_ok) + { + heap_close(rel, AccessShareLock); + *sublsn = InvalidXLogRecPtr; + return SUBREL_STATE_UNKNOWN; + } + + elog(ERROR, "subscription table %u in subscription %u does not exist", + relid, subid); + } + + /* Get the state. */ + d = SysCacheGetAttr(SUBSCRIPTIONRELMAP, tup, + Anum_pg_subscription_rel_srsubstate, &isnull); + Assert(!isnull); + substate = DatumGetChar(d); + d = SysCacheGetAttr(SUBSCRIPTIONRELMAP, tup, + Anum_pg_subscription_rel_srsublsn, &isnull); + if (isnull) + *sublsn = InvalidXLogRecPtr; + else + *sublsn = DatumGetLSN(d); + + /* Cleanup */ + ReleaseSysCache(tup); + heap_close(rel, AccessShareLock); + + return substate; +} + +/* + * Drop subscription relation mapping. These can be for a particular + * subscription, or for a particular relation, or both. + */ +void +RemoveSubscriptionRel(Oid subid, Oid relid) +{ + Relation rel; + HeapScanDesc scan; + ScanKeyData skey[2]; + HeapTuple tup; + int nkeys = 0; + + /* Prevent concurrent changes (see SetSubscriptionRelState()). */ + rel = heap_open(SubscriptionRelRelationId, ShareRowExclusiveLock); + + if (OidIsValid(subid)) + { + ScanKeyInit(&skey[nkeys++], + Anum_pg_subscription_rel_srsubid, + BTEqualStrategyNumber, + F_OIDEQ, + ObjectIdGetDatum(subid)); + } + + if (OidIsValid(relid)) + { + ScanKeyInit(&skey[nkeys++], + Anum_pg_subscription_rel_srrelid, + BTEqualStrategyNumber, + F_OIDEQ, + ObjectIdGetDatum(relid)); + } + + /* Do the search and delete what we found. */ + scan = heap_beginscan_catalog(rel, nkeys, skey); + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + simple_heap_delete(rel, &tup->t_self); + } + heap_endscan(scan); + + heap_close(rel, ShareRowExclusiveLock); +} + + +/* + * Get all relations for subscription. + * + * Returned list is palloced in current memory context. + */ +List * +GetSubscriptionRelations(Oid subid) +{ + List *res = NIL; + Relation rel; + HeapTuple tup; + int nkeys = 0; + ScanKeyData skey[2]; + SysScanDesc scan; + + rel = heap_open(SubscriptionRelRelationId, AccessShareLock); + + ScanKeyInit(&skey[nkeys++], + Anum_pg_subscription_rel_srsubid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(subid)); + + scan = systable_beginscan(rel, InvalidOid, false, + NULL, nkeys, skey); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_subscription_rel subrel; + SubscriptionRelState *relstate; + + subrel = (Form_pg_subscription_rel) GETSTRUCT(tup); + + relstate = (SubscriptionRelState *)palloc(sizeof(SubscriptionRelState)); + relstate->relid = subrel->srrelid; + relstate->state = subrel->srsubstate; + relstate->lsn = subrel->srsublsn; + + res = lappend(res, relstate); + } + + /* Cleanup */ + systable_endscan(scan); + heap_close(rel, AccessShareLock); + + return res; +} + +/* + * Get all relations for subscription that are not in a ready state. + * + * Returned list is palloced in current memory context. + */ +List * +GetSubscriptionNotReadyRelations(Oid subid) +{ + List *res = NIL; + Relation rel; + HeapTuple tup; + int nkeys = 0; + ScanKeyData skey[2]; + SysScanDesc scan; + + rel = heap_open(SubscriptionRelRelationId, AccessShareLock); + + ScanKeyInit(&skey[nkeys++], + Anum_pg_subscription_rel_srsubid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(subid)); + + ScanKeyInit(&skey[nkeys++], + Anum_pg_subscription_rel_srsubstate, + BTEqualStrategyNumber, F_CHARNE, + CharGetDatum(SUBREL_STATE_READY)); + + scan = systable_beginscan(rel, InvalidOid, false, + NULL, nkeys, skey); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_subscription_rel subrel; + SubscriptionRelState *relstate; + + subrel = (Form_pg_subscription_rel) GETSTRUCT(tup); + + relstate = (SubscriptionRelState *)palloc(sizeof(SubscriptionRelState)); + relstate->relid = subrel->srrelid; + relstate->state = subrel->srsubstate; + relstate->lsn = subrel->srsublsn; + + res = lappend(res, relstate); + } + + /* Cleanup */ + systable_endscan(scan); + heap_close(rel, AccessShareLock); + + return res; +} diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index c2b0bedc1d..5723714fb9 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -733,6 +733,7 @@ CREATE VIEW pg_stat_subscription AS su.oid AS subid, su.subname, st.pid, + st.relid, st.received_lsn, st.last_msg_send_time, st.last_msg_receipt_time, diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index ba89b292d1..b0fd09f458 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -60,7 +60,8 @@ typedef enum CopyDest { COPY_FILE, /* to/from file (or a piped program) */ COPY_OLD_FE, /* to/from frontend (2.0 protocol) */ - COPY_NEW_FE /* to/from frontend (3.0 protocol) */ + COPY_NEW_FE, /* to/from frontend (3.0 protocol) */ + COPY_CALLBACK /* to/from callback function */ } CopyDest; /* @@ -109,6 +110,7 @@ typedef struct CopyStateData List *attnumlist; /* integer list of attnums to copy */ char *filename; /* filename, or NULL for STDIN/STDOUT */ bool is_program; /* is 'filename' a program to popen? */ + copy_data_source_cb data_source_cb; /* function for reading data*/ bool binary; /* binary format? */ bool oids; /* include OIDs? */ bool freeze; /* freeze rows on loading? */ @@ -299,7 +301,6 @@ static uint64 DoCopyTo(CopyState cstate); static uint64 CopyTo(CopyState cstate); static void CopyOneRowTo(CopyState cstate, Oid tupleOid, Datum *values, bool *nulls); -static uint64 CopyFrom(CopyState cstate); static void CopyFromInsertBatch(CopyState cstate, EState *estate, CommandId mycid, int hi_options, ResultRelInfo *resultRelInfo, TupleTableSlot *myslot, @@ -529,6 +530,9 @@ CopySendEndOfRow(CopyState cstate) /* Dump the accumulated row as one CopyData message */ (void) pq_putmessage('d', fe_msgbuf->data, fe_msgbuf->len); break; + case COPY_CALLBACK: + Assert(false); /* Not yet supported. */ + break; } resetStringInfo(fe_msgbuf); @@ -643,6 +647,9 @@ CopyGetData(CopyState cstate, void *databuf, int minread, int maxread) bytesread += avail; } break; + case COPY_CALLBACK: + bytesread = cstate->data_source_cb(databuf, minread, maxread); + break; } return bytesread; @@ -969,7 +976,7 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt, PreventCommandIfParallelMode("COPY FROM"); cstate = BeginCopyFrom(pstate, rel, stmt->filename, stmt->is_program, - stmt->attlist, stmt->options); + NULL, stmt->attlist, stmt->options); cstate->range_table = range_table; *processed = CopyFrom(cstate); /* copy from file to database */ EndCopyFrom(cstate); @@ -2286,7 +2293,7 @@ limit_printout_length(const char *str) /* * Copy FROM file to relation. */ -static uint64 +uint64 CopyFrom(CopyState cstate) { HeapTuple tuple; @@ -2878,6 +2885,7 @@ BeginCopyFrom(ParseState *pstate, Relation rel, const char *filename, bool is_program, + copy_data_source_cb data_source_cb, List *attnamelist, List *options) { @@ -2992,7 +3000,12 @@ BeginCopyFrom(ParseState *pstate, cstate->num_defaults = num_defaults; cstate->is_program = is_program; - if (pipe) + if (data_source_cb) + { + cstate->copy_dest = COPY_CALLBACK; + cstate->data_source_cb = data_source_cb; + } + else if (pipe) { Assert(!is_program); /* the grammar does not allow this */ if (whereToSendOutput == DestRemote) diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 0198e6d75b..0784ca7951 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -20,27 +20,36 @@ #include "access/htup_details.h" #include "access/xact.h" +#include "catalog/dependency.h" #include "catalog/indexing.h" +#include "catalog/namespace.h" #include "catalog/objectaccess.h" #include "catalog/objectaddress.h" #include "catalog/pg_type.h" #include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" #include "commands/defrem.h" #include "commands/event_trigger.h" #include "commands/subscriptioncmds.h" +#include "nodes/makefuncs.h" + #include "replication/logicallauncher.h" #include "replication/origin.h" #include "replication/walreceiver.h" +#include "replication/walsender.h" #include "replication/worker_internal.h" #include "storage/lmgr.h" #include "utils/builtins.h" +#include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/syscache.h" +static List *fetch_table_list(WalReceiverConn *wrconn, List *publications); + /* * Common option parsing function for CREATE and ALTER SUBSCRIPTION commands. * @@ -49,17 +58,17 @@ * accomodate that. */ static void -parse_subscription_options(List *options, char **conninfo, - List **publications, bool *enabled_given, - bool *enabled, bool *create_slot, char **slot_name) +parse_subscription_options(List *options, bool *connect, bool *enabled_given, + bool *enabled, bool *create_slot, char **slot_name, + bool *copy_data) { ListCell *lc; + bool connect_given = false; bool create_slot_given = false; + bool copy_data_given = false; - if (conninfo) - *conninfo = NULL; - if (publications) - *publications = NIL; + if (connect) + *connect = true; if (enabled) { *enabled_given = false; @@ -69,29 +78,23 @@ parse_subscription_options(List *options, char **conninfo, *create_slot = true; if (slot_name) *slot_name = NULL; + if (copy_data) + *copy_data = true; /* Parse options */ foreach (lc, options) { DefElem *defel = (DefElem *) lfirst(lc); - if (strcmp(defel->defname, "conninfo") == 0 && conninfo) - { - if (*conninfo) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); - - *conninfo = defGetString(defel); - } - else if (strcmp(defel->defname, "publication") == 0 && publications) + if (strcmp(defel->defname, "noconnect") == 0 && connect) { - if (*publications) + if (connect_given) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting or redundant options"))); - *publications = defGetStringList(defel); + connect_given = true; + *connect = !defGetBoolean(defel); } else if (strcmp(defel->defname, "enabled") == 0 && enabled) { @@ -142,9 +145,57 @@ parse_subscription_options(List *options, char **conninfo, *slot_name = defGetString(defel); } + else if (strcmp(defel->defname, "copy data") == 0 && copy_data) + { + if (copy_data_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + + copy_data_given = true; + *copy_data = defGetBoolean(defel); + } + else if (strcmp(defel->defname, "nocopy data") == 0 && copy_data) + { + if (copy_data_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + + copy_data_given = true; + *copy_data = !defGetBoolean(defel); + } else elog(ERROR, "unrecognized option: %s", defel->defname); } + + /* + * We've been explicitly asked to not connect, that requires some + * additional processing. + */ + if (connect && !*connect) + { + /* Check for incompatible options from the user. */ + if (*enabled_given && *enabled) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("noconnect and enabled are mutually exclusive options"))); + + if (create_slot_given && *create_slot) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("noconnect and create slot are mutually exclusive options"))); + + if (copy_data_given && *copy_data) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("noconnect and copy data are mutually exclusive options"))); + + /* Change the defaults of other options. */ + *enabled = false; + *create_slot = false; + *copy_data = false; + } } /* @@ -214,8 +265,10 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) Datum values[Natts_pg_subscription]; Oid owner = GetUserId(); HeapTuple tup; + bool connect; bool enabled_given; bool enabled; + bool copy_data; char *conninfo; char *slotname; char originname[NAMEDATALEN]; @@ -226,9 +279,8 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) * Parse and check options. * Connection and publication should not be specified here. */ - parse_subscription_options(stmt->options, NULL, NULL, - &enabled_given, &enabled, - &create_slot, &slotname); + parse_subscription_options(stmt->options, &connect, &enabled_given, + &enabled, &create_slot, &slotname, ©_data); /* * Since creating a replication slot is not transactional, rolling back @@ -297,14 +349,17 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) replorigin_create(originname); /* - * If requested, create the replication slot on remote side for our - * newly created subscription. + * Connect to remote side to execute requested commands and fetch table + * info. */ - if (create_slot) + if (connect) { XLogRecPtr lsn; char *err; WalReceiverConn *wrconn; + List *tables; + ListCell *lc; + char table_state; /* Try to connect to the publisher. */ wrconn = walrcv_connect(conninfo, true, stmt->subname, &err); @@ -315,13 +370,43 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) PG_TRY(); { /* - * Create permanent slot for the subscription. We won't use the - * initial snapshot for anything, so no need to export it. + * If requested, create permanent slot for the subscription. + * We won't use the initial snapshot for anything, so no need + * to export it. + */ + if (create_slot) + { + walrcv_create_slot(wrconn, slotname, false, + CRS_NOEXPORT_SNAPSHOT, &lsn); + ereport(NOTICE, + (errmsg("created replication slot \"%s\" on publisher", + slotname))); + } + + /* + * Set sync state based on if we were asked to do data copy or + * not. */ - walrcv_create_slot(wrconn, slotname, false, false, &lsn); + table_state = copy_data ? SUBREL_STATE_INIT : SUBREL_STATE_READY; + + /* + * Get the table list from publisher and build local table status + * info. + */ + tables = fetch_table_list(wrconn, publications); + foreach (lc, tables) + { + RangeVar *rv = (RangeVar *) lfirst(lc); + Oid relid; + + relid = RangeVarGetRelid(rv, AccessShareLock, true); + + SetSubscriptionRelState(subid, relid, table_state, + InvalidXLogRecPtr); + } + ereport(NOTICE, - (errmsg("created replication slot \"%s\" on publisher", - slotname))); + (errmsg("synchronized table states"))); } PG_CATCH(); { @@ -334,6 +419,11 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) /* And we are done with the remote side. */ walrcv_disconnect(wrconn); } + else + ereport(WARNING, + (errmsg("tables were not subscribed, you will have to run " + "ALTER SUBSCRIPTION ... REFRESH PUBLICATION to " + "subscribe the tables"))); heap_close(rel, RowExclusiveLock); @@ -346,6 +436,108 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) return myself; } +static void +AlterSubscription_refresh(Subscription *sub, bool copy_data) +{ + char *err; + List *pubrel_names; + List *subrel_states; + Oid *subrel_local_oids; + Oid *pubrel_local_oids; + ListCell *lc; + int off; + + /* Load the library providing us libpq calls. */ + load_file("libpqwalreceiver", false); + + /* Try to connect to the publisher. */ + wrconn = walrcv_connect(sub->conninfo, true, sub->name, &err); + if (!wrconn) + ereport(ERROR, + (errmsg("could not connect to the publisher: %s", err))); + + /* Get the table list from publisher. */ + pubrel_names = fetch_table_list(wrconn, sub->publications); + + /* We are done with the remote side, close connection. */ + walrcv_disconnect(wrconn); + + /* Get local table list. */ + subrel_states = GetSubscriptionRelations(sub->oid); + + /* + * Build qsorted array of local table oids for faster lookup. + * This can potentially contain all tables in the database so + * speed of lookup is important. + */ + subrel_local_oids = palloc(list_length(subrel_states) * sizeof(Oid)); + off = 0; + foreach(lc, subrel_states) + { + SubscriptionRelState *relstate = (SubscriptionRelState *) lfirst(lc); + subrel_local_oids[off++] = relstate->relid; + } + qsort(subrel_local_oids, list_length(subrel_states), + sizeof(Oid), oid_cmp); + + /* + * Walk over the remote tables and try to match them to locally + * known tables. If the table is not known locally create a new state + * for it. + * + * Also builds array of local oids of remote tables for the next step. + */ + off = 0; + pubrel_local_oids = palloc(list_length(pubrel_names) * sizeof(Oid)); + + foreach (lc, pubrel_names) + { + RangeVar *rv = (RangeVar *) lfirst(lc); + Oid relid; + + relid = RangeVarGetRelid(rv, AccessShareLock, false); + pubrel_local_oids[off++] = relid; + + if (!bsearch(&relid, subrel_local_oids, + list_length(subrel_states), sizeof(Oid), oid_cmp)) + { + SetSubscriptionRelState(sub->oid, relid, + copy_data ? SUBREL_STATE_INIT : SUBREL_STATE_READY, + InvalidXLogRecPtr); + ereport(NOTICE, + (errmsg("added subscription for table %s.%s", + quote_identifier(rv->schemaname), + quote_identifier(rv->relname)))); + } + } + + /* + * Next remove state for tables we should not care about anymore using + * the data we collected above + */ + qsort(pubrel_local_oids, list_length(pubrel_names), + sizeof(Oid), oid_cmp); + + for (off = 0; off < list_length(subrel_states); off++) + { + Oid relid = subrel_local_oids[off]; + + if (!bsearch(&relid, pubrel_local_oids, + list_length(pubrel_names), sizeof(Oid), oid_cmp)) + { + char *namespace; + + RemoveSubscriptionRel(sub->oid, relid); + + namespace = get_namespace_name(get_rel_namespace(relid)); + ereport(NOTICE, + (errmsg("removed subscription for table %s.%s", + quote_identifier(namespace), + quote_identifier(get_rel_name(relid))))); + } + } +} + /* * Alter the existing subscription. */ @@ -359,11 +551,7 @@ AlterSubscription(AlterSubscriptionStmt *stmt) Datum values[Natts_pg_subscription]; HeapTuple tup; Oid subid; - bool enabled_given; - bool enabled; - char *conninfo; - char *slot_name; - List *publications; + bool update_tuple = false; rel = heap_open(SubscriptionRelationId, RowExclusiveLock); @@ -384,52 +572,113 @@ AlterSubscription(AlterSubscriptionStmt *stmt) subid = HeapTupleGetOid(tup); - /* Parse options. */ - parse_subscription_options(stmt->options, &conninfo, &publications, - &enabled_given, &enabled, - NULL, &slot_name); - /* Form a new tuple. */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); memset(replaces, false, sizeof(replaces)); - if (enabled_given) - { - values[Anum_pg_subscription_subenabled - 1] = BoolGetDatum(enabled); - replaces[Anum_pg_subscription_subenabled - 1] = true; - } - if (conninfo) - { - values[Anum_pg_subscription_subconninfo - 1] = - CStringGetTextDatum(conninfo); - replaces[Anum_pg_subscription_subconninfo - 1] = true; - } - if (slot_name) - { - values[Anum_pg_subscription_subslotname - 1] = - DirectFunctionCall1(namein, CStringGetDatum(slot_name)); - replaces[Anum_pg_subscription_subslotname - 1] = true; - } - if (publications != NIL) + switch (stmt->kind) { - values[Anum_pg_subscription_subpublications - 1] = - publicationListToArray(publications); - replaces[Anum_pg_subscription_subpublications - 1] = true; + case ALTER_SUBSCRIPTION_OPTIONS: + { + char *slot_name; + + parse_subscription_options(stmt->options, NULL, NULL, NULL, + NULL, &slot_name, NULL); + + values[Anum_pg_subscription_subslotname - 1] = + DirectFunctionCall1(namein, CStringGetDatum(slot_name)); + replaces[Anum_pg_subscription_subslotname - 1] = true; + + update_tuple = true; + break; + } + + case ALTER_SUBSCRIPTION_ENABLED: + { + bool enabled, + enabled_given; + + parse_subscription_options(stmt->options, NULL, + &enabled_given, &enabled, NULL, + NULL, NULL); + Assert(enabled_given); + + values[Anum_pg_subscription_subenabled - 1] = + BoolGetDatum(enabled); + replaces[Anum_pg_subscription_subenabled - 1] = true; + + update_tuple = true; + break; + } + + case ALTER_SUBSCRIPTION_CONNECTION: + values[Anum_pg_subscription_subconninfo - 1] = + CStringGetTextDatum(stmt->conninfo); + replaces[Anum_pg_subscription_subconninfo - 1] = true; + update_tuple = true; + break; + + case ALTER_SUBSCRIPTION_PUBLICATION: + case ALTER_SUBSCRIPTION_PUBLICATION_REFRESH: + { + bool copy_data; + Subscription *sub = GetSubscription(subid, false); + + parse_subscription_options(stmt->options, NULL, NULL, NULL, + NULL, NULL, ©_data); + + values[Anum_pg_subscription_subpublications - 1] = + publicationListToArray(stmt->publication); + replaces[Anum_pg_subscription_subpublications - 1] = true; + + update_tuple = true; + + /* Refresh if user asked us to. */ + if (stmt->kind == ALTER_SUBSCRIPTION_PUBLICATION_REFRESH) + { + /* Make sure refresh sees the new list of publications. */ + sub->publications = stmt->publication; + + AlterSubscription_refresh(sub, copy_data); + } + + break; + } + + case ALTER_SUBSCRIPTION_REFRESH: + { + bool copy_data; + Subscription *sub = GetSubscription(subid, false); + + parse_subscription_options(stmt->options, NULL, NULL, NULL, + NULL, NULL, ©_data); + + AlterSubscription_refresh(sub, copy_data); + + break; + } + + default: + elog(ERROR, "unrecognized ALTER SUBSCRIPTION kind %d", + stmt->kind); } - tup = heap_modify_tuple(tup, RelationGetDescr(rel), values, nulls, - replaces); + /* Update the catalog if needed. */ + if (update_tuple) + { + tup = heap_modify_tuple(tup, RelationGetDescr(rel), values, nulls, + replaces); - /* Update the catalog. */ - CatalogTupleUpdate(rel, &tup->t_self, tup); + CatalogTupleUpdate(rel, &tup->t_self, tup); - ObjectAddressSet(myself, SubscriptionRelationId, subid); + heap_freetuple(tup); + } - /* Cleanup. */ - heap_freetuple(tup); heap_close(rel, RowExclusiveLock); + ObjectAddressSet(myself, SubscriptionRelationId, subid); + InvokeObjectPostAlterHook(SubscriptionRelationId, subid, 0); return myself; @@ -537,8 +786,11 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) /* Clean up dependencies */ deleteSharedDependencyRecordsFor(SubscriptionRelationId, subid, 0); + /* Remove any associated relation synchronization states. */ + RemoveSubscriptionRel(subid, InvalidOid); + /* Kill the apply worker so that the slot becomes accessible. */ - logicalrep_worker_stop(subid); + logicalrep_worker_stop(subid, InvalidOid); /* Remove the origin tracking if exists. */ snprintf(originname, sizeof(originname), "pg_%u", subid); @@ -571,15 +823,20 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) PG_TRY(); { - if (!walrcv_command(wrconn, cmd.data, &err)) + WalRcvExecResult *res; + res = walrcv_exec(wrconn, cmd.data, 0, NULL); + + if (res->status != WALRCV_OK_COMMAND) ereport(ERROR, (errmsg("could not drop the replication slot \"%s\" on publisher", slotname), - errdetail("The error was: %s", err))); + errdetail("The error was: %s", res->err))); else ereport(NOTICE, (errmsg("dropped replication slot \"%s\" on publisher", slotname))); + + walrcv_clear_result(res); } PG_CATCH(); { @@ -691,3 +948,72 @@ AlterSubscriptionOwner_oid(Oid subid, Oid newOwnerId) heap_close(rel, RowExclusiveLock); } + +/* + * Get the list of tables which belong to specified publications on the + * publisher connection. + */ +static List * +fetch_table_list(WalReceiverConn *wrconn, List *publications) +{ + WalRcvExecResult *res; + StringInfoData cmd; + TupleTableSlot *slot; + Oid tableRow[2] = {TEXTOID, TEXTOID}; + ListCell *lc; + bool first; + List *tablelist = NIL; + + Assert(list_length(publications) > 0); + + initStringInfo(&cmd); + appendStringInfo(&cmd, "SELECT DISTINCT t.schemaname, t.tablename\n" + " FROM pg_catalog.pg_publication_tables t\n" + " WHERE t.pubname IN ("); + first = true; + foreach (lc, publications) + { + char *pubname = strVal(lfirst(lc)); + + if (first) + first = false; + else + appendStringInfoString(&cmd, ", "); + + appendStringInfo(&cmd, "%s", quote_literal_cstr(pubname)); + } + appendStringInfoString(&cmd, ")"); + + res = walrcv_exec(wrconn, cmd.data, 2, tableRow); + pfree(cmd.data); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errmsg("could not receive list of replicated tables from the publisher: %s", + res->err))); + + /* Process tables. */ + slot = MakeSingleTupleTableSlot(res->tupledesc); + while (tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + { + char *nspname; + char *relname; + bool isnull; + RangeVar *rv; + + nspname = TextDatumGetCString(slot_getattr(slot, 1, &isnull)); + Assert(!isnull); + relname = TextDatumGetCString(slot_getattr(slot, 2, &isnull)); + Assert(!isnull); + + rv = makeRangeVar(pstrdup(nspname), pstrdup(relname), -1); + tablelist = lappend(tablelist, rv); + + ExecClearTuple(slot); + } + ExecDropSingleTupleTableSlot(slot); + + walrcv_clear_result(res); + + return tablelist; +} diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index d0d45a557b..50126baacf 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -651,7 +651,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); MAPPING MATCH MATERIALIZED MAXVALUE METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NONE - NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF + NOREFRESH NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF NULLS_P NUMERIC OBJECT_P OF OFF OFFSET OIDS OLD ON ONLY OPERATOR OPTION OPTIONS OR @@ -9095,6 +9095,7 @@ AlterSubscriptionStmt: { AlterSubscriptionStmt *n = makeNode(AlterSubscriptionStmt); + n->kind = ALTER_SUBSCRIPTION_OPTIONS; n->subname = $3; n->options = $5; $$ = (Node *)n; @@ -9103,24 +9104,45 @@ AlterSubscriptionStmt: { AlterSubscriptionStmt *n = makeNode(AlterSubscriptionStmt); + n->kind = ALTER_SUBSCRIPTION_CONNECTION; n->subname = $3; - n->options = list_make1(makeDefElem("conninfo", - (Node *)makeString($5), @1)); + n->conninfo = $5; + $$ = (Node *)n; + } + | ALTER SUBSCRIPTION name REFRESH PUBLICATION opt_definition + { + AlterSubscriptionStmt *n = + makeNode(AlterSubscriptionStmt); + n->kind = ALTER_SUBSCRIPTION_REFRESH; + n->subname = $3; + n->options = $6; + $$ = (Node *)n; + } + | ALTER SUBSCRIPTION name SET PUBLICATION publication_name_list REFRESH opt_definition + { + AlterSubscriptionStmt *n = + makeNode(AlterSubscriptionStmt); + n->kind = ALTER_SUBSCRIPTION_PUBLICATION_REFRESH; + n->subname = $3; + n->publication = $6; + n->options = $8; $$ = (Node *)n; } - | ALTER SUBSCRIPTION name SET PUBLICATION publication_name_list + | ALTER SUBSCRIPTION name SET PUBLICATION publication_name_list NOREFRESH { AlterSubscriptionStmt *n = makeNode(AlterSubscriptionStmt); + n->kind = ALTER_SUBSCRIPTION_PUBLICATION; n->subname = $3; - n->options = list_make1(makeDefElem("publication", - (Node *)$6, @1)); + n->publication = $6; + n->options = NIL; $$ = (Node *)n; } | ALTER SUBSCRIPTION name ENABLE_P { AlterSubscriptionStmt *n = makeNode(AlterSubscriptionStmt); + n->kind = ALTER_SUBSCRIPTION_ENABLED; n->subname = $3; n->options = list_make1(makeDefElem("enabled", (Node *)makeInteger(TRUE), @1)); @@ -9130,11 +9152,13 @@ AlterSubscriptionStmt: { AlterSubscriptionStmt *n = makeNode(AlterSubscriptionStmt); + n->kind = ALTER_SUBSCRIPTION_ENABLED; n->subname = $3; n->options = list_make1(makeDefElem("enabled", (Node *)makeInteger(FALSE), @1)); $$ = (Node *)n; - } ; + } + ; /***************************************************************************** * @@ -14548,6 +14572,7 @@ unreserved_keyword: | NEW | NEXT | NO + | NOREFRESH | NOTHING | NOTIFY | NOWAIT diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 3a50488db3..b704788eb5 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3415,6 +3415,12 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_SYNC_REP: event_name = "SyncRep"; break; + case WAIT_EVENT_LOGICAL_SYNC_DATA: + event_name = "LogicalSyncData"; + break; + case WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE: + event_name = "LogicalSyncStateChange"; + break; /* no default case, so that compiler will warn */ } diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c index 65a9e6c81c..4dd8eef1f9 100644 --- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c +++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c @@ -22,14 +22,16 @@ #include "libpq-fe.h" #include "pqexpbuffer.h" #include "access/xlog.h" +#include "catalog/pg_type.h" +#include "funcapi.h" #include "mb/pg_wchar.h" #include "miscadmin.h" #include "pgstat.h" -#include "replication/logicalproto.h" #include "replication/walreceiver.h" -#include "storage/proc.h" #include "utils/builtins.h" +#include "utils/memutils.h" #include "utils/pg_lsn.h" +#include "utils/tuplestore.h" PG_MODULE_MAGIC; @@ -68,10 +70,12 @@ static void libpqrcv_send(WalReceiverConn *conn, const char *buffer, static char *libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname, bool temporary, - bool export_snapshot, + CRSSnapshotAction snapshot_action, XLogRecPtr *lsn); -static bool libpqrcv_command(WalReceiverConn *conn, - const char *cmd, char **err); +static WalRcvExecResult *libpqrcv_exec(WalReceiverConn *conn, + const char *query, + const int nRetTypes, + const Oid *retTypes); static void libpqrcv_disconnect(WalReceiverConn *conn); static WalReceiverFunctionsType PQWalReceiverFunctions = { @@ -85,7 +89,7 @@ static WalReceiverFunctionsType PQWalReceiverFunctions = { libpqrcv_receive, libpqrcv_send, libpqrcv_create_slot, - libpqrcv_command, + libpqrcv_exec, libpqrcv_disconnect }; @@ -431,10 +435,8 @@ libpqrcv_endstreaming(WalReceiverConn *conn, TimeLineID *next_tli) * next timeline's ID, or just CommandComplete if the server was shut * down. * - * If we had not yet received CopyDone from the backend, PGRES_COPY_IN - * would also be possible. However, at the moment this function is only - * called after receiving CopyDone from the backend - the walreceiver - * never terminates replication on its own initiative. + * If we had not yet received CopyDone from the backend, PGRES_COPY_OUT + * is also possible in case we aborted the copy in mid-stream. */ res = PQgetResult(conn->streamConn); if (PQresultStatus(res) == PGRES_TUPLES_OK) @@ -531,7 +533,7 @@ libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn, * Windows. * * The function is modeled on PQexec() in libpq, but only implements - * those parts that are in use in the walreceiver. + * those parts that are in use in the walreceiver api. * * Queries are always executed on the connection in streamConn. */ @@ -543,8 +545,9 @@ libpqrcv_PQexec(PGconn *streamConn, const char *query) /* * PQexec() silently discards any prior query results on the connection. - * This is not required for walreceiver since it's expected that walsender - * won't generate any such junk results. + * This is not required for this function as it's expected that the + * caller (which is this library in all cases) will behave correctly and + * we don't have to be backwards compatible with old libpq. */ /* @@ -593,8 +596,7 @@ libpqrcv_PQexec(PGconn *streamConn, const char *query) /* * Emulate the PQexec()'s behavior of returning the last result when - * there are many. Since walsender will never generate multiple - * results, we skip the concatenation of error messages. + * there are many. We are fine with returning just last error message. */ result = PQgetResult(streamConn); if (result == NULL) @@ -675,8 +677,19 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer, PGresult *res; res = PQgetResult(conn->streamConn); - if (PQresultStatus(res) == PGRES_COMMAND_OK || - PQresultStatus(res) == PGRES_COPY_IN) + if (PQresultStatus(res) == PGRES_COMMAND_OK) + { + PQclear(res); + + /* Verify that there are no more results */ + res = PQgetResult(conn->streamConn); + if (res != NULL) + ereport(ERROR, + (errmsg("unexpected result after CommandComplete: %s", + PQerrorMessage(conn->streamConn)))); + return -1; + } + else if (PQresultStatus(res) == PGRES_COPY_IN) { PQclear(res); return -1; @@ -721,7 +734,8 @@ libpqrcv_send(WalReceiverConn *conn, const char *buffer, int nbytes) */ static char * libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname, - bool temporary, bool export_snapshot, XLogRecPtr *lsn) + bool temporary, CRSSnapshotAction snapshot_action, + XLogRecPtr *lsn) { PGresult *res; StringInfoData cmd; @@ -737,10 +751,18 @@ libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname, if (conn->logical) { appendStringInfo(&cmd, " LOGICAL pgoutput"); - if (export_snapshot) - appendStringInfo(&cmd, " EXPORT_SNAPSHOT"); - else - appendStringInfo(&cmd, " NOEXPORT_SNAPSHOT"); + switch (snapshot_action) + { + case CRS_EXPORT_SNAPSHOT: + appendStringInfo(&cmd, " EXPORT_SNAPSHOT"); + break; + case CRS_NOEXPORT_SNAPSHOT: + appendStringInfo(&cmd, " NOEXPORT_SNAPSHOT"); + break; + case CRS_USE_SNAPSHOT: + appendStringInfo(&cmd, " USE_SNAPSHOT"); + break; + } } res = libpqrcv_PQexec(conn->streamConn, cmd.data); @@ -767,28 +789,139 @@ libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname, } /* - * Run command. + * Convert tuple query result to tuplestore. + */ +static void +libpqrcv_processTuples(PGresult *pgres, WalRcvExecResult *walres, + const int nRetTypes, const Oid *retTypes) +{ + int tupn; + int coln; + int nfields = PQnfields(pgres); + HeapTuple tuple; + AttInMetadata *attinmeta; + MemoryContext rowcontext; + MemoryContext oldcontext; + + /* No point in doing anything here if there were no tuples returned. */ + if (PQntuples(pgres) == 0) + return; + + /* Make sure we got expected number of fields. */ + if (nfields != nRetTypes) + ereport(ERROR, + (errmsg("invalid query responser"), + errdetail("Expected %d fields, got %d fields.", + nRetTypes, nfields))); + + + walres->tuplestore = tuplestore_begin_heap(true, false, work_mem); + + /* Create tuple descriptor corresponding to expected result. */ + walres->tupledesc = CreateTemplateTupleDesc(nRetTypes, false); + for (coln = 0; coln < nRetTypes; coln++) + TupleDescInitEntry(walres->tupledesc, (AttrNumber) coln + 1, + PQfname(pgres, coln), retTypes[coln], -1, 0); + attinmeta = TupleDescGetAttInMetadata(walres->tupledesc); + + /* Create temporary context for local allocations. */ + rowcontext = AllocSetContextCreate(CurrentMemoryContext, + "libpqrcv query result context", + ALLOCSET_DEFAULT_SIZES); + + /* Process returned rows. */ + for (tupn = 0; tupn < PQntuples(pgres); tupn++) + { + char *cstrs[MaxTupleAttributeNumber]; + + CHECK_FOR_INTERRUPTS(); + + /* Do the allocations in temporary context. */ + oldcontext = MemoryContextSwitchTo(rowcontext); + + /* + * Fill cstrs with null-terminated strings of column values. + */ + for (coln = 0; coln < nfields; coln++) + { + if (PQgetisnull(pgres, tupn, coln)) + cstrs[coln] = NULL; + else + cstrs[coln] = PQgetvalue(pgres, tupn, coln); + } + + /* Convert row to a tuple, and add it to the tuplestore */ + tuple = BuildTupleFromCStrings(attinmeta, cstrs); + tuplestore_puttuple(walres->tuplestore, tuple); + + /* Clean up */ + MemoryContextSwitchTo(oldcontext); + MemoryContextReset(rowcontext); + } + + MemoryContextDelete(rowcontext); +} + +/* + * Public interface for sending generic queries (and commands). * - * Returns if the command has succeeded and fills the err with palloced - * error message if not. + * This can only be called from process connected to database. */ -static bool -libpqrcv_command(WalReceiverConn *conn, const char *cmd, char **err) +static WalRcvExecResult * +libpqrcv_exec(WalReceiverConn *conn, const char *query, + const int nRetTypes, const Oid *retTypes) { - PGresult *res; + PGresult *pgres = NULL; + WalRcvExecResult *walres = palloc0(sizeof(WalRcvExecResult)); - res = libpqrcv_PQexec(conn->streamConn, cmd); + if (MyDatabaseId == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("the query interface requires a database connection"))); - if (PQresultStatus(res) != PGRES_COMMAND_OK) + pgres = libpqrcv_PQexec(conn->streamConn, query); + + switch (PQresultStatus(pgres)) { - PQclear(res); - *err = pchomp(PQerrorMessage(conn->streamConn)); - return false; + case PGRES_SINGLE_TUPLE: + case PGRES_TUPLES_OK: + walres->status = WALRCV_OK_TUPLES; + libpqrcv_processTuples(pgres, walres, nRetTypes, retTypes); + break; + + case PGRES_COPY_IN: + walres->status = WALRCV_OK_COPY_IN; + break; + + case PGRES_COPY_OUT: + walres->status = WALRCV_OK_COPY_OUT; + break; + + case PGRES_COPY_BOTH: + walres->status = WALRCV_OK_COPY_BOTH; + break; + + case PGRES_COMMAND_OK: + walres->status = WALRCV_OK_COMMAND; + break; + + /* Empty query is considered error. */ + case PGRES_EMPTY_QUERY: + walres->status = WALRCV_ERROR; + walres->err = _("empty query"); + break; + + case PGRES_NONFATAL_ERROR: + case PGRES_FATAL_ERROR: + case PGRES_BAD_RESPONSE: + walres->status = WALRCV_ERROR; + walres->err = pchomp(PQerrorMessage(conn->streamConn)); + break; } - PQclear(res); + PQclear(pgres); - return true; + return walres; } /* diff --git a/src/backend/replication/logical/Makefile b/src/backend/replication/logical/Makefile index 259befa4e6..bb417b042e 100644 --- a/src/backend/replication/logical/Makefile +++ b/src/backend/replication/logical/Makefile @@ -15,6 +15,6 @@ include $(top_builddir)/src/Makefile.global override CPPFLAGS := -I$(srcdir) $(CPPFLAGS) OBJS = decode.o launcher.o logical.o logicalfuncs.o message.o origin.o \ - proto.o relation.o reorderbuffer.o snapbuild.o worker.o + proto.o relation.o reorderbuffer.o snapbuild.o tablesync.o worker.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c index 20b43626dd..255b22597b 100644 --- a/src/backend/replication/logical/launcher.c +++ b/src/backend/replication/logical/launcher.c @@ -27,6 +27,7 @@ #include "access/xact.h" #include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" #include "libpq/pqsignal.h" @@ -56,6 +57,8 @@ #define DEFAULT_NAPTIME_PER_CYCLE 180000L int max_logical_replication_workers = 4; +int max_sync_workers_per_subscription = 2; + LogicalRepWorker *MyLogicalRepWorker = NULL; typedef struct LogicalRepCtxStruct @@ -198,20 +201,22 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker, /* * Walks the workers array and searches for one that matches given - * subscription id. + * subscription id and relid. */ LogicalRepWorker * -logicalrep_worker_find(Oid subid) +logicalrep_worker_find(Oid subid, Oid relid, bool only_running) { int i; LogicalRepWorker *res = NULL; Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + /* Search for attached worker for a given subscription id. */ for (i = 0; i < max_logical_replication_workers; i++) { LogicalRepWorker *w = &LogicalRepCtx->workers[i]; - if (w->subid == subid && w->proc && IsBackendPid(w->proc->pid)) + if (w->subid == subid && w->relid == relid && + (!only_running || (w->proc && IsBackendPid(w->proc->pid)))) { res = w; break; @@ -225,7 +230,8 @@ logicalrep_worker_find(Oid subid) * Start new apply background worker. */ void -logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid) +logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid, + Oid relid) { BackgroundWorker bgw; BackgroundWorkerHandle *bgw_handle; @@ -270,10 +276,18 @@ logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid) } /* Prepare the worker info. */ - memset(worker, 0, sizeof(LogicalRepWorker)); + worker->proc = NULL; worker->dbid = dbid; worker->userid = userid; worker->subid = subid; + worker->relid = relid; + worker->relstate = SUBREL_STATE_UNKNOWN; + worker->relstate_lsn = InvalidXLogRecPtr; + worker->last_lsn = InvalidXLogRecPtr; + TIMESTAMP_NOBEGIN(worker->last_send_time); + TIMESTAMP_NOBEGIN(worker->last_recv_time); + worker->reply_lsn = InvalidXLogRecPtr; + TIMESTAMP_NOBEGIN(worker->reply_time); LWLockRelease(LogicalRepWorkerLock); @@ -282,8 +296,12 @@ logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid) BGWORKER_BACKEND_DATABASE_CONNECTION; bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; bgw.bgw_main = ApplyWorkerMain; - snprintf(bgw.bgw_name, BGW_MAXLEN, - "logical replication worker for subscription %u", subid); + if (OidIsValid(relid)) + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication worker for subscription %u sync %u", subid, relid); + else + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication worker for subscription %u", subid); bgw.bgw_restart_time = BGW_NEVER_RESTART; bgw.bgw_notify_pid = MyProcPid; @@ -307,13 +325,13 @@ logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid) * slot. */ void -logicalrep_worker_stop(Oid subid) +logicalrep_worker_stop(Oid subid, Oid relid) { LogicalRepWorker *worker; LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); - worker = logicalrep_worker_find(subid); + worker = logicalrep_worker_find(subid, relid, false); /* No worker, nothing to do. */ if (!worker) @@ -395,6 +413,31 @@ logicalrep_worker_stop(Oid subid) } } +/* + * Wake up (using latch) the logical replication worker. + */ +void +logicalrep_worker_wakeup(Oid subid, Oid relid) +{ + LogicalRepWorker *worker; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + worker = logicalrep_worker_find(subid, relid, true); + LWLockRelease(LogicalRepWorkerLock); + + if (worker) + logicalrep_worker_wakeup_ptr(worker); +} + +/* + * Wake up (using latch) the logical replication worker. + */ +void +logicalrep_worker_wakeup_ptr(LogicalRepWorker *worker) +{ + SetLatch(&worker->proc->procLatch); +} + /* * Attach to a slot. */ @@ -457,6 +500,29 @@ logicalrep_worker_sigterm(SIGNAL_ARGS) SetLatch(MyLatch); } +/* + * Count the number of registered (not necessarily running) sync workers + * for a subscription. + */ +int +logicalrep_sync_worker_count(Oid subid) +{ + int i; + int res = 0; + + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + /* Search for attached worker for a given subscription id. */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + if (w->subid == subid && OidIsValid(w->relid)) + res++; + } + + return res; +} + /* * ApplyLauncherShmemSize * Compute space needed for replication launcher shared memory @@ -512,7 +578,20 @@ ApplyLauncherShmemInit(void) &found); if (!found) + { + int slot; + memset(LogicalRepCtx, 0, ApplyLauncherShmemSize()); + + /* Initialize memory and spin locks for each worker slot. */ + for (slot = 0; slot < max_logical_replication_workers; slot++) + { + LogicalRepWorker *worker = &LogicalRepCtx->workers[slot]; + + memset(worker, 0, sizeof(LogicalRepWorker)); + SpinLockInit(&worker->relmutex); + } + } } /* @@ -607,12 +686,13 @@ ApplyLauncherMain(Datum main_arg) LogicalRepWorker *w; LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); - w = logicalrep_worker_find(sub->oid); + w = logicalrep_worker_find(sub->oid, InvalidOid, false); LWLockRelease(LogicalRepWorkerLock); if (sub->enabled && w == NULL) { - logicalrep_worker_launch(sub->dbid, sub->oid, sub->name, sub->owner); + logicalrep_worker_launch(sub->dbid, sub->oid, sub->name, + sub->owner, InvalidOid); last_start_time = now; wait_time = wal_retrieve_retry_interval; /* Limit to one worker per mainloop cycle. */ @@ -664,7 +744,7 @@ ApplyLauncherMain(Datum main_arg) Datum pg_stat_get_subscription(PG_FUNCTION_ARGS) { -#define PG_STAT_GET_SUBSCRIPTION_COLS 7 +#define PG_STAT_GET_SUBSCRIPTION_COLS 8 Oid subid = PG_ARGISNULL(0) ? InvalidOid : PG_GETARG_OID(0); int i; ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; @@ -723,27 +803,31 @@ pg_stat_get_subscription(PG_FUNCTION_ARGS) MemSet(nulls, 0, sizeof(nulls)); values[0] = ObjectIdGetDatum(worker.subid); - values[1] = Int32GetDatum(worker_pid); + if (OidIsValid(worker.relid)) + values[1] = ObjectIdGetDatum(worker.relid); + else + nulls[1] = true; + values[2] = Int32GetDatum(worker_pid); if (XLogRecPtrIsInvalid(worker.last_lsn)) - nulls[2] = true; + nulls[3] = true; else - values[2] = LSNGetDatum(worker.last_lsn); + values[3] = LSNGetDatum(worker.last_lsn); if (worker.last_send_time == 0) - nulls[3] = true; + nulls[4] = true; else - values[3] = TimestampTzGetDatum(worker.last_send_time); + values[4] = TimestampTzGetDatum(worker.last_send_time); if (worker.last_recv_time == 0) - nulls[4] = true; + nulls[5] = true; else - values[4] = TimestampTzGetDatum(worker.last_recv_time); + values[5] = TimestampTzGetDatum(worker.last_recv_time); if (XLogRecPtrIsInvalid(worker.reply_lsn)) - nulls[5] = true; + nulls[6] = true; else - values[5] = LSNGetDatum(worker.reply_lsn); + values[6] = LSNGetDatum(worker.reply_lsn); if (worker.reply_time == 0) - nulls[6] = true; + nulls[7] = true; else - values[6] = TimestampTzGetDatum(worker.reply_time); + values[7] = TimestampTzGetDatum(worker.reply_time); tuplestore_putvalues(tupstore, tupdesc, values, nulls); diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c index d8dc0c7194..875a08185a 100644 --- a/src/backend/replication/logical/relation.c +++ b/src/backend/replication/logical/relation.c @@ -19,6 +19,7 @@ #include "access/heapam.h" #include "access/sysattr.h" #include "catalog/namespace.h" +#include "catalog/pg_subscription_rel.h" #include "nodes/makefuncs.h" #include "replication/logicalrelation.h" #include "replication/worker_internal.h" @@ -357,6 +358,12 @@ logicalrep_rel_open(LogicalRepRelId remoteid, LOCKMODE lockmode) else entry->localrel = heap_open(entry->localreloid, lockmode); + if (entry->state != SUBREL_STATE_READY) + entry->state = GetSubscriptionRelState(MySubscription->oid, + entry->localreloid, + &entry->statelsn, + true); + return entry; } diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 3f242a8ed7..a73a7b98f9 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -499,51 +499,32 @@ SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid) } /* - * Export a snapshot so it can be set in another session with SET TRANSACTION - * SNAPSHOT. - * - * For that we need to start a transaction in the current backend as the - * importing side checks whether the source transaction is still open to make - * sure the xmin horizon hasn't advanced since then. + * Build the initial slot snapshot and convert it to normal snapshot that + * is understood by HeapTupleSatisfiesMVCC. * - * After that we convert a locally built snapshot into the normal variant - * understood by HeapTupleSatisfiesMVCC et al. + * The snapshot will be usable directly in current transaction or exported + * for loading in different transaction. */ -const char * -SnapBuildExportSnapshot(SnapBuild *builder) +Snapshot +SnapBuildInitalSnapshot(SnapBuild *builder) { Snapshot snap; - char *snapname; TransactionId xid; TransactionId *newxip; int newxcnt = 0; + Assert(!FirstSnapshotSet); + Assert(XactIsoLevel = XACT_REPEATABLE_READ); + if (builder->state != SNAPBUILD_CONSISTENT) - elog(ERROR, "cannot export a snapshot before reaching a consistent state"); + elog(ERROR, "cannot build an initial slot snapshot before reaching a consistent state"); if (!builder->committed.includes_all_transactions) - elog(ERROR, "cannot export a snapshot, not all transactions are monitored anymore"); + elog(ERROR, "cannot build an initial slot snapshot, not all transactions are monitored anymore"); /* so we don't overwrite the existing value */ if (TransactionIdIsValid(MyPgXact->xmin)) - elog(ERROR, "cannot export a snapshot when MyPgXact->xmin already is valid"); - - if (IsTransactionOrTransactionBlock()) - elog(ERROR, "cannot export a snapshot from within a transaction"); - - if (SavedResourceOwnerDuringExport) - elog(ERROR, "can only export one snapshot at a time"); - - SavedResourceOwnerDuringExport = CurrentResourceOwner; - ExportInProgress = true; - - StartTransactionCommand(); - - Assert(!FirstSnapshotSet); - - /* There doesn't seem to a nice API to set these */ - XactIsoLevel = XACT_REPEATABLE_READ; - XactReadOnly = true; + elog(ERROR, "cannot build an initial slot snapshot when MyPgXact->xmin already is valid"); snap = SnapBuildBuildSnapshot(builder, GetTopTransactionId()); @@ -578,7 +559,9 @@ SnapBuildExportSnapshot(SnapBuild *builder) if (test == NULL) { if (newxcnt >= GetMaxSnapshotXidCount()) - elog(ERROR, "snapshot too large"); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("initial slot snapshot too large"))); newxip[newxcnt++] = xid; } @@ -589,9 +572,43 @@ SnapBuildExportSnapshot(SnapBuild *builder) snap->xcnt = newxcnt; snap->xip = newxip; + return snap; +} + +/* + * Export a snapshot so it can be set in another session with SET TRANSACTION + * SNAPSHOT. + * + * For that we need to start a transaction in the current backend as the + * importing side checks whether the source transaction is still open to make + * sure the xmin horizon hasn't advanced since then. + */ +const char * +SnapBuildExportSnapshot(SnapBuild *builder) +{ + Snapshot snap; + char *snapname; + + if (IsTransactionOrTransactionBlock()) + elog(ERROR, "cannot export a snapshot from within a transaction"); + + if (SavedResourceOwnerDuringExport) + elog(ERROR, "can only export one snapshot at a time"); + + SavedResourceOwnerDuringExport = CurrentResourceOwner; + ExportInProgress = true; + + StartTransactionCommand(); + + /* There doesn't seem to a nice API to set these */ + XactIsoLevel = XACT_REPEATABLE_READ; + XactReadOnly = true; + + snap = SnapBuildInitalSnapshot(builder); + /* - * now that we've built a plain snapshot, use the normal mechanisms for - * exporting it + * now that we've built a plain snapshot, make it active and use the + * normal mechanisms for exporting it */ snapname = ExportSnapshot(snap); diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c new file mode 100644 index 0000000000..3e16b0d576 --- /dev/null +++ b/src/backend/replication/logical/tablesync.c @@ -0,0 +1,840 @@ +/*------------------------------------------------------------------------- + * tablesync.c + * PostgreSQL logical replication + * + * Copyright (c) 2012-2016, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/tablesync.c + * + * NOTES + * This file contains code for initial table data synchronization for + * logical replication. + * + * The initial data synchronization is done separately for each table, + * in separate apply worker that only fetches the initial snapshot data + * from the publisher and then synchronizes the position in stream with + * the main apply worker. + * + * The are several reasons for doing the synchronization this way: + * - It allows us to parallelize the initial data synchronization + * which lowers the time needed for it to happen. + * - The initial synchronization does not have to hold the xid and LSN + * for the time it takes to copy data of all tables, causing less + * bloat and lower disk consumption compared to doing the + * synchronization in single process for whole database. + * - It allows us to synchronize the tables added after the initial + * synchronization has finished. + * + * The stream position synchronization works in multiple steps. + * - Sync finishes copy and sets table state as SYNCWAIT and waits + * for state to change in a loop. + * - Apply periodically checks tables that are synchronizing for SYNCWAIT. + * When the desired state appears it will compare its position in the + * stream with the SYNCWAIT position and based on that changes the + * state to based on following rules: + * - if the apply is in front of the sync in the wal stream the new + * state is set to CATCHUP and apply loops until the sync process + * catches up to the same LSN as apply + * - if the sync is in front of the apply in the wal stream the new + * state is set to SYNCDONE + * - if both apply and sync are at the same position in the wal stream + * the state of the table is set to READY + * - If the state was set to CATCHUP sync will read the stream and + * apply changes until it catches up to the specified stream + * position and then sets state to READY and signals apply that it + * can stop waiting and exits, if the state was set to something + * else than CATCHUP the sync process will simply end. + * - If the state was set to SYNCDONE by apply, the apply will + * continue tracking the table until it reaches the SYNCDONE stream + * position at which point it sets state to READY and stops tracking. + * + * The catalog pg_subscription_rel is used to keep information about + * subscribed tables and their state and some transient state during + * data synchronization is kept in shared memory. + * + * Example flows look like this: + * - Apply is in front: + * sync:8 + * -> set SYNCWAIT + * apply:10 + * -> set CATCHUP + * -> enter wait-loop + * sync:10 + * -> set READY + * -> exit + * apply:10 + * -> exit wait-loop + * -> continue rep + * - Sync in front: + * sync:10 + * -> set SYNCWAIT + * apply:8 + * -> set SYNCDONE + * -> continue per-table filtering + * sync:10 + * -> exit + * apply:10 + * -> set READY + * -> stop per-table filtering + * -> continue rep + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "pgstat.h" + +#include "access/xact.h" + +#include "catalog/pg_subscription_rel.h" +#include "catalog/pg_type.h" + +#include "commands/copy.h" + +#include "replication/logicallauncher.h" +#include "replication/logicalrelation.h" +#include "replication/walreceiver.h" +#include "replication/worker_internal.h" + +#include "storage/ipc.h" + +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" + +static bool table_states_valid = false; + +StringInfo copybuf = NULL; + +/* + * Exit routine for synchronization worker. + */ +static void pg_attribute_noreturn() +finish_sync_worker(void) +{ + /* Commit any outstanding transaction. */ + if (IsTransactionState()) + CommitTransactionCommand(); + + /* And flush all writes. */ + XLogFlush(GetXLogWriteRecPtr()); + + /* Find the main apply worker and signal it. */ + logicalrep_worker_wakeup(MyLogicalRepWorker->subid, InvalidOid); + + ereport(LOG, + (errmsg("logical replication synchronization worker finished processing"))); + + /* Stop gracefully */ + walrcv_disconnect(wrconn); + proc_exit(0); +} + +/* + * Wait until the table synchronization change. + * + * Returns false if the relation subscription state disappeared. + */ +static bool +wait_for_sync_status_change(Oid relid, char origstate) +{ + int rc; + char state = origstate; + + while (!got_SIGTERM) + { + LogicalRepWorker *worker; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + worker = logicalrep_worker_find(MyLogicalRepWorker->subid, + relid, false); + if (!worker) + { + LWLockRelease(LogicalRepWorkerLock); + return false; + } + state = worker->relstate; + LWLockRelease(LogicalRepWorkerLock); + + if (state == SUBREL_STATE_UNKNOWN) + return false; + + if (state != origstate) + return true; + + rc = WaitLatch(&MyProc->procLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + 10000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE); + + /* emergency bailout if postmaster has died */ + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); + + ResetLatch(&MyProc->procLatch); + } + + return false; +} + +/* + * Callback from syscache invalidation. + */ +void +invalidate_syncing_table_states(Datum arg, int cacheid, uint32 hashvalue) +{ + table_states_valid = false; +} + +/* + * Handle table synchronization cooperation from the synchronization + * worker. + * + * If the sync worker is in catch up mode and reached the predetermined + * synchronization point in the WAL stream, mark the table as READY and + * finish. If it caught up too far, set to SYNCDONE and finish. Things will + * then proceed in the "sync in front" scenario. + */ +static void +process_syncing_tables_for_sync(XLogRecPtr current_lsn) +{ + Assert(IsTransactionState()); + + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + + if (MyLogicalRepWorker->relstate == SUBREL_STATE_CATCHUP && + current_lsn >= MyLogicalRepWorker->relstate_lsn) + { + TimeLineID tli; + + MyLogicalRepWorker->relstate = + (current_lsn == MyLogicalRepWorker->relstate_lsn) + ? SUBREL_STATE_READY + : SUBREL_STATE_SYNCDONE; + MyLogicalRepWorker->relstate_lsn = current_lsn; + + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + SetSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + MyLogicalRepWorker->relstate, + MyLogicalRepWorker->relstate_lsn); + + walrcv_endstreaming(wrconn, &tli); + finish_sync_worker(); + } + else + SpinLockRelease(&MyLogicalRepWorker->relmutex); +} + +/* + * Handle table synchronization cooperation from the apply worker. + * + * Walk over all subscription tables that are individually tracked by the + * apply process (currently, all that have state other than + * SUBREL_STATE_READY) and manage synchronization for them. + * + * If there are tables that need synchronizing and are not being synchronized + * yet, start sync workers for them (if there are free slots for sync + * workers). + * + * For tables that are being synchronized already, check if sync workers + * either need action from the apply worker or have finished. + * + * The usual scenario is that the apply got ahead of the sync while the sync + * ran, and then the action needed by apply is to mark a table for CATCHUP and + * wait for the catchup to happen. In the less common case that sync worker + * got in front of the apply worker, the table is marked as SYNCDONE but not + * ready yet, as it needs to be tracked until apply reaches the same position + * to which it was synced. + * + * If the synchronization position is reached, then the table can be marked as + * READY and is no longer tracked. + */ +static void +process_syncing_tables_for_apply(XLogRecPtr current_lsn) +{ + static List *table_states = NIL; + ListCell *lc; + + Assert(!IsTransactionState()); + + /* We need up to date sync state info for subscription tables here. */ + if (!table_states_valid) + { + MemoryContext oldctx; + List *rstates; + ListCell *lc; + SubscriptionRelState *rstate; + + /* Clean the old list. */ + list_free_deep(table_states); + table_states = NIL; + + StartTransactionCommand(); + + /* Fetch all non-ready tables. */ + rstates = GetSubscriptionNotReadyRelations(MySubscription->oid); + + /* Allocate the tracking info in a permanent memory context. */ + oldctx = MemoryContextSwitchTo(CacheMemoryContext); + foreach(lc, rstates) + { + rstate = palloc(sizeof(SubscriptionRelState)); + memcpy(rstate, lfirst(lc), sizeof(SubscriptionRelState)); + table_states = lappend(table_states, rstate); + } + MemoryContextSwitchTo(oldctx); + + CommitTransactionCommand(); + + table_states_valid = true; + } + + /* Process all tables that are being synchronized. */ + foreach(lc, table_states) + { + SubscriptionRelState *rstate = (SubscriptionRelState *)lfirst(lc); + + if (rstate->state == SUBREL_STATE_SYNCDONE) + { + /* + * Apply has caught up to the position where the table sync + * has finished. Time to mark the table as ready so that + * apply will just continue to replicate it normally. + */ + if (current_lsn >= rstate->lsn) + { + rstate->state = SUBREL_STATE_READY; + rstate->lsn = current_lsn; + StartTransactionCommand(); + SetSubscriptionRelState(MyLogicalRepWorker->subid, + rstate->relid, rstate->state, + rstate->lsn); + CommitTransactionCommand(); + } + } + else + { + LogicalRepWorker *syncworker; + int nsyncworkers = 0; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + syncworker = logicalrep_worker_find(MyLogicalRepWorker->subid, + rstate->relid, false); + if (syncworker) + { + SpinLockAcquire(&syncworker->relmutex); + rstate->state = syncworker->relstate; + rstate->lsn = syncworker->relstate_lsn; + SpinLockRelease(&syncworker->relmutex); + } + else + /* + * If no sync worker for this table yet, could running sync + * workers for this subscription, while we have the lock, for + * later. + */ + nsyncworkers = logicalrep_sync_worker_count(MyLogicalRepWorker->subid); + LWLockRelease(LogicalRepWorkerLock); + + /* + * There is a worker synchronizing the relation and waiting for + * apply to do something. + */ + if (syncworker && rstate->state == SUBREL_STATE_SYNCWAIT) + { + /* + * There are three possible synchronization situations here. + * + * a) Apply is in front of the table sync: We tell the table + * sync to CATCHUP. + * + * b) Apply is behind the table sync: We tell the table sync + * to mark the table as SYNCDONE and finish. + + * c) Apply and table sync are at the same position: We tell + * table sync to mark the table as READY and finish. + * + * In any case we'll need to wait for table sync to change + * the state in catalog and only then continue ourselves. + */ + if (current_lsn > rstate->lsn) + { + rstate->state = SUBREL_STATE_CATCHUP; + rstate->lsn = current_lsn; + } + else if (current_lsn == rstate->lsn) + { + rstate->state = SUBREL_STATE_READY; + rstate->lsn = current_lsn; + } + else + rstate->state = SUBREL_STATE_SYNCDONE; + + SpinLockAcquire(&syncworker->relmutex); + syncworker->relstate = rstate->state; + syncworker->relstate_lsn = rstate->lsn; + SpinLockRelease(&syncworker->relmutex); + + /* Signal the sync worker, as it may be waiting for us. */ + logicalrep_worker_wakeup_ptr(syncworker); + + /* + * Enter busy loop and wait for synchronization status + * change. + */ + wait_for_sync_status_change(rstate->relid, rstate->state); + } + + /* + * If there is no sync worker registered for the table and + * there is some free sync worker slot, start new sync worker + * for the table. + */ + else if (!syncworker && nsyncworkers < max_sync_workers_per_subscription) + { + logicalrep_worker_launch(MyLogicalRepWorker->dbid, + MySubscription->oid, + MySubscription->name, + MyLogicalRepWorker->userid, + rstate->relid); + } + } + } +} + +/* + * Process state possible change(s) of tables that are being synchronized. + */ +void +process_syncing_tables(XLogRecPtr current_lsn) +{ + if (am_tablesync_worker()) + process_syncing_tables_for_sync(current_lsn); + else + process_syncing_tables_for_apply(current_lsn); +} + +/* + * Create list of columns for COPY based on logical relation mapping. + */ +static List * +make_copy_attnamelist(LogicalRepRelMapEntry *rel) +{ + List *attnamelist = NIL; + TupleDesc desc = RelationGetDescr(rel->localrel); + int i; + + for (i = 0; i < desc->natts; i++) + { + int remoteattnum = rel->attrmap[i]; + + /* Skip dropped attributes. */ + if (desc->attrs[i]->attisdropped) + continue; + + /* Skip attributes that are missing on remote side. */ + if (remoteattnum < 0) + continue; + + attnamelist = lappend(attnamelist, + makeString(rel->remoterel.attnames[remoteattnum])); + } + + return attnamelist; +} + +/* + * Data source callback for the COPY FROM, which reads from the remote + * connection and passes the data back to our local COPY. + */ +static int +copy_read_data(void *outbuf, int minread, int maxread) +{ + int bytesread = 0; + int avail; + + /* If there are some leftover data from previous read, use them. */ + avail = copybuf->len - copybuf->cursor; + if (avail) + { + if (avail > maxread) + avail = maxread; + memcpy(outbuf, ©buf->data[copybuf->cursor], avail); + copybuf->cursor += avail; + maxread -= avail; + bytesread += avail; + } + + while (!got_SIGTERM && maxread > 0 && bytesread < minread) + { + pgsocket fd = PGINVALID_SOCKET; + int rc; + int len; + char *buf = NULL; + + for (;;) + { + /* Try read the data. */ + len = walrcv_receive(wrconn, &buf, &fd); + + CHECK_FOR_INTERRUPTS(); + + if (len == 0) + break; + else if (len < 0) + return bytesread; + else + { + /* Process the data */ + copybuf->data = buf; + copybuf->len = len; + copybuf->cursor = 0; + + avail = copybuf->len - copybuf->cursor; + if (avail > maxread) + avail = maxread; + memcpy(outbuf, ©buf->data[copybuf->cursor], avail); + outbuf = (void *) ((char *) outbuf + avail); + copybuf->cursor += avail; + maxread -= avail; + bytesread += avail; + } + + if (maxread <= 0 || bytesread >= minread) + return bytesread; + } + + /* + * Wait for more data or latch. + */ + rc = WaitLatchOrSocket(&MyProc->procLatch, + WL_SOCKET_READABLE | WL_LATCH_SET | + WL_TIMEOUT | WL_POSTMASTER_DEATH, + fd, 1000L, WAIT_EVENT_LOGICAL_SYNC_DATA); + + /* Emergency bailout if postmaster has died */ + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); + + ResetLatch(&MyProc->procLatch); + } + + /* Check for exit condition. */ + if (got_SIGTERM) + proc_exit(0); + + return bytesread; +} + + +/* + * Get information about remote relation in similar fashion the RELATION + * message provides during replication. + */ +static void +fetch_remote_table_info(char *nspname, char *relname, + LogicalRepRelation *lrel) +{ + WalRcvExecResult *res; + StringInfoData cmd; + TupleTableSlot *slot; + Oid tableRow[2] = {OIDOID, CHAROID}; + Oid attrRow[4] = {TEXTOID, OIDOID, INT4OID, BOOLOID}; + bool isnull; + int natt; + + lrel->nspname = nspname; + lrel->relname = relname; + + /* First fetch Oid and replica identity. */ + initStringInfo(&cmd); + appendStringInfo(&cmd, "SELECT c.oid, c.relreplident" + " FROM pg_catalog.pg_class c," + " pg_catalog.pg_namespace n" + " WHERE n.nspname = %s" + " AND c.relname = %s" + " AND c.relkind = 'r'", + quote_literal_cstr(nspname), + quote_literal_cstr(relname)); + res = walrcv_exec(wrconn, cmd.data, 2, tableRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errmsg("could not fetch table info for table \"%s.%s\" from publisher: %s", + nspname, relname, res->err))); + + slot = MakeSingleTupleTableSlot(res->tupledesc); + if (!tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + ereport(ERROR, + (errmsg("table \"%s.%s\" not found on publisher", + nspname, relname))); + + lrel->remoteid = DatumGetObjectId(slot_getattr(slot, 1, &isnull)); + Assert(!isnull); + lrel->replident = DatumGetChar(slot_getattr(slot, 2, &isnull)); + Assert(!isnull); + + ExecDropSingleTupleTableSlot(slot); + walrcv_clear_result(res); + + /* Now fetch columns. */ + resetStringInfo(&cmd); + appendStringInfo(&cmd, + "SELECT a.attname," + " a.atttypid," + " a.atttypmod," + " a.attnum = ANY(i.indkey)" + " FROM pg_catalog.pg_attribute a" + " LEFT JOIN pg_catalog.pg_index i" + " ON (i.indexrelid = pg_get_replica_identity_index(%u))" + " WHERE a.attnum > 0::pg_catalog.int2" + " AND NOT a.attisdropped" + " AND a.attrelid = %u" + " ORDER BY a.attnum", + lrel->remoteid, lrel->remoteid); + res = walrcv_exec(wrconn, cmd.data, 4, attrRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errmsg("could not fetch table info for table \"%s.%s\": %s", + nspname, relname, res->err))); + + /* We don't know number of rows coming, so allocate enough space. */ + lrel->attnames = palloc0(MaxTupleAttributeNumber * sizeof(char *)); + lrel->atttyps = palloc0(MaxTupleAttributeNumber * sizeof(Oid)); + lrel->attkeys = NULL; + + natt = 0; + slot = MakeSingleTupleTableSlot(res->tupledesc); + while (tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + { + lrel->attnames[natt] = + pstrdup(TextDatumGetCString(slot_getattr(slot, 1, &isnull))); + Assert(!isnull); + lrel->atttyps[natt] = DatumGetObjectId(slot_getattr(slot, 2, &isnull)); + Assert(!isnull); + if (DatumGetBool(slot_getattr(slot, 4, &isnull))) + lrel->attkeys = bms_add_member(lrel->attkeys, natt); + + /* Should never happen. */ + if (++natt >= MaxTupleAttributeNumber) + elog(ERROR, "too many columns in remote table \"%s.%s\"", + nspname, relname); + + ExecClearTuple(slot); + } + ExecDropSingleTupleTableSlot(slot); + + lrel->natts = natt; + + walrcv_clear_result(res); + pfree(cmd.data); +} + +/* + * Copy existing data of a table from publisher. + * + * Caller is responsible for locking the local relation. + */ +static void +copy_table(Relation rel) +{ + LogicalRepRelMapEntry *relmapentry; + LogicalRepRelation lrel; + WalRcvExecResult *res; + StringInfoData cmd; + CopyState cstate; + List *attnamelist; + + /* Get the publisher relation info. */ + fetch_remote_table_info(get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel), &lrel); + + /* Put the relation into relmap. */ + logicalrep_relmap_update(&lrel); + + /* Map the publisher relation to local one. */ + relmapentry = logicalrep_rel_open(lrel.remoteid, NoLock); + Assert(rel == relmapentry->localrel); + + /* Start copy on the publisher. */ + initStringInfo(&cmd); + appendStringInfo(&cmd, "COPY %s TO STDOUT", + quote_qualified_identifier(lrel.nspname, lrel.relname)); + res = walrcv_exec(wrconn, cmd.data, 0, NULL); + pfree(cmd.data); + if (res->status != WALRCV_OK_COPY_OUT) + ereport(ERROR, + (errmsg("could not start initial contents copy for table \"%s.%s\": %s", + lrel.nspname, lrel.relname, res->err))); + walrcv_clear_result(res); + + copybuf = makeStringInfo(); + + /* Create CopyState for ingestion of the data from publisher. */ + attnamelist = make_copy_attnamelist(relmapentry); + cstate = BeginCopyFrom(NULL, rel, NULL, false, copy_read_data, attnamelist, NIL); + + /* Do the copy */ + (void) CopyFrom(cstate); + + logicalrep_rel_close(relmapentry, NoLock); +} + +/* + * Start syncing the table in the sync worker. + * + * The returned slot name is palloced in current memory context. + */ +char * +LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) +{ + char *slotname; + char *err; + + /* Check the state of the table synchronization. */ + StartTransactionCommand(); + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->relstate = + GetSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + &MyLogicalRepWorker->relstate_lsn, + false); + SpinLockRelease(&MyLogicalRepWorker->relmutex); + CommitTransactionCommand(); + + /* + * To build a slot name for the sync work, we are limited to NAMEDATALEN - + * 1 characters. We cut the original slot name to NAMEDATALEN - 28 chars + * and append _%u_sync_%u (1 + 10 + 6 + 10 + '\0'). (It's actually the + * NAMEDATALEN on the remote that matters, but this scheme will also work + * reasonably if that is different.) + */ + StaticAssertStmt(NAMEDATALEN >= 32, "NAMEDATALEN too small"); /* for sanity */ + slotname = psprintf("%.*s_%u_sync_%u", + NAMEDATALEN - 28, + MySubscription->slotname, + MySubscription->oid, + MyLogicalRepWorker->relid); + + wrconn = walrcv_connect(MySubscription->conninfo, true, slotname, &err); + if (wrconn == NULL) + ereport(ERROR, + (errmsg("could not connect to the publisher: %s", err))); + + switch (MyLogicalRepWorker->relstate) + { + case SUBREL_STATE_INIT: + case SUBREL_STATE_DATASYNC: + { + Relation rel; + WalRcvExecResult *res; + + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->relstate = SUBREL_STATE_DATASYNC; + MyLogicalRepWorker->relstate_lsn = InvalidXLogRecPtr; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* Update the state and make it visible to others. */ + StartTransactionCommand(); + SetSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + MyLogicalRepWorker->relstate, + MyLogicalRepWorker->relstate_lsn); + CommitTransactionCommand(); + + /* + * We want to do the table data sync in single + * transaction. + */ + StartTransactionCommand(); + + /* + * Use standard write lock here. It might be better to + * disallow access to table while it's being synchronized. + * But we don't want to block the main apply process from + * working and it has to open relation in RowExclusiveLock + * when remapping remote relation id to local one. + */ + rel = heap_open(MyLogicalRepWorker->relid, RowExclusiveLock); + + /* + * Create temporary slot for the sync process. + * We do this inside transaction so that we can use the + * snapshot made by the slot to get existing data. + */ + res = walrcv_exec(wrconn, + "BEGIN READ ONLY ISOLATION LEVEL " + "REPEATABLE READ", 0, NULL); + if (res->status != WALRCV_OK_COMMAND) + ereport(ERROR, + (errmsg("table copy could not start transaction on publisher"), + errdetail("The error was: %s", res->err))); + walrcv_clear_result(res); + + /* + * Create new temporary logical decoding slot. + * + * We'll use slot for data copy so make sure the snapshot + * is used for the transaction, that way the COPY will get + * data that is consistent with the lsn used by the slot + * to start decoding. + */ + walrcv_create_slot(wrconn, slotname, true, + CRS_USE_SNAPSHOT, origin_startpos); + + copy_table(rel); + + res = walrcv_exec(wrconn, "COMMIT", 0, NULL); + if (res->status != WALRCV_OK_COMMAND) + ereport(ERROR, + (errmsg("table copy could not finish transaction on publisher"), + errdetail("The error was: %s", res->err))); + walrcv_clear_result(res); + + heap_close(rel, NoLock); + + /* Make the copy visible. */ + CommandCounterIncrement(); + + /* + * We are done with the initial data synchronization, + * update the state. + */ + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->relstate = SUBREL_STATE_SYNCWAIT; + MyLogicalRepWorker->relstate_lsn = *origin_startpos; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* + * Wait for main apply worker to either tell us to + * catchup or that we are done. + */ + wait_for_sync_status_change(MyLogicalRepWorker->relid, + MyLogicalRepWorker->relstate); + if (MyLogicalRepWorker->relstate != SUBREL_STATE_CATCHUP) + { + /* Update the new state. */ + SetSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + MyLogicalRepWorker->relstate, + MyLogicalRepWorker->relstate_lsn); + finish_sync_worker(); + } + break; + } + case SUBREL_STATE_SYNCDONE: + case SUBREL_STATE_READY: + /* Nothing to do here but finish. */ + finish_sync_worker(); + break; + default: + elog(ERROR, "unknown relation state \"%c\"", + MyLogicalRepWorker->relstate); + } + + return slotname; +} diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index c3e54af259..bbf3506be0 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -32,6 +32,7 @@ #include "catalog/namespace.h" #include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" #include "commands/trigger.h" @@ -101,7 +102,7 @@ typedef struct SlotErrCallbackArg } SlotErrCallbackArg; static MemoryContext ApplyContext = NULL; -static MemoryContext ApplyCacheContext = NULL; +MemoryContext ApplyCacheContext = NULL; WalReceiverConn *wrconn = NULL; @@ -109,6 +110,7 @@ Subscription *MySubscription = NULL; bool MySubscriptionValid = false; bool in_remote_transaction = false; +static XLogRecPtr remote_final_lsn = InvalidXLogRecPtr; static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply); @@ -116,6 +118,30 @@ static void store_flush_position(XLogRecPtr remote_lsn); static void reread_subscription(void); +/* + * Should this worker apply changes for given relation. + * + * This is mainly needed for initial relation data sync as that runs in + * separate worker process running in parallel and we need some way to skip + * changes coming to the main apply worker during the sync of a table. + * + * Note we need to do smaller or equals comparison for SYNCDONE state because + * it might hold position of end of intitial slot consistent point WAL + * record + 1 (ie start of next record) and next record can be COMMIT of + * transaction we are now processing (which is what we set remote_final_lsn + * to in apply_handle_begin). + */ +static bool +should_apply_changes_for_rel(LogicalRepRelMapEntry *rel) +{ + if (am_tablesync_worker()) + return MyLogicalRepWorker->relid == rel->localreloid; + else + return (rel->state == SUBREL_STATE_READY || + (rel->state == SUBREL_STATE_SYNCDONE && + rel->statelsn <= remote_final_lsn)); +} + /* * Make sure that we started local transaction. * @@ -398,6 +424,8 @@ apply_handle_begin(StringInfo s) replorigin_session_origin_timestamp = begin_data.committime; replorigin_session_origin_lsn = begin_data.final_lsn; + remote_final_lsn = begin_data.final_lsn; + in_remote_transaction = true; pgstat_report_activity(STATE_RUNNING, NULL); @@ -418,7 +446,10 @@ apply_handle_commit(StringInfo s) Assert(commit_data.commit_lsn == replorigin_session_origin_lsn); Assert(commit_data.committime == replorigin_session_origin_timestamp); - if (IsTransactionState()) + Assert(commit_data.commit_lsn == remote_final_lsn); + + /* The synchronization worker runs in single transaction. */ + if (IsTransactionState() && !am_tablesync_worker()) { CommitTransactionCommand(); @@ -427,6 +458,9 @@ apply_handle_commit(StringInfo s) in_remote_transaction = false; + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(commit_data.end_lsn); + pgstat_report_activity(STATE_IDLE, NULL); } @@ -442,7 +476,8 @@ apply_handle_origin(StringInfo s) * ORIGIN message can only come inside remote transaction and before * any actual writes. */ - if (!in_remote_transaction || IsTransactionState()) + if (!in_remote_transaction || + (IsTransactionState() && !am_tablesync_worker())) ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("ORIGIN message sent out of order"))); @@ -515,6 +550,15 @@ apply_handle_insert(StringInfo s) relid = logicalrep_read_insert(s, &newtup); rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + return; + } /* Initialize the executor state. */ estate = create_estate_for_relation(rel); @@ -607,6 +651,15 @@ apply_handle_update(StringInfo s) relid = logicalrep_read_update(s, &has_oldtup, &oldtup, &newtup); rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + return; + } /* Check if we can do the update. */ check_relation_updatable(rel); @@ -716,6 +769,15 @@ apply_handle_delete(StringInfo s) relid = logicalrep_read_delete(s, &oldtup); rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + return; + } /* Check if we can do the delete. */ check_relation_updatable(rel); @@ -927,10 +989,8 @@ UpdateWorkerStats(XLogRecPtr last_lsn, TimestampTz send_time, bool reply) * Apply main loop. */ static void -ApplyLoop(void) +LogicalRepApplyLoop(XLogRecPtr last_received) { - XLogRecPtr last_received = InvalidXLogRecPtr; - /* Init the ApplyContext which we use for easier cleanup. */ ApplyContext = AllocSetContextCreate(TopMemoryContext, "ApplyContext", @@ -1014,15 +1074,18 @@ ApplyLoop(void) } else if (c == 'k') { - XLogRecPtr endpos; + XLogRecPtr end_lsn; TimestampTz timestamp; bool reply_requested; - endpos = pq_getmsgint64(&s); + end_lsn = pq_getmsgint64(&s); timestamp = pq_getmsgint64(&s); reply_requested = pq_getmsgbyte(&s); - send_feedback(endpos, reply_requested, false); + if (last_received < end_lsn) + last_received = end_lsn; + + send_feedback(last_received, reply_requested, false); UpdateWorkerStats(last_received, timestamp, true); } /* other message types are purposefully ignored */ @@ -1030,6 +1093,9 @@ ApplyLoop(void) len = walrcv_receive(wrconn, &buf, &fd); } + + /* confirm all writes at once */ + send_feedback(last_received, false, false); } if (!in_remote_transaction) @@ -1038,15 +1104,13 @@ ApplyLoop(void) * If we didn't get any transactions for a while there might be * unconsumed invalidation messages in the queue, consume them now. */ - StartTransactionCommand(); - /* Check for subscription change */ + AcceptInvalidationMessages(); if (!MySubscriptionValid) reread_subscription(); - CommitTransactionCommand(); - } - /* confirm all writes at once */ - send_feedback(last_received, false, false); + /* Process any table synchronization changes. */ + process_syncing_tables(last_received); + } /* Cleanup the memory. */ MemoryContextResetAndDeleteChildren(ApplyContext); @@ -1054,7 +1118,11 @@ ApplyLoop(void) /* Check if we need to exit the streaming loop. */ if (endofstream) + { + TimeLineID tli; + walrcv_endstreaming(wrconn, &tli); break; + } /* * Wait for more data or latch. @@ -1222,6 +1290,14 @@ reread_subscription(void) { MemoryContext oldctx; Subscription *newsub; + bool started_tx = false; + + /* This function might be called inside or outside of transaction. */ + if (!IsTransactionState()) + { + StartTransactionCommand(); + started_tx = true; + } /* Ensure allocations in permanent context. */ oldctx = MemoryContextSwitchTo(ApplyCacheContext); @@ -1319,6 +1395,9 @@ reread_subscription(void) MemoryContextSwitchTo(oldctx); + if (started_tx) + CommitTransactionCommand(); + MySubscriptionValid = true; } @@ -1339,11 +1418,8 @@ ApplyWorkerMain(Datum main_arg) int worker_slot = DatumGetObjectId(main_arg); MemoryContext oldctx; char originname[NAMEDATALEN]; - RepOriginId originid; XLogRecPtr origin_startpos; - char *err; - int server_version; - TimeLineID startpointTLI; + char *myslotname; WalRcvStreamOptions options; /* Attach to slot */ @@ -1402,49 +1478,90 @@ ApplyWorkerMain(Datum main_arg) subscription_change_cb, (Datum) 0); - ereport(LOG, - (errmsg("logical replication apply for subscription \"%s\" has started", - MySubscription->name))); - - /* Setup replication origin tracking. */ - snprintf(originname, sizeof(originname), "pg_%u", MySubscription->oid); - originid = replorigin_by_name(originname, true); - if (!OidIsValid(originid)) - originid = replorigin_create(originname); - replorigin_session_setup(originid); - replorigin_session_origin = originid; - origin_startpos = replorigin_session_get_progress(false); + if (am_tablesync_worker()) + elog(LOG, "logical replication sync for subscription %s, table %s started", + MySubscription->name, get_rel_name(MyLogicalRepWorker->relid)); + else + elog(LOG, "logical replication apply for subscription %s started", + MySubscription->name); CommitTransactionCommand(); /* Connect to the origin and start the replication. */ elog(DEBUG1, "connecting to publisher using connection string \"%s\"", MySubscription->conninfo); - wrconn = walrcv_connect(MySubscription->conninfo, true, - MySubscription->name, &err); - if (wrconn == NULL) - ereport(ERROR, - (errmsg("could not connect to the publisher: %s", err))); + + if (am_tablesync_worker()) + { + char *syncslotname; + + /* This is table synchroniation worker, call initial sync. */ + syncslotname = LogicalRepSyncTableStart(&origin_startpos); + + /* The slot name needs to be allocated in permanent memory context. */ + oldctx = MemoryContextSwitchTo(ApplyCacheContext); + myslotname = pstrdup(syncslotname); + MemoryContextSwitchTo(oldctx); + + pfree(syncslotname); + } + else + { + /* This is main apply worker */ + RepOriginId originid; + TimeLineID startpointTLI; + char *err; + int server_version; + + myslotname = MySubscription->slotname; + + /* Setup replication origin tracking. */ + StartTransactionCommand(); + snprintf(originname, sizeof(originname), "pg_%u", MySubscription->oid); + originid = replorigin_by_name(originname, true); + if (!OidIsValid(originid)) + originid = replorigin_create(originname); + replorigin_session_setup(originid); + replorigin_session_origin = originid; + origin_startpos = replorigin_session_get_progress(false); + CommitTransactionCommand(); + + wrconn = walrcv_connect(MySubscription->conninfo, true, myslotname, + &err); + if (wrconn == NULL) + ereport(ERROR, + (errmsg("could not connect to the publisher: %s", err))); + + /* + * We don't really use the output identify_system for anything + * but it does some initializations on the upstream so let's still + * call it. + */ + (void) walrcv_identify_system(wrconn, &startpointTLI, + &server_version); + + } /* - * We don't really use the output identify_system for anything - * but it does some initializations on the upstream so let's still - * call it. + * Setup callback for syscache so that we know when something + * changes in the subscription relation state. */ - (void) walrcv_identify_system(wrconn, &startpointTLI, &server_version); + CacheRegisterSyscacheCallback(SUBSCRIPTIONRELMAP, + invalidate_syncing_table_states, + (Datum) 0); /* Build logical replication streaming options. */ options.logical = true; options.startpoint = origin_startpos; - options.slotname = MySubscription->slotname; + options.slotname = myslotname; options.proto.logical.proto_version = LOGICALREP_PROTO_VERSION_NUM; options.proto.logical.publication_names = MySubscription->publications; - /* Start streaming from the slot. */ + /* Start normal logical streaming replication. */ walrcv_startstreaming(wrconn, &options); /* Run the main loop. */ - ApplyLoop(); + LogicalRepApplyLoop(origin_startpos); walrcv_disconnect(wrconn); diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y index f1e43bc9f3..ec047c827c 100644 --- a/src/backend/replication/repl_gram.y +++ b/src/backend/replication/repl_gram.y @@ -25,6 +25,8 @@ /* Result of the parsing is returned here */ Node *replication_parse_result; +static SQLCmd *make_sqlcmd(void); + /* * Bison doesn't allocate anything that needs to live across parser calls, @@ -57,6 +59,7 @@ Node *replication_parse_result; %token SCONST IDENT %token UCONST %token RECPTR +%token T_WORD /* Keyword tokens. */ %token K_BASE_BACKUP @@ -81,11 +84,12 @@ Node *replication_parse_result; %token K_TEMPORARY %token K_EXPORT_SNAPSHOT %token K_NOEXPORT_SNAPSHOT +%token K_USE_SNAPSHOT %type command %type base_backup start_replication start_logical_replication create_replication_slot drop_replication_slot identify_system - timeline_history show + timeline_history show sql_cmd %type base_backup_opt_list %type base_backup_opt %type opt_timeline @@ -118,6 +122,7 @@ command: | drop_replication_slot | timeline_history | show + | sql_cmd ; /* @@ -248,6 +253,11 @@ create_slot_opt: $$ = makeDefElem("export_snapshot", (Node *)makeInteger(FALSE), -1); } + | K_USE_SNAPSHOT + { + $$ = makeDefElem("use_snapshot", + (Node *)makeInteger(TRUE), -1); + } | K_RESERVE_WAL { $$ = makeDefElem("reserve_wal", @@ -373,6 +383,26 @@ plugin_opt_arg: SCONST { $$ = (Node *) makeString($1); } | /* EMPTY */ { $$ = NULL; } ; + +sql_cmd: + IDENT { $$ = (Node *) make_sqlcmd(); } + ; %% +static SQLCmd * +make_sqlcmd(void) +{ + SQLCmd *cmd = makeNode(SQLCmd); + int tok; + + /* Just move lexer to the end of command. */ + for (;;) + { + tok = yylex(); + if (tok == ';' || tok == 0) + break; + } + return cmd; +} + #include "repl_scanner.c" diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l index f56d41d59c..52ae7b343f 100644 --- a/src/backend/replication/repl_scanner.l +++ b/src/backend/replication/repl_scanner.l @@ -102,6 +102,7 @@ SLOT { return K_SLOT; } TEMPORARY { return K_TEMPORARY; } EXPORT_SNAPSHOT { return K_EXPORT_SNAPSHOT; } NOEXPORT_SNAPSHOT { return K_NOEXPORT_SNAPSHOT; } +USE_SNAPSHOT { return K_USE_SNAPSHOT; } "," { return ','; } ";" { return ';'; } @@ -180,9 +181,7 @@ NOEXPORT_SNAPSHOT { return K_NOEXPORT_SNAPSHOT; } } . { - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error: unexpected character \"%s\"", yytext))); + return T_WORD; } %% diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 75617709ec..c6ba916c49 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -753,7 +753,7 @@ logical_read_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int req static void parseCreateReplSlotOptions(CreateReplicationSlotCmd *cmd, bool *reserve_wal, - bool *export_snapshot) + CRSSnapshotAction *snapshot_action) { ListCell *lc; bool snapshot_action_given = false; @@ -772,7 +772,18 @@ parseCreateReplSlotOptions(CreateReplicationSlotCmd *cmd, errmsg("conflicting or redundant options"))); snapshot_action_given = true; - *export_snapshot = defGetBoolean(defel); + *snapshot_action = defGetBoolean(defel) ? CRS_EXPORT_SNAPSHOT : + CRS_NOEXPORT_SNAPSHOT; + } + else if (strcmp(defel->defname, "use_snapshot") == 0) + { + if (snapshot_action_given || cmd->kind != REPLICATION_KIND_LOGICAL) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + + snapshot_action_given = true; + *snapshot_action = CRS_USE_SNAPSHOT; } else if (strcmp(defel->defname, "reserve_wal") == 0) { @@ -799,7 +810,7 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) char xpos[MAXFNAMELEN]; char *slot_name; bool reserve_wal = false; - bool export_snapshot = true; + CRSSnapshotAction snapshot_action = CRS_EXPORT_SNAPSHOT; DestReceiver *dest; TupOutputState *tstate; TupleDesc tupdesc; @@ -808,7 +819,7 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) Assert(!MyReplicationSlot); - parseCreateReplSlotOptions(cmd, &reserve_wal, &export_snapshot); + parseCreateReplSlotOptions(cmd, &reserve_wal, &snapshot_action); /* setup state for XLogReadPage */ sendTimeLineIsHistoric = false; @@ -838,6 +849,40 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) { LogicalDecodingContext *ctx; + /* + * Do options check early so that we can bail before calling the + * DecodingContextFindStartpoint which can take long time. + */ + if (snapshot_action == CRS_EXPORT_SNAPSHOT) + { + if (IsTransactionBlock()) + ereport(ERROR, + (errmsg("CREATE_REPLICATION_SLOT ... EXPORT_SNAPSHOT " + "must not be called inside a transaction"))); + } + else if (snapshot_action == CRS_USE_SNAPSHOT) + { + if (!IsTransactionBlock()) + ereport(ERROR, + (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT " + "must be called inside a transaction"))); + + if (XactIsoLevel != XACT_REPEATABLE_READ) + ereport(ERROR, + (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT " + "must be called in REPEATABLE READ isolation mode transaction"))); + + if (FirstSnapshotSet) + ereport(ERROR, + (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT " + "must be called before any query"))); + + if (IsSubTransaction()) + ereport(ERROR, + (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT " + "must not be called in a subtransaction"))); + } + ctx = CreateInitDecodingContext(cmd->plugin, NIL, logical_read_xlog_page, WalSndPrepareWrite, WalSndWriteData); @@ -855,13 +900,22 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) DecodingContextFindStartpoint(ctx); /* - * Export the snapshot if we've been asked to do so. + * Export or use the snapshot if we've been asked to do so. * * NB. We will convert the snapbuild.c kind of snapshot to normal * snapshot when doing this. */ - if (export_snapshot) + if (snapshot_action == CRS_EXPORT_SNAPSHOT) + { snapshot_name = SnapBuildExportSnapshot(ctx->snapshot_builder); + } + else if (snapshot_action == CRS_USE_SNAPSHOT) + { + Snapshot snap; + + snap = SnapBuildInitalSnapshot(ctx->snapshot_builder); + RestoreTransactionSnapshot(snap, MyProc); + } /* don't need the decoding context anymore */ FreeDecodingContext(ctx); @@ -1277,8 +1331,11 @@ WalSndWaitForWal(XLogRecPtr loc) /* * Execute an incoming replication command. + * + * Returns true if the cmd_string was recognized as WalSender command, false + * if not. */ -void +bool exec_replication_command(const char *cmd_string) { int parse_rc; @@ -1317,6 +1374,25 @@ exec_replication_command(const char *cmd_string) cmd_node = replication_parse_result; + /* + * CREATE_REPLICATION_SLOT ... LOGICAL exports a snapshot. If it was + * called outside of transaction the snapshot should be cleared here. + */ + if (!IsTransactionBlock()) + SnapBuildClearExportedSnapshot(); + + /* + * For aborted transactions, don't allow anything except pure SQL, + * the exec_simple_query() will handle it correctly. + */ + if (IsAbortedTransactionBlockState() && !IsA(cmd_node, SQLCmd)) + ereport(ERROR, + (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), + errmsg("current transaction is aborted, " + "commands ignored until end of transaction block"))); + + CHECK_FOR_INTERRUPTS(); + /* * Allocate buffers that will be used for each outgoing and incoming * message. We do this just once per command to reduce palloc overhead. @@ -1332,6 +1408,7 @@ exec_replication_command(const char *cmd_string) break; case T_BaseBackupCmd: + PreventTransactionChain(true, "BASE_BACKUP"); SendBaseBackup((BaseBackupCmd *) cmd_node); break; @@ -1347,6 +1424,8 @@ exec_replication_command(const char *cmd_string) { StartReplicationCmd *cmd = (StartReplicationCmd *) cmd_node; + PreventTransactionChain(true, "START_REPLICATION"); + if (cmd->kind == REPLICATION_KIND_PHYSICAL) StartReplication(cmd); else @@ -1355,6 +1434,7 @@ exec_replication_command(const char *cmd_string) } case T_TimeLineHistoryCmd: + PreventTransactionChain(true, "TIMELINE_HISTORY"); SendTimeLineHistory((TimeLineHistoryCmd *) cmd_node); break; @@ -1367,6 +1447,14 @@ exec_replication_command(const char *cmd_string) } break; + case T_SQLCmd: + if (MyDatabaseId == InvalidOid) + ereport(ERROR, + (errmsg("not connected to database"))); + + /* Tell the caller that this wasn't a WalSender command. */ + return false; + default: elog(ERROR, "unrecognized replication command node tag: %u", cmd_node->type); @@ -1378,6 +1466,8 @@ exec_replication_command(const char *cmd_string) /* Send CommandComplete message */ EndCommand("SELECT", DestRemote); + + return true; } /* diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index b07d6c6cb9..ba41f90712 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -4061,7 +4061,10 @@ PostgresMain(int argc, char *argv[], pq_getmsgend(&input_message); if (am_walsender) - exec_replication_command(query_string); + { + if (!exec_replication_command(query_string)) + exec_simple_query(query_string); + } else exec_simple_query(query_string); diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c index 1ec7f32470..7dcecb2f0f 100644 --- a/src/backend/utils/adt/misc.c +++ b/src/backend/utils/adt/misc.c @@ -982,3 +982,23 @@ pg_current_logfile_1arg(PG_FUNCTION_ARGS) { return pg_current_logfile(fcinfo); } + +/* + * SQL wrapper around RelationGetReplicaIndex(). + */ +Datum +pg_get_replica_identity_index(PG_FUNCTION_ARGS) +{ + Oid reloid = PG_GETARG_OID(0); + Oid idxoid; + Relation rel; + + rel = heap_open(reloid, AccessShareLock); + idxoid = RelationGetReplicaIndex(rel); + heap_close(rel, AccessShareLock); + + if (OidIsValid(idxoid)) + PG_RETURN_OID(idxoid); + else + PG_RETURN_NULL(); +} diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index b1c0b4b1be..d5a376406f 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -62,6 +62,7 @@ #include "catalog/pg_replication_origin.h" #include "catalog/pg_statistic.h" #include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_transform.h" #include "catalog/pg_ts_config.h" @@ -693,7 +694,7 @@ static const struct cachedesc cacheinfo[] = { 64 }, {PublicationRelRelationId, /* PUBLICATIONRELMAP */ - PublicationRelMapIndexId, + PublicationRelPrrelidPrpubidIndexId, 2, { Anum_pg_publication_rel_prrelid, @@ -758,6 +759,17 @@ static const struct cachedesc cacheinfo[] = { }, 4 }, + {SubscriptionRelRelationId, /* SUBSCRIPTIONRELMAP */ + SubscriptionRelSrrelidSrsubidIndexId, + 2, + { + Anum_pg_subscription_rel_srrelid, + Anum_pg_subscription_rel_srsubid, + 0, + 0 + }, + 64 + }, {TableSpaceRelationId, /* TABLESPACEOID */ TablespaceOidIndexId, 1, diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 4feb26aa7a..291bf7631d 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2497,6 +2497,18 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"max_sync_workers_per_subscription", + PGC_SIGHUP, + RESOURCES_ASYNCHRONOUS, + gettext_noop("Maximum number of table synchronization workers per subscription."), + NULL, + }, + &max_sync_workers_per_subscription, + 2, 0, MAX_BACKENDS, + NULL, NULL, NULL + }, + { {"log_rotation_age", PGC_SIGHUP, LOGGING_WHERE, gettext_noop("Automatic log file rotation will occur after N minutes."), diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h index 610bed531c..98bc1a586a 100644 --- a/src/bin/pg_dump/pg_backup.h +++ b/src/bin/pg_dump/pg_backup.h @@ -155,7 +155,7 @@ typedef struct _dumpOptions int use_setsessauth; int enable_row_security; int include_subscriptions; - int no_create_subscription_slots; + int no_subscription_connect; /* default, if no "inclusion" switches appear, is to dump everything */ bool include_everything; diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 2b5a52656c..a98747d89a 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -351,8 +351,8 @@ main(int argc, char **argv) {"snapshot", required_argument, NULL, 6}, {"strict-names", no_argument, &strict_names, 1}, {"use-set-session-authorization", no_argument, &dopt.use_setsessauth, 1}, - {"no-create-subscription-slots", no_argument, &dopt.no_create_subscription_slots, 1}, {"no-security-labels", no_argument, &dopt.no_security_labels, 1}, + {"no-subscription-connect", no_argument, &dopt.no_subscription_connect, 1}, {"no-synchronized-snapshots", no_argument, &dopt.no_synchronized_snapshots, 1}, {"no-unlogged-table-data", no_argument, &dopt.no_unlogged_table_data, 1}, {"no-sync", no_argument, NULL, 7}, @@ -951,9 +951,8 @@ help(const char *progname) printf(_(" --if-exists use IF EXISTS when dropping objects\n")); printf(_(" --include-subscriptions dump logical replication subscriptions\n")); printf(_(" --inserts dump data as INSERT commands, rather than COPY\n")); - printf(_(" --no-create-subscription-slots\n" - " do not create replication slots for subscriptions\n")); printf(_(" --no-security-labels do not dump security label assignments\n")); + printf(_(" --no-subscription-connect dump subscriptions so they don't connect on restore\n")); printf(_(" --no-synchronized-snapshots do not use synchronized snapshots in parallel jobs\n")); printf(_(" --no-tablespaces do not dump tablespace assignments\n")); printf(_(" --no-unlogged-table-data do not dump unlogged table data\n")); @@ -3774,8 +3773,8 @@ dumpSubscription(Archive *fout, SubscriptionInfo *subinfo) appendPQExpBufferStr(query, ", SLOT NAME = "); appendStringLiteralAH(query, subinfo->subslotname, fout); - if (dopt->no_create_subscription_slots) - appendPQExpBufferStr(query, ", NOCREATE SLOT"); + if (dopt->no_subscription_connect) + appendPQExpBufferStr(query, ", NOCONNECT"); appendPQExpBufferStr(query, ");\n"); diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index a46dcdbcd7..021f4bf081 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -4224,7 +4224,7 @@ qr/CREATE TRANSFORM FOR integer LANGUAGE sql \(FROM SQL WITH FUNCTION pg_catalog create_order => 50, create_sql => 'CREATE SUBSCRIPTION sub1 CONNECTION \'dbname=doesnotexist\' PUBLICATION pub1 - WITH (DISABLED, NOCREATE SLOT);', + WITH (DISABLED, NOCONNECT);', regexp => qr/^ \QCREATE SUBSCRIPTION sub1 CONNECTION 'dbname=doesnotexist' PUBLICATION pub1 WITH (DISABLED, SLOT NAME = 'sub1');\E /xm, diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 315f155b64..d8679f5f59 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201703221 +#define CATALOG_VERSION_NO 201703231 #endif diff --git a/src/include/catalog/indexing.h b/src/include/catalog/indexing.h index 6bce7328a2..5d4190c05e 100644 --- a/src/include/catalog/indexing.h +++ b/src/include/catalog/indexing.h @@ -340,8 +340,8 @@ DECLARE_UNIQUE_INDEX(pg_publication_pubname_index, 6111, on pg_publication using DECLARE_UNIQUE_INDEX(pg_publication_rel_oid_index, 6112, on pg_publication_rel using btree(oid oid_ops)); #define PublicationRelObjectIndexId 6112 -DECLARE_UNIQUE_INDEX(pg_publication_rel_map_index, 6113, on pg_publication_rel using btree(prrelid oid_ops, prpubid oid_ops)); -#define PublicationRelMapIndexId 6113 +DECLARE_UNIQUE_INDEX(pg_publication_rel_prrelid_prpubid_index, 6113, on pg_publication_rel using btree(prrelid oid_ops, prpubid oid_ops)); +#define PublicationRelPrrelidPrpubidIndexId 6113 DECLARE_UNIQUE_INDEX(pg_subscription_oid_index, 6114, on pg_subscription using btree(oid oid_ops)); #define SubscriptionObjectIndexId 6114 @@ -349,6 +349,9 @@ DECLARE_UNIQUE_INDEX(pg_subscription_oid_index, 6114, on pg_subscription using b DECLARE_UNIQUE_INDEX(pg_subscription_subname_index, 6115, on pg_subscription using btree(subdbid oid_ops, subname name_ops)); #define SubscriptionNameIndexId 6115 +DECLARE_UNIQUE_INDEX(pg_subscription_rel_srrelid_srsubid_index, 6117, on pg_subscription_rel using btree(srrelid oid_ops, srsubid oid_ops)); +#define SubscriptionRelSrrelidSrsubidIndexId 6117 + /* last step of initialization script: build the indexes declared above */ BUILD_INDICES diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 22635655f5..78c23e3f5d 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -2021,6 +2021,9 @@ DESCR("is a relation insertable/updatable/deletable"); DATA(insert OID = 3843 ( pg_column_is_updatable PGNSP PGUID 12 10 0 0 0 f f f f t f s s 3 0 16 "2205 21 16" _null_ _null_ _null_ _null_ _null_ pg_column_is_updatable _null_ _null_ _null_ )); DESCR("is a column updatable"); +DATA(insert OID = 6120 ( pg_get_replica_identity_index PGNSP PGUID 12 10 0 0 0 f f f f t f s s 1 0 2205 "2205" _null_ _null_ _null_ _null_ _null_ pg_get_replica_identity_index _null_ _null_ _null_ )); +DESCR("oid of replica identity index if any"); + /* Deferrable unique constraint trigger */ DATA(insert OID = 1250 ( unique_key_recheck PGNSP PGUID 12 1 0 0 0 f f f f t f v s 0 0 2279 "" _null_ _null_ _null_ _null_ _null_ unique_key_recheck _null_ _null_ _null_ )); DESCR("deferred UNIQUE constraint check"); @@ -2805,7 +2808,7 @@ DATA(insert OID = 3099 ( pg_stat_get_wal_senders PGNSP PGUID 12 1 10 0 0 f f f DESCR("statistics: information about currently active replication"); DATA(insert OID = 3317 ( pg_stat_get_wal_receiver PGNSP PGUID 12 1 0 0 0 f f f f f f s r 0 0 2249 "" "{23,25,3220,23,3220,23,1184,1184,3220,1184,25,25}" "{o,o,o,o,o,o,o,o,o,o,o,o}" "{pid,status,receive_start_lsn,receive_start_tli,received_lsn,received_tli,last_msg_send_time,last_msg_receipt_time,latest_end_lsn,latest_end_time,slot_name,conninfo}" _null_ _null_ pg_stat_get_wal_receiver _null_ _null_ _null_ )); DESCR("statistics: information about WAL receiver"); -DATA(insert OID = 6118 ( pg_stat_get_subscription PGNSP PGUID 12 1 0 0 0 f f f f f f s r 1 0 2249 "26" "{26,26,23,3220,1184,1184,3220,1184}" "{i,o,o,o,o,o,o,o}" "{subid,subid,pid,received_lsn,last_msg_send_time,last_msg_receipt_time,latest_end_lsn,latest_end_time}" _null_ _null_ pg_stat_get_subscription _null_ _null_ _null_ )); +DATA(insert OID = 6118 ( pg_stat_get_subscription PGNSP PGUID 12 1 0 0 0 f f f f f f s r 1 0 2249 "26" "{26,26,26,23,3220,1184,1184,3220,1184}" "{i,o,o,o,o,o,o,o,o}" "{subid,subid,relid,pid,received_lsn,last_msg_send_time,last_msg_receipt_time,latest_end_lsn,latest_end_time}" _null_ _null_ pg_stat_get_subscription _null_ _null_ _null_ )); DESCR("statistics: information about subscription"); DATA(insert OID = 2026 ( pg_backend_pid PGNSP PGUID 12 1 0 0 0 f f f f t f s r 0 0 23 "" _null_ _null_ _null_ _null_ _null_ pg_backend_pid _null_ _null_ _null_ )); DESCR("statistics: current backend PID"); diff --git a/src/include/catalog/pg_subscription_rel.h b/src/include/catalog/pg_subscription_rel.h new file mode 100644 index 0000000000..129aa99f29 --- /dev/null +++ b/src/include/catalog/pg_subscription_rel.h @@ -0,0 +1,78 @@ +/* ------------------------------------------------------------------------- + * + * pg_subscription_rel.h + * Local info about tables that come from the publisher of a + * subscription (pg_subscription_rel). + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * ------------------------------------------------------------------------- + */ +#ifndef PG_SUBSCRIPTION_REL_H +#define PG_SUBSCRIPTION_REL_H + +#include "catalog/genbki.h" + +/* ---------------- + * pg_subscription_rel definition. cpp turns this into + * typedef struct FormData_pg_subscription_rel + * ---------------- + */ +#define SubscriptionRelRelationId 6102 + +/* Workaround for genbki not knowing about XLogRecPtr */ +#define pg_lsn XLogRecPtr + +CATALOG(pg_subscription_rel,6102) BKI_WITHOUT_OIDS +{ + Oid srsubid; /* Oid of subscription */ + Oid srrelid; /* Oid of relation */ + char srsubstate; /* state of the relation in subscription */ + pg_lsn srsublsn; /* remote lsn of the state change + * used for synchronization coordination */ +} FormData_pg_subscription_rel; + +typedef FormData_pg_subscription_rel *Form_pg_subscription_rel; + +/* ---------------- + * compiler constants for pg_subscription_rel + * ---------------- + */ +#define Natts_pg_subscription_rel 4 +#define Anum_pg_subscription_rel_srsubid 1 +#define Anum_pg_subscription_rel_srrelid 2 +#define Anum_pg_subscription_rel_srsubstate 3 +#define Anum_pg_subscription_rel_srsublsn 4 + +/* ---------------- + * substate constants + * ---------------- + */ +#define SUBREL_STATE_INIT 'i' /* initializing (sublsn NULL) */ +#define SUBREL_STATE_DATASYNC 'd' /* data is being synchronized (sublsn NULL) */ +#define SUBREL_STATE_SYNCDONE 's' /* synchronization finished infront of apply (sublsn set) */ +#define SUBREL_STATE_READY 'r' /* ready (sublsn set) */ + +/* These are never stored in the catalog, we only use them for IPC. */ +#define SUBREL_STATE_UNKNOWN '\0' /* unknown state */ +#define SUBREL_STATE_SYNCWAIT 'w' /* waiting for sync */ +#define SUBREL_STATE_CATCHUP 'c' /* catching up with apply */ + +typedef struct SubscriptionRelState +{ + Oid relid; + XLogRecPtr lsn; + char state; +} SubscriptionRelState; + +extern Oid SetSubscriptionRelState(Oid subid, Oid relid, char state, + XLogRecPtr sublsn); +extern char GetSubscriptionRelState(Oid subid, Oid relid, + XLogRecPtr *sublsn, bool missing_ok); +extern void RemoveSubscriptionRel(Oid subid, Oid relid); + +extern List *GetSubscriptionRelations(Oid subid); +extern List *GetSubscriptionNotReadyRelations(Oid subid); + +#endif /* PG_SUBSCRIPTION_REL_H */ diff --git a/src/include/commands/copy.h b/src/include/commands/copy.h index d63ca0f5e9..f081f2219f 100644 --- a/src/include/commands/copy.h +++ b/src/include/commands/copy.h @@ -21,6 +21,7 @@ /* CopyStateData is private in commands/copy.c */ typedef struct CopyStateData *CopyState; +typedef int (*copy_data_source_cb) (void *outbuf, int minread, int maxread); extern void DoCopy(ParseState *state, const CopyStmt *stmt, int stmt_location, int stmt_len, @@ -28,7 +29,7 @@ extern void DoCopy(ParseState *state, const CopyStmt *stmt, extern void ProcessCopyOptions(ParseState *pstate, CopyState cstate, bool is_from, List *options); extern CopyState BeginCopyFrom(ParseState *pstate, Relation rel, const char *filename, - bool is_program, List *attnamelist, List *options); + bool is_program, copy_data_source_cb data_source_cb, List *attnamelist, List *options); extern void EndCopyFrom(CopyState cstate); extern bool NextCopyFrom(CopyState cstate, ExprContext *econtext, Datum *values, bool *nulls, Oid *tupleOid); @@ -36,6 +37,8 @@ extern bool NextCopyFromRawFields(CopyState cstate, char ***fields, int *nfields); extern void CopyFromErrorCallback(void *arg); +extern uint64 CopyFrom(CopyState cstate); + extern DestReceiver *CreateCopyDestReceiver(void); #endif /* COPY_H */ diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 2cbd6d77b8..9a4221a9e7 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -488,6 +488,7 @@ typedef enum NodeTag T_DropReplicationSlotCmd, T_StartReplicationCmd, T_TimeLineHistoryCmd, + T_SQLCmd, /* * TAGS FOR RANDOM OTHER STUFF diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index a15df229a4..582e0e0ebe 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -3319,10 +3319,23 @@ typedef struct CreateSubscriptionStmt List *options; /* List of DefElem nodes */ } CreateSubscriptionStmt; +typedef enum AlterSubscriptionType +{ + ALTER_SUBSCRIPTION_OPTIONS, + ALTER_SUBSCRIPTION_CONNECTION, + ALTER_SUBSCRIPTION_PUBLICATION, + ALTER_SUBSCRIPTION_PUBLICATION_REFRESH, + ALTER_SUBSCRIPTION_REFRESH, + ALTER_SUBSCRIPTION_ENABLED +} AlterSubscriptionType; + typedef struct AlterSubscriptionStmt { NodeTag type; + AlterSubscriptionType kind; /* ALTER_SUBSCRIPTION_OPTIONS, etc */ char *subname; /* Name of of the subscription */ + char *conninfo; /* Connection string to publisher */ + List *publication; /* One or more publication to subscribe to */ List *options; /* List of DefElem nodes */ } AlterSubscriptionStmt; diff --git a/src/include/nodes/replnodes.h b/src/include/nodes/replnodes.h index 996da3c02e..92ada41b6d 100644 --- a/src/include/nodes/replnodes.h +++ b/src/include/nodes/replnodes.h @@ -96,4 +96,13 @@ typedef struct TimeLineHistoryCmd TimeLineID timeline; } TimeLineHistoryCmd; +/* ---------------------- + * SQL commands + * ---------------------- + */ +typedef struct SQLCmd +{ + NodeTag type; +} SQLCmd; + #endif /* REPLNODES_H */ diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index 28c4dab258..6cd36c7fe3 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -258,6 +258,7 @@ PG_KEYWORD("new", NEW, UNRESERVED_KEYWORD) PG_KEYWORD("next", NEXT, UNRESERVED_KEYWORD) PG_KEYWORD("no", NO, UNRESERVED_KEYWORD) PG_KEYWORD("none", NONE, COL_NAME_KEYWORD) +PG_KEYWORD("norefresh", NOREFRESH, UNRESERVED_KEYWORD) PG_KEYWORD("not", NOT, RESERVED_KEYWORD) PG_KEYWORD("nothing", NOTHING, UNRESERVED_KEYWORD) PG_KEYWORD("notify", NOTIFY, UNRESERVED_KEYWORD) diff --git a/src/include/pgstat.h b/src/include/pgstat.h index f2daf32e1a..a675242971 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -790,7 +790,9 @@ typedef enum WAIT_EVENT_PARALLEL_FINISH, WAIT_EVENT_PARALLEL_BITMAP_SCAN, WAIT_EVENT_SAFE_SNAPSHOT, - WAIT_EVENT_SYNC_REP + WAIT_EVENT_SYNC_REP, + WAIT_EVENT_LOGICAL_SYNC_DATA, + WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE } WaitEventIPC; /* ---------- diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h index fd34964bad..d10dd2c90a 100644 --- a/src/include/replication/logical.h +++ b/src/include/replication/logical.h @@ -31,9 +31,11 @@ typedef struct LogicalDecodingContext /* memory context this is all allocated in */ MemoryContext context; - /* infrastructure pieces */ - XLogReaderState *reader; + /* The associated replication slot */ ReplicationSlot *slot; + + /* infrastructure pieces for decoding */ + XLogReaderState *reader; struct ReorderBuffer *reorder; struct SnapBuild *snapshot_builder; @@ -75,6 +77,7 @@ typedef struct LogicalDecodingContext TransactionId write_xid; } LogicalDecodingContext; + extern void CheckLogicalDecodingRequirements(void); extern LogicalDecodingContext *CreateInitDecodingContext(char *plugin, @@ -92,6 +95,12 @@ extern void DecodingContextFindStartpoint(LogicalDecodingContext *ctx); extern bool DecodingContextReady(LogicalDecodingContext *ctx); extern void FreeDecodingContext(LogicalDecodingContext *ctx); +extern LogicalDecodingContext *CreateCopyDecodingContext( + List *output_plugin_options, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write); +extern List *DecodingContextGetTableList(LogicalDecodingContext *ctx); + extern void LogicalIncreaseXminForSlot(XLogRecPtr lsn, TransactionId xmin); extern void LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart_lsn); diff --git a/src/include/replication/logicallauncher.h b/src/include/replication/logicallauncher.h index cfe3db10dd..060946a096 100644 --- a/src/include/replication/logicallauncher.h +++ b/src/include/replication/logicallauncher.h @@ -13,6 +13,7 @@ #define LOGICALLAUNCHER_H extern int max_logical_replication_workers; +extern int max_sync_workers_per_subscription; extern void ApplyLauncherRegister(void); extern void ApplyLauncherMain(Datum main_arg); diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h index 5e824ae6fc..091a9f91e3 100644 --- a/src/include/replication/snapbuild.h +++ b/src/include/replication/snapbuild.h @@ -59,6 +59,7 @@ extern void FreeSnapshotBuilder(SnapBuild *cache); extern void SnapBuildSnapDecRefcount(Snapshot snap); +extern Snapshot SnapBuildInitalSnapshot(SnapBuild *builder); extern const char *SnapBuildExportSnapshot(SnapBuild *snapstate); extern void SnapBuildClearExportedSnapshot(void); diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h index 78e577c89b..fb55c30fa1 100644 --- a/src/include/replication/walreceiver.h +++ b/src/include/replication/walreceiver.h @@ -15,9 +15,12 @@ #include "access/xlog.h" #include "access/xlogdefs.h" #include "fmgr.h" +#include "replication/logicalproto.h" +#include "replication/walsender.h" #include "storage/latch.h" #include "storage/spin.h" #include "pgtime.h" +#include "utils/tuplestore.h" /* user-settable parameters */ extern int wal_receiver_status_interval; @@ -160,6 +163,33 @@ typedef struct struct WalReceiverConn; typedef struct WalReceiverConn WalReceiverConn; +/* + * Status of walreceiver query execution. + * + * We only define statuses that are currently used. + */ +typedef enum +{ + WALRCV_ERROR, /* There was error when executing the query. */ + WALRCV_OK_COMMAND, /* Query executed utility or replication command. */ + WALRCV_OK_TUPLES, /* Query returned tuples. */ + WALRCV_OK_COPY_IN, /* Query started COPY FROM. */ + WALRCV_OK_COPY_OUT, /* Query started COPY TO. */ + WALRCV_OK_COPY_BOTH, /* Query started COPY BOTH replication protocol. */ +} WalRcvExecStatus; + +/* + * Return value for walrcv_query, returns the status of the execution and + * tuples if any. + */ +typedef struct WalRcvExecResult +{ + WalRcvExecStatus status; + char *err; + Tuplestorestate *tuplestore; + TupleDesc tupledesc; +} WalRcvExecResult; + /* libpqwalreceiver hooks */ typedef WalReceiverConn *(*walrcv_connect_fn) (const char *conninfo, bool logical, const char *appname, @@ -183,9 +213,12 @@ typedef void (*walrcv_send_fn) (WalReceiverConn *conn, const char *buffer, int nbytes); typedef char *(*walrcv_create_slot_fn) (WalReceiverConn *conn, const char *slotname, bool temporary, - bool export_snapshot, XLogRecPtr *lsn); -typedef bool (*walrcv_command_fn) (WalReceiverConn *conn, const char *cmd, - char **err); + CRSSnapshotAction snapshot_action, + XLogRecPtr *lsn); +typedef WalRcvExecResult *(*walrcv_exec_fn) (WalReceiverConn *conn, + const char *query, + const int nRetTypes, + const Oid *retTypes); typedef void (*walrcv_disconnect_fn) (WalReceiverConn *conn); typedef struct WalReceiverFunctionsType @@ -200,7 +233,7 @@ typedef struct WalReceiverFunctionsType walrcv_receive_fn walrcv_receive; walrcv_send_fn walrcv_send; walrcv_create_slot_fn walrcv_create_slot; - walrcv_command_fn walrcv_command; + walrcv_exec_fn walrcv_exec; walrcv_disconnect_fn walrcv_disconnect; } WalReceiverFunctionsType; @@ -224,13 +257,31 @@ extern PGDLLIMPORT WalReceiverFunctionsType *WalReceiverFunctions; WalReceiverFunctions->walrcv_receive(conn, buffer, wait_fd) #define walrcv_send(conn, buffer, nbytes) \ WalReceiverFunctions->walrcv_send(conn, buffer, nbytes) -#define walrcv_create_slot(conn, slotname, temporary, export_snapshot, lsn) \ - WalReceiverFunctions->walrcv_create_slot(conn, slotname, temporary, export_snapshot, lsn) -#define walrcv_command(conn, cmd, err) \ - WalReceiverFunctions->walrcv_command(conn, cmd, err) +#define walrcv_create_slot(conn, slotname, temporary, snapshot_action, lsn) \ + WalReceiverFunctions->walrcv_create_slot(conn, slotname, temporary, snapshot_action, lsn) +#define walrcv_exec(conn, exec, nRetTypes, retTypes) \ + WalReceiverFunctions->walrcv_exec(conn, exec, nRetTypes, retTypes) #define walrcv_disconnect(conn) \ WalReceiverFunctions->walrcv_disconnect(conn) +static inline void +walrcv_clear_result(WalRcvExecResult *walres) +{ + if (!walres) + return; + + if (walres->err) + pfree(walres->err); + + if (walres->tuplestore) + tuplestore_end(walres->tuplestore); + + if (walres->tupledesc) + FreeTupleDesc(walres->tupledesc); + + pfree(walres); +} + /* prototypes for functions in walreceiver.c */ extern void WalReceiverMain(void) pg_attribute_noreturn(); diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h index fe23f6619f..2ca903872e 100644 --- a/src/include/replication/walsender.h +++ b/src/include/replication/walsender.h @@ -16,6 +16,16 @@ #include "fmgr.h" +/* + * What to do with a snapshot in create replication slot command. + */ +typedef enum +{ + CRS_EXPORT_SNAPSHOT, + CRS_NOEXPORT_SNAPSHOT, + CRS_USE_SNAPSHOT +} CRSSnapshotAction; + /* global state */ extern bool am_walsender; extern bool am_cascading_walsender; @@ -28,7 +38,7 @@ extern int wal_sender_timeout; extern bool log_replication_commands; extern void InitWalSender(void); -extern void exec_replication_command(const char *query_string); +extern bool exec_replication_command(const char *query_string); extern void WalSndErrorCleanup(void); extern void WalSndSignals(void); extern Size WalSndShmemSize(void); diff --git a/src/include/replication/worker_internal.h b/src/include/replication/worker_internal.h index 8cbf2687a9..bf96d340ca 100644 --- a/src/include/replication/worker_internal.h +++ b/src/include/replication/worker_internal.h @@ -33,6 +33,9 @@ typedef struct LogicalRepWorker /* Used for initial table synchronization. */ Oid relid; + char relstate; + XLogRecPtr relstate_lsn; + slock_t relmutex; /* Stats. */ XLogRecPtr last_lsn; @@ -42,6 +45,9 @@ typedef struct LogicalRepWorker TimestampTz reply_time; } LogicalRepWorker; +/* Memory context for cached variables in apply worker. */ +MemoryContext ApplyCacheContext; + /* libpqreceiver connection */ extern struct WalReceiverConn *wrconn; @@ -53,12 +59,26 @@ extern bool in_remote_transaction; extern bool got_SIGTERM; extern void logicalrep_worker_attach(int slot); -extern LogicalRepWorker *logicalrep_worker_find(Oid subid); -extern int logicalrep_worker_count(Oid subid); -extern void logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid); -extern void logicalrep_worker_stop(Oid subid); -extern void logicalrep_worker_wakeup(Oid subid); +extern LogicalRepWorker *logicalrep_worker_find(Oid subid, Oid relid, + bool only_running); +extern void logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, + Oid userid, Oid relid); +extern void logicalrep_worker_stop(Oid subid, Oid relid); +extern void logicalrep_worker_wakeup(Oid subid, Oid relid); +extern void logicalrep_worker_wakeup_ptr(LogicalRepWorker *worker); + +extern int logicalrep_sync_worker_count(Oid subid); extern void logicalrep_worker_sigterm(SIGNAL_ARGS); +extern char *LogicalRepSyncTableStart(XLogRecPtr *origin_startpos); +void process_syncing_tables(XLogRecPtr current_lsn); +void invalidate_syncing_table_states(Datum arg, int cacheid, + uint32 hashvalue); + +static inline bool +am_tablesync_worker(void) +{ + return OidIsValid(MyLogicalRepWorker->relid); +} #endif /* WORKER_INTERNAL_H */ diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h index 66f60d271e..b35faf81b9 100644 --- a/src/include/utils/syscache.h +++ b/src/include/utils/syscache.h @@ -89,6 +89,7 @@ enum SysCacheIdentifier STATRELATTINH, SUBSCRIPTIONOID, SUBSCRIPTIONNAME, + SUBSCRIPTIONRELMAP, TABLESPACEOID, TRFOID, TRFTYPELANG, diff --git a/src/test/regress/expected/object_address.out b/src/test/regress/expected/object_address.out index 90c4ba4608..978d9a9a0f 100644 --- a/src/test/regress/expected/object_address.out +++ b/src/test/regress/expected/object_address.out @@ -37,7 +37,8 @@ CREATE TRANSFORM FOR int LANGUAGE SQL ( FROM SQL WITH FUNCTION varchar_transform(internal), TO SQL WITH FUNCTION int4recv(internal)); CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable; -CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCREATE SLOT); +CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCONNECT); +WARNING: tables were not subscribed, you will have to run ALTER SUBSCRIPTION ... REFRESH PUBLICATION to subscribe the tables -- test some error cases SELECT pg_get_object_address('stone', '{}', '{}'); ERROR: unrecognized object type "stone" diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index bd13ae6010..f7c3a637b5 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1847,13 +1847,14 @@ pg_stat_ssl| SELECT s.pid, pg_stat_subscription| SELECT su.oid AS subid, su.subname, st.pid, + st.relid, st.received_lsn, st.last_msg_send_time, st.last_msg_receipt_time, st.latest_end_lsn, st.latest_end_time FROM (pg_subscription su - LEFT JOIN pg_stat_get_subscription(NULL::oid) st(subid, pid, received_lsn, last_msg_send_time, last_msg_receipt_time, latest_end_lsn, latest_end_time) ON ((st.subid = su.oid))); + LEFT JOIN pg_stat_get_subscription(NULL::oid) st(subid, relid, pid, received_lsn, last_msg_send_time, last_msg_receipt_time, latest_end_lsn, latest_end_time) ON ((st.subid = su.oid))); pg_stat_sys_indexes| SELECT pg_stat_all_indexes.relid, pg_stat_all_indexes.indexrelid, pg_stat_all_indexes.schemaname, diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out index 88b4c973a1..8e3028edaa 100644 --- a/src/test/regress/expected/sanity_check.out +++ b/src/test/regress/expected/sanity_check.out @@ -143,6 +143,7 @@ pg_shdescription|t pg_shseclabel|t pg_statistic|t pg_subscription|t +pg_subscription_rel|t pg_tablespace|t pg_transform|t pg_trigger|t diff --git a/src/test/regress/expected/subscription.out b/src/test/regress/expected/subscription.out index 3471d88ca7..0912bef657 100644 --- a/src/test/regress/expected/subscription.out +++ b/src/test/regress/expected/subscription.out @@ -14,7 +14,6 @@ CREATE SUBSCRIPTION testsub PUBLICATION foo; ERROR: syntax error at or near "PUBLICATION" LINE 1: CREATE SUBSCRIPTION testsub PUBLICATION foo; ^ -set client_min_messages to error; -- fail - cannot do CREATE SUBSCRIPTION CREATE SLOT inside transaction block BEGIN; CREATE SUBSCRIPTION testsub CONNECTION 'testconn' PUBLICATION testpub WITH (CREATE SLOT); @@ -23,8 +22,8 @@ COMMIT; CREATE SUBSCRIPTION testsub CONNECTION 'testconn' PUBLICATION testpub; ERROR: invalid connection string syntax: missing "=" after "testconn" in connection info string -CREATE SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist' PUBLICATION testpub WITH (DISABLED, NOCREATE SLOT); -reset client_min_messages; +CREATE SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist' PUBLICATION testpub WITH (NOCONNECT); +WARNING: tables were not subscribed, you will have to run ALTER SUBSCRIPTION ... REFRESH PUBLICATION to subscribe the tables \dRs+ List of subscriptions Name | Owner | Enabled | Publication | Conninfo @@ -32,38 +31,30 @@ reset client_min_messages; testsub | regress_subscription_user | f | {testpub} | dbname=doesnotexist (1 row) -ALTER SUBSCRIPTION testsub SET PUBLICATION testpub2, testpub3; -\dRs - List of subscriptions - Name | Owner | Enabled | Publication ----------+---------------------------+---------+--------------------- - testsub | regress_subscription_user | f | {testpub2,testpub3} -(1 row) - +ALTER SUBSCRIPTION testsub SET PUBLICATION testpub2, testpub3 NOREFRESH; ALTER SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist2'; -ALTER SUBSCRIPTION testsub SET PUBLICATION testpub, testpub1; \dRs+ List of subscriptions - Name | Owner | Enabled | Publication | Conninfo ----------+---------------------------+---------+--------------------+---------------------- - testsub | regress_subscription_user | f | {testpub,testpub1} | dbname=doesnotexist2 + Name | Owner | Enabled | Publication | Conninfo +---------+---------------------------+---------+---------------------+---------------------- + testsub | regress_subscription_user | f | {testpub2,testpub3} | dbname=doesnotexist2 (1 row) BEGIN; ALTER SUBSCRIPTION testsub ENABLE; \dRs - List of subscriptions - Name | Owner | Enabled | Publication ----------+---------------------------+---------+-------------------- - testsub | regress_subscription_user | t | {testpub,testpub1} + List of subscriptions + Name | Owner | Enabled | Publication +---------+---------------------------+---------+--------------------- + testsub | regress_subscription_user | t | {testpub2,testpub3} (1 row) ALTER SUBSCRIPTION testsub DISABLE; \dRs - List of subscriptions - Name | Owner | Enabled | Publication ----------+---------------------------+---------+-------------------- - testsub | regress_subscription_user | f | {testpub,testpub1} + List of subscriptions + Name | Owner | Enabled | Publication +---------+---------------------------+---------+--------------------- + testsub | regress_subscription_user | f | {testpub2,testpub3} (1 row) COMMIT; @@ -74,10 +65,10 @@ ERROR: must be owner of subscription testsub RESET ROLE; ALTER SUBSCRIPTION testsub RENAME TO testsub_foo; \dRs - List of subscriptions - Name | Owner | Enabled | Publication --------------+---------------------------+---------+-------------------- - testsub_foo | regress_subscription_user | f | {testpub,testpub1} + List of subscriptions + Name | Owner | Enabled | Publication +-------------+---------------------------+---------+--------------------- + testsub_foo | regress_subscription_user | f | {testpub2,testpub3} (1 row) -- rename back to keep the rest simple diff --git a/src/test/regress/sql/object_address.sql b/src/test/regress/sql/object_address.sql index 6b85fe2949..28476daff1 100644 --- a/src/test/regress/sql/object_address.sql +++ b/src/test/regress/sql/object_address.sql @@ -40,7 +40,7 @@ CREATE TRANSFORM FOR int LANGUAGE SQL ( FROM SQL WITH FUNCTION varchar_transform(internal), TO SQL WITH FUNCTION int4recv(internal)); CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable; -CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCREATE SLOT); +CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCONNECT); -- test some error cases SELECT pg_get_object_address('stone', '{}', '{}'); diff --git a/src/test/regress/sql/subscription.sql b/src/test/regress/sql/subscription.sql index 5c05b14f9e..c1199ee629 100644 --- a/src/test/regress/sql/subscription.sql +++ b/src/test/regress/sql/subscription.sql @@ -12,24 +12,19 @@ CREATE SUBSCRIPTION testsub CONNECTION 'foo'; -- fail - no connection CREATE SUBSCRIPTION testsub PUBLICATION foo; -set client_min_messages to error; -- fail - cannot do CREATE SUBSCRIPTION CREATE SLOT inside transaction block BEGIN; CREATE SUBSCRIPTION testsub CONNECTION 'testconn' PUBLICATION testpub WITH (CREATE SLOT); COMMIT; CREATE SUBSCRIPTION testsub CONNECTION 'testconn' PUBLICATION testpub; -CREATE SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist' PUBLICATION testpub WITH (DISABLED, NOCREATE SLOT); -reset client_min_messages; -\dRs+ - -ALTER SUBSCRIPTION testsub SET PUBLICATION testpub2, testpub3; +CREATE SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist' PUBLICATION testpub WITH (NOCONNECT); -\dRs +\dRs+ +ALTER SUBSCRIPTION testsub SET PUBLICATION testpub2, testpub3 NOREFRESH; ALTER SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist2'; -ALTER SUBSCRIPTION testsub SET PUBLICATION testpub, testpub1; \dRs+ diff --git a/src/test/subscription/t/001_rep_changes.pl b/src/test/subscription/t/001_rep_changes.pl index b81028aed1..d1817f57da 100644 --- a/src/test/subscription/t/001_rep_changes.pl +++ b/src/test/subscription/t/001_rep_changes.pl @@ -3,7 +3,7 @@ use strict; use warnings; use PostgresNode; use TestLib; -use Test::More tests => 11; +use Test::More tests => 14; # Initialize publisher node my $node_publisher = get_new_node('publisher'); @@ -19,7 +19,7 @@ $node_subscriber->start; $node_publisher->safe_psql('postgres', "CREATE TABLE tab_notrep AS SELECT generate_series(1,10) AS a"); $node_publisher->safe_psql('postgres', - "CREATE TABLE tab_ins (a int)"); + "CREATE TABLE tab_ins AS SELECT generate_series(1,1002) AS a"); $node_publisher->safe_psql('postgres', "CREATE TABLE tab_full AS SELECT generate_series(1,10) AS a"); $node_publisher->safe_psql('postgres', @@ -56,10 +56,20 @@ my $caughtup_query = $node_publisher->poll_query_until('postgres', $caughtup_query) or die "Timed out while waiting for subscriber to catch up"; +# Also wait for initial table sync to finish +my $synced_query = +"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('r', 's');"; +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + my $result = $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_notrep"); is($result, qq(0), 'check non-replicated table is empty on subscriber'); +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_ins"); +is($result, qq(1002), 'check initial data was copied to subscriber'); + $node_publisher->safe_psql('postgres', "INSERT INTO tab_ins SELECT generate_series(1,50)"); $node_publisher->safe_psql('postgres', @@ -79,7 +89,7 @@ $node_publisher->poll_query_until('postgres', $caughtup_query) $result = $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_ins"); -is($result, qq(50|1|50), 'check replicated inserts on subscriber'); +is($result, qq(1052|1|1002), 'check replicated inserts on subscriber'); $result = $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_rep"); @@ -109,7 +119,7 @@ $node_publisher->poll_query_until('postgres', $caughtup_query) $result = $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_full"); -is($result, qq(10|1|100), 'update works with REPLICA IDENTITY FULL and duplicate tuples'); +is($result, qq(20|1|100), 'update works with REPLICA IDENTITY FULL and duplicate tuples'); # check that change of connection string and/or publication list causes # restart of subscription workers. Not all of these are registered as tests @@ -126,7 +136,7 @@ $node_publisher->poll_query_until('postgres', $oldpid = $node_publisher->safe_psql('postgres', "SELECT pid FROM pg_stat_replication WHERE application_name = '$appname';"); $node_subscriber->safe_psql('postgres', - "ALTER SUBSCRIPTION tap_sub SET PUBLICATION tap_pub_ins_only"); + "ALTER SUBSCRIPTION tap_sub SET PUBLICATION tap_pub_ins_only REFRESH WITH (NOCOPY DATA)"); $node_publisher->poll_query_until('postgres', "SELECT pid != $oldpid FROM pg_stat_replication WHERE application_name = '$appname';") or die "Timed out while waiting for apply to restart"; @@ -141,7 +151,7 @@ $node_publisher->poll_query_until('postgres', $caughtup_query) $result = $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_ins"); -is($result, qq(150|1|1100), 'check replicated inserts after subscription publication change'); +is($result, qq(1152|1|1100), 'check replicated inserts after subscription publication change'); $result = $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_rep"); @@ -154,6 +164,8 @@ $node_publisher->safe_psql('postgres', "ALTER PUBLICATION tap_pub_ins_only ADD TABLE tab_full"); $node_publisher->safe_psql('postgres', "DELETE FROM tab_ins WHERE a > 0"); +$node_subscriber->safe_psql('postgres', + "ALTER SUBSCRIPTION tap_sub REFRESH PUBLICATION WITH (NOCOPY DATA)"); $node_publisher->safe_psql('postgres', "INSERT INTO tab_full VALUES(0)"); @@ -163,11 +175,11 @@ $node_publisher->poll_query_until('postgres', $caughtup_query) # note that data are different on provider and subscriber $result = $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_ins"); -is($result, qq(50|1|50), 'check replicated deletes after alter publication'); +is($result, qq(1052|1|1002), 'check replicated deletes after alter publication'); $result = $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_full"); -is($result, qq(11|0|100), 'check replicated insert after alter publication'); +is($result, qq(21|0|100), 'check replicated insert after alter publication'); # check restart on rename $oldpid = $node_publisher->safe_psql('postgres', @@ -189,6 +201,14 @@ $result = $node_publisher->safe_psql('postgres', "SELECT count(*) FROM pg_replication_slots"); is($result, qq(0), 'check replication slot was dropped on publisher'); +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM pg_subscription_rel"); +is($result, qq(0), 'check subscription relation status was dropped on subscriber'); + +$result = + $node_publisher->safe_psql('postgres', "SELECT count(*) FROM pg_replication_slots"); +is($result, qq(0), 'check replication slot was dropped on publisher'); + $result = $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM pg_replication_origin"); is($result, qq(0), 'check replication origin was dropped on subscriber'); diff --git a/src/test/subscription/t/002_types.pl b/src/test/subscription/t/002_types.pl index f44e1e671d..ad15e85c0c 100644 --- a/src/test/subscription/t/002_types.pl +++ b/src/test/subscription/t/002_types.pl @@ -111,6 +111,12 @@ my $caughtup_query = $node_publisher->poll_query_until('postgres', $caughtup_query) or die "Timed out while waiting for subscriber to catch up"; +# Wait for initial sync to finish as well +my $synced_query = +"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('s', 'r');"; +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + # Insert initial test data $node_publisher->safe_psql('postgres', qq( -- test_tbl_one_array_col diff --git a/src/test/subscription/t/003_constraints.pl b/src/test/subscription/t/003_constraints.pl index b785132f5b..11b8254155 100644 --- a/src/test/subscription/t/003_constraints.pl +++ b/src/test/subscription/t/003_constraints.pl @@ -34,7 +34,7 @@ $node_publisher->safe_psql('postgres', my $appname = 'tap_sub'; $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub;"); + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub WITH (NOCOPY DATA)"); # Wait for subscriber to finish initialization my $caughtup_query = diff --git a/src/test/subscription/t/004_sync.pl b/src/test/subscription/t/004_sync.pl new file mode 100644 index 0000000000..87541a0e6e --- /dev/null +++ b/src/test/subscription/t/004_sync.pl @@ -0,0 +1,159 @@ +# Tests for logical replication table syncing +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 7; + +# Initialize publisher node +my $node_publisher = get_new_node('publisher'); +$node_publisher->init(allows_streaming => 'logical'); +$node_publisher->start; + +# Create subscriber node +my $node_subscriber = get_new_node('subscriber'); +$node_subscriber->init(allows_streaming => 'logical'); +$node_subscriber->start; + +# Create some preexisting content on publisher +$node_publisher->safe_psql('postgres', + "CREATE TABLE tab_rep (a int primary key)"); +$node_publisher->safe_psql('postgres', + "INSERT INTO tab_rep SELECT generate_series(1,10)"); + +# Setup structure on subscriber +$node_subscriber->safe_psql('postgres', + "CREATE TABLE tab_rep (a int primary key)"); + +# Setup logical replication +my $publisher_connstr = $node_publisher->connstr . ' dbname=postgres'; +$node_publisher->safe_psql('postgres', + "CREATE PUBLICATION tap_pub FOR ALL TABLES"); + +my $appname = 'tap_sub'; +$node_subscriber->safe_psql('postgres', + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub"); + +# Wait for subscriber to finish initialization +my $caughtup_query = +"SELECT pg_current_wal_location() <= replay_location FROM pg_stat_replication WHERE application_name = '$appname';"; +$node_publisher->poll_query_until('postgres', $caughtup_query) + or die "Timed out while waiting for subscriber to catch up"; + +# Also wait for initial table sync to finish +my $synced_query = +"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('r', 's');"; +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + +my $result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep"); +is($result, qq(10), 'initial data synced for first sub'); + +# drop subscription so that there is unreplicated data +$node_subscriber->safe_psql('postgres', "DROP SUBSCRIPTION tap_sub"); + +$node_publisher->safe_psql('postgres', + "INSERT INTO tab_rep SELECT generate_series(11,20)"); + +# recreate the subscription, it will try to do initial copy +$node_subscriber->safe_psql('postgres', + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub"); + +# but it will be stuck on data copy as it will fail on constraint +my $started_query = +"SELECT srsubstate = 'd' FROM pg_subscription_rel;"; +$node_subscriber->poll_query_until('postgres', $started_query) + or die "Timed out while waiting for subscriber to start sync"; + +# remove the conflicting data +$node_subscriber->safe_psql('postgres', + "DELETE FROM tab_rep;"); + +# wait for sync to finish this time +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + +# check that all data is synced +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep"); +is($result, qq(20), 'initial data synced for second sub'); + +# now check another subscription for the same node pair +$node_subscriber->safe_psql('postgres', + "CREATE SUBSCRIPTION tap_sub2 CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub WITH (NOCOPY DATA)"); + +# wait for it to start +$node_subscriber->poll_query_until('postgres', "SELECT pid IS NOT NULL FROM pg_stat_subscription WHERE subname = 'tap_sub2' AND relid IS NULL") + or die "Timed out while waiting for subscriber to start"; + +# and drop both subscriptions +$node_subscriber->safe_psql('postgres', "DROP SUBSCRIPTION tap_sub"); +$node_subscriber->safe_psql('postgres', "DROP SUBSCRIPTION tap_sub2"); + +# check subscriptions are removed +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM pg_subscription"); +is($result, qq(0), 'second and third sub are dropped'); + +# remove the conflicting data +$node_subscriber->safe_psql('postgres', + "DELETE FROM tab_rep;"); + +# recreate the subscription again +$node_subscriber->safe_psql('postgres', + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub"); + +# and wait for data sync to finish again +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + +# check that all data is synced +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep"); +is($result, qq(20), 'initial data synced for fourth sub'); + +# add new table on subscriber +$node_subscriber->safe_psql('postgres', + "CREATE TABLE tab_rep_next (a int)"); + +# setup structure with existing data on pubisher +$node_publisher->safe_psql('postgres', + "CREATE TABLE tab_rep_next (a) AS SELECT generate_series(1,10)"); + +# Wait for subscription to catch up +$node_publisher->poll_query_until('postgres', $caughtup_query) + or die "Timed out while waiting for subscriber to catch up"; + +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep_next"); +is($result, qq(0), 'no data for table added after subscription initialized'); + +# ask for data sync +$node_subscriber->safe_psql('postgres', + "ALTER SUBSCRIPTION tap_sub REFRESH PUBLICATION"); + +# wait for sync to finish +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep_next"); +is($result, qq(10), 'data for table added after subscription initialized are now synced'); + +# Add some data +$node_publisher->safe_psql('postgres', + "INSERT INTO tab_rep_next SELECT generate_series(1,10)"); + +# Wait for subscription to catch up +$node_publisher->poll_query_until('postgres', $caughtup_query) + or die "Timed out while waiting for subscriber to catch up"; + +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep_next"); +is($result, qq(20), 'changes for table added after subscription initialized replicated'); + +$node_subscriber->safe_psql('postgres', "DROP SUBSCRIPTION tap_sub"); + +$node_subscriber->stop('fast'); +$node_publisher->stop('fast'); -- 2.40.0