From: Robert Haas Date: Mon, 3 Mar 2014 21:32:18 +0000 (-0500) Subject: Introduce logical decoding. X-Git-Tag: REL9_4_BETA1~397 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b89e151;p=postgresql Introduce logical decoding. This feature, building on previous commits, allows the write-ahead log stream to be decoded into a series of logical changes; that is, inserts, updates, and deletes and the transactions which contain them. It is capable of handling decoding even across changes to the schema of the effected tables. The output format is controlled by a so-called "output plugin"; an example is included. To make use of this in a real replication system, the output plugin will need to be modified to produce output in the format appropriate to that system, and to perform filtering. Currently, information can be extracted from the logical decoding system only via SQL; future commits will add the ability to stream changes via walsender. Andres Freund, with review and other contributions from many other people, including Álvaro Herrera, Abhijit Menon-Sen, Peter Gheogegan, Kevin Grittner, Robert Haas, Heikki Linnakangas, Fujii Masao, Abhijit Menon-Sen, Michael Paquier, Simon Riggs, Craig Ringer, and Steve Singer. --- diff --git a/contrib/Makefile b/contrib/Makefile index c90fe29222..8dc40f7de0 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -50,6 +50,7 @@ SUBDIRS = \ spi \ tablefunc \ tcn \ + test_decoding \ test_parser \ test_shm_mq \ tsearch2 \ diff --git a/contrib/test_decoding/.gitignore b/contrib/test_decoding/.gitignore new file mode 100644 index 0000000000..5dcb3ff972 --- /dev/null +++ b/contrib/test_decoding/.gitignore @@ -0,0 +1,4 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ diff --git a/contrib/test_decoding/Makefile b/contrib/test_decoding/Makefile new file mode 100644 index 0000000000..c193f73786 --- /dev/null +++ b/contrib/test_decoding/Makefile @@ -0,0 +1,69 @@ +# contrib/test_decoding/Makefile + +MODULES = test_decoding +OBJS = test_decoding.o + +# Note: because we don't tell the Makefile there are any regression tests, +# we have to clean those result files explicitly +EXTRA_CLEAN = -r $(pg_regress_clean_files) + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/test_decoding +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +# Disabled because these tests require "wal_level=logical", which +# typical installcheck users do not have (e.g. buildfarm clients). +installcheck:; + +# But it can nonetheless be very helpful to run tests on preexisting +# installation, allow to do so, but only if requested explicitly. +installcheck-force: regresscheck-install-force isolationcheck-install-force + +check: regresscheck isolationcheck + +submake-regress: + $(MAKE) -C $(top_builddir)/src/test/regress all + +submake-isolation: + $(MAKE) -C $(top_builddir)/src/test/isolation all + +submake-test_decoding: + $(MAKE) -C $(top_builddir)/contrib/test_decoding + +REGRESSCHECKS=ddl rewrite toast permissions decoding_in_xact binary + +regresscheck: all | submake-regress submake-test_decoding + $(pg_regress_check) \ + --temp-config $(top_srcdir)/contrib/test_decoding/logical.conf \ + --temp-install=./tmp_check \ + --extra-install=contrib/test_decoding \ + $(REGRESSCHECKS) + +regresscheck-install-force: | submake-regress submake-test_decoding + $(pg_regress_installcheck) \ + --extra-install=contrib/test_decoding \ + $(REGRESSCHECKS) + +ISOLATIONCHECKS=mxact delayed_startup concurrent_ddl_dml + +isolationcheck: all | submake-isolation submake-test_decoding + $(pg_isolation_regress_check) \ + --temp-config $(top_srcdir)/contrib/test_decoding/logical.conf \ + --extra-install=contrib/test_decoding \ + $(ISOLATIONCHECKS) + +isolationcheck-install-force: all | submake-isolation submake-test_decoding + $(pg_isolation_regress_installcheck) \ + --extra-install=contrib/test_decoding \ + $(ISOLATIONCHECKS) + +PHONY: submake-test_decoding submake-regress check \ + regresscheck regresscheck-install-force \ + isolationcheck isolationcheck-install-force diff --git a/contrib/test_decoding/expected/binary.out b/contrib/test_decoding/expected/binary.out new file mode 100644 index 0000000000..3409d9d426 --- /dev/null +++ b/contrib/test_decoding/expected/binary.out @@ -0,0 +1,35 @@ +-- predictability +SET synchronous_commit = on; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); + ?column? +---------- + init +(1 row) + +-- succeeds, textual plugin, textual consumer +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'force-binary', '0'); + data +------ +(0 rows) + +-- fails, binary plugin, textual consumer +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'force-binary', '1'); +ERROR: output plugin cannot produce text output +-- succeeds, textual plugin, binary consumer +SELECT data FROM pg_logical_slot_get_binary_changes('regression_slot', NULL, NULL, 'force-binary', '0'); + data +------ +(0 rows) + +-- succeeds, binary plugin, binary consumer +SELECT data FROM pg_logical_slot_get_binary_changes('regression_slot', NULL, NULL, 'force-binary', '1'); + data +------ +(0 rows) + +SELECT 'init' FROM pg_drop_replication_slot('regression_slot'); + ?column? +---------- + init +(1 row) + diff --git a/contrib/test_decoding/expected/concurrent_ddl_dml.out b/contrib/test_decoding/expected/concurrent_ddl_dml.out new file mode 100644 index 0000000000..cc9165655f --- /dev/null +++ b/contrib/test_decoding/expected/concurrent_ddl_dml.out @@ -0,0 +1,733 @@ +Parsed test spec with 2 sessions + +starting permutation: s1_init s1_begin s1_insert_tbl1 s2_alter_tbl2_float s1_insert_tbl2 s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl2_float: ALTER TABLE tbl2 ALTER COLUMN val2 TYPE float; +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s1_commit: COMMIT; +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[double precision]:1 +COMMIT +?column? + +stop + +starting permutation: s1_init s1_begin s1_insert_tbl1 s2_alter_tbl1_float s1_insert_tbl2 s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl1_float: ALTER TABLE tbl1 ALTER COLUMN val2 TYPE float; +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s1_commit: COMMIT; +step s2_alter_tbl1_float: <... completed> +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 +COMMIT +BEGIN +table public.pg_temp: INSERT: val1[integer]:1 val2[double precision]:1 +COMMIT +?column? + +stop + +starting permutation: s1_init s1_begin s1_insert_tbl1 s2_alter_tbl2_char s1_insert_tbl2 s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl2_char: ALTER TABLE tbl2 ALTER COLUMN val2 TYPE character varying; +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s1_commit: COMMIT; +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[character varying]:'1' +COMMIT +?column? + +stop + +starting permutation: s1_init s1_begin s1_insert_tbl1 s2_alter_tbl1_char s1_insert_tbl2 s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl1_char: ALTER TABLE tbl1 ALTER COLUMN val2 TYPE character varying; +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s1_commit: COMMIT; +step s2_alter_tbl1_char: <... completed> +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 +COMMIT +BEGIN +table public.pg_temp: INSERT: val1[integer]:1 val2[character varying]:'1' +COMMIT +?column? + +stop + +starting permutation: s1_init s1_begin s1_insert_tbl1 s1_insert_tbl2 s2_alter_tbl1_float s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s2_alter_tbl1_float: ALTER TABLE tbl1 ALTER COLUMN val2 TYPE float; +step s1_commit: COMMIT; +step s2_alter_tbl1_float: <... completed> +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 +COMMIT +BEGIN +table public.pg_temp: INSERT: val1[integer]:1 val2[double precision]:1 +COMMIT +?column? + +stop + +starting permutation: s1_init s1_begin s1_insert_tbl1 s1_insert_tbl2 s2_alter_tbl1_char s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s2_alter_tbl1_char: ALTER TABLE tbl1 ALTER COLUMN val2 TYPE character varying; +step s1_commit: COMMIT; +step s2_alter_tbl1_char: <... completed> +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 +COMMIT +BEGIN +table public.pg_temp: INSERT: val1[integer]:1 val2[character varying]:'1' +COMMIT +?column? + +stop + +starting permutation: s1_init s1_begin s1_insert_tbl1 s2_alter_tbl2_float s1_insert_tbl2 s2_alter_tbl1_float s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl2_float: ALTER TABLE tbl2 ALTER COLUMN val2 TYPE float; +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s2_alter_tbl1_float: ALTER TABLE tbl1 ALTER COLUMN val2 TYPE float; +step s1_commit: COMMIT; +step s2_alter_tbl1_float: <... completed> +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[double precision]:1 +COMMIT +BEGIN +table public.pg_temp: INSERT: val1[integer]:1 val2[double precision]:1 +COMMIT +?column? + +stop + +starting permutation: s1_init s1_begin s1_insert_tbl1 s2_alter_tbl2_char s1_insert_tbl2 s2_alter_tbl1_char s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl2_char: ALTER TABLE tbl2 ALTER COLUMN val2 TYPE character varying; +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s2_alter_tbl1_char: ALTER TABLE tbl1 ALTER COLUMN val2 TYPE character varying; +step s1_commit: COMMIT; +step s2_alter_tbl1_char: <... completed> +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[character varying]:'1' +COMMIT +BEGIN +table public.pg_temp: INSERT: val1[integer]:1 val2[character varying]:'1' +COMMIT +?column? + +stop + +starting permutation: s1_init s2_alter_tbl2_char s1_begin s1_insert_tbl1 s2_alter_tbl2_text s1_insert_tbl2 s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s2_alter_tbl2_char: ALTER TABLE tbl2 ALTER COLUMN val2 TYPE character varying; +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl2_text: ALTER TABLE tbl2 ALTER COLUMN val2 TYPE text; +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s1_commit: COMMIT; +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +COMMIT +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[text]:'1' +COMMIT +?column? + +stop + +starting permutation: s1_init s2_alter_tbl2_char s1_begin s1_insert_tbl1 s2_alter_tbl2_text s1_insert_tbl2 s2_alter_tbl1_char s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s2_alter_tbl2_char: ALTER TABLE tbl2 ALTER COLUMN val2 TYPE character varying; +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl2_text: ALTER TABLE tbl2 ALTER COLUMN val2 TYPE text; +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s2_alter_tbl1_char: ALTER TABLE tbl1 ALTER COLUMN val2 TYPE character varying; +step s1_commit: COMMIT; +step s2_alter_tbl1_char: <... completed> +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +COMMIT +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[text]:'1' +COMMIT +BEGIN +table public.pg_temp: INSERT: val1[integer]:1 val2[character varying]:'1' +COMMIT +?column? + +stop + +starting permutation: s1_init s1_begin s1_insert_tbl1 s2_alter_tbl2_boolean s1_insert_tbl2 s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl2_boolean: ALTER TABLE tbl2 ALTER COLUMN val2 TYPE boolean; +ERROR: column "val2" cannot be cast automatically to type boolean +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s1_commit: COMMIT; +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 +COMMIT +?column? + +stop + +starting permutation: s1_init s1_begin s1_insert_tbl1 s2_alter_tbl2_boolean s1_insert_tbl2 s2_alter_tbl1_boolean s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl2_boolean: ALTER TABLE tbl2 ALTER COLUMN val2 TYPE boolean; +ERROR: column "val2" cannot be cast automatically to type boolean +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s2_alter_tbl1_boolean: ALTER TABLE tbl1 ALTER COLUMN val2 TYPE boolean; +step s1_commit: COMMIT; +step s2_alter_tbl1_boolean: <... completed> +error in steps s1_commit s2_alter_tbl1_boolean: ERROR: column "val2" cannot be cast automatically to type boolean +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 +COMMIT +?column? + +stop + +starting permutation: s1_init s1_begin s1_insert_tbl1 s2_alter_tbl2_add_int s1_insert_tbl2_3col s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl2_add_int: ALTER TABLE tbl2 ADD COLUMN val3 INTEGER; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s1_commit: COMMIT; +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[integer]:1 +COMMIT +?column? + +stop + +starting permutation: s1_init s1_begin s1_insert_tbl1 s1_insert_tbl2 s1_commit s1_begin s2_alter_tbl2_add_int s1_insert_tbl2_3col s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s1_commit: COMMIT; +step s1_begin: BEGIN; +step s2_alter_tbl2_add_int: ALTER TABLE tbl2 ADD COLUMN val3 INTEGER; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s1_commit: COMMIT; +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 +COMMIT +BEGIN +COMMIT +BEGIN +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[integer]:1 +COMMIT +?column? + +stop + +starting permutation: s1_init s1_begin s1_insert_tbl1 s2_alter_tbl2_add_float s1_insert_tbl2_3col s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl2_add_float: ALTER TABLE tbl2 ADD COLUMN val3 FLOAT; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s1_commit: COMMIT; +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[double precision]:1 +COMMIT +?column? + +stop + +starting permutation: s1_init s1_begin s1_insert_tbl1 s1_insert_tbl2 s1_commit s1_begin s2_alter_tbl2_add_float s1_insert_tbl2_3col s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s1_commit: COMMIT; +step s1_begin: BEGIN; +step s2_alter_tbl2_add_float: ALTER TABLE tbl2 ADD COLUMN val3 FLOAT; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s1_commit: COMMIT; +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 +COMMIT +BEGIN +COMMIT +BEGIN +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[double precision]:1 +COMMIT +?column? + +stop + +starting permutation: s1_init s1_begin s1_insert_tbl1 s2_alter_tbl2_add_char s1_insert_tbl2_3col s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl2_add_char: ALTER TABLE tbl2 ADD COLUMN val3 character varying; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s1_commit: COMMIT; +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[character varying]:'1' +COMMIT +?column? + +stop + +starting permutation: s1_init s1_begin s1_insert_tbl1 s1_insert_tbl2 s1_commit s1_begin s2_alter_tbl2_add_char s1_insert_tbl2_3col s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s1_commit: COMMIT; +step s1_begin: BEGIN; +step s2_alter_tbl2_add_char: ALTER TABLE tbl2 ADD COLUMN val3 character varying; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s1_commit: COMMIT; +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 +COMMIT +BEGIN +COMMIT +BEGIN +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[character varying]:'1' +COMMIT +?column? + +stop + +starting permutation: s1_init s2_alter_tbl2_add_int s1_begin s1_insert_tbl2_3col s2_alter_tbl2_drop_3rd_col s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s2_alter_tbl2_add_int: ALTER TABLE tbl2 ADD COLUMN val3 INTEGER; +step s1_begin: BEGIN; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s2_alter_tbl2_drop_3rd_col: ALTER TABLE tbl2 DROP COLUMN val3; +step s1_commit: COMMIT; +step s2_alter_tbl2_drop_3rd_col: <... completed> +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[integer]:1 +COMMIT +BEGIN +COMMIT +?column? + +stop + +starting permutation: s1_init s2_alter_tbl2_add_int s1_begin s1_insert_tbl2_3col s2_alter_tbl2_drop_3rd_col s1_insert_tbl2 s1_commit s1_insert_tbl2 s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s2_alter_tbl2_add_int: ALTER TABLE tbl2 ADD COLUMN val3 INTEGER; +step s1_begin: BEGIN; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s2_alter_tbl2_drop_3rd_col: ALTER TABLE tbl2 DROP COLUMN val3; +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s1_commit: COMMIT; +step s2_alter_tbl2_drop_3rd_col: <... completed> +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[integer]:null +COMMIT +BEGIN +COMMIT +BEGIN +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 +COMMIT +?column? + +stop + +starting permutation: s1_init s2_alter_tbl2_add_int s1_begin s1_insert_tbl2_3col s2_alter_tbl2_drop_3rd_col s1_commit s2_get_changes s2_alter_tbl2_add_text s1_begin s1_insert_tbl2_3col s2_alter_tbl2_3rd_char s1_insert_tbl2_3col s1_commit s2_get_changes s2_alter_tbl2_3rd_int s1_insert_tbl2_3col s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s2_alter_tbl2_add_int: ALTER TABLE tbl2 ADD COLUMN val3 INTEGER; +step s1_begin: BEGIN; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s2_alter_tbl2_drop_3rd_col: ALTER TABLE tbl2 DROP COLUMN val3; +step s1_commit: COMMIT; +step s2_alter_tbl2_drop_3rd_col: <... completed> +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[integer]:1 +COMMIT +BEGIN +COMMIT +step s2_alter_tbl2_add_text: ALTER TABLE tbl2 ADD COLUMN val3 TEXT; +step s1_begin: BEGIN; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s2_alter_tbl2_3rd_char: ALTER TABLE tbl2 ALTER COLUMN val3 TYPE character varying; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s1_commit: COMMIT; +step s2_alter_tbl2_3rd_char: <... completed> +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[text]:'1' +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[text]:'1' +COMMIT +BEGIN +COMMIT +step s2_alter_tbl2_3rd_int: ALTER TABLE tbl2 ALTER COLUMN val3 TYPE int USING val3::integer; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +table public.pg_temp: INSERT: val1[integer]:1 val2[integer]:1 val3[integer]:null +table public.pg_temp: INSERT: val1[integer]:1 val2[integer]:1 val3[integer]:1 +table public.pg_temp: INSERT: val1[integer]:1 val2[integer]:1 val3[integer]:1 +COMMIT +BEGIN +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[integer]:1 +COMMIT +?column? + +stop + +starting permutation: s1_init s2_alter_tbl2_add_char s1_begin s1_insert_tbl1 s1_insert_tbl2_3col s2_alter_tbl2_3rd_text s1_insert_tbl2_3col s1_commit s1_insert_tbl2_3col s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s2_alter_tbl2_add_char: ALTER TABLE tbl2 ADD COLUMN val3 character varying; +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s2_alter_tbl2_3rd_text: ALTER TABLE tbl2 ALTER COLUMN val3 TYPE text; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s1_commit: COMMIT; +step s2_alter_tbl2_3rd_text: <... completed> +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[character varying]:'1' +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[character varying]:'1' +COMMIT +BEGIN +COMMIT +BEGIN +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[text]:'1' +COMMIT +?column? + +stop + +starting permutation: s1_init s2_alter_tbl2_add_text s1_begin s1_insert_tbl1 s1_insert_tbl2_3col s2_alter_tbl2_3rd_char s1_insert_tbl2_3col s1_commit s1_insert_tbl2_3col s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s2_alter_tbl2_add_text: ALTER TABLE tbl2 ADD COLUMN val3 TEXT; +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s2_alter_tbl2_3rd_char: ALTER TABLE tbl2 ALTER COLUMN val3 TYPE character varying; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s1_commit: COMMIT; +step s2_alter_tbl2_3rd_char: <... completed> +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[text]:'1' +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[text]:'1' +COMMIT +BEGIN +COMMIT +BEGIN +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[character varying]:'1' +COMMIT +?column? + +stop + +starting permutation: s1_init s2_alter_tbl2_add_char s1_begin s1_insert_tbl1 s2_alter_tbl2_3rd_text s1_insert_tbl2_3col s1_commit s2_alter_tbl2_drop_3rd_col s1_insert_tbl2 s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s2_alter_tbl2_add_char: ALTER TABLE tbl2 ADD COLUMN val3 character varying; +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl2_3rd_text: ALTER TABLE tbl2 ALTER COLUMN val3 TYPE text; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s1_commit: COMMIT; +step s2_alter_tbl2_drop_3rd_col: ALTER TABLE tbl2 DROP COLUMN val3; +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +COMMIT +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[text]:'1' +COMMIT +BEGIN +COMMIT +BEGIN +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 +COMMIT +?column? + +stop + +starting permutation: s1_init s2_alter_tbl2_add_text s1_begin s1_insert_tbl1 s2_alter_tbl2_3rd_char s1_insert_tbl2_3col s1_commit s2_alter_tbl2_drop_3rd_col s1_insert_tbl2 s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s2_alter_tbl2_add_text: ALTER TABLE tbl2 ADD COLUMN val3 TEXT; +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl2_3rd_char: ALTER TABLE tbl2 ALTER COLUMN val3 TYPE character varying; +step s1_insert_tbl2_3col: INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); +step s1_commit: COMMIT; +step s2_alter_tbl2_drop_3rd_col: ALTER TABLE tbl2 DROP COLUMN val3; +step s1_insert_tbl2: INSERT INTO tbl2 (val1, val2) VALUES (1, 1); +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +COMMIT +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 val3[character varying]:'1' +COMMIT +BEGIN +COMMIT +BEGIN +table public.tbl2: INSERT: val1[integer]:1 val2[integer]:1 +COMMIT +?column? + +stop + +starting permutation: s1_init s2_alter_tbl2_add_char s1_begin s1_insert_tbl1 s2_alter_tbl2_drop_3rd_col s1_insert_tbl1 s1_commit s2_get_changes +step s1_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s2_alter_tbl2_add_char: ALTER TABLE tbl2 ADD COLUMN val3 character varying; +step s1_begin: BEGIN; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s2_alter_tbl2_drop_3rd_col: ALTER TABLE tbl2 DROP COLUMN val3; +step s1_insert_tbl1: INSERT INTO tbl1 (val1, val2) VALUES (1, 1); +step s1_commit: COMMIT; +step s2_get_changes: SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); +data + +BEGIN +COMMIT +BEGIN +COMMIT +BEGIN +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +table public.tbl1: INSERT: val1[integer]:1 val2[integer]:1 +COMMIT +?column? + +stop diff --git a/contrib/test_decoding/expected/ddl.out b/contrib/test_decoding/expected/ddl.out new file mode 100644 index 0000000000..986f504eff --- /dev/null +++ b/contrib/test_decoding/expected/ddl.out @@ -0,0 +1,651 @@ +-- predictability +SET synchronous_commit = on; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); + ?column? +---------- + init +(1 row) + +-- fail because of an already existing slot +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +ERROR: replication slot "regression_slot" already exists +-- fail because of an invalid name +SELECT 'init' FROM pg_create_logical_replication_slot('Invalid Name', 'test_decoding'); +ERROR: replication slot name "Invalid Name" contains invalid character +HINT: Replication slot names may only contain letters, numbers and the underscore character. +-- fail twice because of an invalid parameter values +SELECT 'init' FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', 'frakbar'); +ERROR: could not parse value "frakbar" for parameter "include-xids" +CONTEXT: slot "regression_slot", output plugin "test_decoding", in the startup callback +SELECT 'init' FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'nonexistant-option', 'frakbar'); +ERROR: option "nonexistant-option" = "frakbar" is unknown +CONTEXT: slot "regression_slot", output plugin "test_decoding", in the startup callback +SELECT 'init' FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', 'frakbar'); +ERROR: could not parse value "frakbar" for parameter "include-xids" +CONTEXT: slot "regression_slot", output plugin "test_decoding", in the startup callback +-- succeed once +SELECT pg_drop_replication_slot('regression_slot'); + pg_drop_replication_slot +-------------------------- + +(1 row) + +-- fail +SELECT pg_drop_replication_slot('regression_slot'); +ERROR: replication slot "regression_slot" does not exist +-- check that we're detecting a streaming rep slot used for logical decoding +SELECT 'init' FROM pg_create_physical_replication_slot('repl'); + ?column? +---------- + init +(1 row) + +SELECT data FROM pg_logical_slot_get_changes('repl', NULL, NULL, 'include-xids', '0'); +ERROR: cannot use physical replication slot for logical decoding +SELECT pg_drop_replication_slot('repl'); + pg_drop_replication_slot +-------------------------- + +(1 row) + +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); + ?column? +---------- + init +(1 row) + +/* check whether status function reports us, only reproduceable columns */ +SELECT slot_name, plugin, slot_type, active, + NOT catalog_xmin IS NULL AS catalog_xmin_set, + xmin IS NULl AS data_xmin_not_set, + pg_xlog_location_diff(restart_lsn, '0/01000000') > 0 AS some_wal +FROM pg_replication_slots; + slot_name | plugin | slot_type | active | catalog_xmin_set | data_xmin_not_set | some_wal +-----------------+---------------+-----------+--------+------------------+-------------------+---------- + regression_slot | test_decoding | logical | f | t | t | t +(1 row) + +/* + * Check that changes are handled correctly when interleaved with ddl + */ +CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120)); +BEGIN; +INSERT INTO replication_example(somedata, text) VALUES (1, 1); +INSERT INTO replication_example(somedata, text) VALUES (1, 2); +COMMIT; +ALTER TABLE replication_example ADD COLUMN bar int; +INSERT INTO replication_example(somedata, text, bar) VALUES (2, 1, 4); +BEGIN; +INSERT INTO replication_example(somedata, text, bar) VALUES (2, 2, 4); +INSERT INTO replication_example(somedata, text, bar) VALUES (2, 3, 4); +INSERT INTO replication_example(somedata, text, bar) VALUES (2, 4, NULL); +COMMIT; +ALTER TABLE replication_example DROP COLUMN bar; +INSERT INTO replication_example(somedata, text) VALUES (3, 1); +BEGIN; +INSERT INTO replication_example(somedata, text) VALUES (3, 2); +INSERT INTO replication_example(somedata, text) VALUES (3, 3); +COMMIT; +ALTER TABLE replication_example RENAME COLUMN text TO somenum; +INSERT INTO replication_example(somedata, somenum) VALUES (4, 1); +-- collect all changes +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +--------------------------------------------------------------------------------------------------------------------------- + BEGIN + COMMIT + BEGIN + table public.replication_example: INSERT: id[integer]:1 somedata[integer]:1 text[character varying]:'1' + table public.replication_example: INSERT: id[integer]:2 somedata[integer]:1 text[character varying]:'2' + COMMIT + BEGIN + COMMIT + BEGIN + table public.replication_example: INSERT: id[integer]:3 somedata[integer]:2 text[character varying]:'1' bar[integer]:4 + COMMIT + BEGIN + table public.replication_example: INSERT: id[integer]:4 somedata[integer]:2 text[character varying]:'2' bar[integer]:4 + table public.replication_example: INSERT: id[integer]:5 somedata[integer]:2 text[character varying]:'3' bar[integer]:4 + table public.replication_example: INSERT: id[integer]:6 somedata[integer]:2 text[character varying]:'4' bar[integer]:null + COMMIT + BEGIN + COMMIT + BEGIN + table public.replication_example: INSERT: id[integer]:7 somedata[integer]:3 text[character varying]:'1' + COMMIT + BEGIN + table public.replication_example: INSERT: id[integer]:8 somedata[integer]:3 text[character varying]:'2' + table public.replication_example: INSERT: id[integer]:9 somedata[integer]:3 text[character varying]:'3' + COMMIT + BEGIN + COMMIT + BEGIN + table public.replication_example: INSERT: id[integer]:10 somedata[integer]:4 somenum[character varying]:'1' + COMMIT +(30 rows) + +ALTER TABLE replication_example ALTER COLUMN somenum TYPE int4 USING (somenum::int4); +-- throw away changes, they contain oids +SELECT count(data) FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + count +------- + 12 +(1 row) + +INSERT INTO replication_example(somedata, somenum) VALUES (5, 1); +BEGIN; +INSERT INTO replication_example(somedata, somenum) VALUES (6, 1); +ALTER TABLE replication_example ADD COLUMN zaphod1 int; +INSERT INTO replication_example(somedata, somenum, zaphod1) VALUES (6, 2, 1); +ALTER TABLE replication_example ADD COLUMN zaphod2 int; +INSERT INTO replication_example(somedata, somenum, zaphod2) VALUES (6, 3, 1); +INSERT INTO replication_example(somedata, somenum, zaphod1) VALUES (6, 4, 2); +COMMIT; +-- show changes +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +------------------------------------------------------------------------------------------------------------------------------------------ + BEGIN + table public.replication_example: INSERT: id[integer]:11 somedata[integer]:5 somenum[integer]:1 + COMMIT + BEGIN + table public.replication_example: INSERT: id[integer]:12 somedata[integer]:6 somenum[integer]:1 + table public.replication_example: INSERT: id[integer]:13 somedata[integer]:6 somenum[integer]:2 zaphod1[integer]:1 + table public.replication_example: INSERT: id[integer]:14 somedata[integer]:6 somenum[integer]:3 zaphod1[integer]:null zaphod2[integer]:1 + table public.replication_example: INSERT: id[integer]:15 somedata[integer]:6 somenum[integer]:4 zaphod1[integer]:2 zaphod2[integer]:null + COMMIT +(9 rows) + +-- hide changes bc of oid visible in full table rewrites +CREATE TABLE tr_unique(id2 serial unique NOT NULL, data int); +INSERT INTO tr_unique(data) VALUES(10); +ALTER TABLE tr_unique RENAME TO tr_pkey; +ALTER TABLE tr_pkey ADD COLUMN id serial primary key; +SELECT count(data) FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + count +------- + 10 +(1 row) + +INSERT INTO tr_pkey(data) VALUES(1); +--show deletion with primary key +DELETE FROM tr_pkey; +/* display results */ +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +---------------------------------------------------------------------------- + BEGIN + table public.tr_pkey: INSERT: id2[integer]:2 data[integer]:1 id[integer]:2 + COMMIT + BEGIN + table public.tr_pkey: DELETE: id[integer]:1 + table public.tr_pkey: DELETE: id[integer]:2 + COMMIT +(7 rows) + +/* + * check that disk spooling works + */ +BEGIN; +CREATE TABLE tr_etoomuch (id serial primary key, data int); +INSERT INTO tr_etoomuch(data) SELECT g.i FROM generate_series(1, 10234) g(i); +DELETE FROM tr_etoomuch WHERE id < 5000; +UPDATE tr_etoomuch SET data = - data WHERE id > 5000; +COMMIT; +/* display results, but hide most of the output */ +SELECT count(*), min(data), max(data) +FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0') +GROUP BY substring(data, 1, 24) +ORDER BY 1; + count | min | max +-------+-------------------------------------------------+------------------------------------------------------------------------ + 1 | COMMIT | COMMIT + 1 | BEGIN | BEGIN + 20467 | table public.tr_etoomuch: DELETE: id[integer]:1 | table public.tr_etoomuch: UPDATE: id[integer]:9999 data[integer]:-9999 +(3 rows) + +/* + * check whether we decode subtransactions correctly in relation with each + * other + */ +CREATE TABLE tr_sub (id serial primary key, path text); +-- toplevel, subtxn, toplevel, subtxn, subtxn +BEGIN; +INSERT INTO tr_sub(path) VALUES ('1-top-#1'); +SAVEPOINT a; +INSERT INTO tr_sub(path) VALUES ('1-top-1-#1'); +INSERT INTO tr_sub(path) VALUES ('1-top-1-#2'); +RELEASE SAVEPOINT a; +SAVEPOINT b; +SAVEPOINT c; +INSERT INTO tr_sub(path) VALUES ('1-top-2-1-#1'); +INSERT INTO tr_sub(path) VALUES ('1-top-2-1-#2'); +RELEASE SAVEPOINT c; +INSERT INTO tr_sub(path) VALUES ('1-top-2-#1'); +RELEASE SAVEPOINT b; +COMMIT; +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +---------------------------------------------------------------------- + BEGIN + COMMIT + BEGIN + table public.tr_sub: INSERT: id[integer]:1 path[text]:'1-top-#1' + table public.tr_sub: INSERT: id[integer]:2 path[text]:'1-top-1-#1' + table public.tr_sub: INSERT: id[integer]:3 path[text]:'1-top-1-#2' + table public.tr_sub: INSERT: id[integer]:4 path[text]:'1-top-2-1-#1' + table public.tr_sub: INSERT: id[integer]:5 path[text]:'1-top-2-1-#2' + table public.tr_sub: INSERT: id[integer]:6 path[text]:'1-top-2-#1' + COMMIT +(10 rows) + +-- check that we handle xlog assignments correctly +BEGIN; +-- nest 80 subtxns +SAVEPOINT subtop;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +-- assign xid by inserting +INSERT INTO tr_sub(path) VALUES ('2-top-1...--#1'); +INSERT INTO tr_sub(path) VALUES ('2-top-1...--#2'); +INSERT INTO tr_sub(path) VALUES ('2-top-1...--#3'); +RELEASE SAVEPOINT subtop; +INSERT INTO tr_sub(path) VALUES ('2-top-#1'); +COMMIT; +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +------------------------------------------------------------------------ + BEGIN + table public.tr_sub: INSERT: id[integer]:7 path[text]:'2-top-1...--#1' + table public.tr_sub: INSERT: id[integer]:8 path[text]:'2-top-1...--#2' + table public.tr_sub: INSERT: id[integer]:9 path[text]:'2-top-1...--#3' + table public.tr_sub: INSERT: id[integer]:10 path[text]:'2-top-#1' + COMMIT +(6 rows) + +-- make sure rollbacked subtransactions aren't decoded +BEGIN; +INSERT INTO tr_sub(path) VALUES ('3-top-2-#1'); +SAVEPOINT a; +INSERT INTO tr_sub(path) VALUES ('3-top-2-1-#1'); +SAVEPOINT b; +INSERT INTO tr_sub(path) VALUES ('3-top-2-2-#1'); +ROLLBACK TO SAVEPOINT b; +INSERT INTO tr_sub(path) VALUES ('3-top-2-#2'); +COMMIT; +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +----------------------------------------------------------------------- + BEGIN + table public.tr_sub: INSERT: id[integer]:11 path[text]:'3-top-2-#1' + table public.tr_sub: INSERT: id[integer]:12 path[text]:'3-top-2-1-#1' + table public.tr_sub: INSERT: id[integer]:14 path[text]:'3-top-2-#2' + COMMIT +(5 rows) + +-- test whether a known, but not yet logged toplevel xact, followed by a +-- subxact commit is handled correctly +BEGIN; +SELECT txid_current() != 0; -- so no fixed xid apears in the outfile + ?column? +---------- + t +(1 row) + +SAVEPOINT a; +INSERT INTO tr_sub(path) VALUES ('4-top-1-#1'); +RELEASE SAVEPOINT a; +COMMIT; +-- test whether a change in a subtransaction, in an unknown toplevel +-- xact is handled correctly. +BEGIN; +SAVEPOINT a; +INSERT INTO tr_sub(path) VALUES ('5-top-1-#1'); +COMMIT; +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +--------------------------------------------------------------------- + BEGIN + table public.tr_sub: INSERT: id[integer]:15 path[text]:'4-top-1-#1' + COMMIT + BEGIN + table public.tr_sub: INSERT: id[integer]:16 path[text]:'5-top-1-#1' + COMMIT +(6 rows) + +/* + * Check whether treating a table as a catalog table works somewhat + */ +CREATE TABLE replication_metadata ( + id serial primary key, + relation name NOT NULL, + options text[] +) +WITH (user_catalog_table = true) +; +\d+ replication_metadata + Table "public.replication_metadata" + Column | Type | Modifiers | Storage | Stats target | Description +----------+---------+-------------------------------------------------------------------+----------+--------------+------------- + id | integer | not null default nextval('replication_metadata_id_seq'::regclass) | plain | | + relation | name | not null | plain | | + options | text[] | | extended | | +Indexes: + "replication_metadata_pkey" PRIMARY KEY, btree (id) +Has OIDs: no +Options: user_catalog_table=true + +INSERT INTO replication_metadata(relation, options) +VALUES ('foo', ARRAY['a', 'b']); +ALTER TABLE replication_metadata RESET (user_catalog_table); +\d+ replication_metadata + Table "public.replication_metadata" + Column | Type | Modifiers | Storage | Stats target | Description +----------+---------+-------------------------------------------------------------------+----------+--------------+------------- + id | integer | not null default nextval('replication_metadata_id_seq'::regclass) | plain | | + relation | name | not null | plain | | + options | text[] | | extended | | +Indexes: + "replication_metadata_pkey" PRIMARY KEY, btree (id) +Has OIDs: no + +INSERT INTO replication_metadata(relation, options) +VALUES ('bar', ARRAY['a', 'b']); +ALTER TABLE replication_metadata SET (user_catalog_table = true); +\d+ replication_metadata + Table "public.replication_metadata" + Column | Type | Modifiers | Storage | Stats target | Description +----------+---------+-------------------------------------------------------------------+----------+--------------+------------- + id | integer | not null default nextval('replication_metadata_id_seq'::regclass) | plain | | + relation | name | not null | plain | | + options | text[] | | extended | | +Indexes: + "replication_metadata_pkey" PRIMARY KEY, btree (id) +Has OIDs: no +Options: user_catalog_table=true + +INSERT INTO replication_metadata(relation, options) +VALUES ('blub', NULL); +-- make sure rewrites don't work +ALTER TABLE replication_metadata ADD COLUMN rewritemeornot int; +ALTER TABLE replication_metadata ALTER COLUMN rewritemeornot TYPE text; +ERROR: cannot rewrite table "replication_metadata" used as a catalog table +ALTER TABLE replication_metadata SET (user_catalog_table = false); +\d+ replication_metadata + Table "public.replication_metadata" + Column | Type | Modifiers | Storage | Stats target | Description +----------------+---------+-------------------------------------------------------------------+----------+--------------+------------- + id | integer | not null default nextval('replication_metadata_id_seq'::regclass) | plain | | + relation | name | not null | plain | | + options | text[] | | extended | | + rewritemeornot | integer | | plain | | +Indexes: + "replication_metadata_pkey" PRIMARY KEY, btree (id) +Has OIDs: no +Options: user_catalog_table=false + +INSERT INTO replication_metadata(relation, options) +VALUES ('zaphod', NULL); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +------------------------------------------------------------------------------------------------------------------------------------ + BEGIN + COMMIT + BEGIN + table public.replication_metadata: INSERT: id[integer]:1 relation[name]:'foo' options[text[]]:'{a,b}' + COMMIT + BEGIN + COMMIT + BEGIN + table public.replication_metadata: INSERT: id[integer]:2 relation[name]:'bar' options[text[]]:'{a,b}' + COMMIT + BEGIN + COMMIT + BEGIN + table public.replication_metadata: INSERT: id[integer]:3 relation[name]:'blub' options[text[]]:null + COMMIT + BEGIN + COMMIT + BEGIN + COMMIT + BEGIN + table public.replication_metadata: INSERT: id[integer]:4 relation[name]:'zaphod' options[text[]]:null rewritemeornot[integer]:null + COMMIT +(22 rows) + +/* + * check whether we handle updates/deletes correct with & without a pkey + */ +/* we should handle the case without a key at all more gracefully */ +CREATE TABLE table_without_key(id serial, data int); +INSERT INTO table_without_key(data) VALUES(1),(2); +DELETE FROM table_without_key WHERE data = 1; +-- won't log old keys +UPDATE table_without_key SET data = 3 WHERE data = 2; +UPDATE table_without_key SET id = -id; +UPDATE table_without_key SET id = -id; +-- should log the full old row now +ALTER TABLE table_without_key REPLICA IDENTITY FULL; +UPDATE table_without_key SET data = 3 WHERE data = 2; +UPDATE table_without_key SET id = -id; +UPDATE table_without_key SET id = -id; +DELETE FROM table_without_key WHERE data = 3; +CREATE TABLE table_with_pkey(id serial primary key, data int); +INSERT INTO table_with_pkey(data) VALUES(1), (2); +DELETE FROM table_with_pkey WHERE data = 1; +-- should log the old pkey +UPDATE table_with_pkey SET data = 3 WHERE data = 2; +UPDATE table_with_pkey SET id = -id; +UPDATE table_with_pkey SET id = -id; +-- check that we log nothing despite having a pkey +ALTER TABLE table_without_key REPLICA IDENTITY NOTHING; +UPDATE table_with_pkey SET id = -id; +-- check that we log everything despite having a pkey +ALTER TABLE table_without_key REPLICA IDENTITY FULL; +UPDATE table_with_pkey SET id = -id; +DELETE FROM table_with_pkey WHERE data = 3; +CREATE TABLE table_with_unique_not_null(id serial unique, data int); +ALTER TABLE table_with_unique_not_null ALTER COLUMN id SET NOT NULL; --already set +-- won't log anything, replica identity not setup +INSERT INTO table_with_unique_not_null(data) VALUES(1), (2); +DELETE FROM table_with_unique_not_null WHERE data = 1; +UPDATE table_with_unique_not_null SET data = 3 WHERE data = 2; +UPDATE table_with_unique_not_null SET id = -id; +UPDATE table_with_unique_not_null SET id = -id; +DELETE FROM table_with_unique_not_null WHERE data = 3; +-- should log old key +ALTER TABLE table_with_unique_not_null REPLICA IDENTITY USING INDEX table_with_unique_not_null_id_key; +INSERT INTO table_with_unique_not_null(data) VALUES(1), (2); +DELETE FROM table_with_unique_not_null WHERE data = 1; +UPDATE table_with_unique_not_null SET data = 3 WHERE data = 2; +UPDATE table_with_unique_not_null SET id = -id; +UPDATE table_with_unique_not_null SET id = -id; +DELETE FROM table_with_unique_not_null WHERE data = 3; +-- check toast support +BEGIN; +CREATE SEQUENCE toasttable_rand_seq START 79 INCREMENT 1499; -- portable "random" +CREATE TABLE toasttable( + id serial primary key, + toasted_col1 text, + rand1 float8 DEFAULT nextval('toasttable_rand_seq'), + toasted_col2 text, + rand2 float8 DEFAULT nextval('toasttable_rand_seq') + ); +COMMIT; +-- uncompressed external toast data +INSERT INTO toasttable(toasted_col1) SELECT string_agg(g.i::text, '') FROM generate_series(1, 2000) g(i); +-- compressed external toast data +INSERT INTO toasttable(toasted_col2) SELECT repeat(string_agg(to_char(g.i, 'FM0000'), ''), 50) FROM generate_series(1, 500) g(i); +-- update of existing column +UPDATE toasttable + SET toasted_col1 = (SELECT string_agg(g.i::text, '') FROM generate_series(1, 2000) g(i)) +WHERE id = 1; +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ + BEGIN + COMMIT + BEGIN + table public.table_without_key: INSERT: id[integer]:1 data[integer]:1 + table public.table_without_key: INSERT: id[integer]:2 data[integer]:2 + COMMIT + BEGIN + table public.table_without_key: DELETE: (no-tuple-data) + COMMIT + BEGIN + table public.table_without_key: UPDATE: id[integer]:2 data[integer]:3 + COMMIT + BEGIN + table public.table_without_key: UPDATE: id[integer]:-2 data[integer]:3 + COMMIT + BEGIN + table public.table_without_key: UPDATE: id[integer]:2 data[integer]:3 + COMMIT + BEGIN + COMMIT + BEGIN + table public.table_without_key: UPDATE: old-key: id[integer]:2 data[integer]:3 new-tuple: id[integer]:-2 data[integer]:3 + COMMIT + BEGIN + table public.table_without_key: UPDATE: old-key: id[integer]:-2 data[integer]:3 new-tuple: id[integer]:2 data[integer]:3 + COMMIT + BEGIN + table public.table_without_key: DELETE: id[integer]:2 data[integer]:3 + COMMIT + BEGIN + COMMIT + BEGIN + table public.table_with_pkey: INSERT: id[integer]:1 data[integer]:1 + table public.table_with_pkey: INSERT: id[integer]:2 data[integer]:2 + COMMIT + BEGIN + table public.table_with_pkey: DELETE: id[integer]:1 + COMMIT + BEGIN + table public.table_with_pkey: UPDATE: id[integer]:2 data[integer]:3 + COMMIT + BEGIN + table public.table_with_pkey: UPDATE: old-key: id[integer]:2 new-tuple: id[integer]:-2 data[integer]:3 + COMMIT + BEGIN + table public.table_with_pkey: UPDATE: old-key: id[integer]:-2 new-tuple: id[integer]:2 data[integer]:3 + COMMIT + BEGIN + COMMIT + BEGIN + table public.table_with_pkey: UPDATE: old-key: id[integer]:2 new-tuple: id[integer]:-2 data[integer]:3 + COMMIT + BEGIN + COMMIT + BEGIN + table public.table_with_pkey: UPDATE: old-key: id[integer]:-2 new-tuple: id[integer]:2 data[integer]:3 + COMMIT + BEGIN + table public.table_with_pkey: DELETE: id[integer]:2 + COMMIT + BEGIN + COMMIT + BEGIN + table public.table_with_unique_not_null: INSERT: id[integer]:1 data[integer]:1 + table public.table_with_unique_not_null: INSERT: id[integer]:2 data[integer]:2 + COMMIT + BEGIN + table public.table_with_unique_not_null: DELETE: (no-tuple-data) + COMMIT + BEGIN + table public.table_with_unique_not_null: UPDATE: id[integer]:2 data[integer]:3 + COMMIT + BEGIN + table public.table_with_unique_not_null: UPDATE: id[integer]:-2 data[integer]:3 + COMMIT + BEGIN + table public.table_with_unique_not_null: UPDATE: id[integer]:2 data[integer]:3 + COMMIT + BEGIN + table public.table_with_unique_not_null: DELETE: (no-tuple-data) + COMMIT + BEGIN + COMMIT + BEGIN + table public.table_with_unique_not_null: INSERT: id[integer]:3 data[integer]:1 + table public.table_with_unique_not_null: INSERT: id[integer]:4 data[integer]:2 + COMMIT + BEGIN + table public.table_with_unique_not_null: DELETE: id[integer]:3 + COMMIT + BEGIN + table public.table_with_unique_not_null: UPDATE: id[integer]:4 data[integer]:3 + COMMIT + BEGIN + table public.table_with_unique_not_null: UPDATE: old-key: id[integer]:4 new-tuple: id[integer]:-4 data[integer]:3 + COMMIT + BEGIN + table public.table_with_unique_not_null: UPDATE: old-key: id[integer]:-4 new-tuple: id[integer]:4 data[integer]:3 + COMMIT + BEGIN + table public.table_with_unique_not_null: DELETE: id[integer]:4 + COMMIT + BEGIN + COMMIT + BEGIN + table public.toasttable: INSERT: id[integer]:1 toasted_col1[text]:'12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000' rand1[double precision]:79 toasted_col2[text]:null rand2[double precision]:1578 + COMMIT + BEGIN + table public.toasttable: INSERT: id[integer]:2 toasted_col1[text]:null rand1[double precision]:3077 toasted_col2[text]:'0001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500000100020003000400050006000700080009001000110012001300140015001600170018001900200021002200230024002500260027002800290030003100320033003400350036003700380039004000410042004300440045004600470048004900500051005200530054005500560057005800590060006100620063006400650066006700680069007000710072007300740075007600770078007900800081008200830084008500860087008800890090009100920093009400950096009700980099010001010102010301040105010601070108010901100111011201130114011501160117011801190120012101220123012401250126012701280129013001310132013301340135013601370138013901400141014201430144014501460147014801490150015101520153015401550156015701580159016001610162016301640165016601670168016901700171017201730174017501760177017801790180018101820183018401850186018701880189019001910192019301940195019601970198019902000201020202030204020502060207020802090210021102120213021402150216021702180219022002210222022302240225022602270228022902300231023202330234023502360237023802390240024102420243024402450246024702480249025002510252025302540255025602570258025902600261026202630264026502660267026802690270027102720273027402750276027702780279028002810282028302840285028602870288028902900291029202930294029502960297029802990300030103020303030403050306030703080309031003110312031303140315031603170318031903200321032203230324032503260327032803290330033103320333033403350336033703380339034003410342034303440345034603470348034903500351035203530354035503560357035803590360036103620363036403650366036703680369037003710372037303740375037603770378037903800381038203830384038503860387038803890390039103920393039403950396039703980399040004010402040304040405040604070408040904100411041204130414041504160417041804190420042104220423042404250426042704280429043004310432043304340435043604370438043904400441044204430444044504460447044804490450045104520453045404550456045704580459046004610462046304640465046604670468046904700471047204730474047504760477047804790480048104820483048404850486048704880489049004910492049304940495049604970498049905000001000200030004000500060007000800090010001100120013001400150016001700180019002000210022002300240025002600270028002900300031003200330034003500360037003800390040004100420043004400450046004700480049005000510052005300540055005600570058005900600061006200630064006500660067006800690070007100720073007400750076007700780079008000810082008300840085008600870088008900900091009200930094009500960097009800990100010101020103010401050106010701080109011001110112011301140115011601170118011901200121012201230124012501260127012801290130013101320133013401350136013701380139014001410142014301440145014601470148014901500151015201530154015501560157015801590160016101620163016401650166016701680169017001710172017301740175017601770178017901800181018201830184018501860187018801890190019101920193019401950196019701980199020002010202020302040205020602070208020902100211021202130214021502160217021802190220022102220223022402250226022702280229023002310232023302340235023602370238023902400241024202430244024502460247024802490250025102520253025402550256025702580259026002610262026302640265026602670268026902700271027202730274027502760277027802790280028102820283028402850286028702880289029002910292029302940295029602970298029903000301030203030304030503060307030803090310031103120313031403150316031703180319032003210322032303240325032603270328032903300331033203330334033503360337033803390340034103420343034403450346034703480349035003510352035303540355035603570358035903600361036203630364036503660367036803690370037103720373037403750376037703780379038003810382038303840385038603870388038903900391039203930394039503960397039803990400040104020403040404050406040704080409041004110412041304140415041604170418041904200421042204230424042504260427042804290430043104320433043404350436043704380439044004410442044304440445044604470448044904500451045204530454045504560457045804590460046104620463046404650466046704680469047004710472047304740475047604770478047904800481048204830484048504860487048804890490049104920493049404950496049704980499050000010002000300040005000600070008000900100011001200130014001500160017001800190020002100220023002400250026002700280029003000310032003300340035003600370038003900400041004200430044004500460047004800490050005100520053005400550056005700580059006000610062006300640065006600670068006900700071007200730074007500760077007800790080008100820083008400850086008700880089009000910092009300940095009600970098009901000101010201030104010501060107010801090110011101120113011401150116011701180119012001210122012301240125012601270128012901300131013201330134013501360137013801390140014101420143014401450146014701480149015001510152015301540155015601570158015901600161016201630164016501660167016801690170017101720173017401750176017701780179018001810182018301840185018601870188018901900191019201930194019501960197019801990200020102020203020402050206020702080209021002110212021302140215021602170218021902200221022202230224022502260227022802290230023102320233023402350236023702380239024002410242024302440245024602470248024902500251025202530254025502560257025802590260026102620263026402650266026702680269027002710272027302740275027602770278027902800281028202830284028502860287028802890290029102920293029402950296029702980299030003010302030303040305030603070308030903100311031203130314031503160317031803190320032103220323032403250326032703280329033003310332033303340335033603370338033903400341034203430344034503460347034803490350035103520353035403550356035703580359036003610362036303640365036603670368036903700371037203730374037503760377037803790380038103820383038403850386038703880389039003910392039303940395039603970398039904000401040204030404040504060407040804090410041104120413041404150416041704180419042004210422042304240425042604270428042904300431043204330434043504360437043804390440044104420443044404450446044704480449045004510452045304540455045604570458045904600461046204630464046504660467046804690470047104720473047404750476047704780479048004810482048304840485048604870488048904900491049204930494049504960497049804990500' rand2[double precision]:4576 + COMMIT + BEGIN + table public.toasttable: UPDATE: id[integer]:1 toasted_col1[text]:'12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000' rand1[double precision]:79 toasted_col2[text]:null rand2[double precision]:1578 + COMMIT +(113 rows) + +INSERT INTO toasttable(toasted_col1) SELECT string_agg(g.i::text, '') FROM generate_series(1, 2000) g(i); +-- update of second column, first column unchanged +UPDATE toasttable + SET toasted_col2 = (SELECT string_agg(g.i::text, '') FROM generate_series(1, 2000) g(i)) +WHERE id = 1; +-- make sure we decode correctly even if the toast table is gone +DROP TABLE toasttable; +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + BEGIN + table public.toasttable: INSERT: id[integer]:3 toasted_col1[text]:'12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000' rand1[double precision]:6075 toasted_col2[text]:null rand2[double precision]:7574 + COMMIT + BEGIN + table public.toasttable: UPDATE: id[integer]:1 toasted_col1[text]:unchanged-toast-datum rand1[double precision]:79 toasted_col2[text]:'12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000' rand2[double precision]:1578 + COMMIT + BEGIN + COMMIT +(8 rows) + +-- done, free logical replication slot +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +------ +(0 rows) + +SELECT pg_drop_replication_slot('regression_slot'); + pg_drop_replication_slot +-------------------------- + +(1 row) + +/* check that we aren't visible anymore now */ +SELECT * FROM pg_stat_replication; + pid | usesysid | usename | application_name | client_addr | client_hostname | client_port | backend_start | backend_xmin | state | sent_location | write_location | flush_location | replay_location | sync_priority | sync_state +-----+----------+---------+------------------+-------------+-----------------+-------------+---------------+--------------+-------+---------------+----------------+----------------+-----------------+---------------+------------ +(0 rows) + diff --git a/contrib/test_decoding/expected/decoding_in_xact.out b/contrib/test_decoding/expected/decoding_in_xact.out new file mode 100644 index 0000000000..d15b0b542b --- /dev/null +++ b/contrib/test_decoding/expected/decoding_in_xact.out @@ -0,0 +1,89 @@ +-- predictability +SET synchronous_commit = on; +-- fail because we're creating a slot while in an xact with xid +BEGIN; +SELECT txid_current() = 0; + ?column? +---------- + f +(1 row) + +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +ERROR: cannot create logical replication slot in transaction that has performed writes +ROLLBACK; +-- fail because we're creating a slot while in an subxact whose topxact has a xid +BEGIN; +SELECT txid_current() = 0; + ?column? +---------- + f +(1 row) + +SAVEPOINT barf; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +ERROR: cannot create logical replication slot in transaction that has performed writes +ROLLBACK TO SAVEPOINT barf; +ROLLBACK; +-- succeed, outside tx. +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); + ?column? +---------- + init +(1 row) + +SELECT 'stop' FROM pg_drop_replication_slot('regression_slot'); + ?column? +---------- + stop +(1 row) + +-- succeed, in tx without xid. +BEGIN; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); + ?column? +---------- + init +(1 row) + +COMMIT; +CREATE TABLE nobarf(id serial primary key, data text); +INSERT INTO nobarf(data) VALUES('1'); +-- decoding works in transaction with xid +BEGIN; +SELECT txid_current() = 0; + ?column? +---------- + f +(1 row) + +-- don't show yet, haven't committed +INSERT INTO nobarf(data) VALUES('2'); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +----------------------------------------------------------- + BEGIN + COMMIT + BEGIN + table public.nobarf: INSERT: id[integer]:1 data[text]:'1' + COMMIT +(5 rows) + +COMMIT; +INSERT INTO nobarf(data) VALUES('3'); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +----------------------------------------------------------- + BEGIN + table public.nobarf: INSERT: id[integer]:2 data[text]:'2' + COMMIT + BEGIN + table public.nobarf: INSERT: id[integer]:3 data[text]:'3' + COMMIT +(6 rows) + +SELECT 'stop' FROM pg_drop_replication_slot('regression_slot'); + ?column? +---------- + stop +(1 row) + diff --git a/contrib/test_decoding/expected/delayed_startup.out b/contrib/test_decoding/expected/delayed_startup.out new file mode 100644 index 0000000000..db8c525ac4 --- /dev/null +++ b/contrib/test_decoding/expected/delayed_startup.out @@ -0,0 +1,38 @@ +Parsed test spec with 2 sessions + +starting permutation: s1b s1w s2init s1c s2start s1b s1w s1c s2start s1b s1w s2start s1c s2start +step s1b: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s1w: INSERT INTO do_write DEFAULT VALUES; +step s2init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +step s1c: COMMIT; +step s2init: <... completed> +?column? + +init +step s2start: SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', 'false'); +data + +step s1b: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s1w: INSERT INTO do_write DEFAULT VALUES; +step s1c: COMMIT; +step s2start: SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', 'false'); +data + +BEGIN +table public.do_write: INSERT: id[integer]:2 +COMMIT +step s1b: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s1w: INSERT INTO do_write DEFAULT VALUES; +step s2start: SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', 'false'); +data + +step s1c: COMMIT; +step s2start: SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', 'false'); +data + +BEGIN +table public.do_write: INSERT: id[integer]:3 +COMMIT +?column? + +stop diff --git a/contrib/test_decoding/expected/mxact.out b/contrib/test_decoding/expected/mxact.out new file mode 100644 index 0000000000..f0d96cc67d --- /dev/null +++ b/contrib/test_decoding/expected/mxact.out @@ -0,0 +1,66 @@ +Parsed test spec with 3 sessions + +starting permutation: s0init s0start s1begin s1sharepgclass s2begin s2sharepgclass s0w s0start s2commit s1commit +step s0init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s0start: SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', 'false'); +data + +step s1begin: BEGIN; +step s1sharepgclass: SELECT count(*) > 1 FROM (SELECT * FROM pg_class FOR SHARE) s; +?column? + +t +step s2begin: BEGIN; +step s2sharepgclass: SELECT count(*) > 1 FROM (SELECT * FROM pg_class FOR SHARE) s; +?column? + +t +step s0w: INSERT INTO do_write DEFAULT VALUES; +step s0start: SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', 'false'); +data + +BEGIN +table public.do_write: INSERT: id[integer]:1 +COMMIT +step s2commit: COMMIT; +step s1commit: COMMIT; +?column? + +stop + +starting permutation: s0init s0start s1begin s1keysharepgclass s2begin s2keysharepgclass s0alter s0w s0start s2commit s1commit +step s0init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); +?column? + +init +step s0start: SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', 'false'); +data + +step s1begin: BEGIN; +step s1keysharepgclass: SELECT count(*) > 1 FROM (SELECT * FROM pg_class FOR KEY SHARE) s; +?column? + +t +step s2begin: BEGIN; +step s2keysharepgclass: SELECT count(*) > 1 FROM (SELECT * FROM pg_class FOR KEY SHARE) s; +?column? + +t +step s0alter: ALTER TABLE do_write ADD column ts timestamptz; +step s0w: INSERT INTO do_write DEFAULT VALUES; +step s0start: SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', 'false'); +data + +BEGIN +COMMIT +BEGIN +table public.do_write: INSERT: id[integer]:1 ts[timestamp with time zone]:null +COMMIT +step s2commit: COMMIT; +step s1commit: COMMIT; +?column? + +stop diff --git a/contrib/test_decoding/expected/permissions.out b/contrib/test_decoding/expected/permissions.out new file mode 100644 index 0000000000..85b7f5d625 --- /dev/null +++ b/contrib/test_decoding/expected/permissions.out @@ -0,0 +1,130 @@ +-- predictability +SET synchronous_commit = on; +-- setup +CREATE ROLE lr_normal; +CREATE ROLE lr_superuser SUPERUSER; +CREATE ROLE lr_replication REPLICATION; +CREATE TABLE lr_test(data text); +-- superuser can control replication +SET ROLE lr_superuser; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); + ?column? +---------- + init +(1 row) + +INSERT INTO lr_test VALUES('lr_superuser_init'); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +-------------------------------------------------------------- + BEGIN + table public.lr_test: INSERT: data[text]:'lr_superuser_init' + COMMIT +(3 rows) + +SELECT pg_drop_replication_slot('regression_slot'); + pg_drop_replication_slot +-------------------------- + +(1 row) + +RESET ROLE; +-- replication user can control replication +SET ROLE lr_replication; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); + ?column? +---------- + init +(1 row) + +INSERT INTO lr_test VALUES('lr_superuser_init'); +ERROR: permission denied for relation lr_test +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +------ +(0 rows) + +SELECT pg_drop_replication_slot('regression_slot'); + pg_drop_replication_slot +-------------------------- + +(1 row) + +RESET ROLE; +-- plain user *can't* can control replication +SET ROLE lr_normal; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +ERROR: must be superuser or replication role to use replication slots +INSERT INTO lr_test VALUES('lr_superuser_init'); +ERROR: permission denied for relation lr_test +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); +ERROR: must be superuser or replication role to use replication slots +SELECT pg_drop_replication_slot('regression_slot'); +ERROR: must be superuser or replication role to use replication slots +RESET ROLE; +-- replication users can drop superuser created slots +SET ROLE lr_superuser; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); + ?column? +---------- + init +(1 row) + +RESET ROLE; +SET ROLE lr_replication; +SELECT pg_drop_replication_slot('regression_slot'); + pg_drop_replication_slot +-------------------------- + +(1 row) + +RESET ROLE; +-- normal users can't drop existing slots +SET ROLE lr_superuser; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); + ?column? +---------- + init +(1 row) + +RESET ROLE; +SET ROLE lr_normal; +SELECT pg_drop_replication_slot('regression_slot'); +ERROR: must be superuser or replication role to use replication slots +RESET ROLE; +-- all users can see existing slots +SET ROLE lr_superuser; +SELECT slot_name, plugin FROM pg_replication_slots; + slot_name | plugin +-----------------+--------------- + regression_slot | test_decoding +(1 row) + +RESET ROLE; +SET ROLE lr_replication; +SELECT slot_name, plugin FROM pg_replication_slots; + slot_name | plugin +-----------------+--------------- + regression_slot | test_decoding +(1 row) + +RESET ROLE; +SET ROLE lr_normal; +SELECT slot_name, plugin FROM pg_replication_slots; + slot_name | plugin +-----------------+--------------- + regression_slot | test_decoding +(1 row) + +RESET ROLE; +-- cleanup +SELECT pg_drop_replication_slot('regression_slot'); + pg_drop_replication_slot +-------------------------- + +(1 row) + +DROP ROLE lr_normal; +DROP ROLE lr_superuser; +DROP ROLE lr_replication; +DROP TABLE lr_test; diff --git a/contrib/test_decoding/expected/rewrite.out b/contrib/test_decoding/expected/rewrite.out new file mode 100644 index 0000000000..ec23ab9024 --- /dev/null +++ b/contrib/test_decoding/expected/rewrite.out @@ -0,0 +1,107 @@ +-- predictability +SET synchronous_commit = on; +DROP TABLE IF EXISTS replication_example; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); + ?column? +---------- + init +(1 row) + +CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120)); +INSERT INTO replication_example(somedata) VALUES (1); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +---------------------------------------------------------------------------------------------------------- + BEGIN + COMMIT + BEGIN + table public.replication_example: INSERT: id[integer]:1 somedata[integer]:1 text[character varying]:null + COMMIT +(5 rows) + +BEGIN; +INSERT INTO replication_example(somedata) VALUES (2); +ALTER TABLE replication_example ADD COLUMN testcolumn1 int; +INSERT INTO replication_example(somedata, testcolumn1) VALUES (3, 1); +COMMIT; +BEGIN; +INSERT INTO replication_example(somedata) VALUES (3); +ALTER TABLE replication_example ADD COLUMN testcolumn2 int; +INSERT INTO replication_example(somedata, testcolumn1, testcolumn2) VALUES (4, 2, 1); +COMMIT; +VACUUM FULL pg_am; +VACUUM FULL pg_amop; +VACUUM FULL pg_proc; +VACUUM FULL pg_opclass; +VACUUM FULL pg_type; +VACUUM FULL pg_index; +VACUUM FULL pg_database; +-- repeated rewrites that fail +BEGIN; +CLUSTER pg_class USING pg_class_oid_index; +CLUSTER pg_class USING pg_class_oid_index; +ROLLBACK; +-- repeated rewrites that succeed +BEGIN; +CLUSTER pg_class USING pg_class_oid_index; +CLUSTER pg_class USING pg_class_oid_index; +CLUSTER pg_class USING pg_class_oid_index; +COMMIT; + -- repeated rewrites in different transactions +VACUUM FULL pg_class; +VACUUM FULL pg_class; +INSERT INTO replication_example(somedata, testcolumn1) VALUES (5, 3); +BEGIN; +INSERT INTO replication_example(somedata, testcolumn1) VALUES (6, 4); +ALTER TABLE replication_example ADD COLUMN testcolumn3 int; +INSERT INTO replication_example(somedata, testcolumn1, testcolumn3) VALUES (7, 5, 1); +COMMIT; +-- make old files go away +CHECKPOINT; +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + data +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + BEGIN + table public.replication_example: INSERT: id[integer]:2 somedata[integer]:2 text[character varying]:null + table public.replication_example: INSERT: id[integer]:3 somedata[integer]:3 text[character varying]:null testcolumn1[integer]:1 + COMMIT + BEGIN + table public.replication_example: INSERT: id[integer]:4 somedata[integer]:3 text[character varying]:null testcolumn1[integer]:null + table public.replication_example: INSERT: id[integer]:5 somedata[integer]:4 text[character varying]:null testcolumn1[integer]:2 testcolumn2[integer]:1 + COMMIT + BEGIN + COMMIT + BEGIN + COMMIT + BEGIN + COMMIT + BEGIN + COMMIT + BEGIN + COMMIT + BEGIN + COMMIT + BEGIN + COMMIT + BEGIN + COMMIT + BEGIN + COMMIT + BEGIN + COMMIT + BEGIN + table public.replication_example: INSERT: id[integer]:6 somedata[integer]:5 text[character varying]:null testcolumn1[integer]:3 testcolumn2[integer]:null + COMMIT + BEGIN + table public.replication_example: INSERT: id[integer]:7 somedata[integer]:6 text[character varying]:null testcolumn1[integer]:4 testcolumn2[integer]:null + table public.replication_example: INSERT: id[integer]:8 somedata[integer]:7 text[character varying]:null testcolumn1[integer]:5 testcolumn2[integer]:null testcolumn3[integer]:1 + COMMIT +(35 rows) + +SELECT pg_drop_replication_slot('regression_slot'); + pg_drop_replication_slot +-------------------------- + +(1 row) + +DROP TABLE IF EXISTS replication_example; diff --git a/contrib/test_decoding/expected/toast.out b/contrib/test_decoding/expected/toast.out new file mode 100644 index 0000000000..6adef83f02 --- /dev/null +++ b/contrib/test_decoding/expected/toast.out @@ -0,0 +1,90 @@ +-- predictability +SET synchronous_commit = on; +DROP TABLE IF EXISTS xpto; +NOTICE: table "xpto" does not exist, skipping +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); + ?column? +---------- + init +(1 row) + +CREATE SEQUENCE xpto_rand_seq START 79 INCREMENT 1499; -- portable "random" +CREATE TABLE xpto ( + id serial primary key, + toasted_col1 text, + rand1 float8 DEFAULT nextval('xpto_rand_seq'), + toasted_col2 text, + rand2 float8 DEFAULT nextval('xpto_rand_seq') +); +-- uncompressed external toast data +INSERT INTO xpto (toasted_col1, toasted_col2) SELECT string_agg(g.i::text, ''), string_agg((g.i*2)::text, '') FROM generate_series(1, 2000) g(i); +-- compressed external toast data +INSERT INTO xpto (toasted_col2) SELECT repeat(string_agg(to_char(g.i, 'FM0000'), ''), 50) FROM generate_series(1, 500) g(i); +-- update of existing column +UPDATE xpto SET toasted_col1 = (SELECT string_agg(g.i::text, '') FROM generate_series(1, 2000) g(i)) WHERE id = 1; +UPDATE xpto SET rand1 = 123.456 WHERE id = 1; +DELETE FROM xpto WHERE id = 1; +DROP TABLE IF EXISTS toasted_key; +NOTICE: table "toasted_key" does not exist, skipping +CREATE TABLE toasted_key ( + id serial, + toasted_key text PRIMARY KEY, + toasted_col1 text, + toasted_col2 text +); +ALTER TABLE toasted_key ALTER COLUMN toasted_key SET STORAGE EXTERNAL; +ALTER TABLE toasted_key ALTER COLUMN toasted_col1 SET STORAGE EXTERNAL; +INSERT INTO toasted_key(toasted_key, toasted_col1) VALUES(repeat('1234567890', 200), repeat('9876543210', 200)); +-- test update of a toasted key without changing it +UPDATE toasted_key SET toasted_col2 = toasted_col1; +-- test update of a toasted key, changing it +UPDATE toasted_key SET toasted_key = toasted_key || '1'; +DELETE FROM toasted_key; +SELECT substr(data, 1, 200) FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + substr +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + BEGIN + COMMIT + BEGIN + COMMIT + BEGIN + table public.xpto: INSERT: id[integer]:1 toasted_col1[text]:'1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374 + COMMIT + BEGIN + table public.xpto: INSERT: id[integer]:2 toasted_col1[text]:null rand1[double precision]:3077 toasted_col2[text]:'00010002000300040005000600070008000900100011001200130014001500160017001800190020002100 + COMMIT + BEGIN + table public.xpto: UPDATE: id[integer]:1 toasted_col1[text]:'1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374 + COMMIT + BEGIN + table public.xpto: UPDATE: id[integer]:1 toasted_col1[text]:unchanged-toast-datum rand1[double precision]:123.456 toasted_col2[text]:unchanged-toast-datum rand2[double precision]:1578 + COMMIT + BEGIN + table public.xpto: DELETE: id[integer]:1 + COMMIT + BEGIN + COMMIT + BEGIN + COMMIT + BEGIN + COMMIT + BEGIN + table public.toasted_key: INSERT: id[integer]:1 toasted_key[text]:'1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123 + COMMIT + BEGIN + table public.toasted_key: UPDATE: id[integer]:1 toasted_key[text]:unchanged-toast-datum toasted_col1[text]:unchanged-toast-datum toasted_col2[text]:'987654321098765432109876543210987654321098765432109 + COMMIT + BEGIN + table public.toasted_key: UPDATE: old-key: toasted_key[text]:'123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678 + COMMIT + BEGIN + table public.toasted_key: DELETE: toasted_key[text]:'123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567 + COMMIT +(37 rows) + +SELECT pg_drop_replication_slot('regression_slot'); + pg_drop_replication_slot +-------------------------- + +(1 row) + diff --git a/contrib/test_decoding/logical.conf b/contrib/test_decoding/logical.conf new file mode 100644 index 0000000000..367f706651 --- /dev/null +++ b/contrib/test_decoding/logical.conf @@ -0,0 +1,2 @@ +wal_level = logical +max_replication_slots = 4 diff --git a/contrib/test_decoding/specs/concurrent_ddl_dml.spec b/contrib/test_decoding/specs/concurrent_ddl_dml.spec new file mode 100644 index 0000000000..7c8a7c7977 --- /dev/null +++ b/contrib/test_decoding/specs/concurrent_ddl_dml.spec @@ -0,0 +1,94 @@ +setup +{ + DROP TABLE IF EXISTS tbl1; + DROP TABLE IF EXISTS tbl2; + CREATE TABLE tbl1(val1 integer, val2 integer); + CREATE TABLE tbl2(val1 integer, val2 integer); +} + +teardown +{ + DROP TABLE tbl1; + DROP TABLE tbl2; + SELECT 'stop' FROM pg_drop_replication_slot('isolation_slot'); +} + +session "s1" +step "s1_init" { SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); } +step "s1_begin" { BEGIN; } +step "s1_insert_tbl1" { INSERT INTO tbl1 (val1, val2) VALUES (1, 1); } +step "s1_insert_tbl1_3col" { INSERT INTO tbl1 (val1, val2, val3) VALUES (1, 1, 1); } +step "s1_insert_tbl2" { INSERT INTO tbl2 (val1, val2) VALUES (1, 1); } +step "s1_insert_tbl2_3col" { INSERT INTO tbl2 (val1, val2, val3) VALUES (1, 1, 1); } +step "s1_commit" { COMMIT; } + +session "s2" +step "s2_alter_tbl1_float" { ALTER TABLE tbl1 ALTER COLUMN val2 TYPE float; } +step "s2_alter_tbl1_char" { ALTER TABLE tbl1 ALTER COLUMN val2 TYPE character varying; } +step "s2_alter_tbl1_text" { ALTER TABLE tbl1 ALTER COLUMN val2 TYPE text; } +step "s2_alter_tbl1_boolean" { ALTER TABLE tbl1 ALTER COLUMN val2 TYPE boolean; } + +step "s2_alter_tbl1_add_int" { ALTER TABLE tbl1 ADD COLUMN val3 INTEGER; } +step "s2_alter_tbl1_add_float" { ALTER TABLE tbl1 ADD COLUMN val3 FLOAT; } +step "s2_alter_tbl1_add_char" { ALTER TABLE tbl1 ADD COLUMN val3 character varying; } +step "s2_alter_tbl1_add_boolean" { ALTER TABLE tbl1 ADD COLUMN val3 BOOLEAN; } +step "s2_alter_tbl1_add_text" { ALTER TABLE tbl1 ADD COLUMN val3 TEXT; } + +step "s2_alter_tbl2_float" { ALTER TABLE tbl2 ALTER COLUMN val2 TYPE float; } +step "s2_alter_tbl2_char" { ALTER TABLE tbl2 ALTER COLUMN val2 TYPE character varying; } +step "s2_alter_tbl2_text" { ALTER TABLE tbl2 ALTER COLUMN val2 TYPE text; } +step "s2_alter_tbl2_boolean" { ALTER TABLE tbl2 ALTER COLUMN val2 TYPE boolean; } +step "s2_alter_tbl2_text" { ALTER TABLE tbl2 ALTER COLUMN val2 TYPE boolean; } + +step "s2_alter_tbl2_add_int" { ALTER TABLE tbl2 ADD COLUMN val3 INTEGER; } +step "s2_alter_tbl2_add_float" { ALTER TABLE tbl2 ADD COLUMN val3 FLOAT; } +step "s2_alter_tbl2_add_char" { ALTER TABLE tbl2 ADD COLUMN val3 character varying; } +step "s2_alter_tbl2_add_boolean" { ALTER TABLE tbl2 ADD COLUMN val3 BOOLEAN; } +step "s2_alter_tbl2_add_text" { ALTER TABLE tbl2 ADD COLUMN val3 TEXT; } +step "s2_alter_tbl2_drop_3rd_col" { ALTER TABLE tbl2 DROP COLUMN val3; } +step "s2_alter_tbl2_3rd_char" { ALTER TABLE tbl2 ALTER COLUMN val3 TYPE character varying; } +step "s2_alter_tbl2_3rd_text" { ALTER TABLE tbl2 ALTER COLUMN val3 TYPE text; } +step "s2_alter_tbl2_3rd_int" { ALTER TABLE tbl2 ALTER COLUMN val3 TYPE int USING val3::integer; } + +step "s2_get_changes" { SELECT regexp_replace(data, 'temp_\d+', 'temp') AS data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0'); } + + + +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl2_float" "s1_insert_tbl2" "s1_commit" "s2_get_changes" +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl1_float" "s1_insert_tbl2" "s1_commit" "s2_get_changes" +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl2_char" "s1_insert_tbl2" "s1_commit" "s2_get_changes" +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl1_char" "s1_insert_tbl2" "s1_commit" "s2_get_changes" + +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s1_insert_tbl2" "s2_alter_tbl1_float" "s1_commit" "s2_get_changes" +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s1_insert_tbl2" "s2_alter_tbl1_char" "s1_commit" "s2_get_changes" + +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl2_float" "s1_insert_tbl2" "s2_alter_tbl1_float" "s1_commit" "s2_get_changes" +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl2_char" "s1_insert_tbl2" "s2_alter_tbl1_char" "s1_commit" "s2_get_changes" + +permutation "s1_init" "s2_alter_tbl2_char" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl2_text" "s1_insert_tbl2" "s1_commit" "s2_get_changes" +permutation "s1_init" "s2_alter_tbl2_char" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl2_text" "s1_insert_tbl2" "s2_alter_tbl1_char" "s1_commit" "s2_get_changes" + +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl2_boolean" "s1_insert_tbl2" "s1_commit" "s2_get_changes" +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl2_boolean" "s1_insert_tbl2" "s2_alter_tbl1_boolean" "s1_commit" "s2_get_changes" + +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl2_add_int" "s1_insert_tbl2_3col" "s1_commit" "s2_get_changes" +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s1_insert_tbl2" "s1_commit" "s1_begin" "s2_alter_tbl2_add_int" "s1_insert_tbl2_3col" "s1_commit" "s2_get_changes" + +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl2_add_float" "s1_insert_tbl2_3col" "s1_commit" "s2_get_changes" +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s1_insert_tbl2" "s1_commit" "s1_begin" "s2_alter_tbl2_add_float" "s1_insert_tbl2_3col" "s1_commit" "s2_get_changes" + +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl2_add_char" "s1_insert_tbl2_3col" "s1_commit" "s2_get_changes" +permutation "s1_init" "s1_begin" "s1_insert_tbl1" "s1_insert_tbl2" "s1_commit" "s1_begin" "s2_alter_tbl2_add_char" "s1_insert_tbl2_3col" "s1_commit" "s2_get_changes" + +permutation "s1_init" "s2_alter_tbl2_add_int" "s1_begin" "s1_insert_tbl2_3col" "s2_alter_tbl2_drop_3rd_col" "s1_commit" "s2_get_changes" +permutation "s1_init" "s2_alter_tbl2_add_int" "s1_begin" "s1_insert_tbl2_3col" "s2_alter_tbl2_drop_3rd_col" "s1_insert_tbl2" "s1_commit" "s1_insert_tbl2" "s2_get_changes" + +permutation "s1_init" "s2_alter_tbl2_add_int" "s1_begin" "s1_insert_tbl2_3col" "s2_alter_tbl2_drop_3rd_col" "s1_commit" "s2_get_changes" "s2_alter_tbl2_add_text" "s1_begin" "s1_insert_tbl2_3col" "s2_alter_tbl2_3rd_char" "s1_insert_tbl2_3col" "s1_commit" "s2_get_changes" "s2_alter_tbl2_3rd_int" "s1_insert_tbl2_3col" "s2_get_changes" + +permutation "s1_init" "s2_alter_tbl2_add_char" "s1_begin" "s1_insert_tbl1" "s1_insert_tbl2_3col" "s2_alter_tbl2_3rd_text" "s1_insert_tbl2_3col" "s1_commit" "s1_insert_tbl2_3col" "s2_get_changes" +permutation "s1_init" "s2_alter_tbl2_add_text" "s1_begin" "s1_insert_tbl1" "s1_insert_tbl2_3col" "s2_alter_tbl2_3rd_char" "s1_insert_tbl2_3col" "s1_commit" "s1_insert_tbl2_3col" "s2_get_changes" + +permutation "s1_init" "s2_alter_tbl2_add_char" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl2_3rd_text" "s1_insert_tbl2_3col" "s1_commit" "s2_alter_tbl2_drop_3rd_col" "s1_insert_tbl2" "s2_get_changes" +permutation "s1_init" "s2_alter_tbl2_add_text" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl2_3rd_char" "s1_insert_tbl2_3col" "s1_commit" "s2_alter_tbl2_drop_3rd_col" "s1_insert_tbl2" "s2_get_changes" + +permutation "s1_init" "s2_alter_tbl2_add_char" "s1_begin" "s1_insert_tbl1" "s2_alter_tbl2_drop_3rd_col" "s1_insert_tbl1" "s1_commit" "s2_get_changes" diff --git a/contrib/test_decoding/specs/delayed_startup.spec b/contrib/test_decoding/specs/delayed_startup.spec new file mode 100644 index 0000000000..b7fe8148ce --- /dev/null +++ b/contrib/test_decoding/specs/delayed_startup.spec @@ -0,0 +1,24 @@ +setup +{ + DROP TABLE IF EXISTS do_write; + CREATE TABLE do_write(id serial primary key); +} + +teardown +{ + DROP TABLE do_write; + SELECT 'stop' FROM pg_drop_replication_slot('isolation_slot'); +} + +session "s1" +setup { SET synchronous_commit=on; } +step "s1b" { BEGIN ISOLATION LEVEL SERIALIZABLE; } +step "s1w" { INSERT INTO do_write DEFAULT VALUES; } +step "s1c" { COMMIT; } +session "s2" +setup { SET synchronous_commit=on; } +step "s2init" {SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding');} +step "s2start" {SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', 'false');} + + +permutation "s1b" "s1w" "s2init" "s1c" "s2start" "s1b" "s1w" "s1c" "s2start" "s1b" "s1w" "s2start" "s1c" "s2start" diff --git a/contrib/test_decoding/specs/mxact.spec b/contrib/test_decoding/specs/mxact.spec new file mode 100644 index 0000000000..ea5b1aa2d6 --- /dev/null +++ b/contrib/test_decoding/specs/mxact.spec @@ -0,0 +1,38 @@ +setup +{ + DROP TABLE IF EXISTS do_write; + CREATE TABLE do_write(id serial primary key); +} + +teardown +{ + DROP TABLE IF EXISTS do_write; + SELECT 'stop' FROM pg_drop_replication_slot('isolation_slot'); +} + +session "s0" +setup { SET synchronous_commit=on; } +step "s0init" {SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding');} +step "s0start" {SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', 'false');} +step "s0alter" {ALTER TABLE do_write ADD column ts timestamptz; } +step "s0w" { INSERT INTO do_write DEFAULT VALUES; } + +session "s1" +setup { SET synchronous_commit=on; } +step "s1begin" {BEGIN;} +step "s1sharepgclass" { SELECT count(*) > 1 FROM (SELECT * FROM pg_class FOR SHARE) s; } +step "s1keysharepgclass" { SELECT count(*) > 1 FROM (SELECT * FROM pg_class FOR KEY SHARE) s; } +step "s1commit" {COMMIT;} + +session "s2" +setup { SET synchronous_commit=on; } +step "s2begin" {BEGIN;} +step "s2sharepgclass" { SELECT count(*) > 1 FROM (SELECT * FROM pg_class FOR SHARE) s; } +step "s2keysharepgclass" { SELECT count(*) > 1 FROM (SELECT * FROM pg_class FOR KEY SHARE) s; } +step "s2commit" {COMMIT;} + +# test that we're handling an update-only mxact xmax correctly +permutation "s0init" "s0start" "s1begin" "s1sharepgclass" "s2begin" "s2sharepgclass" "s0w" "s0start" "s2commit" "s1commit" + +# test that we're handling an update-only mxact xmax correctly +permutation "s0init" "s0start" "s1begin" "s1keysharepgclass" "s2begin" "s2keysharepgclass" "s0alter" "s0w" "s0start" "s2commit" "s1commit" diff --git a/contrib/test_decoding/sql/binary.sql b/contrib/test_decoding/sql/binary.sql new file mode 100644 index 0000000000..619f00b3bc --- /dev/null +++ b/contrib/test_decoding/sql/binary.sql @@ -0,0 +1,14 @@ +-- predictability +SET synchronous_commit = on; + +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +-- succeeds, textual plugin, textual consumer +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'force-binary', '0'); +-- fails, binary plugin, textual consumer +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'force-binary', '1'); +-- succeeds, textual plugin, binary consumer +SELECT data FROM pg_logical_slot_get_binary_changes('regression_slot', NULL, NULL, 'force-binary', '0'); +-- succeeds, binary plugin, binary consumer +SELECT data FROM pg_logical_slot_get_binary_changes('regression_slot', NULL, NULL, 'force-binary', '1'); + +SELECT 'init' FROM pg_drop_replication_slot('regression_slot'); diff --git a/contrib/test_decoding/sql/ddl.sql b/contrib/test_decoding/sql/ddl.sql new file mode 100644 index 0000000000..b4807e991b --- /dev/null +++ b/contrib/test_decoding/sql/ddl.sql @@ -0,0 +1,337 @@ +-- predictability +SET synchronous_commit = on; + +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +-- fail because of an already existing slot +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +-- fail because of an invalid name +SELECT 'init' FROM pg_create_logical_replication_slot('Invalid Name', 'test_decoding'); + +-- fail twice because of an invalid parameter values +SELECT 'init' FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', 'frakbar'); +SELECT 'init' FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'nonexistant-option', 'frakbar'); +SELECT 'init' FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', 'frakbar'); + +-- succeed once +SELECT pg_drop_replication_slot('regression_slot'); +-- fail +SELECT pg_drop_replication_slot('regression_slot'); + +-- check that we're detecting a streaming rep slot used for logical decoding +SELECT 'init' FROM pg_create_physical_replication_slot('repl'); +SELECT data FROM pg_logical_slot_get_changes('repl', NULL, NULL, 'include-xids', '0'); +SELECT pg_drop_replication_slot('repl'); + + +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); + +/* check whether status function reports us, only reproduceable columns */ +SELECT slot_name, plugin, slot_type, active, + NOT catalog_xmin IS NULL AS catalog_xmin_set, + xmin IS NULl AS data_xmin_not_set, + pg_xlog_location_diff(restart_lsn, '0/01000000') > 0 AS some_wal +FROM pg_replication_slots; + +/* + * Check that changes are handled correctly when interleaved with ddl + */ +CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120)); +BEGIN; +INSERT INTO replication_example(somedata, text) VALUES (1, 1); +INSERT INTO replication_example(somedata, text) VALUES (1, 2); +COMMIT; + +ALTER TABLE replication_example ADD COLUMN bar int; + +INSERT INTO replication_example(somedata, text, bar) VALUES (2, 1, 4); + +BEGIN; +INSERT INTO replication_example(somedata, text, bar) VALUES (2, 2, 4); +INSERT INTO replication_example(somedata, text, bar) VALUES (2, 3, 4); +INSERT INTO replication_example(somedata, text, bar) VALUES (2, 4, NULL); +COMMIT; + +ALTER TABLE replication_example DROP COLUMN bar; +INSERT INTO replication_example(somedata, text) VALUES (3, 1); + +BEGIN; +INSERT INTO replication_example(somedata, text) VALUES (3, 2); +INSERT INTO replication_example(somedata, text) VALUES (3, 3); +COMMIT; + +ALTER TABLE replication_example RENAME COLUMN text TO somenum; + +INSERT INTO replication_example(somedata, somenum) VALUES (4, 1); + +-- collect all changes +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + +ALTER TABLE replication_example ALTER COLUMN somenum TYPE int4 USING (somenum::int4); +-- throw away changes, they contain oids +SELECT count(data) FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + +INSERT INTO replication_example(somedata, somenum) VALUES (5, 1); + +BEGIN; +INSERT INTO replication_example(somedata, somenum) VALUES (6, 1); +ALTER TABLE replication_example ADD COLUMN zaphod1 int; +INSERT INTO replication_example(somedata, somenum, zaphod1) VALUES (6, 2, 1); +ALTER TABLE replication_example ADD COLUMN zaphod2 int; +INSERT INTO replication_example(somedata, somenum, zaphod2) VALUES (6, 3, 1); +INSERT INTO replication_example(somedata, somenum, zaphod1) VALUES (6, 4, 2); +COMMIT; + +-- show changes +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + +-- hide changes bc of oid visible in full table rewrites +CREATE TABLE tr_unique(id2 serial unique NOT NULL, data int); +INSERT INTO tr_unique(data) VALUES(10); +ALTER TABLE tr_unique RENAME TO tr_pkey; +ALTER TABLE tr_pkey ADD COLUMN id serial primary key; +SELECT count(data) FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + +INSERT INTO tr_pkey(data) VALUES(1); +--show deletion with primary key +DELETE FROM tr_pkey; + +/* display results */ +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + +/* + * check that disk spooling works + */ +BEGIN; +CREATE TABLE tr_etoomuch (id serial primary key, data int); +INSERT INTO tr_etoomuch(data) SELECT g.i FROM generate_series(1, 10234) g(i); +DELETE FROM tr_etoomuch WHERE id < 5000; +UPDATE tr_etoomuch SET data = - data WHERE id > 5000; +COMMIT; + +/* display results, but hide most of the output */ +SELECT count(*), min(data), max(data) +FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0') +GROUP BY substring(data, 1, 24) +ORDER BY 1; + +/* + * check whether we decode subtransactions correctly in relation with each + * other + */ +CREATE TABLE tr_sub (id serial primary key, path text); + +-- toplevel, subtxn, toplevel, subtxn, subtxn +BEGIN; +INSERT INTO tr_sub(path) VALUES ('1-top-#1'); + +SAVEPOINT a; +INSERT INTO tr_sub(path) VALUES ('1-top-1-#1'); +INSERT INTO tr_sub(path) VALUES ('1-top-1-#2'); +RELEASE SAVEPOINT a; + +SAVEPOINT b; +SAVEPOINT c; +INSERT INTO tr_sub(path) VALUES ('1-top-2-1-#1'); +INSERT INTO tr_sub(path) VALUES ('1-top-2-1-#2'); +RELEASE SAVEPOINT c; +INSERT INTO tr_sub(path) VALUES ('1-top-2-#1'); +RELEASE SAVEPOINT b; +COMMIT; + +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + +-- check that we handle xlog assignments correctly +BEGIN; +-- nest 80 subtxns +SAVEPOINT subtop;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a;SAVEPOINT a; +-- assign xid by inserting +INSERT INTO tr_sub(path) VALUES ('2-top-1...--#1'); +INSERT INTO tr_sub(path) VALUES ('2-top-1...--#2'); +INSERT INTO tr_sub(path) VALUES ('2-top-1...--#3'); +RELEASE SAVEPOINT subtop; +INSERT INTO tr_sub(path) VALUES ('2-top-#1'); +COMMIT; + +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + +-- make sure rollbacked subtransactions aren't decoded +BEGIN; +INSERT INTO tr_sub(path) VALUES ('3-top-2-#1'); +SAVEPOINT a; +INSERT INTO tr_sub(path) VALUES ('3-top-2-1-#1'); +SAVEPOINT b; +INSERT INTO tr_sub(path) VALUES ('3-top-2-2-#1'); +ROLLBACK TO SAVEPOINT b; +INSERT INTO tr_sub(path) VALUES ('3-top-2-#2'); +COMMIT; + +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + +-- test whether a known, but not yet logged toplevel xact, followed by a +-- subxact commit is handled correctly +BEGIN; +SELECT txid_current() != 0; -- so no fixed xid apears in the outfile +SAVEPOINT a; +INSERT INTO tr_sub(path) VALUES ('4-top-1-#1'); +RELEASE SAVEPOINT a; +COMMIT; + +-- test whether a change in a subtransaction, in an unknown toplevel +-- xact is handled correctly. +BEGIN; +SAVEPOINT a; +INSERT INTO tr_sub(path) VALUES ('5-top-1-#1'); +COMMIT; + + +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + + +/* + * Check whether treating a table as a catalog table works somewhat + */ +CREATE TABLE replication_metadata ( + id serial primary key, + relation name NOT NULL, + options text[] +) +WITH (user_catalog_table = true) +; +\d+ replication_metadata + +INSERT INTO replication_metadata(relation, options) +VALUES ('foo', ARRAY['a', 'b']); + +ALTER TABLE replication_metadata RESET (user_catalog_table); +\d+ replication_metadata + +INSERT INTO replication_metadata(relation, options) +VALUES ('bar', ARRAY['a', 'b']); + +ALTER TABLE replication_metadata SET (user_catalog_table = true); +\d+ replication_metadata + +INSERT INTO replication_metadata(relation, options) +VALUES ('blub', NULL); + +-- make sure rewrites don't work +ALTER TABLE replication_metadata ADD COLUMN rewritemeornot int; +ALTER TABLE replication_metadata ALTER COLUMN rewritemeornot TYPE text; + +ALTER TABLE replication_metadata SET (user_catalog_table = false); +\d+ replication_metadata + +INSERT INTO replication_metadata(relation, options) +VALUES ('zaphod', NULL); + +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + +/* + * check whether we handle updates/deletes correct with & without a pkey + */ + +/* we should handle the case without a key at all more gracefully */ +CREATE TABLE table_without_key(id serial, data int); +INSERT INTO table_without_key(data) VALUES(1),(2); +DELETE FROM table_without_key WHERE data = 1; +-- won't log old keys +UPDATE table_without_key SET data = 3 WHERE data = 2; +UPDATE table_without_key SET id = -id; +UPDATE table_without_key SET id = -id; +-- should log the full old row now +ALTER TABLE table_without_key REPLICA IDENTITY FULL; +UPDATE table_without_key SET data = 3 WHERE data = 2; +UPDATE table_without_key SET id = -id; +UPDATE table_without_key SET id = -id; +DELETE FROM table_without_key WHERE data = 3; + +CREATE TABLE table_with_pkey(id serial primary key, data int); +INSERT INTO table_with_pkey(data) VALUES(1), (2); +DELETE FROM table_with_pkey WHERE data = 1; +-- should log the old pkey +UPDATE table_with_pkey SET data = 3 WHERE data = 2; +UPDATE table_with_pkey SET id = -id; +UPDATE table_with_pkey SET id = -id; +-- check that we log nothing despite having a pkey +ALTER TABLE table_without_key REPLICA IDENTITY NOTHING; +UPDATE table_with_pkey SET id = -id; +-- check that we log everything despite having a pkey +ALTER TABLE table_without_key REPLICA IDENTITY FULL; +UPDATE table_with_pkey SET id = -id; +DELETE FROM table_with_pkey WHERE data = 3; + +CREATE TABLE table_with_unique_not_null(id serial unique, data int); +ALTER TABLE table_with_unique_not_null ALTER COLUMN id SET NOT NULL; --already set +-- won't log anything, replica identity not setup +INSERT INTO table_with_unique_not_null(data) VALUES(1), (2); +DELETE FROM table_with_unique_not_null WHERE data = 1; +UPDATE table_with_unique_not_null SET data = 3 WHERE data = 2; +UPDATE table_with_unique_not_null SET id = -id; +UPDATE table_with_unique_not_null SET id = -id; +DELETE FROM table_with_unique_not_null WHERE data = 3; +-- should log old key +ALTER TABLE table_with_unique_not_null REPLICA IDENTITY USING INDEX table_with_unique_not_null_id_key; +INSERT INTO table_with_unique_not_null(data) VALUES(1), (2); +DELETE FROM table_with_unique_not_null WHERE data = 1; +UPDATE table_with_unique_not_null SET data = 3 WHERE data = 2; +UPDATE table_with_unique_not_null SET id = -id; +UPDATE table_with_unique_not_null SET id = -id; +DELETE FROM table_with_unique_not_null WHERE data = 3; + +-- check toast support +BEGIN; +CREATE SEQUENCE toasttable_rand_seq START 79 INCREMENT 1499; -- portable "random" +CREATE TABLE toasttable( + id serial primary key, + toasted_col1 text, + rand1 float8 DEFAULT nextval('toasttable_rand_seq'), + toasted_col2 text, + rand2 float8 DEFAULT nextval('toasttable_rand_seq') + ); +COMMIT; +-- uncompressed external toast data +INSERT INTO toasttable(toasted_col1) SELECT string_agg(g.i::text, '') FROM generate_series(1, 2000) g(i); + +-- compressed external toast data +INSERT INTO toasttable(toasted_col2) SELECT repeat(string_agg(to_char(g.i, 'FM0000'), ''), 50) FROM generate_series(1, 500) g(i); + +-- update of existing column +UPDATE toasttable + SET toasted_col1 = (SELECT string_agg(g.i::text, '') FROM generate_series(1, 2000) g(i)) +WHERE id = 1; + +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + +INSERT INTO toasttable(toasted_col1) SELECT string_agg(g.i::text, '') FROM generate_series(1, 2000) g(i); + +-- update of second column, first column unchanged +UPDATE toasttable + SET toasted_col2 = (SELECT string_agg(g.i::text, '') FROM generate_series(1, 2000) g(i)) +WHERE id = 1; + +-- make sure we decode correctly even if the toast table is gone +DROP TABLE toasttable; + +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + +-- done, free logical replication slot +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); +SELECT pg_drop_replication_slot('regression_slot'); + +/* check that we aren't visible anymore now */ +SELECT * FROM pg_stat_replication; diff --git a/contrib/test_decoding/sql/decoding_in_xact.sql b/contrib/test_decoding/sql/decoding_in_xact.sql new file mode 100644 index 0000000000..2771afee7a --- /dev/null +++ b/contrib/test_decoding/sql/decoding_in_xact.sql @@ -0,0 +1,41 @@ +-- predictability +SET synchronous_commit = on; + +-- fail because we're creating a slot while in an xact with xid +BEGIN; +SELECT txid_current() = 0; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +ROLLBACK; + +-- fail because we're creating a slot while in an subxact whose topxact has a xid +BEGIN; +SELECT txid_current() = 0; +SAVEPOINT barf; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +ROLLBACK TO SAVEPOINT barf; +ROLLBACK; + +-- succeed, outside tx. +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +SELECT 'stop' FROM pg_drop_replication_slot('regression_slot'); + +-- succeed, in tx without xid. +BEGIN; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +COMMIT; + +CREATE TABLE nobarf(id serial primary key, data text); +INSERT INTO nobarf(data) VALUES('1'); + +-- decoding works in transaction with xid +BEGIN; +SELECT txid_current() = 0; +-- don't show yet, haven't committed +INSERT INTO nobarf(data) VALUES('2'); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); +COMMIT; + +INSERT INTO nobarf(data) VALUES('3'); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + +SELECT 'stop' FROM pg_drop_replication_slot('regression_slot'); diff --git a/contrib/test_decoding/sql/permissions.sql b/contrib/test_decoding/sql/permissions.sql new file mode 100644 index 0000000000..39d70b56b0 --- /dev/null +++ b/contrib/test_decoding/sql/permissions.sql @@ -0,0 +1,69 @@ +-- predictability +SET synchronous_commit = on; + +-- setup +CREATE ROLE lr_normal; +CREATE ROLE lr_superuser SUPERUSER; +CREATE ROLE lr_replication REPLICATION; +CREATE TABLE lr_test(data text); + +-- superuser can control replication +SET ROLE lr_superuser; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +INSERT INTO lr_test VALUES('lr_superuser_init'); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); +SELECT pg_drop_replication_slot('regression_slot'); +RESET ROLE; + +-- replication user can control replication +SET ROLE lr_replication; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +INSERT INTO lr_test VALUES('lr_superuser_init'); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); +SELECT pg_drop_replication_slot('regression_slot'); +RESET ROLE; + +-- plain user *can't* can control replication +SET ROLE lr_normal; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +INSERT INTO lr_test VALUES('lr_superuser_init'); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); +SELECT pg_drop_replication_slot('regression_slot'); +RESET ROLE; + +-- replication users can drop superuser created slots +SET ROLE lr_superuser; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +RESET ROLE; +SET ROLE lr_replication; +SELECT pg_drop_replication_slot('regression_slot'); +RESET ROLE; + +-- normal users can't drop existing slots +SET ROLE lr_superuser; +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +RESET ROLE; +SET ROLE lr_normal; +SELECT pg_drop_replication_slot('regression_slot'); +RESET ROLE; + +-- all users can see existing slots +SET ROLE lr_superuser; +SELECT slot_name, plugin FROM pg_replication_slots; +RESET ROLE; + +SET ROLE lr_replication; +SELECT slot_name, plugin FROM pg_replication_slots; +RESET ROLE; + +SET ROLE lr_normal; +SELECT slot_name, plugin FROM pg_replication_slots; +RESET ROLE; + +-- cleanup +SELECT pg_drop_replication_slot('regression_slot'); + +DROP ROLE lr_normal; +DROP ROLE lr_superuser; +DROP ROLE lr_replication; +DROP TABLE lr_test; diff --git a/contrib/test_decoding/sql/rewrite.sql b/contrib/test_decoding/sql/rewrite.sql new file mode 100644 index 0000000000..9a3dcbf857 --- /dev/null +++ b/contrib/test_decoding/sql/rewrite.sql @@ -0,0 +1,62 @@ +-- predictability +SET synchronous_commit = on; + +DROP TABLE IF EXISTS replication_example; + +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); +CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120)); +INSERT INTO replication_example(somedata) VALUES (1); +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); + +BEGIN; +INSERT INTO replication_example(somedata) VALUES (2); +ALTER TABLE replication_example ADD COLUMN testcolumn1 int; +INSERT INTO replication_example(somedata, testcolumn1) VALUES (3, 1); +COMMIT; + +BEGIN; +INSERT INTO replication_example(somedata) VALUES (3); +ALTER TABLE replication_example ADD COLUMN testcolumn2 int; +INSERT INTO replication_example(somedata, testcolumn1, testcolumn2) VALUES (4, 2, 1); +COMMIT; + +VACUUM FULL pg_am; +VACUUM FULL pg_amop; +VACUUM FULL pg_proc; +VACUUM FULL pg_opclass; +VACUUM FULL pg_type; +VACUUM FULL pg_index; +VACUUM FULL pg_database; + +-- repeated rewrites that fail +BEGIN; +CLUSTER pg_class USING pg_class_oid_index; +CLUSTER pg_class USING pg_class_oid_index; +ROLLBACK; + +-- repeated rewrites that succeed +BEGIN; +CLUSTER pg_class USING pg_class_oid_index; +CLUSTER pg_class USING pg_class_oid_index; +CLUSTER pg_class USING pg_class_oid_index; +COMMIT; + + -- repeated rewrites in different transactions +VACUUM FULL pg_class; +VACUUM FULL pg_class; + +INSERT INTO replication_example(somedata, testcolumn1) VALUES (5, 3); + +BEGIN; +INSERT INTO replication_example(somedata, testcolumn1) VALUES (6, 4); +ALTER TABLE replication_example ADD COLUMN testcolumn3 int; +INSERT INTO replication_example(somedata, testcolumn1, testcolumn3) VALUES (7, 5, 1); +COMMIT; + +-- make old files go away +CHECKPOINT; + +SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); +SELECT pg_drop_replication_slot('regression_slot'); + +DROP TABLE IF EXISTS replication_example; diff --git a/contrib/test_decoding/sql/toast.sql b/contrib/test_decoding/sql/toast.sql new file mode 100644 index 0000000000..943db9d2ee --- /dev/null +++ b/contrib/test_decoding/sql/toast.sql @@ -0,0 +1,51 @@ +-- predictability +SET synchronous_commit = on; + +DROP TABLE IF EXISTS xpto; + +SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding'); + +CREATE SEQUENCE xpto_rand_seq START 79 INCREMENT 1499; -- portable "random" +CREATE TABLE xpto ( + id serial primary key, + toasted_col1 text, + rand1 float8 DEFAULT nextval('xpto_rand_seq'), + toasted_col2 text, + rand2 float8 DEFAULT nextval('xpto_rand_seq') +); + +-- uncompressed external toast data +INSERT INTO xpto (toasted_col1, toasted_col2) SELECT string_agg(g.i::text, ''), string_agg((g.i*2)::text, '') FROM generate_series(1, 2000) g(i); + +-- compressed external toast data +INSERT INTO xpto (toasted_col2) SELECT repeat(string_agg(to_char(g.i, 'FM0000'), ''), 50) FROM generate_series(1, 500) g(i); + +-- update of existing column +UPDATE xpto SET toasted_col1 = (SELECT string_agg(g.i::text, '') FROM generate_series(1, 2000) g(i)) WHERE id = 1; + +UPDATE xpto SET rand1 = 123.456 WHERE id = 1; + +DELETE FROM xpto WHERE id = 1; + +DROP TABLE IF EXISTS toasted_key; +CREATE TABLE toasted_key ( + id serial, + toasted_key text PRIMARY KEY, + toasted_col1 text, + toasted_col2 text +); + +ALTER TABLE toasted_key ALTER COLUMN toasted_key SET STORAGE EXTERNAL; +ALTER TABLE toasted_key ALTER COLUMN toasted_col1 SET STORAGE EXTERNAL; + +INSERT INTO toasted_key(toasted_key, toasted_col1) VALUES(repeat('1234567890', 200), repeat('9876543210', 200)); + +-- test update of a toasted key without changing it +UPDATE toasted_key SET toasted_col2 = toasted_col1; +-- test update of a toasted key, changing it +UPDATE toasted_key SET toasted_key = toasted_key || '1'; + +DELETE FROM toasted_key; + +SELECT substr(data, 1, 200) FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0'); +SELECT pg_drop_replication_slot('regression_slot'); diff --git a/contrib/test_decoding/test_decoding.c b/contrib/test_decoding/test_decoding.c new file mode 100644 index 0000000000..ea463fb2e7 --- /dev/null +++ b/contrib/test_decoding/test_decoding.c @@ -0,0 +1,404 @@ +/*------------------------------------------------------------------------- + * + * test_decoding.c + * example logical decoding output plugin + * + * Copyright (c) 2012-2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/test_decoding/test_decoding.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/sysattr.h" + +#include "catalog/pg_class.h" +#include "catalog/pg_type.h" + +#include "nodes/parsenodes.h" + +#include "replication/output_plugin.h" +#include "replication/logical.h" + +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/relcache.h" +#include "utils/syscache.h" +#include "utils/typcache.h" + + +PG_MODULE_MAGIC; + +extern void _PG_init(void); +extern void _PG_output_plugin_init(OutputPluginCallbacks *cb); + +typedef struct +{ + MemoryContext context; + bool include_xids; + bool include_timestamp; +} TestDecodingData; + +/* These must be available to pg_dlsym() */ +static void pg_decode_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, + bool is_init); +static void pg_decode_shutdown(LogicalDecodingContext *ctx); +static void pg_decode_begin_txn(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); +static void pg_decode_commit_txn(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, XLogRecPtr commit_lsn); +static void pg_decode_change(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, Relation rel, + ReorderBufferChange *change); + +void +_PG_init(void) +{ + /* other plugins can perform things here */ +} + +/* specify output plugin callbacks */ +void +_PG_output_plugin_init(OutputPluginCallbacks *cb) +{ + AssertVariableIsOfType(&_PG_output_plugin_init, LogicalOutputPluginInit); + + cb->startup_cb = pg_decode_startup; + cb->begin_cb = pg_decode_begin_txn; + cb->change_cb = pg_decode_change; + cb->commit_cb = pg_decode_commit_txn; + cb->shutdown_cb = pg_decode_shutdown; +} + + +/* initialize this plugin */ +static void +pg_decode_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, + bool is_init) +{ + ListCell *option; + TestDecodingData *data; + + data = palloc(sizeof(TestDecodingData)); + data->context = AllocSetContextCreate(ctx->context, + "text conversion context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + data->include_xids = true; + data->include_timestamp = false; + + ctx->output_plugin_private = data; + + opt->output_type = OUTPUT_PLUGIN_TEXTUAL_OUTPUT; + + foreach(option, ctx->output_plugin_options) + { + DefElem *elem = lfirst(option); + + Assert(elem->arg == NULL || IsA(elem->arg, String)); + + if (strcmp(elem->defname, "include-xids") == 0) + { + /* if option does not provide a value, it means its value is true */ + if (elem->arg == NULL) + data->include_xids = true; + else if (!parse_bool(strVal(elem->arg), &data->include_xids)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not parse value \"%s\" for parameter \"%s\"", + strVal(elem->arg), elem->defname))); + } + else if (strcmp(elem->defname, "include-timestamp") == 0) + { + if (elem->arg == NULL) + data->include_timestamp = true; + else if (!parse_bool(strVal(elem->arg), &data->include_timestamp)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not parse value \"%s\" for parameter \"%s\"", + strVal(elem->arg), elem->defname))); + } + else if (strcmp(elem->defname, "force-binary") == 0) + { + bool force_binary; + + if (elem->arg == NULL) + continue; + else if (!parse_bool(strVal(elem->arg), &force_binary)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not parse value \"%s\" for parameter \"%s\"", + strVal(elem->arg), elem->defname))); + + if (force_binary) + opt->output_type = OUTPUT_PLUGIN_BINARY_OUTPUT; + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("option \"%s\" = \"%s\" is unknown", + elem->defname, + elem->arg ? strVal(elem->arg) : "(null)"))); + } + } +} + +/* cleanup this plugin's resources */ +static void +pg_decode_shutdown(LogicalDecodingContext *ctx) +{ + TestDecodingData *data = ctx->output_plugin_private; + + /* cleanup our own resources via memory context reset */ + MemoryContextDelete(data->context); +} + +/* BEGIN callback */ +static void +pg_decode_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn) +{ + TestDecodingData *data = ctx->output_plugin_private; + + OutputPluginPrepareWrite(ctx, true); + if (data->include_xids) + appendStringInfo(ctx->out, "BEGIN %u", txn->xid); + else + appendStringInfoString(ctx->out, "BEGIN"); + OutputPluginWrite(ctx, true); +} + +/* COMMIT callback */ +static void +pg_decode_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + TestDecodingData *data = ctx->output_plugin_private; + + OutputPluginPrepareWrite(ctx, true); + if (data->include_xids) + appendStringInfo(ctx->out, "COMMIT %u", txn->xid); + else + appendStringInfoString(ctx->out, "COMMIT"); + + if (data->include_timestamp) + appendStringInfo(ctx->out, " (at %s)", + timestamptz_to_str(txn->commit_time)); + + OutputPluginWrite(ctx, true); +} + +/* + * Print literal `outputstr' already represented as string of type `typid' + * into stringbuf `s'. + * + * Some builtin types aren't quoted, the rest is quoted. Escaping is done as + * if standard_conforming_strings were enabled. + */ +static void +print_literal(StringInfo s, Oid typid, char *outputstr) +{ + const char *valptr; + + switch (typid) + { + case INT2OID: + case INT4OID: + case INT8OID: + case OIDOID: + case FLOAT4OID: + case FLOAT8OID: + case NUMERICOID: + /* NB: We don't care about Inf, NaN et al. */ + appendStringInfoString(s, outputstr); + break; + + case BITOID: + case VARBITOID: + appendStringInfo(s, "B'%s'", outputstr); + break; + + case BOOLOID: + if (strcmp(outputstr, "t") == 0) + appendStringInfoString(s, "true"); + else + appendStringInfoString(s, "false"); + break; + + default: + appendStringInfoChar(s, '\''); + for (valptr = outputstr; *valptr; valptr++) + { + char ch = *valptr; + + if (SQL_STR_DOUBLE(ch, false)) + appendStringInfoChar(s, ch); + appendStringInfoChar(s, ch); + } + appendStringInfoChar(s, '\''); + break; + } +} + +/* print the tuple 'tuple' into the StringInfo s */ +static void +tuple_to_stringinfo(StringInfo s, TupleDesc tupdesc, HeapTuple tuple, bool skip_nulls) +{ + int natt; + Oid oid; + + /* print oid of tuple, it's not included in the TupleDesc */ + if ((oid = HeapTupleHeaderGetOid(tuple->t_data)) != InvalidOid) + { + appendStringInfo(s, " oid[oid]:%u", oid); + } + + /* print all columns individually */ + for (natt = 0; natt < tupdesc->natts; natt++) + { + Form_pg_attribute attr; /* the attribute itself */ + Oid typid; /* type of current attribute */ + Oid typoutput; /* output function */ + bool typisvarlena; + Datum origval; /* possibly toasted Datum */ + bool isnull; /* column is null? */ + + attr = tupdesc->attrs[natt]; + + /* + * don't print dropped columns, we can't be sure everything is + * available for them + */ + if (attr->attisdropped) + continue; + + /* + * Don't print system columns, oid will already have been printed if + * present. + */ + if (attr->attnum < 0) + continue; + + typid = attr->atttypid; + + /* get Datum from tuple */ + origval = fastgetattr(tuple, natt + 1, tupdesc, &isnull); + + if (isnull && skip_nulls) + continue; + + /* print attribute name */ + appendStringInfoChar(s, ' '); + appendStringInfoString(s, quote_identifier(NameStr(attr->attname))); + + /* print attribute type */ + appendStringInfoChar(s, '['); + appendStringInfoString(s, format_type_be(typid)); + appendStringInfoChar(s, ']'); + + /* query output function */ + getTypeOutputInfo(typid, + &typoutput, &typisvarlena); + + /* print separator */ + appendStringInfoChar(s, ':'); + + /* print data */ + if (isnull) + appendStringInfoString(s, "null"); + else if (typisvarlena && VARATT_IS_EXTERNAL_ONDISK(origval)) + appendStringInfoString(s, "unchanged-toast-datum"); + else if (!typisvarlena) + print_literal(s, typid, + OidOutputFunctionCall(typoutput, origval)); + else + { + Datum val; /* definitely detoasted Datum */ + val = PointerGetDatum(PG_DETOAST_DATUM(origval)); + print_literal(s, typid, OidOutputFunctionCall(typoutput, val)); + } + } +} + +/* + * callback for individual changed tuples + */ +static void +pg_decode_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + TestDecodingData *data; + Form_pg_class class_form; + TupleDesc tupdesc; + MemoryContext old; + + data = ctx->output_plugin_private; + class_form = RelationGetForm(relation); + tupdesc = RelationGetDescr(relation); + + /* Avoid leaking memory by using and resetting our own context */ + old = MemoryContextSwitchTo(data->context); + + OutputPluginPrepareWrite(ctx, true); + + appendStringInfoString(ctx->out, "table "); + appendStringInfoString(ctx->out, + quote_qualified_identifier( + get_namespace_name( + get_rel_namespace(RelationGetRelid(relation))), + NameStr(class_form->relname))); + appendStringInfoString(ctx->out, ":"); + + switch (change->action) + { + case REORDER_BUFFER_CHANGE_INSERT: + appendStringInfoString(ctx->out, " INSERT:"); + if (change->tp.newtuple == NULL) + appendStringInfoString(ctx->out, " (no-tuple-data)"); + else + tuple_to_stringinfo(ctx->out, tupdesc, + &change->tp.newtuple->tuple, + false); + break; + case REORDER_BUFFER_CHANGE_UPDATE: + appendStringInfoString(ctx->out, " UPDATE:"); + if (change->tp.oldtuple != NULL) + { + appendStringInfoString(ctx->out, " old-key:"); + tuple_to_stringinfo(ctx->out, tupdesc, + &change->tp.oldtuple->tuple, + true); + appendStringInfoString(ctx->out, " new-tuple:"); + } + + if (change->tp.newtuple == NULL) + appendStringInfoString(ctx->out, " (no-tuple-data)"); + else + tuple_to_stringinfo(ctx->out, tupdesc, + &change->tp.newtuple->tuple, + false); + break; + case REORDER_BUFFER_CHANGE_DELETE: + appendStringInfoString(ctx->out, " DELETE:"); + + /* if there was no PK, we only know that a delete happened */ + if (change->tp.oldtuple == NULL) + appendStringInfoString(ctx->out, " (no-tuple-data)"); + /* In DELETE, only the replica identity is present; display that */ + else + tuple_to_stringinfo(ctx->out, tupdesc, + &change->tp.oldtuple->tuple, + true); + break; + } + + MemoryContextSwitchTo(old); + MemoryContextReset(data->context); + + OutputPluginWrite(ctx, true); +} diff --git a/doc/src/sgml/contrib.sgml b/doc/src/sgml/contrib.sgml index 336ba0c564..ec68f10b65 100644 --- a/doc/src/sgml/contrib.sgml +++ b/doc/src/sgml/contrib.sgml @@ -140,6 +140,7 @@ CREATE EXTENSION module_name FROM unpackaged; &sslinfo; &tablefunc; &tcn; + &test-decoding; &test-parser; &test-shm-mq; &tsearch2; diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml index 09de4bd051..0e863ee064 100644 --- a/doc/src/sgml/filelist.sgml +++ b/doc/src/sgml/filelist.sgml @@ -143,6 +143,7 @@ + diff --git a/doc/src/sgml/test-decoding.sgml b/doc/src/sgml/test-decoding.sgml new file mode 100644 index 0000000000..0fe3d0a45a --- /dev/null +++ b/doc/src/sgml/test-decoding.sgml @@ -0,0 +1,42 @@ + + + + test_decoding + + + test_decoding + + + + test_decoding is an example of a logical decoding + output plugin. It doesn't do anything especially useful, but can serve as + a starting point for developing your own decoder. + + + + test_decoding receives WAL through the logical decoding + mechanism and decodes it into text representations of the operations + performed. + + + + Typical output from this plugin, used over the SQL logical decoding + interface, might be: + + +postgres=# SELECT * FROM pg_logical_slot_get_changes('test_slot', 'now', 'include-xids', '0'); + location | xid | data +-----------+-----+-------------------------------------------------- + 0/16D30F8 | 691 | BEGIN + 0/16D32A0 | 691 | table public.data: INSERT: id[int4]:2 data[text]:'arg' + 0/16D32A0 | 691 | table public.data: INSERT: id[int4]:3 data[text]:'demo' + 0/16D32A0 | 691 | COMMIT + 0/16D32D8 | 692 | BEGIN + 0/16D3398 | 692 | table public.data: DELETE: id[int4]:2 + 0/16D3398 | 692 | table public.data: DELETE: id[int4]:3 + 0/16D3398 | 692 | COMMIT +(8 rows) + + + + diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 209d1bdf4d..cdddf492f4 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -468,6 +468,8 @@ pg_regress_installcheck = $(top_builddir)/src/test/regress/pg_regress --inputdir pg_regress_clean_files = results/ regression.diffs regression.out tmp_check/ log/ +pg_isolation_regress_check = $(top_builddir)/src/test/isolation/pg_isolation_regress --inputdir=$(srcdir) --temp-install=./tmp_check --top-builddir=$(top_builddir) $(pg_regress_locale_flags) +pg_isolation_regress_installcheck = $(top_builddir)/src/test/isolation/pg_isolation_regress --inputdir=$(srcdir) --top-builddir=$(top_builddir) $(pg_regress_locale_flags) ########################################################################## # diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index de4befa93f..71ec74015c 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -347,8 +347,7 @@ heapgetpage(HeapScanDesc scan, BlockNumber page) /* * Prune and repair fragmentation for the whole page, if possible. */ - Assert(TransactionIdIsValid(RecentGlobalXmin)); - heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin); + heap_page_prune_opt(scan->rs_rd, buffer); /* * We must hold share lock on the buffer content while examining tuple @@ -1750,10 +1749,22 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, */ if (!skip) { + /* + * For the benefit of logical decoding, have t_self point at the + * element of the HOT chain we're currently investigating instead + * of the root tuple of the HOT chain. This is important because + * the *Satisfies routine for historical mvcc snapshots needs the + * correct tid to decide about the visibility in some cases. + */ + ItemPointerSet(&(heapTuple->t_self), BufferGetBlockNumber(buffer), offnum); + /* If it's visible per the snapshot, we must return it */ valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer); CheckForSerializableConflictOut(valid, relation, heapTuple, buffer, snapshot); + /* reset to original, non-redirected, tid */ + heapTuple->t_self = *tid; + if (valid) { ItemPointerSetOffsetNumber(tid, offnum); @@ -8207,6 +8218,9 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record) * decoding. */ break; + case XLOG_HEAP2_REWRITE: + heap_xlog_logical_rewrite(lsn, record); + break; default: elog(PANIC, "heap2_redo: unknown op code %u", info); } diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 27cbac8525..3c69e1bada 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -18,13 +18,14 @@ #include "access/heapam_xlog.h" #include "access/transam.h" #include "access/htup_details.h" +#include "catalog/catalog.h" #include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" +#include "utils/snapmgr.h" #include "utils/rel.h" #include "utils/tqual.h" - /* Working data for heap_page_prune and subroutines */ typedef struct { @@ -70,10 +71,34 @@ static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum); * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). */ void -heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin) +heap_page_prune_opt(Relation relation, Buffer buffer) { Page page = BufferGetPage(buffer); Size minfree; + TransactionId OldestXmin; + + /* + * We can't write WAL in recovery mode, so there's no point trying to + * clean the page. The master will likely issue a cleaning WAL record soon + * anyway, so this is no particular loss. + */ + if (RecoveryInProgress()) + return; + + /* + * Use the appropriate xmin horizon for this relation. If it's a proper + * catalog relation or a user defined, additional, catalog relation, we + * need to use the horizon that includes slots, otherwise the data-only + * horizon can be used. Note that the toast relation of user defined + * relations are *not* considered catalog relations. + */ + if (IsCatalogRelation(relation) || + RelationIsAccessibleInLogicalDecoding(relation)) + OldestXmin = RecentGlobalXmin; + else + OldestXmin = RecentGlobalDataXmin; + + Assert(TransactionIdIsValid(OldestXmin)); /* * Let's see if we really need pruning. @@ -84,14 +109,6 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin) if (!PageIsPrunable(page, OldestXmin)) return; - /* - * We can't write WAL in recovery mode, so there's no point trying to - * clean the page. The master will likely issue a cleaning WAL record soon - * anyway, so this is no particular loss. - */ - if (RecoveryInProgress()) - return; - /* * We prune when a previous UPDATE failed to find enough space on the page * for a new tuple version, or when free space falls below the relation's diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index c34ab9865f..239c7dad0c 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -102,17 +102,34 @@ */ #include "postgres.h" +#include +#include + +#include "miscadmin.h" + #include "access/heapam.h" #include "access/heapam_xlog.h" #include "access/rewriteheap.h" #include "access/transam.h" #include "access/tuptoaster.h" +#include "access/xact.h" + +#include "catalog/catalog.h" + +#include "lib/ilist.h" + +#include "replication/logical.h" +#include "replication/slot.h" + #include "storage/bufmgr.h" +#include "storage/fd.h" #include "storage/smgr.h" + #include "utils/memutils.h" #include "utils/rel.h" #include "utils/tqual.h" +#include "storage/procarray.h" /* * State associated with a rewrite operation. This is opaque to the user @@ -120,21 +137,28 @@ */ typedef struct RewriteStateData { + Relation rs_old_rel; /* source heap */ Relation rs_new_rel; /* destination heap */ Page rs_buffer; /* page currently being built */ BlockNumber rs_blockno; /* block where page will go */ bool rs_buffer_valid; /* T if any tuples in buffer */ bool rs_use_wal; /* must we WAL-log inserts? */ + bool rs_logical_rewrite; /* do we need to do logical rewriting */ TransactionId rs_oldest_xmin; /* oldest xmin used by caller to * determine tuple visibility */ TransactionId rs_freeze_xid;/* Xid that will be used as freeze cutoff * point */ + TransactionId rs_logical_xmin; /* Xid that will be used as cutoff + * point for logical rewrites */ MultiXactId rs_cutoff_multi;/* MultiXactId that will be used as cutoff * point for multixacts */ MemoryContext rs_cxt; /* for hash tables and entries and tuples in * them */ + XLogRecPtr rs_begin_lsn; /* XLogInsertLsn when starting the rewrite */ HTAB *rs_unresolved_tups; /* unmatched A tuples */ HTAB *rs_old_new_tid_map; /* unmatched B tuples */ + HTAB *rs_logical_mappings; /* logical remapping files */ + uint32 rs_num_rewrite_mappings; /* # in memory mappings */ } RewriteStateData; /* @@ -169,14 +193,45 @@ typedef struct typedef OldToNewMappingData *OldToNewMapping; +/* + * In-Memory data for a xid that might need logical remapping entries + * to be logged. + */ +typedef struct RewriteMappingFile +{ + TransactionId xid; /* xid that might need to see the row */ + int vfd; /* fd of mappings file */ + off_t off; /* how far have we written yet */ + uint32 num_mappings; /* number of in-memory mappings */ + dlist_head mappings; /* list of in-memory mappings */ + char path[MAXPGPATH]; /* path, for error messages */ +} RewriteMappingFile; + +/* + * A single In-Memeory logical rewrite mapping, hanging of + * RewriteMappingFile->mappings. + */ +typedef struct RewriteMappingDataEntry +{ + LogicalRewriteMappingData map; /* map between old and new location of + * the tuple */ + dlist_node node; +} RewriteMappingDataEntry; + /* prototypes for internal functions */ static void raw_heap_insert(RewriteState state, HeapTuple tup); +/* internal logical remapping prototypes */ +static void logical_begin_heap_rewrite(RewriteState state); +static void logical_rewrite_heap_tuple(RewriteState state, ItemPointerData old_tid, HeapTuple new_tuple); +static void logical_end_heap_rewrite(RewriteState state); + /* * Begin a rewrite of a table * + * old_heap old, locked heap relation tuples will be read from * new_heap new, locked heap relation to insert tuples to * oldest_xmin xid used by the caller to determine which tuples are dead * freeze_xid xid before which tuples will be frozen @@ -187,7 +242,7 @@ static void raw_heap_insert(RewriteState state, HeapTuple tup); * to be used in subsequent calls to the other functions. */ RewriteState -begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin, +begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin, TransactionId freeze_xid, MultiXactId cutoff_multi, bool use_wal) { @@ -210,6 +265,7 @@ begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin, /* Create and fill in the state struct */ state = palloc0(sizeof(RewriteStateData)); + state->rs_old_rel = old_heap; state->rs_new_rel = new_heap; state->rs_buffer = (Page) palloc(BLCKSZ); /* new_heap needn't be empty, just locked */ @@ -244,6 +300,8 @@ begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin, MemoryContextSwitchTo(old_cxt); + logical_begin_heap_rewrite(state); + return state; } @@ -301,6 +359,8 @@ end_heap_rewrite(RewriteState state) if (RelationNeedsWAL(state->rs_new_rel)) heap_sync(state->rs_new_rel); + logical_end_heap_rewrite(state); + /* Deleting the context frees everything */ MemoryContextDelete(state->rs_cxt); } @@ -429,6 +489,8 @@ rewrite_heap_tuple(RewriteState state, raw_heap_insert(state, new_tuple); new_tid = new_tuple->t_self; + logical_rewrite_heap_tuple(state, old_tid, new_tuple); + /* * If the tuple is the updated version of a row, and the prior version * wouldn't be DEAD yet, then we need to either resolve the prior @@ -678,3 +740,545 @@ raw_heap_insert(RewriteState state, HeapTuple tup) if (heaptup != tup) heap_freetuple(heaptup); } + +/* ------------------------------------------------------------------------ + * Logical rewrite support + * + * When doing logical decoding - which relies on using cmin/cmax of catalog + * tuples, via xl_heap_new_cid records - heap rewrites have to log enough + * information to allow the decoding backend to updates its internal mapping + * of (relfilenode,ctid) => (cmin, cmax) to be correct for the rewritten heap. + * + * For that, every time we find a tuple that's been modified in a catalog + * relation within the xmin horizon of any decoding slot, we log a mapping + * from the old to the new location. + * + * To deal with rewrites that abort the filename of a mapping file contains + * the xid of the transaction performing the rewrite, which then can be + * checked before being read in. + * + * For efficiency we don't immediately spill every single map mapping for a + * row to disk but only do so in batches when we've collected several of them + * in memory or when end_heap_rewrite() has been called. + * + * Crash-Safety: This module diverts from the usual patterns of doing WAL + * since it cannot rely on checkpoint flushing out all buffers and thus + * waiting for exlusive locks on buffers. Usually the XLogInsert() covering + * buffer modifications is performed while the buffer(s) that are being + * modified are exlusively locked guaranteeing that both the WAL record and + * the modified heap are on either side of the checkpoint. But since the + * mapping files we log aren't in shared_buffers that interlock doesn't work. + * + * Instead we simply write the mapping files out to disk, *before* the + * XLogInsert() is performed. That guarantees that either the XLogInsert() is + * inserted after the checkpoint's redo pointer or that the checkpoint (via + * LogicalRewriteHeapCheckpoint()) has flushed the (partial) mapping file to + * disk. That leaves the tail end that has not yet been flushed open to + * corruption, which is solved by including the current offset in the + * xl_heap_rewrite_mapping records and truncating the mapping file to it + * during replay. Every time a rewrite is finished all generated mapping files + * are synced to disk. + * + * Note that if we were only concerned about crash safety we wouldn't have to + * deal with WAL logging at all - an fsync() at the end of a rewrite would be + * sufficient for crash safety. Any mapping that hasn't been safely flushed to + * disk has to be by an aborted (explicitly or via a crash) transaction and is + * ignored by virtue of the xid in it's name being subject to a + * TransactionDidCommit() check. But we want to support having standbys via + * physical replication, both for availability and to to do logical decoding + * there. + * ------------------------------------------------------------------------ + */ + +/* + * Do preparations for logging logical mappings during a rewrite if + * necessary. If we detect that we don't need to log anything we'll prevent + * any further action by the various logical rewrite functions. + */ +static void +logical_begin_heap_rewrite(RewriteState state) +{ + HASHCTL hash_ctl; + TransactionId logical_xmin; + + /* + * We only need to persist these mappings if the rewritten table can be + * accessed during logical decoding, if not, we can skip doing any + * additional work. + */ + state->rs_logical_rewrite = + RelationIsAccessibleInLogicalDecoding(state->rs_old_rel); + + if (!state->rs_logical_rewrite) + return; + + Assert(ReplicationSlotCtl != NULL); + + ProcArrayGetReplicationSlotXmin(NULL, &logical_xmin); + + /* + * If there are no logical slots in progress we don't need to do anything, + * there cannot be any remappings for relevant rows yet. The relation's + * lock protects us against races. + */ + if (logical_xmin == InvalidTransactionId) + { + state->rs_logical_rewrite = false; + return; + } + + state->rs_logical_xmin = logical_xmin; + state->rs_begin_lsn = GetXLogInsertRecPtr(); + state->rs_num_rewrite_mappings = 0; + + memset(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(TransactionId); + hash_ctl.entrysize = sizeof(RewriteMappingFile); + hash_ctl.hcxt = state->rs_cxt; + hash_ctl.hash = tag_hash; + + state->rs_logical_mappings = + hash_create("Logical rewrite mapping", + 128, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); +} + +/* + * Flush all logical in-memory mappings to disk, but don't fsync them yet. + */ +static void +logical_heap_rewrite_flush_mappings(RewriteState state) +{ + HASH_SEQ_STATUS seq_status; + RewriteMappingFile *src; + dlist_mutable_iter iter; + + Assert(state->rs_logical_rewrite); + + /* no logical rewrite in progress, no need to iterate over mappings */ + if (state->rs_num_rewrite_mappings == 0) + return; + + elog(DEBUG1, "flushing %u logical rewrite mapping entries", + state->rs_num_rewrite_mappings); + + hash_seq_init(&seq_status, state->rs_logical_mappings); + while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL) + { + XLogRecData rdata[2]; + char *waldata; + char *waldata_start; + xl_heap_rewrite_mapping xlrec; + Oid dboid; + uint32 len; + int written; + + /* this file hasn't got any new mappings */ + if (src->num_mappings == 0) + continue; + + if (state->rs_old_rel->rd_rel->relisshared) + dboid = InvalidOid; + else + dboid = MyDatabaseId; + + xlrec.num_mappings = src->num_mappings; + xlrec.mapped_rel = RelationGetRelid(state->rs_old_rel); + xlrec.mapped_xid = src->xid; + xlrec.mapped_db = dboid; + xlrec.offset = src->off; + xlrec.start_lsn = state->rs_begin_lsn; + + rdata[0].data = (char *) (&xlrec); + rdata[0].len = sizeof(xlrec); + rdata[0].buffer = InvalidBuffer; + rdata[0].next = &(rdata[1]); + + /* write all mappings consecutively */ + len = src->num_mappings * sizeof(LogicalRewriteMappingData); + waldata = palloc(len); + waldata_start = waldata; + + /* + * collect data we need to write out, but don't modify ondisk data yet + */ + dlist_foreach_modify(iter, &src->mappings) + { + RewriteMappingDataEntry *pmap; + + pmap = dlist_container(RewriteMappingDataEntry, node, iter.cur); + + memcpy(waldata, &pmap->map, sizeof(pmap->map)); + waldata += sizeof(pmap->map); + + /* remove from the list and free */ + dlist_delete(&pmap->node); + pfree(pmap); + + /* update bookkeeping */ + state->rs_num_rewrite_mappings--; + src->num_mappings--; + } + + /* + * Note that we deviate from the usual WAL coding practices here, + * check the above "Logical rewrite support" comment for reasoning. + */ + written = FileWrite(src->vfd, waldata_start, len); + if (written != len) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\", wrote %d of %d: %m", src->path, + written, len))); + src->off += len; + + Assert(src->num_mappings == 0); + + rdata[1].data = waldata_start; + rdata[1].len = len; + rdata[1].buffer = InvalidBuffer; + rdata[1].next = NULL; + + /* write xlog record */ + XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_REWRITE, rdata); + + } + Assert(state->rs_num_rewrite_mappings == 0); +} + +/* + * Logical remapping part of end_heap_rewrite(). + */ +static void +logical_end_heap_rewrite(RewriteState state) +{ + HASH_SEQ_STATUS seq_status; + RewriteMappingFile *src; + + /* done, no logical rewrite in progress */ + if (!state->rs_logical_rewrite) + return; + + /* writeout remaining in-memory entries */ + if (state->rs_num_rewrite_mappings > 0 ) + logical_heap_rewrite_flush_mappings(state); + + /* Iterate over all mappings we have written and fsync the files. */ + hash_seq_init(&seq_status, state->rs_logical_mappings); + while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL) + { + if(FileSync(src->vfd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", src->path))); + FileClose(src->vfd); + } + /* memory context cleanup will deal with the rest */ +} + +/* + * Log a single (old->new) mapping for 'xid'. + */ +static void +logical_rewrite_log_mapping(RewriteState state, TransactionId xid, + LogicalRewriteMappingData *map) +{ + RewriteMappingFile *src; + RewriteMappingDataEntry *pmap; + Oid relid; + bool found; + + relid = RelationGetRelid(state->rs_old_rel); + + /* look for existing mappings for this 'mapped' xid */ + src = hash_search(state->rs_logical_mappings, &xid, + HASH_ENTER, &found); + + /* + * We haven't yet had the need to map anything for this xid, create + * per-xid data structures. + */ + if (!found) + { + char path[MAXPGPATH]; + Oid dboid; + + if (state->rs_old_rel->rd_rel->relisshared) + dboid = InvalidOid; + else + dboid = MyDatabaseId; + + snprintf(path, MAXPGPATH, + "pg_llog/mappings/" LOGICAL_REWRITE_FORMAT, + dboid, relid, + (uint32) (state->rs_begin_lsn >> 32), + (uint32) state->rs_begin_lsn, + xid, GetCurrentTransactionId()); + + dlist_init(&src->mappings); + src->num_mappings = 0; + src->off = 0; + memcpy(src->path, path, sizeof(path)); + src->vfd = PathNameOpenFile(path, + O_CREAT | O_EXCL | O_WRONLY | PG_BINARY, + S_IRUSR | S_IWUSR); + if (src->vfd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", path))); + } + + pmap = MemoryContextAlloc(state->rs_cxt, + sizeof(RewriteMappingDataEntry)); + memcpy(&pmap->map, map, sizeof(LogicalRewriteMappingData)); + dlist_push_tail(&src->mappings, &pmap->node); + src->num_mappings++; + state->rs_num_rewrite_mappings++; + + /* + * Write out buffer every time we've too many in-memory entries across all + * mapping files. + */ + if (state->rs_num_rewrite_mappings >= 1000 /* arbitrary number */) + logical_heap_rewrite_flush_mappings(state); +} + +/* + * Perform logical remapping for a tuple that's mapped from old_tid to + * new_tuple->t_self by rewrite_heap_tuple() iff necessary for the tuple. + */ +static void +logical_rewrite_heap_tuple(RewriteState state, ItemPointerData old_tid, + HeapTuple new_tuple) +{ + ItemPointerData new_tid = new_tuple->t_self; + TransactionId cutoff = state->rs_logical_xmin; + TransactionId xmin; + TransactionId xmax; + bool do_log_xmin = false; + bool do_log_xmax = false; + LogicalRewriteMappingData map; + + /* no logical rewrite in progress, we don't need to log anything */ + if (!state->rs_logical_rewrite) + return; + + xmin = HeapTupleHeaderGetXmin(new_tuple->t_data); + /* use *GetUpdateXid to correctly deal with multixacts */ + xmax = HeapTupleHeaderGetUpdateXid(new_tuple->t_data); + + /* + * Log the mapping iff the tuple has been created recently. + */ + if (TransactionIdIsNormal(xmin) && !TransactionIdPrecedes(xmin, cutoff)) + do_log_xmin = true; + + if (!TransactionIdIsNormal(xmax)) + { + /* + * no xmax is set, can't have any permanent ones, so this check is + * sufficient + */ + } + else if (HEAP_XMAX_IS_LOCKED_ONLY(new_tuple->t_data->t_infomask)) + { + /* only locked, we don't care */ + } + else if (!TransactionIdPrecedes(xmax, cutoff)) + { + /* tuple has been deleted recently, log */ + do_log_xmax = true; + } + + /* if neither needs to be logged, we're done */ + if (!do_log_xmin && !do_log_xmax) + return; + + /* fill out mapping information */ + map.old_node = state->rs_old_rel->rd_node; + map.old_tid = old_tid; + map.new_node = state->rs_new_rel->rd_node; + map.new_tid = new_tid; + + /* --- + * Now persist the mapping for the individual xids that are affected. We + * need to log for both xmin and xmax if they aren't the same transaction + * since the mapping files are per "affected" xid. + * We don't muster all that much effort detecting whether xmin and xmax + * are actually the same transaction, we just check whether the xid is the + * same disregarding subtransactions. Logging too much is relatively + * harmless and we could never do the check fully since subtransaction + * data is thrown away during restarts. + * --- + */ + if (do_log_xmin) + logical_rewrite_log_mapping(state, xmin, &map); + /* separately log mapping for xmax unless it'd be redundant */ + if (do_log_xmax && !TransactionIdEquals(xmin, xmax)) + logical_rewrite_log_mapping(state, xmax, &map); +} + +/* + * Replay XLOG_HEAP2_REWRITE records + */ +void +heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r) +{ + char path[MAXPGPATH]; + int fd; + xl_heap_rewrite_mapping *xlrec; + uint32 len; + char *data; + + xlrec = (xl_heap_rewrite_mapping *) XLogRecGetData(r); + + snprintf(path, MAXPGPATH, + "pg_llog/mappings/" LOGICAL_REWRITE_FORMAT, + xlrec->mapped_db, xlrec->mapped_rel, + (uint32) (xlrec->start_lsn >> 32), + (uint32) xlrec->start_lsn, + xlrec->mapped_xid, r->xl_xid); + + fd = OpenTransientFile(path, + O_CREAT | O_WRONLY | PG_BINARY, + S_IRUSR | S_IWUSR); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", path))); + /* + * Truncate all data that's not guaranteed to have been safely fsynced (by + * previous record or by the last checkpoint). + */ + if (ftruncate(fd, xlrec->offset) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not truncate file \"%s\" to %u: %m", + path, (uint32) xlrec->offset))); + + /* now seek to the position we want to write our data to */ + if (lseek(fd, xlrec->offset, SEEK_SET) != xlrec->offset) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not seek to the end of file \"%s\": %m", + path))); + + data = XLogRecGetData(r) + sizeof(*xlrec); + + len = xlrec->num_mappings * sizeof(LogicalRewriteMappingData); + + /* write out tail end of mapping file (again) */ + if (write(fd, data, len) != len) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", path))); + /* + * Now fsync all previously written data. We could improve things and only + * do this for the last write to a file, but the required bookkeeping + * doesn't seem worth the trouble. + */ + if (pg_fsync(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", path))); + + CloseTransientFile(fd); +} + +/* --- + * Perform a checkpoint for logical rewrite mappings + * + * This serves two tasks: + * 1) Remove all mappings not needed anymore based on the logical restart LSN + * 2) Flush all remaining mappings to disk, so that replay after a checkpoint + * only has to deal with the parts of a mapping that have been written out + * after the checkpoint started. + * --- + */ +void +CheckPointLogicalRewriteHeap(void) +{ + XLogRecPtr cutoff; + XLogRecPtr redo; + DIR *mappings_dir; + struct dirent *mapping_de; + char path[MAXPGPATH]; + + /* + * We start of with a minimum of the last redo pointer. No new decoding + * slot will start before that, so that's a safe upper bound for removal. + */ + redo = GetRedoRecPtr(); + + /* now check for the restart ptrs from existing slots */ + cutoff = ReplicationSlotsComputeLogicalRestartLSN(); + + /* don't start earlier than the restart lsn */ + if (cutoff != InvalidXLogRecPtr && redo < cutoff) + cutoff = redo; + + mappings_dir = AllocateDir("pg_llog/mappings"); + while ((mapping_de = ReadDir(mappings_dir, "pg_llog/mappings")) != NULL) + { + struct stat statbuf; + Oid dboid; + Oid relid; + XLogRecPtr lsn; + TransactionId rewrite_xid; + TransactionId create_xid; + uint32 hi, lo; + + if (strcmp(mapping_de->d_name, ".") == 0 || + strcmp(mapping_de->d_name, "..") == 0) + continue; + + snprintf(path, MAXPGPATH, "pg_llog/mappings/%s", mapping_de->d_name); + if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode)) + continue; + + /* Skip over files that cannot be ours. */ + if (strncmp(mapping_de->d_name, "map-", 4) != 0) + continue; + + if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT, + &dboid, &relid, &hi, &lo, &rewrite_xid, &create_xid) != 6) + elog(ERROR,"could not parse filename \"%s\"", mapping_de->d_name); + + lsn = ((uint64) hi) << 32 | lo; + + if (lsn < cutoff || cutoff == InvalidXLogRecPtr) + { + elog(DEBUG1, "removing logical rewrite file \"%s\"", path); + if (unlink(path) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not unlink file \"%s\": %m", path))); + } + else + { + int fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0); + + /* + * The file cannot vanish due to concurrency since this function + * is the only one removing logical mappings and it's run while + * CheckpointLock is held exclusively. + */ + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + /* + * We could try to avoid fsyncing files that either haven't + * changed or have only been created since the checkpoint's start, + * but it's currently not deemed worth the effort. + */ + else if (pg_fsync(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", path))); + CloseTransientFile(fd); + } + } + FreeDir(mappings_dir); +} diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c index 97c9f238a7..9a821d3e1c 100644 --- a/src/backend/access/heap/tuptoaster.c +++ b/src/backend/access/heap/tuptoaster.c @@ -44,32 +44,6 @@ #undef TOAST_DEBUG -/* - * Testing whether an externally-stored value is compressed now requires - * comparing extsize (the actual length of the external data) to rawsize - * (the original uncompressed datum's size). The latter includes VARHDRSZ - * overhead, the former doesn't. We never use compression unless it actually - * saves space, so we expect either equality or less-than. - */ -#define VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer) \ - ((toast_pointer).va_extsize < (toast_pointer).va_rawsize - VARHDRSZ) - -/* - * Macro to fetch the possibly-unaligned contents of an EXTERNAL datum - * into a local "struct varatt_external" toast pointer. This should be - * just a memcpy, but some versions of gcc seem to produce broken code - * that assumes the datum contents are aligned. Introducing an explicit - * intermediate "varattrib_1b_e *" variable seems to fix it. - */ -#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr) \ -do { \ - varattrib_1b_e *attre = (varattrib_1b_e *) (attr); \ - Assert(VARATT_IS_EXTERNAL(attre)); \ - Assert(VARSIZE_EXTERNAL(attre) == sizeof(toast_pointer) + VARHDRSZ_EXTERNAL); \ - memcpy(&(toast_pointer), VARDATA_EXTERNAL(attre), sizeof(toast_pointer)); \ -} while (0) - - static void toast_delete_datum(Relation rel, Datum value); static Datum toast_save_datum(Relation rel, Datum value, struct varlena * oldexternal, int options); diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 1aba2f04cc..a4b5f3d698 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -67,7 +67,10 @@ #include "access/relscan.h" #include "access/transam.h" +#include "access/xlog.h" + #include "catalog/index.h" +#include "catalog/catalog.h" #include "pgstat.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" @@ -520,8 +523,7 @@ index_fetch_heap(IndexScanDesc scan) * Prune page, but only if we weren't already on this page */ if (prev_buf != scan->xs_cbuf) - heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, - RecentGlobalXmin); + heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf); } /* Obtain share-lock on the buffer so we can examine visibility */ diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c index 89ba09a206..c8a61669dd 100644 --- a/src/backend/access/rmgrdesc/heapdesc.c +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -149,6 +149,10 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec) xlrec->node.relNode, xlrec->block, xlrec->cutoff_xid, xlrec->ntuples); } + else if (info == XLOG_HEAP2_REWRITE) + { + appendStringInfoString(buf, "heap rewrite:"); + } else if (info == XLOG_HEAP2_CLEANUP_INFO) { xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec; diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 0487be17df..b20d9732e7 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -1074,8 +1074,16 @@ RecordTransactionCommit(void) /* * Do we need the long commit record? If not, use the compact format. + * + * For now always use the non-compact version if wal_level=logical, so + * we can hide commits from other databases. TODO: In the future we + * should merge compact and non-compact commits and use a flags + * variable to determine if it contains subxacts, relations or + * invalidation messages, that's more extensible and degrades more + * gracefully. Till then, it's just 20 bytes of overhead. */ - if (nrels > 0 || nmsgs > 0 || RelcacheInitFileInval || forceSyncCommit) + if (nrels > 0 || nmsgs > 0 || RelcacheInitFileInval || forceSyncCommit || + XLogLogicalInfoActive()) { XLogRecData rdata[4]; int lastrdata = 0; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index ad46eb0ceb..53a20b1e60 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -23,6 +23,7 @@ #include "access/clog.h" #include "access/multixact.h" +#include "access/rewriteheap.h" #include "access/subtrans.h" #include "access/timeline.h" #include "access/transam.h" @@ -39,7 +40,9 @@ #include "pgstat.h" #include "postmaster/bgwriter.h" #include "postmaster/startup.h" +#include "replication/logical.h" #include "replication/slot.h" +#include "replication/snapbuild.h" #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/barrier.h" @@ -4015,6 +4018,27 @@ CheckXLogRemoved(XLogSegNo segno, TimeLineID tli) } } +/* + * Return the last WAL segment removed, or 0 if no segment has been removed + * since startup. + * + * NB: the result can be out of date arbitrarily fast, the caller has to deal + * with that. + */ +XLogSegNo +XLogGetLastRemovedSegno(void) +{ + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + XLogSegNo lastRemovedSegNo; + + SpinLockAcquire(&xlogctl->info_lck); + lastRemovedSegNo = xlogctl->lastRemovedSegNo; + SpinLockRelease(&xlogctl->info_lck); + + return lastRemovedSegNo; +} + /* * Update the last removed segno pointer in shared memory, to reflect * that the given XLOG file has been removed. @@ -6558,6 +6582,12 @@ StartupXLOG(void) */ StartupReplicationSlots(checkPoint.redo); + /* + * Startup logical state, needs to be setup now so we have proper data + * during crash recovery. + */ + StartupReorderBuffer(); + /* * Startup MultiXact. We need to do this early for two reasons: one * is that we might try to access multixacts when we do tuple freezing, @@ -8589,7 +8619,7 @@ CreateCheckPoint(int flags) * StartupSUBTRANS hasn't been called yet. */ if (!RecoveryInProgress()) - TruncateSUBTRANS(GetOldestXmin(true, false)); + TruncateSUBTRANS(GetOldestXmin(NULL, false)); /* Real work is done, but log and update stats before releasing lock. */ LogCheckpointEnd(false); @@ -8674,6 +8704,8 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) CheckPointPredicate(); CheckPointRelationMap(); CheckPointReplicationSlots(); + CheckPointSnapBuild(); + CheckPointLogicalRewriteHeap(); CheckPointBuffers(flags); /* performs all required fsyncs */ /* We deliberately delay 2PC checkpointing as long as possible */ CheckPointTwoPhase(checkPointRedo); @@ -8965,7 +8997,7 @@ CreateRestartPoint(int flags) * this because StartupSUBTRANS hasn't been called yet. */ if (EnableHotStandby) - TruncateSUBTRANS(GetOldestXmin(true, false)); + TruncateSUBTRANS(GetOldestXmin(NULL, false)); /* Real work is done, but log and update before releasing lock. */ LogCheckpointEnd(true); diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index cebca95ac8..877d7678f7 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -2156,7 +2156,7 @@ IndexBuildHeapScan(Relation heapRelation, { snapshot = SnapshotAny; /* okay to ignore lazy VACUUMs here */ - OldestXmin = GetOldestXmin(heapRelation->rd_rel->relisshared, true); + OldestXmin = GetOldestXmin(heapRelation, true); } scan = heap_beginscan_strat(heapRelation, /* relation */ diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 04dfbb0ee5..0500a73e1b 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -619,11 +619,13 @@ CREATE VIEW pg_stat_replication AS CREATE VIEW pg_replication_slots AS SELECT L.slot_name, + L.plugin, L.slot_type, L.datoid, D.datname AS database, L.active, L.xmin, + L.catalog_xmin, L.restart_lsn FROM pg_get_replication_slots() AS L LEFT JOIN pg_database D ON (L.datoid = D.oid); @@ -822,3 +824,35 @@ CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION json_populate_recordset(base anyelement, from_json json, use_json_as_text boolean DEFAULT false) RETURNS SETOF anyelement LANGUAGE internal STABLE ROWS 100 AS 'json_populate_recordset'; + +CREATE OR REPLACE FUNCTION pg_logical_slot_get_changes( + IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}', + OUT location pg_lsn, OUT xid xid, OUT data text) +RETURNS SETOF RECORD +LANGUAGE INTERNAL +VOLATILE ROWS 1000 COST 1000 +AS 'pg_logical_slot_get_changes'; + +CREATE OR REPLACE FUNCTION pg_logical_slot_peek_changes( + IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}', + OUT location pg_lsn, OUT xid xid, OUT data text) +RETURNS SETOF RECORD +LANGUAGE INTERNAL +VOLATILE ROWS 1000 COST 1000 +AS 'pg_logical_slot_peek_changes'; + +CREATE OR REPLACE FUNCTION pg_logical_slot_get_binary_changes( + IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}', + OUT location pg_lsn, OUT xid xid, OUT data bytea) +RETURNS SETOF RECORD +LANGUAGE INTERNAL +VOLATILE ROWS 1000 COST 1000 +AS 'pg_logical_slot_get_binary_changes'; + +CREATE OR REPLACE FUNCTION pg_logical_slot_peek_binary_changes( + IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}', + OUT location pg_lsn, OUT xid xid, OUT data bytea) +RETURNS SETOF RECORD +LANGUAGE INTERNAL +VOLATILE ROWS 1000 COST 1000 +AS 'pg_logical_slot_peek_binary_changes'; diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index e7fcb55868..a04adeaac7 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -22,6 +22,7 @@ #include "access/tuptoaster.h" #include "access/visibilitymap.h" #include "access/xact.h" +#include "catalog/catalog.h" #include "catalog/index.h" #include "catalog/indexing.h" #include "catalog/pg_collation.h" @@ -1081,7 +1082,7 @@ acquire_sample_rows(Relation onerel, int elevel, totalblocks = RelationGetNumberOfBlocks(onerel); /* Need a cutoff xmin for HeapTupleSatisfiesVacuum */ - OldestXmin = GetOldestXmin(onerel->rd_rel->relisshared, true); + OldestXmin = GetOldestXmin(onerel, true); /* Prepare for sampling block numbers */ BlockSampler_Init(&bs, totalblocks, targrows); diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index 8b18e4acb7..b6b40e724e 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -850,7 +850,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose, * Since we're going to rewrite the whole table anyway, there's no reason * not to be aggressive about this. */ - vacuum_set_xid_limits(0, 0, 0, 0, OldHeap->rd_rel->relisshared, + vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0, &OldestXmin, &FreezeXid, NULL, &MultiXactCutoff, NULL); @@ -869,7 +869,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose, is_system_catalog = IsSystemRelation(OldHeap); /* Initialize the rewrite operation */ - rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid, + rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid, MultiXactCutoff, use_wal); /* diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 5d540aa3a0..4996a2e7cd 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -45,6 +45,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "postmaster/bgwriter.h" +#include "replication/slot.h" #include "storage/copydir.h" #include "storage/fd.h" #include "storage/lmgr.h" @@ -750,6 +751,7 @@ dropdb(const char *dbname, bool missing_ok) HeapTuple tup; int notherbackends; int npreparedxacts; + int nslots, nslots_active; /* * Look up the target database's OID, and get exclusive lock on it. We @@ -806,6 +808,19 @@ dropdb(const char *dbname, bool missing_ok) (errcode(ERRCODE_OBJECT_IN_USE), errmsg("cannot drop the currently open database"))); + /* + * Check whether there are, possibly unconnected, logical slots that refer + * to the to-be-dropped database. The database lock we are holding + * prevents the creation of new slots using the database. + */ + if (ReplicationSlotsCountDBSlots(db_id, &nslots, &nslots_active)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("database \"%s\" is used by a logical decoding slot", + dbname), + errdetail("There are %d slot(s), %d of them active", + nslots, nslots_active))); + /* * Check for other backends in the target database. (Because we hold the * database lock, no new ones can start after this.) diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 5ae7763534..ded1841dc6 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -398,11 +398,11 @@ get_rel_oids(Oid relid, const RangeVar *vacrel) * not interested. */ void -vacuum_set_xid_limits(int freeze_min_age, +vacuum_set_xid_limits(Relation rel, + int freeze_min_age, int freeze_table_age, int multixact_freeze_min_age, int multixact_freeze_table_age, - bool sharedRel, TransactionId *oldestXmin, TransactionId *freezeLimit, TransactionId *xidFullScanLimit, @@ -425,7 +425,7 @@ vacuum_set_xid_limits(int freeze_min_age, * working on a particular table at any time, and that each vacuum is * always an independent transaction. */ - *oldestXmin = GetOldestXmin(sharedRel, true); + *oldestXmin = GetOldestXmin(rel, true); Assert(TransactionIdIsNormal(*oldestXmin)); @@ -795,7 +795,7 @@ vac_update_datfrozenxid(void) * committed pg_class entries for new tables; see AddNewRelationTuple(). * So we cannot produce a wrong minimum by starting with this. */ - newFrozenXid = GetOldestXmin(true, true); + newFrozenXid = GetOldestXmin(NULL, true); /* * Similarly, initialize the MultiXact "min" with the value that would be diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index d77892ee7f..d5db917d97 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -44,6 +44,7 @@ #include "access/multixact.h" #include "access/transam.h" #include "access/visibilitymap.h" +#include "catalog/catalog.h" #include "catalog/storage.h" #include "commands/dbcommands.h" #include "commands/vacuum.h" @@ -204,10 +205,10 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, vac_strategy = bstrategy; - vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age, + vacuum_set_xid_limits(onerel, + vacstmt->freeze_min_age, vacstmt->freeze_table_age, vacstmt->multixact_freeze_min_age, vacstmt->multixact_freeze_table_age, - onerel->rd_rel->relisshared, &OldestXmin, &FreezeLimit, &xidFullScanLimit, &MultiXactCutoff, &mxactFullScanLimit); diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 1a8d4e5143..7d8a3f2c24 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -336,8 +336,7 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) /* * Prune and repair fragmentation for the whole page, if possible. */ - Assert(TransactionIdIsValid(RecentGlobalXmin)); - heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin); + heap_page_prune_opt(scan->rs_rd, buffer); /* * We must hold share lock on the buffer content while examining tuple diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile index 7941cb8d5e..6f17b08a6a 100644 --- a/src/backend/replication/Makefile +++ b/src/backend/replication/Makefile @@ -17,6 +17,8 @@ override CPPFLAGS := -I$(srcdir) $(CPPFLAGS) OBJS = walsender.o walreceiverfuncs.o walreceiver.o basebackup.o \ repl_gram.o slot.o slotfuncs.o syncrep.o +SUBDIRS = logical + include $(top_srcdir)/src/backend/common.mk # repl_scanner is compiled as part of repl_gram diff --git a/src/backend/replication/logical/Makefile b/src/backend/replication/logical/Makefile new file mode 100644 index 0000000000..310a45c5c0 --- /dev/null +++ b/src/backend/replication/logical/Makefile @@ -0,0 +1,19 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for src/backend/replication/logical +# +# IDENTIFICATION +# src/backend/replication/logical/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/replication/logical +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +override CPPFLAGS := -I$(srcdir) $(CPPFLAGS) + +OBJS = decode.o logical.o logicalfuncs.o reorderbuffer.o snapbuild.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c new file mode 100644 index 0000000000..e8949aab32 --- /dev/null +++ b/src/backend/replication/logical/decode.c @@ -0,0 +1,826 @@ +/* ------------------------------------------------------------------------- + * + * decode.c + * This module decodes WAL records read using xlogreader.h's APIs for the + * purpose of logical decoding by passing information to the + * reorderbuffer module (containing the actual changes) and to the + * snapbuild module to build a fitting catalog snapshot (to be able to + * properly decode the changes in the reorderbuffer). + * + * NOTE: + * This basically tries to handle all low level xlog stuff for + * reorderbuffer.c and snapbuild.c. There's some minor leakage where a + * specific record's struct is used to pass data along, but those just + * happen to contain the right amount of data in a convenient + * format. There isn't and shouldn't be much intelligence about the + * contents of records in here except turning them into a more usable + * format. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/replication/logical/decode.c + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "access/heapam_xlog.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "access/xlogreader.h" + +#include "catalog/pg_control.h" + +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/reorderbuffer.h" +#include "replication/snapbuild.h" + +#include "storage/standby.h" + +typedef struct XLogRecordBuffer +{ + XLogRecPtr origptr; + XLogRecPtr endptr; + XLogRecord record; + char *record_data; +} XLogRecordBuffer; + +/* RMGR Handlers */ +static void DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); + +/* individual record(group)'s handlers */ +static void DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + TransactionId xid, Oid dboid, + TimestampTz commit_time, + int nsubxacts, TransactionId *sub_xids, + int ninval_msgs, SharedInvalidationMessage *msg); +static void DecodeAbort(LogicalDecodingContext *ctx, XLogRecPtr lsn, + TransactionId xid, TransactionId *sub_xids, int nsubxacts); + +/* common function to decode tuples */ +static void DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tup); + +/* + * Take every XLogReadRecord()ed record and perform the actions required to + * decode it using the output plugin already setup in the logical decoding + * context. + */ +void +LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogRecord *record) +{ + XLogRecordBuffer buf; + + buf.origptr = ctx->reader->ReadRecPtr; + buf.endptr = ctx->reader->EndRecPtr; + buf.record = *record; + buf.record_data = XLogRecGetData(record); + + /* cast so we get a warning when new rmgrs are added */ + switch ((RmgrIds) buf.record.xl_rmid) + { + /* + * Rmgrs we care about for logical decoding. Add new rmgrs in + * rmgrlist.h's order. + */ + case RM_XLOG_ID: + DecodeXLogOp(ctx, &buf); + break; + + case RM_XACT_ID: + DecodeXactOp(ctx, &buf); + break; + + case RM_STANDBY_ID: + DecodeStandbyOp(ctx, &buf); + break; + + case RM_HEAP2_ID: + DecodeHeap2Op(ctx, &buf); + break; + + case RM_HEAP_ID: + DecodeHeapOp(ctx, &buf); + break; + + /* + * Rmgrs irrelevant for logical decoding; they describe stuff not + * represented in logical decoding. Add new rmgrs in rmgrlist.h's + * order. + */ + case RM_SMGR_ID: + case RM_CLOG_ID: + case RM_DBASE_ID: + case RM_TBLSPC_ID: + case RM_MULTIXACT_ID: + case RM_RELMAP_ID: + case RM_BTREE_ID: + case RM_HASH_ID: + case RM_GIN_ID: + case RM_GIST_ID: + case RM_SEQ_ID: + case RM_SPGIST_ID: + break; + case RM_NEXT_ID: + elog(ERROR, "unexpected RM_NEXT_ID rmgr_id: %u", (RmgrIds) buf.record.xl_rmid); + } +} + +/* + * Handle rmgr XLOG_ID records for DecodeRecordIntoReorderBuffer(). + */ +static void +DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + SnapBuild *builder = ctx->snapshot_builder; + uint8 info = buf->record.xl_info & ~XLR_INFO_MASK; + + switch (info) + { + /* this is also used in END_OF_RECOVERY checkpoints */ + case XLOG_CHECKPOINT_SHUTDOWN: + case XLOG_END_OF_RECOVERY: + SnapBuildSerializationPoint(builder, buf->origptr); + + break; + case XLOG_CHECKPOINT_ONLINE: + /* + * a RUNNING_XACTS record will have been logged near to this, we + * can restart from there. + */ + break; + case XLOG_NOOP: + case XLOG_NEXTOID: + case XLOG_SWITCH: + case XLOG_BACKUP_END: + case XLOG_PARAMETER_CHANGE: + case XLOG_RESTORE_POINT: + case XLOG_FPW_CHANGE: + case XLOG_FPI: + break; + default: + elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info); + } +} + +/* + * Handle rmgr XACT_ID records for DecodeRecordIntoReorderBuffer(). + */ +static void +DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + SnapBuild *builder = ctx->snapshot_builder; + ReorderBuffer *reorder = ctx->reorder; + XLogRecord *r = &buf->record; + uint8 info = r->xl_info & ~XLR_INFO_MASK; + + /* no point in doing anything yet, data could not be decoded anyway */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT) + return; + + switch (info) + { + case XLOG_XACT_COMMIT: + { + xl_xact_commit *xlrec; + TransactionId *subxacts = NULL; + SharedInvalidationMessage *invals = NULL; + + xlrec = (xl_xact_commit *) buf->record_data; + + subxacts = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]); + invals = (SharedInvalidationMessage *) &(subxacts[xlrec->nsubxacts]); + + DecodeCommit(ctx, buf, r->xl_xid, xlrec->dbId, + xlrec->xact_time, + xlrec->nsubxacts, subxacts, + xlrec->nmsgs, invals); + + break; + } + case XLOG_XACT_COMMIT_PREPARED: + { + xl_xact_commit_prepared *prec; + xl_xact_commit *xlrec; + TransactionId *subxacts; + SharedInvalidationMessage *invals = NULL; + + /* Prepared commits contain a normal commit record... */ + prec = (xl_xact_commit_prepared *) buf->record_data; + xlrec = &prec->crec; + + subxacts = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]); + invals = (SharedInvalidationMessage *) &(subxacts[xlrec->nsubxacts]); + + DecodeCommit(ctx, buf, r->xl_xid, xlrec->dbId, + xlrec->xact_time, + xlrec->nsubxacts, subxacts, + xlrec->nmsgs, invals); + + break; + } + case XLOG_XACT_COMMIT_COMPACT: + { + xl_xact_commit_compact *xlrec; + + xlrec = (xl_xact_commit_compact *) buf->record_data; + + DecodeCommit(ctx, buf, r->xl_xid, InvalidOid, + xlrec->xact_time, + xlrec->nsubxacts, xlrec->subxacts, + 0, NULL); + break; + } + case XLOG_XACT_ABORT: + { + xl_xact_abort *xlrec; + TransactionId *sub_xids; + + xlrec = (xl_xact_abort *) buf->record_data; + + sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]); + + DecodeAbort(ctx, buf->origptr, r->xl_xid, + sub_xids, xlrec->nsubxacts); + break; + } + case XLOG_XACT_ABORT_PREPARED: + { + xl_xact_abort_prepared *prec; + xl_xact_abort *xlrec; + TransactionId *sub_xids; + + /* prepared abort contain a normal commit abort... */ + prec = (xl_xact_abort_prepared *) buf->record_data; + xlrec = &prec->arec; + + sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]); + + /* r->xl_xid is committed in a separate record */ + DecodeAbort(ctx, buf->origptr, prec->xid, + sub_xids, xlrec->nsubxacts); + break; + } + + case XLOG_XACT_ASSIGNMENT: + { + xl_xact_assignment *xlrec; + int i; + TransactionId *sub_xid; + + xlrec = (xl_xact_assignment *) buf->record_data; + + sub_xid = &xlrec->xsub[0]; + + for (i = 0; i < xlrec->nsubxacts; i++) + { + ReorderBufferAssignChild(reorder, xlrec->xtop, + *(sub_xid++), buf->origptr); + } + break; + } + case XLOG_XACT_PREPARE: + /* + * Currently decoding ignores PREPARE TRANSACTION and will just + * decode the transaction when the COMMIT PREPARED is sent or + * throw away the transaction's contents when a ROLLBACK PREPARED + * is received. In the future we could add code to expose prepared + * transactions in the changestream allowing for a kind of + * distributed 2PC. + */ + break; + default: + elog(ERROR, "unexpected RM_XACT_ID record type: %u", info); + } +} + +/* + * Handle rmgr STANDBY_ID records for DecodeRecordIntoReorderBuffer(). + */ +static void +DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + SnapBuild *builder = ctx->snapshot_builder; + XLogRecord *r = &buf->record; + uint8 info = r->xl_info & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_RUNNING_XACTS: + { + xl_running_xacts *running = (xl_running_xacts *) buf->record_data; + SnapBuildProcessRunningXacts(builder, buf->origptr, running); + /* + * Abort all transactions that we keep track of, that are + * older than the record's oldestRunningXid. This is the most + * convenient spot for doing so since, in contrast to shutdown + * or end-of-recovery checkpoints, we have information about + * all running transactions which includes prepared ones, + * while shutdown checkpoints just know that no non-prepared + * transactions are in progress. + */ + ReorderBufferAbortOld(ctx->reorder, running->oldestRunningXid); + } + break; + case XLOG_STANDBY_LOCK: + break; + default: + elog(ERROR, "unexpected RM_STANDBY_ID record type: %u", info); + } +} + +/* + * Handle rmgr HEAP2_ID records for DecodeRecordIntoReorderBuffer(). + */ +static void +DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = buf->record.xl_info & XLOG_HEAP_OPMASK; + TransactionId xid = buf->record.xl_xid; + SnapBuild *builder = ctx->snapshot_builder; + + /* no point in doing anything yet */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT) + return; + + switch (info) + { + case XLOG_HEAP2_MULTI_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeMultiInsert(ctx, buf); + break; + case XLOG_HEAP2_NEW_CID: + { + xl_heap_new_cid *xlrec; + xlrec = (xl_heap_new_cid *) buf->record_data; + SnapBuildProcessNewCid(builder, xid, buf->origptr, xlrec); + + break; + } + case XLOG_HEAP2_REWRITE: + /* + * Although these records only exist to serve the needs of logical + * decoding, all the work happens as part of crash or archive + * recovery, so we don't need to do anything here. + */ + break; + /* + * Everything else here is just low level physical stuff we're + * not interested in. + */ + case XLOG_HEAP2_FREEZE_PAGE: + case XLOG_HEAP2_CLEAN: + case XLOG_HEAP2_CLEANUP_INFO: + case XLOG_HEAP2_VISIBLE: + case XLOG_HEAP2_LOCK_UPDATED: + break; + default: + elog(ERROR, "unexpected RM_HEAP2_ID record type: %u", info); + } +} + +/* + * Handle rmgr HEAP_ID records for DecodeRecordIntoReorderBuffer(). + */ +static void +DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = buf->record.xl_info & XLOG_HEAP_OPMASK; + TransactionId xid = buf->record.xl_xid; + SnapBuild *builder = ctx->snapshot_builder; + + /* no point in doing anything yet */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT) + return; + + switch (info) + { + case XLOG_HEAP_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeInsert(ctx, buf); + break; + + /* + * Treat HOT update as normal updates. There is no useful + * information in the fact that we could make it a HOT update + * locally and the WAL layout is compatible. + */ + case XLOG_HEAP_HOT_UPDATE: + case XLOG_HEAP_UPDATE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeUpdate(ctx, buf); + break; + + case XLOG_HEAP_DELETE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeDelete(ctx, buf); + break; + + case XLOG_HEAP_NEWPAGE: + /* + * This is only used in places like indexams and CLUSTER which + * don't contain changes relevant for logical replication. + */ + break; + + case XLOG_HEAP_INPLACE: + /* + * Inplace updates are only ever performed on catalog tuples and + * can, per definition, not change tuple visibility. Since we + * don't decode catalog tuples, we're not interested in the + * record's contents. + * + * In-place updates can be used either by XID-bearing transactions + * (e.g. in CREATE INDEX CONCURRENTLY) or by XID-less + * transactions (e.g. VACUUM). In the former case, the commit + * record will include cache invalidations, so we mark the + * transaction as catalog modifying here. Currently that's + * redundant because the commit will do that as well, but once we + * support decoding in-progress relations, this will be important. + */ + if (!TransactionIdIsValid(xid)) + break; + + SnapBuildProcessChange(builder, xid, buf->origptr); + ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr); + break; + + case XLOG_HEAP_LOCK: + /* we don't care about row level locks for now */ + break; + + default: + elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info); + break; + } +} + +/* + * Consolidated commit record handling between the different form of commit + * records. + */ +static void +DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + TransactionId xid, Oid dboid, + TimestampTz commit_time, + int nsubxacts, TransactionId *sub_xids, + int ninval_msgs, SharedInvalidationMessage *msgs) +{ + int i; + + /* + * Process invalidation messages, even if we're not interested in the + * transaction's contents, since the various caches need to always be + * consistent. + */ + if (ninval_msgs > 0) + { + ReorderBufferAddInvalidations(ctx->reorder, xid, buf->origptr, + ninval_msgs, msgs); + ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr); + } + + SnapBuildCommitTxn(ctx->snapshot_builder, buf->origptr, xid, + nsubxacts, sub_xids); + + /* ---- + * Check whether we are interested in this specific transaction, and tell + * the the reorderbuffer to forget the content of the (sub-)transactions + * if not. + * + * There basically two reasons we might not be interested in this + * transaction: + * 1) We might not be interested in decoding transactions up to this + * LSN. This can happen because we previously decoded it and now just + * are restarting or if we haven't assembled a consistent snapshot yet. + * 2) The transaction happened in another database. + * + * We can't just use ReorderBufferAbort() here, because we need to execute + * the transaction's invalidations. This currently won't be needed if + * we're just skipping over the transaction because currently we only do + * so during startup, to get to the first transaction the client needs. As + * we have reset the catalog caches before starting to read WAL, and we + * haven't yet touched any catalogs, there can't be anything to invalidate. + * But if we're "forgetting" this commit because it's it happened in + * another database, the invalidations might be important, because they + * could be for shared catalogs and we might have loaded data into the + * relevant syscaches. + * --- + */ + if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, buf->origptr) || + (dboid != InvalidOid && dboid != ctx->slot->data.database)) + { + for (i = 0; i < nsubxacts; i++) + { + ReorderBufferForget(ctx->reorder, *sub_xids, buf->origptr); + sub_xids++; + } + ReorderBufferForget(ctx->reorder, xid, buf->origptr); + + return; + } + + /* tell the reorderbuffer about the surviving subtransactions */ + for (i = 0; i < nsubxacts; i++) + { + ReorderBufferCommitChild(ctx->reorder, xid, *sub_xids, + buf->origptr, buf->endptr); + sub_xids++; + } + + /* replay actions of all transaction + subtransactions in order */ + ReorderBufferCommit(ctx->reorder, xid, buf->origptr, buf->endptr, + commit_time); +} + +/* + * Get the data from the various forms of abort records and pass it on to + * snapbuild.c and reorderbuffer.c + */ +static void +DecodeAbort(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, + TransactionId *sub_xids, int nsubxacts) +{ + int i; + + SnapBuildAbortTxn(ctx->snapshot_builder, lsn, xid, nsubxacts, sub_xids); + + for (i = 0; i < nsubxacts; i++) + { + ReorderBufferAbort(ctx->reorder, *sub_xids, lsn); + sub_xids++; + } + + ReorderBufferAbort(ctx->reorder, xid, lsn); +} + +/* + * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. + * + * Deletes can contain the new tuple. + */ +static void +DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogRecord *r = &buf->record; + xl_heap_insert *xlrec; + ReorderBufferChange *change; + + xlrec = (xl_heap_insert *) buf->record_data; + + /* only interested in our database */ + if (xlrec->target.node.dbNode != ctx->slot->data.database) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_INSERT; + memcpy(&change->tp.relnode, &xlrec->target.node, sizeof(RelFileNode)); + + if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE) + { + Assert(r->xl_len > (SizeOfHeapInsert + SizeOfHeapHeader)); + + change->tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder); + + DecodeXLogTuple((char *) xlrec + SizeOfHeapInsert, + r->xl_len - SizeOfHeapInsert, + change->tp.newtuple); + } + + ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change); +} + +/* + * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout + * in the record, from wal into proper tuplebufs. + * + * Updates can possibly contain a new tuple and the old primary key. + */ +static void +DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogRecord *r = &buf->record; + xl_heap_update *xlrec; + xl_heap_header_len *xlhdr; + ReorderBufferChange *change; + char *data; + + xlrec = (xl_heap_update *) buf->record_data; + xlhdr = (xl_heap_header_len *) (buf->record_data + SizeOfHeapUpdate); + + /* only interested in our database */ + if (xlrec->target.node.dbNode != ctx->slot->data.database) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_UPDATE; + memcpy(&change->tp.relnode, &xlrec->target.node, sizeof(RelFileNode)); + + data = (char *) &xlhdr->header; + + if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE) + { + Assert(r->xl_len > (SizeOfHeapUpdate + SizeOfHeapHeaderLen)); + + change->tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder); + + DecodeXLogTuple(data, + xlhdr->t_len + SizeOfHeapHeader, + change->tp.newtuple); + /* skip over the rest of the tuple header */ + data += SizeOfHeapHeader; + /* skip over the tuple data */ + data += xlhdr->t_len; + } + + if (xlrec->flags & XLOG_HEAP_CONTAINS_OLD) + { + xlhdr = (xl_heap_header_len *) data; + change->tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder); + DecodeXLogTuple((char *) &xlhdr->header, + xlhdr->t_len + SizeOfHeapHeader, + change->tp.oldtuple); + data = (char *) &xlhdr->header; + data += SizeOfHeapHeader; + data += xlhdr->t_len; + } + + ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change); +} + +/* + * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs. + * + * Deletes can possibly contain the old primary key. + */ +static void +DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogRecord *r = &buf->record; + xl_heap_delete *xlrec; + ReorderBufferChange *change; + + xlrec = (xl_heap_delete *) buf->record_data; + + /* only interested in our database */ + if (xlrec->target.node.dbNode != ctx->slot->data.database) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_DELETE; + + memcpy(&change->tp.relnode, &xlrec->target.node, sizeof(RelFileNode)); + + /* old primary key stored */ + if (xlrec->flags & XLOG_HEAP_CONTAINS_OLD) + { + Assert(r->xl_len > (SizeOfHeapDelete + SizeOfHeapHeader)); + + change->tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder); + + DecodeXLogTuple((char *) xlrec + SizeOfHeapDelete, + r->xl_len - SizeOfHeapDelete, + change->tp.oldtuple); + } + ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change); +} + +/* + * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. + * + * Currently MULTI_INSERT will always contain the full tuples. + */ +static void +DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogRecord *r = &buf->record; + xl_heap_multi_insert *xlrec; + int i; + char *data; + bool isinit = (r->xl_info & XLOG_HEAP_INIT_PAGE) != 0; + + xlrec = (xl_heap_multi_insert *) buf->record_data; + + /* only interested in our database */ + if (xlrec->node.dbNode != ctx->slot->data.database) + return; + + data = buf->record_data + SizeOfHeapMultiInsert; + + /* + * OffsetNumbers (which are not of interest to us) are stored when + * XLOG_HEAP_INIT_PAGE is not set -- skip over them. + */ + if (!isinit) + data += sizeof(OffsetNumber) * xlrec->ntuples; + + for (i = 0; i < xlrec->ntuples; i++) + { + ReorderBufferChange *change; + xl_multi_insert_tuple *xlhdr; + int datalen; + ReorderBufferTupleBuf *tuple; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_INSERT; + memcpy(&change->tp.relnode, &xlrec->node, sizeof(RelFileNode)); + + /* + * CONTAINS_NEW_TUPLE will always be set currently as multi_insert + * isn't used for catalogs, but better be future proof. + * + * We decode the tuple in pretty much the same way as DecodeXLogTuple, + * but since the layout is slightly different, we can't use it here. + */ + if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE) + { + change->tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder); + + tuple = change->tp.newtuple; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->tuple.t_self); + + xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(data); + data = ((char *) xlhdr) + SizeOfMultiInsertTuple; + datalen = xlhdr->datalen; + + /* + * We can only figure this out after reassembling the + * transactions. + */ + tuple->tuple.t_tableOid = InvalidOid; + tuple->tuple.t_data = &tuple->header; + tuple->tuple.t_len = datalen + + offsetof(HeapTupleHeaderData, t_bits); + + memset(&tuple->header, 0, sizeof(HeapTupleHeaderData)); + + memcpy((char *) &tuple->header + + offsetof(HeapTupleHeaderData, t_bits), + (char *) data, + datalen); + data += datalen; + + tuple->header.t_infomask = xlhdr->t_infomask; + tuple->header.t_infomask2 = xlhdr->t_infomask2; + tuple->header.t_hoff = xlhdr->t_hoff; + } + + ReorderBufferQueueChange(ctx->reorder, r->xl_xid, + buf->origptr, change); + } +} + +/* + * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete + * (but not by heap_multi_insert) into a tuplebuf. + * + * The size 'len' and the pointer 'data' in the record need to be + * computed outside as they are record specific. + */ +static void +DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple) +{ + xl_heap_header xlhdr; + int datalen = len - SizeOfHeapHeader; + + Assert(datalen >= 0); + Assert(datalen <= MaxHeapTupleSize); + + tuple->tuple.t_len = datalen + offsetof(HeapTupleHeaderData, t_bits); + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->tuple.t_self); + + /* we can only figure this out after reassembling the transactions */ + tuple->tuple.t_tableOid = InvalidOid; + tuple->tuple.t_data = &tuple->header; + + /* data is not stored aligned, copy to aligned storage */ + memcpy((char *) &xlhdr, + data, + SizeOfHeapHeader); + + memset(&tuple->header, 0, sizeof(HeapTupleHeaderData)); + + memcpy((char *) &tuple->header + offsetof(HeapTupleHeaderData, t_bits), + data + SizeOfHeapHeader, + datalen); + + tuple->header.t_infomask = xlhdr.t_infomask; + tuple->header.t_infomask2 = xlhdr.t_infomask2; + tuple->header.t_hoff = xlhdr.t_hoff; +} diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c new file mode 100644 index 0000000000..4fb0974f29 --- /dev/null +++ b/src/backend/replication/logical/logical.c @@ -0,0 +1,920 @@ +/*------------------------------------------------------------------------- + * logical.c + * PostgreSQL logical decoding coordination + * + * Copyright (c) 2012-2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/logical.c + * + * NOTES + * This file coordinates interaction between the various modules that + * together providethe logical decoding, primarily by providing so + * called LogicalDecodingContexts. The goal is to encapsulate most of the + * internal complexity for consumers of logical decoding, so they can + * create and consume a changestream with a low amount of code. + * + * The idea is that a consumer provides three callbacks, one to read WAL, + * one to prepare a data write, and a final one for actually writing since + * their implementation depends on the type of consumer. Check + * logicalfunc.c for an example implementations of a fairly simple consumer + * and a implementation of a WAL reading callback that's suitable for + * simpler consumers. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include + +#include "miscadmin.h" + +#include "access/xact.h" + +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/reorderbuffer.h" +#include "replication/snapbuild.h" + +#include "storage/proc.h" +#include "storage/procarray.h" + +#include "utils/memutils.h" + +/* data for errcontext callback */ +typedef struct LogicalErrorCallbackState +{ + LogicalDecodingContext *ctx; + const char *callback_name; + XLogRecPtr report_location; +} LogicalErrorCallbackState; + +/* wrappers around output plugin callbacks */ +static void output_plugin_error_callback(void *arg); +static void startup_cb_wrapper(LogicalDecodingContext *ctx, OutputPluginOptions *opt, + bool is_init); +static void shutdown_cb_wrapper(LogicalDecodingContext *ctx); +static void begin_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn); +static void commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); +static void change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); + +static void LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin); + +/* + * Make sure the current settings & environment are capable of doing logical + * decoding. + */ +void +CheckLogicalDecodingRequirements(void) +{ + CheckSlotRequirements(); + + if (wal_level < WAL_LEVEL_LOGICAL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical decoding requires wal_level >= logical"))); + + if (MyDatabaseId == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical decoding requires a database connection"))); + + /* ---- + * TODO: We got to change that someday soon... + * + * There's basically three things missing to allow this: + * 1) We need to be able to correctly and quickly identify the timeline a + * LSN belongs to + * 2) We need to force hot_standby_feedback to be enabled at all times so + * the primary cannot remove rows we need. + * 3) support dropping replication slots referring to a database, in + * dbase_redo. There can't be any active ones due to HS recovery + * conflicts, so that should be relatively easy. + * ---- + */ + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("logical decoding cannot be used while in recovery"))); +} + +/* + * Helper function for CreateInitialDecodingContext() and + * CreateDecodingContext() performing common tasks. + */ +static LogicalDecodingContext * +StartupDecodingContext(List *output_plugin_options, + XLogRecPtr start_lsn, + TransactionId xmin_horizon, + XLogPageReadCB read_page, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write) +{ + ReplicationSlot *slot; + MemoryContext context, old_context; + LogicalDecodingContext *ctx; + + /* shorter lines... */ + slot = MyReplicationSlot; + + context = AllocSetContextCreate(CurrentMemoryContext, + "Changeset Extraction Context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + old_context = MemoryContextSwitchTo(context); + ctx = palloc0(sizeof(LogicalDecodingContext)); + + ctx->context = context; + + /* (re-)load output plugins, so we detect a bad (removed) output plugin now. */ + LoadOutputPlugin(&ctx->callbacks, NameStr(slot->data.plugin)); + + /* + * Now that the slot's xmin has been set, we can announce ourselves as a + * logical decoding backend which doesn't need to be checked individually + * when computing the xmin horizon because the xmin is enforced via + * replication slots. + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + MyPgXact->vacuumFlags |= PROC_IN_LOGICAL_DECODING; + LWLockRelease(ProcArrayLock); + + ctx->slot = slot; + + ctx->reader = XLogReaderAllocate(read_page, ctx); + ctx->reader->private_data = ctx; + + ctx->reorder = ReorderBufferAllocate(); + ctx->snapshot_builder = + AllocateSnapshotBuilder(ctx->reorder, xmin_horizon, start_lsn); + + ctx->reorder->private_data = ctx; + + /* wrap output plugin callbacks, so we can add error context information */ + ctx->reorder->begin = begin_cb_wrapper; + ctx->reorder->apply_change = change_cb_wrapper; + ctx->reorder->commit = commit_cb_wrapper; + + ctx->out = makeStringInfo(); + ctx->prepare_write = prepare_write; + ctx->write = do_write; + + ctx->output_plugin_options = output_plugin_options; + + MemoryContextSwitchTo(old_context); + + return ctx; +} + +/* + * Create a new decoding context, for a new logical slot. + * + * plugin contains the name of the output plugin + * output_plugin_options contains options passed to the output plugin + * read_page, prepare_write, do_write are callbacks that have to be filled to + * perform the use-case dependent, actual, work. + * + * Needs to be called while in a memory context that's at least as long lived + * as the the decoding context because further memory contexts will be created + * inside it. + * + * Returns an initialized decoding context after calling the output plugin's + * startup function. + */ +LogicalDecodingContext * +CreateInitDecodingContext(char *plugin, + List *output_plugin_options, + XLogPageReadCB read_page, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write) +{ + TransactionId xmin_horizon = InvalidTransactionId; + ReplicationSlot *slot; + LogicalDecodingContext *ctx; + MemoryContext old_context; + + /* shorter lines... */ + slot = MyReplicationSlot; + + /* first some sanity checks that are unlikely to be violated */ + if (slot == NULL) + elog(ERROR, "cannot perform logical decoding without a acquired slot"); + + if (plugin == NULL) + elog(ERROR, "cannot initialize logical decoding without a specified plugin"); + + /* Make sure the passed slot is suitable. These are user facing errors. */ + if (slot->data.database == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot use physical replication slot created for logical decoding"))); + + if (slot->data.database != MyDatabaseId) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slot \"%s\" was not created in this database", + NameStr(slot->data.name)))); + + if (IsTransactionState() && + GetTopTransactionIdIfAny() != InvalidTransactionId) + ereport(ERROR, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + errmsg("cannot create logical replication slot in transaction that has performed writes"))); + + /* register output plugin name with slot */ + SpinLockAcquire(&slot->mutex); + strncpy(NameStr(slot->data.plugin), plugin, + NAMEDATALEN); + NameStr(slot->data.plugin)[NAMEDATALEN - 1] = '\0'; + SpinLockRelease(&slot->mutex); + + /* + * The replication slot mechanism is used to prevent removal of required + * WAL. As there is no interlock between this and checkpoints required WAL + * could be removed before ReplicationSlotsComputeRequiredLSN() has been + * called to prevent that. In the very unlikely case that this happens + * we'll just retry. + */ + while (true) + { + XLogSegNo segno; + + /* + * Let's start with enough information if we can, so log a standby + * snapshot and start decoding at exactly that position. + */ + if (!RecoveryInProgress()) + { + XLogRecPtr flushptr; + + /* start at current insert position*/ + slot->data.restart_lsn = GetXLogInsertRecPtr(); + + /* make sure we have enough information to start */ + flushptr = LogStandbySnapshot(); + + /* and make sure it's fsynced to disk */ + XLogFlush(flushptr); + } + else + slot->data.restart_lsn = GetRedoRecPtr(); + + /* prevent WAL removal as fast as possible */ + ReplicationSlotsComputeRequiredLSN(); + + /* + * If all required WAL is still there, great, otherwise retry. The + * slot should prevent further removal of WAL, unless there's a + * concurrent ReplicationSlotsComputeRequiredLSN() after we've written + * the new restart_lsn above, so normally we should never need to loop + * more than twice. + */ + XLByteToSeg(slot->data.restart_lsn, segno); + if (XLogGetLastRemovedSegno() < segno) + break; + } + + + /* ---- + * This is a bit tricky: We need to determine a safe xmin horizon to start + * decoding from, to avoid starting from a running xacts record referring + * to xids whose rows have been vacuumed or pruned + * already. GetOldestSafeDecodingTransactionId() returns such a value, but + * without further interlock it's return value might immediately be out of + * date. + * + * So we have to acquire the ProcArrayLock to prevent computation of new + * xmin horizons by other backends, get the safe decoding xid, and inform + * the slot machinery about the new limit. Once that's done the + * ProcArrayLock can be be released as the slot machinery now is + * protecting against vacuum. + * ---- + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + slot->effective_catalog_xmin = GetOldestSafeDecodingTransactionId(); + slot->data.catalog_xmin = slot->effective_catalog_xmin; + + ReplicationSlotsComputeRequiredXmin(true); + + LWLockRelease(ProcArrayLock); + + /* + * tell the snapshot builder to only assemble snapshot once reaching + * the a running_xact's record with the respective xmin. + */ + xmin_horizon = slot->data.catalog_xmin; + + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + + ctx = StartupDecodingContext(NIL, InvalidXLogRecPtr, xmin_horizon, + read_page, prepare_write, do_write); + + /* call output plugin initialization callback */ + old_context = MemoryContextSwitchTo(ctx->context); + if (ctx->callbacks.startup_cb != NULL) + startup_cb_wrapper(ctx, &ctx->options, true); + MemoryContextSwitchTo(old_context); + + return ctx; +} + +/* + * Create a new decoding context, for a logical slot that has previously been + * used already. + * + * start_lsn contains the LSN of the last received data or InvalidXLogRecPtr + * output_plugin_options contains options passed to the output plugin + * read_page, prepare_write, do_write are callbacks that have to be filled to + * perform the use-case dependent, actual, work. + * + * Needs to be called while in a memory context that's at least as long lived + * as the the decoding context because further memory contexts will be created + * inside it. + * + * Returns an initialized decoding context after calling the output plugin's + * startup function. + */ +LogicalDecodingContext * +CreateDecodingContext(XLogRecPtr start_lsn, + List *output_plugin_options, + XLogPageReadCB read_page, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write) +{ + LogicalDecodingContext *ctx; + ReplicationSlot *slot; + MemoryContext old_context; + + /* shorter lines... */ + slot = MyReplicationSlot; + + /* first some sanity checks that are unlikely to be violated */ + if (slot == NULL) + elog(ERROR, "cannot perform logical decoding without a acquired slot"); + + /* make sure the passed slot is suitable, these are user facing errors */ + if (slot->data.database == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + (errmsg("cannot use physical replication slot for logical decoding")))); + + if (slot->data.database != MyDatabaseId) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + (errmsg("replication slot \"%s\" was not created in this database", + NameStr(slot->data.name))))); + + if (start_lsn == InvalidXLogRecPtr) + { + /* continue from last position */ + start_lsn = slot->data.confirmed_flush; + } + else if (start_lsn < slot->data.confirmed_flush) + { + /* + * It might seem like we should error out in this case, but it's + * pretty common for a client to acknowledge a LSN it doesn't have to + * do anything for, and thus didn't store persistently, because the + * xlog records didn't result in anything relevant for logical + * decoding. Clients have to be able to do that to support + * synchronous replication. + */ + start_lsn = slot->data.confirmed_flush; + elog(DEBUG1, "cannot stream from %X/%X, minimum is %X/%X, forwarding", + (uint32)(start_lsn >> 32), (uint32)start_lsn, + (uint32)(slot->data.confirmed_flush >> 32), + (uint32)slot->data.confirmed_flush); + } + + ctx = StartupDecodingContext(output_plugin_options, + start_lsn, InvalidTransactionId, + read_page, prepare_write, do_write); + + /* call output plugin initialization callback */ + old_context = MemoryContextSwitchTo(ctx->context); + if (ctx->callbacks.startup_cb != NULL) + startup_cb_wrapper(ctx, &ctx->options, true); + MemoryContextSwitchTo(old_context); + + ereport(LOG, + (errmsg("starting logical decoding for slot %s", + NameStr(slot->data.name)), + errdetail("streaming transactions committing after %X/%X, reading WAL from %X/%X", + (uint32)(slot->data.confirmed_flush >> 32), + (uint32)slot->data.confirmed_flush, + (uint32)(slot->data.restart_lsn >> 32), + (uint32)slot->data.restart_lsn))); + + return ctx; +} + +/* + * Returns true if an consistent initial decoding snapshot has been built. + */ +bool +DecodingContextReady(LogicalDecodingContext *ctx) +{ + return SnapBuildCurrentState(ctx->snapshot_builder) == SNAPBUILD_CONSISTENT; +} + +/* + * Read from the decoding slot, until it is ready to start extracting changes. + */ +void +DecodingContextFindStartpoint(LogicalDecodingContext *ctx) +{ + XLogRecPtr startptr; + + /* Initialize from where to start reading WAL. */ + startptr = ctx->slot->data.restart_lsn; + + elog(DEBUG1, "searching for logical decoding starting point, starting at %X/%X", + (uint32)(ctx->slot->data.restart_lsn >> 32), + (uint32)ctx->slot->data.restart_lsn); + + /* Wait for a consistent starting point */ + for (;;) + { + XLogRecord *record; + char *err = NULL; + + /* + * If the caller requires that interrupts be checked, the read_page + * callback should do so, as those will often wait. + */ + + /* the read_page callback waits for new WAL */ + record = XLogReadRecord(ctx->reader, startptr, &err); + if (err) + elog(ERROR, "%s", err); + + Assert(record); + + startptr = InvalidXLogRecPtr; + + LogicalDecodingProcessRecord(ctx, record); + + /* only continue till we found a consistent spot */ + if (DecodingContextReady(ctx)) + break; + } + + ctx->slot->data.confirmed_flush = ctx->reader->EndRecPtr; +} + +/* + * Free a previously allocated decoding context, invoking the shutdown + * callback if necessary. + */ +void +FreeDecodingContext(LogicalDecodingContext *ctx) +{ + if (ctx->callbacks.shutdown_cb != NULL) + shutdown_cb_wrapper(ctx); + + ReorderBufferFree(ctx->reorder); + FreeSnapshotBuilder(ctx->snapshot_builder); + XLogReaderFree(ctx->reader); + MemoryContextDelete(ctx->context); +} + +/* + * Prepare a write using the context's output routine. + */ +void +OutputPluginPrepareWrite(struct LogicalDecodingContext *ctx, bool last_write) +{ + if (!ctx->accept_writes) + elog(ERROR, "writes are only accepted in commit, begin and change callbacks"); + + ctx->prepare_write(ctx, ctx->write_location, ctx->write_xid, last_write); + ctx->prepared_write = true; +} + +/* + * Perform a write using the context's output routine. + */ +void +OutputPluginWrite(struct LogicalDecodingContext *ctx, bool last_write) +{ + if (!ctx->prepared_write) + elog(ERROR, "OutputPluginPrepareWrite needs to be called before OutputPluginWrite"); + + ctx->write(ctx, ctx->write_location, ctx->write_xid, last_write); + ctx->prepared_write = false; +} + +/* + * Load the output plugin, lookup its output plugin init function, and check + * that it provides the required callbacks. + */ +static void +LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin) +{ + LogicalOutputPluginInit plugin_init; + + plugin_init = (LogicalOutputPluginInit) + load_external_function(plugin, "_PG_output_plugin_init", false, NULL); + + if (plugin_init == NULL) + elog(ERROR, "output plugins have to declare the _PG_output_plugin_init symbol"); + + /* ask the output plugin to fill the callback struct */ + plugin_init(callbacks); + + if (callbacks->begin_cb == NULL) + elog(ERROR, "output plugins have to register a begin callback"); + if (callbacks->change_cb == NULL) + elog(ERROR, "output plugins have to register a change callback"); + if (callbacks->commit_cb == NULL) + elog(ERROR, "output plugins have to register a commit callback"); +} + +static void +output_plugin_error_callback(void *arg) +{ + LogicalErrorCallbackState *state = (LogicalErrorCallbackState *) arg; + /* not all callbacks have an associated LSN */ + if (state->report_location != InvalidXLogRecPtr) + errcontext("slot \"%s\", output plugin \"%s\", in the %s callback, associated LSN %X/%X", + NameStr(state->ctx->slot->data.name), + NameStr(state->ctx->slot->data.plugin), + state->callback_name, + (uint32)(state->report_location >> 32), + (uint32)state->report_location); + else + errcontext("slot \"%s\", output plugin \"%s\", in the %s callback", + NameStr(state->ctx->slot->data.name), + NameStr(state->ctx->slot->data.plugin), + state->callback_name); +} + +static void +startup_cb_wrapper(LogicalDecodingContext *ctx, OutputPluginOptions *opt, bool is_init) +{ + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "startup"; + state.report_location = InvalidXLogRecPtr; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + + /* do the actual work: call callback */ + ctx->callbacks.startup_cb(ctx, opt, is_init); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +shutdown_cb_wrapper(LogicalDecodingContext *ctx) +{ + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "shutdown"; + state.report_location = InvalidXLogRecPtr; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + + /* do the actual work: call callback */ + ctx->callbacks.shutdown_cb(ctx); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + + +/* + * Callbacks for ReorderBuffer which add in some more information and then call + * output_plugin.h plugins. + */ +static void +begin_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "begin"; + state.report_location = txn->first_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->first_lsn; + + /* do the actual work: call callback */ + ctx->callbacks.begin_cb(ctx, txn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "commit"; + state.report_location = txn->final_lsn; /* beginning of commit record */ + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; /* points to the end of the record */ + + /* do the actual work: call callback */ + ctx->callbacks.commit_cb(ctx, txn, commit_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "change"; + state.report_location = change->lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + /* + * report this change's lsn so replies from clients can give an up2date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = change->lsn; + + ctx->callbacks.change_cb(ctx, txn, relation, change); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +/* + * Set the required catalog xmin horizon for historic snapshots in the current + * replication slot. + * + * Note that in the most cases, we won't be able to immediately use the xmin + * to increase the xmin horizon, we need to wait till the client has confirmed + * receiving current_lsn with LogicalConfirmReceivedLocation(). + */ +void +LogicalIncreaseXminForSlot(XLogRecPtr current_lsn, TransactionId xmin) +{ + bool updated_xmin = false; + ReplicationSlot *slot; + + slot = MyReplicationSlot; + + Assert(slot != NULL); + + SpinLockAcquire(&slot->mutex); + + /* + * don't overwrite if we already have a newer xmin. This can + * happen if we restart decoding in a slot. + */ + if (TransactionIdPrecedesOrEquals(xmin, slot->data.catalog_xmin)) + { + } + /* + * If the client has already confirmed up to this lsn, we directly + * can mark this as accepted. This can happen if we restart + * decoding in a slot. + */ + else if (current_lsn <= slot->data.confirmed_flush) + { + slot->candidate_catalog_xmin = xmin; + slot->candidate_xmin_lsn = current_lsn; + + /* our candidate can directly be used */ + updated_xmin = true; + } + /* + * Only increase if the previous values have been applied, otherwise we + * might never end up updating if the receiver acks too slowly. + */ + else if (slot->candidate_xmin_lsn == InvalidXLogRecPtr) + { + slot->candidate_catalog_xmin = xmin; + slot->candidate_xmin_lsn = current_lsn; + } + SpinLockRelease(&slot->mutex); + + /* candidate already valid with the current flush position, apply */ + if (updated_xmin) + LogicalConfirmReceivedLocation(slot->data.confirmed_flush); +} + +/* + * Mark the minimal LSN (restart_lsn) we need to read to replay all + * transactions that have not yet committed at current_lsn. + * + * Just like IncreaseRestartDecodingForSlot this nly takes effect when the + * client has confirmed to have received current_lsn. + */ +void +LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart_lsn) +{ + bool updated_lsn = false; + ReplicationSlot *slot; + + slot = MyReplicationSlot; + + Assert(slot != NULL); + Assert(restart_lsn != InvalidXLogRecPtr); + Assert(current_lsn != InvalidXLogRecPtr); + + SpinLockAcquire(&slot->mutex); + + /* don't overwrite if have a newer restart lsn*/ + if (restart_lsn <= slot->data.restart_lsn) + { + } + /* + * We might have already flushed far enough to directly accept this lsn, in + * this case there is no need to check for existing candidate LSNs + */ + else if (current_lsn <= slot->data.confirmed_flush) + { + slot->candidate_restart_valid = current_lsn; + slot->candidate_restart_lsn = restart_lsn; + + /* our candidate can directly be used */ + updated_lsn = true; + } + /* + * Only increase if the previous values have been applied, otherwise we + * might never end up updating if the receiver acks too slowly. A missed + * value here will just cause some extra effort after reconnecting. + */ + if (slot->candidate_restart_valid == InvalidXLogRecPtr) + { + slot->candidate_restart_valid = current_lsn; + slot->candidate_restart_lsn = restart_lsn; + + elog(DEBUG1, "got new restart lsn %X/%X at %X/%X", + (uint32) (restart_lsn >> 32), (uint32) restart_lsn, + (uint32) (current_lsn >> 32), (uint32) current_lsn); + } + else + { + elog(DEBUG1, "failed to increase restart lsn: proposed %X/%X, after %X/%X, current candidate %X/%X, current after %X/%X, flushed up to %X/%X", + (uint32) (restart_lsn >> 32), (uint32) restart_lsn, + (uint32) (current_lsn >> 32), (uint32) current_lsn, + (uint32) (slot->candidate_restart_lsn >> 32), + (uint32) slot->candidate_restart_lsn, + (uint32) (slot->candidate_restart_valid >> 32), + (uint32) slot->candidate_restart_valid, + (uint32) (slot->data.confirmed_flush >> 32), + (uint32) slot->data.confirmed_flush + ); + } + SpinLockRelease(&slot->mutex); + + /* candidates are already valid with the current flush position, apply */ + if (updated_lsn) + LogicalConfirmReceivedLocation(slot->data.confirmed_flush); +} + +/* + * Handle a consumer's conformation having received all changes up to lsn. + */ +void +LogicalConfirmReceivedLocation(XLogRecPtr lsn) +{ + Assert(lsn != InvalidXLogRecPtr); + + /* Do an unlocked check for candidate_lsn first. */ + if (MyReplicationSlot->candidate_xmin_lsn != InvalidXLogRecPtr || + MyReplicationSlot->candidate_restart_valid != InvalidXLogRecPtr) + { + bool updated_xmin = false; + bool updated_restart = false; + + /* use volatile pointer to prevent code rearrangement */ + volatile ReplicationSlot *slot = MyReplicationSlot; + + SpinLockAcquire(&slot->mutex); + + slot->data.confirmed_flush = lsn; + + /* if were past the location required for bumping xmin, do so */ + if (slot->candidate_xmin_lsn != InvalidXLogRecPtr && + slot->candidate_xmin_lsn <= lsn) + { + /* + * We have to write the changed xmin to disk *before* we change + * the in-memory value, otherwise after a crash we wouldn't know + * that some catalog tuples might have been removed already. + * + * Ensure that by first writing to ->xmin and only update + * ->effective_xmin once the new state is synced to disk. After a + * crash ->effective_xmin is set to ->xmin. + */ + if (TransactionIdIsValid(slot->candidate_catalog_xmin) && + slot->data.catalog_xmin != slot->candidate_catalog_xmin) + { + slot->data.catalog_xmin = slot->candidate_catalog_xmin; + slot->candidate_catalog_xmin = InvalidTransactionId; + slot->candidate_xmin_lsn = InvalidXLogRecPtr; + updated_xmin = true; + } + } + + if (slot->candidate_restart_valid != InvalidXLogRecPtr && + slot->candidate_restart_valid <= lsn) + { + Assert(slot->candidate_restart_lsn != InvalidXLogRecPtr); + + slot->data.restart_lsn = slot->candidate_restart_lsn; + slot->candidate_restart_lsn = InvalidXLogRecPtr; + slot->candidate_restart_valid = InvalidXLogRecPtr; + updated_restart = true; + } + + SpinLockRelease(&slot->mutex); + + /* first write new xmin to disk, so we know whats up after a crash */ + if (updated_xmin || updated_restart) + { + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + elog(DEBUG1, "updated xmin: %u restart: %u", updated_xmin, updated_restart); + } + /* + * Now the new xmin is safely on disk, we can let the global value + * advance. We do not take ProcArrayLock or similar since we only + * advance xmin here and there's not much harm done by a concurrent + * computation missing that. + */ + if (updated_xmin) + { + SpinLockAcquire(&slot->mutex); + slot->effective_catalog_xmin = slot->data.catalog_xmin; + SpinLockRelease(&slot->mutex); + + ReplicationSlotsComputeRequiredXmin(false); + ReplicationSlotsComputeRequiredLSN(); + } + } + else + { + volatile ReplicationSlot *slot = MyReplicationSlot; + + SpinLockAcquire(&slot->mutex); + slot->data.confirmed_flush = lsn; + SpinLockRelease(&slot->mutex); + } +} diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c new file mode 100644 index 0000000000..3b8ae3853b --- /dev/null +++ b/src/backend/replication/logical/logicalfuncs.c @@ -0,0 +1,509 @@ +/*------------------------------------------------------------------------- + * + * logicalfuncs.c + * + * Support functions for using logical decoding and managemnt of + * logical replication slots via SQL. + * + * + * Copyright (c) 2012-2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logicalfuncs.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" + +#include "catalog/pg_type.h" + +#include "nodes/makefuncs.h" + +#include "mb/pg_wchar.h" + +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/inval.h" +#include "utils/memutils.h" +#include "utils/pg_lsn.h" +#include "utils/resowner.h" +#include "utils/lsyscache.h" + +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/logicalfuncs.h" + +#include "storage/fd.h" + +/* private date for writing out data */ +typedef struct DecodingOutputState { + Tuplestorestate *tupstore; + TupleDesc tupdesc; + bool binary_output; + int64 returned_rows; +} DecodingOutputState; + +/* + * Prepare for a output plugin write. + */ +static void +LogicalOutputPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, + bool last_write) +{ + resetStringInfo(ctx->out); +} + +/* + * Perform output plugin write into tuplestore. + */ +static void +LogicalOutputWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, + bool last_write) +{ + Datum values[3]; + bool nulls[3]; + DecodingOutputState *p; + + /* SQL Datums can only be of a limited length... */ + if (ctx->out->len > MaxAllocSize - VARHDRSZ) + elog(ERROR, "too much output for sql interface"); + + p = (DecodingOutputState *) ctx->output_writer_private; + + memset(nulls, 0, sizeof(nulls)); + values[0] = LSNGetDatum(lsn); + values[1] = TransactionIdGetDatum(xid); + + /* + * Assert ctx->out is in database encoding when we're writing textual + * output. + */ + if (!p->binary_output) + Assert(pg_verify_mbstr(GetDatabaseEncoding(), + ctx->out->data, ctx->out->len, + false)); + + /* ick, but cstring_to_text_with_len works for bytea perfectly fine */ + values[2] = PointerGetDatum( + cstring_to_text_with_len(ctx->out->data, ctx->out->len)); + + tuplestore_putvalues(p->tupstore, p->tupdesc, values, nulls); + p->returned_rows++; +} + +/* + * TODO: This is duplicate code with pg_xlogdump, similar to walsender.c, but + * we currently don't have the infrastructure (elog!) to share it. + */ +static void +XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count) +{ + char *p; + XLogRecPtr recptr; + Size nbytes; + + static int sendFile = -1; + static XLogSegNo sendSegNo = 0; + static uint32 sendOff = 0; + + p = buf; + recptr = startptr; + nbytes = count; + + while (nbytes > 0) + { + uint32 startoff; + int segbytes; + int readbytes; + + startoff = recptr % XLogSegSize; + + if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo)) + { + char path[MAXPGPATH]; + + /* Switch to another logfile segment */ + if (sendFile >= 0) + close(sendFile); + + XLByteToSeg(recptr, sendSegNo); + + XLogFilePath(path, tli, sendSegNo); + + sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); + + if (sendFile < 0) + { + if (errno == ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("requested WAL segment %s has already been removed", + path))); + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); + } + sendOff = 0; + } + + /* Need to seek in the file? */ + if (sendOff != startoff) + { + if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0) + { + char path[MAXPGPATH]; + + XLogFilePath(path, tli, sendSegNo); + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not seek in log segment %s to offset %u: %m", + path, startoff))); + } + sendOff = startoff; + } + + /* How many bytes are within this segment? */ + if (nbytes > (XLogSegSize - startoff)) + segbytes = XLogSegSize - startoff; + else + segbytes = nbytes; + + readbytes = read(sendFile, p, segbytes); + if (readbytes <= 0) + { + char path[MAXPGPATH]; + + XLogFilePath(path, tli, sendSegNo); + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from log segment %s, offset %u, length %lu: %m", + path, sendOff, (unsigned long) segbytes))); + } + + /* Update state for read */ + recptr += readbytes; + + sendOff += readbytes; + nbytes -= readbytes; + p += readbytes; + } +} + +static void +check_permissions(void) +{ + if (!superuser() && !has_rolreplication(GetUserId())) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser or replication role to use replication slots")))); +} + +/* + * read_page callback for logical decoding contexts. + * + * Public because it would likely be very helpful for someone writing another + * output method outside walsender, e.g. in a bgworker. + * + * TODO: The walsender has it's own version of this, but it relies on the + * walsender's latch being set whenever WAL is flushed. No such infrastructure + * exists for normal backends, so we have to do a check/sleep/repeat style of + * loop for now. + */ +int +logical_read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, + int reqLen, XLogRecPtr targetRecPtr, char *cur_page, TimeLineID *pageTLI) +{ + XLogRecPtr flushptr, + loc; + int count; + + loc = targetPagePtr + reqLen; + while (1) + { + /* + * TODO: we're going to have to do something more intelligent about + * timelines on standbys. Use readTimeLineHistory() and + * tliOfPointInHistory() to get the proper LSN? For now we'll catch + * that case earlier, but the code and TODO is left in here for when + * that changes. + */ + if (!RecoveryInProgress()) + { + *pageTLI = ThisTimeLineID; + flushptr = GetFlushRecPtr(); + } + else + flushptr = GetXLogReplayRecPtr(pageTLI); + + if (loc <= flushptr) + break; + + CHECK_FOR_INTERRUPTS(); + pg_usleep(1000L); + } + + /* more than one block available */ + if (targetPagePtr + XLOG_BLCKSZ <= flushptr) + count = XLOG_BLCKSZ; + /* not enough data there */ + else if (targetPagePtr + reqLen > flushptr) + return -1; + /* part of the page available */ + else + count = flushptr - targetPagePtr; + + XLogRead(cur_page, *pageTLI, targetPagePtr, XLOG_BLCKSZ); + + return count; +} + +/* + * Helper function for the various SQL callable logical decoding functions. + */ +static Datum +pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool binary) +{ + Name name = PG_GETARG_NAME(0); + XLogRecPtr upto_lsn; + int32 upto_nchanges; + + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + + XLogRecPtr end_of_wal; + XLogRecPtr startptr; + + LogicalDecodingContext *ctx; + + ResourceOwner old_resowner = CurrentResourceOwner; + ArrayType *arr; + Size ndim; + List *options = NIL; + DecodingOutputState *p; + + if (PG_ARGISNULL(1)) + upto_lsn = InvalidXLogRecPtr; + else + upto_lsn = PG_GETARG_LSN(1); + + if (PG_ARGISNULL(2)) + upto_nchanges = InvalidXLogRecPtr; + else + upto_nchanges = PG_GETARG_INT32(2); + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not allowed in this context"))); + + /* state to write output to */ + p = palloc0(sizeof(DecodingOutputState)); + + p->binary_output = binary; + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &p->tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + check_permissions(); + + CheckLogicalDecodingRequirements(); + + arr = PG_GETARG_ARRAYTYPE_P(3); + ndim = ARR_NDIM(arr); + + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + if (ndim > 1) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("array must be one-dimensional"))); + } + else if (array_contains_nulls(arr)) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("array must not contain nulls"))); + } + else if (ndim == 1) + { + int nelems; + Datum *datum_opts; + int i; + + Assert(ARR_ELEMTYPE(arr) == TEXTOID); + + deconstruct_array(arr, TEXTOID, -1, false, 'i', + &datum_opts, NULL, &nelems); + + if (nelems % 2 != 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("array must have even number of elements"))); + + for (i = 0; i < nelems; i += 2) + { + char *name = TextDatumGetCString(datum_opts[i]); + char *opt = TextDatumGetCString(datum_opts[i + 1]); + + options = lappend(options, makeDefElem(name, (Node *) makeString(opt))); + } + } + + p->tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = p->tupstore; + rsinfo->setDesc = p->tupdesc; + + /* compute the current end-of-wal */ + if (!RecoveryInProgress()) + end_of_wal = GetFlushRecPtr(); + else + end_of_wal = GetXLogReplayRecPtr(NULL); + + CheckLogicalDecodingRequirements(); + ReplicationSlotAcquire(NameStr(*name)); + + PG_TRY(); + { + ctx = CreateDecodingContext(InvalidXLogRecPtr, + options, + logical_read_local_xlog_page, + LogicalOutputPrepareWrite, + LogicalOutputWrite); + + MemoryContextSwitchTo(oldcontext); + + /* + * Check whether the output pluggin writes textual output if that's + * what we need. + */ + if (!binary && + ctx->options.output_type != OUTPUT_PLUGIN_TEXTUAL_OUTPUT) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("output plugin cannot produce text output"))); + + ctx->output_writer_private = p; + + startptr = MyReplicationSlot->data.restart_lsn; + + CurrentResourceOwner = ResourceOwnerCreate(CurrentResourceOwner, "logical decoding"); + + /* invalidate non-timetravel entries */ + InvalidateSystemCaches(); + + while ((startptr != InvalidXLogRecPtr && startptr < end_of_wal) || + (ctx->reader->EndRecPtr && ctx->reader->EndRecPtr < end_of_wal)) + { + XLogRecord *record; + char *errm = NULL; + + record = XLogReadRecord(ctx->reader, startptr, &errm); + if (errm) + elog(ERROR, "%s", errm); + + startptr = InvalidXLogRecPtr; + + /* + * The {begin_txn,change,commit_txn}_wrapper callbacks above will + * store the description into our tuplestore. + */ + if (record != NULL) + LogicalDecodingProcessRecord(ctx, record); + + /* check limits */ + if (upto_lsn != InvalidXLogRecPtr && + upto_lsn <= ctx->reader->EndRecPtr) + break; + if (upto_nchanges != 0 && + upto_nchanges <= p->returned_rows) + break; + } + } + PG_CATCH(); + { + /* clear all timetravel entries */ + InvalidateSystemCaches(); + + PG_RE_THROW(); + } + PG_END_TRY(); + + tuplestore_donestoring(tupstore); + + CurrentResourceOwner = old_resowner; + + /* + * Next time, start where we left off. (Hunting things, the family + * business..) + */ + if (ctx->reader->EndRecPtr != InvalidXLogRecPtr && confirm) + LogicalConfirmReceivedLocation(ctx->reader->EndRecPtr); + + /* free context, call shutdown callback */ + FreeDecodingContext(ctx); + + ReplicationSlotRelease(); + InvalidateSystemCaches(); + + return (Datum) 0; +} + +/* + * SQL function returning the changestream as text, consuming the data. + */ +Datum +pg_logical_slot_get_changes(PG_FUNCTION_ARGS) +{ + Datum ret = pg_logical_slot_get_changes_guts(fcinfo, true, false); + return ret; +} + +/* + * SQL function returning the changestream as text, only peeking ahead. + */ +Datum +pg_logical_slot_peek_changes(PG_FUNCTION_ARGS) +{ + Datum ret = pg_logical_slot_get_changes_guts(fcinfo, false, false); + return ret; +} + +/* + * SQL function returning the changestream in binary, consuming the data. + */ +Datum +pg_logical_slot_get_binary_changes(PG_FUNCTION_ARGS) +{ + Datum ret = pg_logical_slot_get_changes_guts(fcinfo, true, true); + return ret; +} + +/* + * SQL function returning the changestream in binary, only peeking ahead. + */ +Datum +pg_logical_slot_peek_binary_changes(PG_FUNCTION_ARGS) +{ + Datum ret = pg_logical_slot_get_changes_guts(fcinfo, false, true); + return ret; +} diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c new file mode 100644 index 0000000000..e7182338b8 --- /dev/null +++ b/src/backend/replication/logical/reorderbuffer.c @@ -0,0 +1,3059 @@ +/*------------------------------------------------------------------------- + * + * reorderbuffer.c + * PostgreSQL logical replay/reorder buffer management + * + * + * Copyright (c) 2012-2014, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/replication/reorderbuffer.c + * + * NOTES + * This module gets handed individual pieces of transactions in the order + * they are written to the WAL and is responsible to reassemble them into + * toplevel transaction sized pieces. When a transaction is completely + * reassembled - signalled by reading the transaction commit record - it + * will then call the output plugin (c.f. ReorderBufferCommit()) with the + * individual changes. The output plugins rely on snapshots built by + * snapbuild.c which hands them to us. + * + * Transactions and subtransactions/savepoints in postgres are not + * immediately linked to each other from outside the performing + * backend. Only at commit/abort (or special xact_assignment records) they + * are linked together. Which means that we will have to splice together a + * toplevel transaction from its subtransactions. To do that efficiently we + * build a binary heap indexed by the smallest current lsn of the individual + * subtransactions' changestreams. As the individual streams are inherently + * ordered by LSN - since that is where we build them from - the transaction + * can easily be reassembled by always using the subtransaction with the + * smallest current LSN from the heap. + * + * In order to cope with large transactions - which can be several times as + * big as the available memory - this module supports spooling the contents + * of a large transactions to disk. When the transaction is replayed the + * contents of individual (sub-)transactions will be read from disk in + * chunks. + * + * This module also has to deal with reassembling toast records from the + * individual chunks stored in WAL. When a new (or initial) version of a + * tuple is stored in WAL it will always be preceded by the toast chunks + * emitted for the columns stored out of line. Within a single toplevel + * transaction there will be no other data carrying records between a row's + * toast chunks and the row data itself. See ReorderBufferToast* for + * details. + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include + +#include "miscadmin.h" + +#include "access/rewriteheap.h" +#include "access/transam.h" +#include "access/tuptoaster.h" +#include "access/xact.h" + +#include "catalog/catalog.h" + +#include "common/relpath.h" + +#include "lib/binaryheap.h" + +#include "replication/logical.h" +#include "replication/reorderbuffer.h" +#include "replication/slot.h" +#include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */ + +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/sinval.h" + +#include "utils/builtins.h" +#include "utils/combocid.h" +#include "utils/memdebug.h" +#include "utils/memutils.h" +#include "utils/relcache.h" +#include "utils/relfilenodemap.h" +#include "utils/tqual.h" + +/* + * For efficiency and simplicity reasons we want to keep Snapshots, CommandIds + * and ComboCids in the same list with the user visible INSERT/UPDATE/DELETE + * changes. We don't want to leak those internal values to external users + * though (they would just use switch()...default:) because that would make it + * harder to add to new user visible values. + * + * This needs to be synchronized with ReorderBufferChangeType! Adjust the + * StaticAssertExpr's in ReorderBufferAllocate if you add anything! + */ +typedef enum +{ + REORDER_BUFFER_CHANGE_INTERNAL_INSERT, + REORDER_BUFFER_CHANGE_INTERNAL_UPDATE, + REORDER_BUFFER_CHANGE_INTERNAL_DELETE, + REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT, + REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID, + REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID +} ReorderBufferChangeTypeInternal; + +/* entry for a hash table we use to map from xid to our transaction state */ +typedef struct ReorderBufferTXNByIdEnt +{ + TransactionId xid; + ReorderBufferTXN *txn; +} ReorderBufferTXNByIdEnt; + +/* data structures for (relfilenode, ctid) => (cmin, cmax) mapping */ +typedef struct ReorderBufferTupleCidKey +{ + RelFileNode relnode; + ItemPointerData tid; +} ReorderBufferTupleCidKey; + +typedef struct ReorderBufferTupleCidEnt +{ + ReorderBufferTupleCidKey key; + CommandId cmin; + CommandId cmax; + CommandId combocid; /* just for debugging */ +} ReorderBufferTupleCidEnt; + +/* k-way in-order change iteration support structures */ +typedef struct ReorderBufferIterTXNEntry +{ + XLogRecPtr lsn; + ReorderBufferChange *change; + ReorderBufferTXN *txn; + int fd; + XLogSegNo segno; +} ReorderBufferIterTXNEntry; + +typedef struct ReorderBufferIterTXNState +{ + binaryheap *heap; + Size nr_txns; + dlist_head old_change; + ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER]; +} ReorderBufferIterTXNState; + +/* toast datastructures */ +typedef struct ReorderBufferToastEnt +{ + Oid chunk_id; /* toast_table.chunk_id */ + int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we + * have seen */ + Size num_chunks; /* number of chunks we've already seen */ + Size size; /* combined size of chunks seen */ + dlist_head chunks; /* linked list of chunks */ + struct varlena *reconstructed; /* reconstructed varlena now pointed + * to in main tup */ +} ReorderBufferToastEnt; + +/* Disk serialization support datastructures */ +typedef struct ReorderBufferDiskChange +{ + Size size; + ReorderBufferChange change; + /* data follows */ +} ReorderBufferDiskChange; + +/* + * Maximum number of changes kept in memory, per transaction. After that, + * changes are spooled to disk. + * + * The current value should be sufficient to decode the entire transaction + * without hitting disk in OLTP workloads, while starting to spool to disk in + * other workloads reasonably fast. + * + * At some point in the future it probaly makes sense to have a more elaborate + * resource management here, but it's not entirely clear what that would look + * like. + */ +static const Size max_changes_in_memory = 4096; + +/* + * We use a very simple form of a slab allocator for frequently allocated + * objects, simply keeping a fixed number in a linked list when unused, + * instead pfree()ing them. Without that in many workloads aset.c becomes a + * major bottleneck, especially when spilling to disk while decoding batch + * workloads. + */ +static const Size max_cached_changes = 4096 * 2; +static const Size max_cached_tuplebufs = 4096 * 2; /* ~8MB */ +static const Size max_cached_transactions = 512; + + +/* --------------------------------------- + * primary reorderbuffer support routines + * --------------------------------------- + */ +static ReorderBufferTXN *ReorderBufferGetTXN(ReorderBuffer *rb); +static void ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); +static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb, + TransactionId xid, bool create, bool *is_new, + XLogRecPtr lsn, bool create_as_top); + +static void AssertTXNLsnOrder(ReorderBuffer *rb); + +/* --------------------------------------- + * support functions for lsn-order iterating over the ->changes of a + * transaction and its subtransactions + * + * used for iteration over the k-way heap merge of a transaction and its + * subtransactions + * --------------------------------------- + */ +static ReorderBufferIterTXNState *ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn); +static ReorderBufferChange * + ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state); +static void ReorderBufferIterTXNFinish(ReorderBuffer *rb, + ReorderBufferIterTXNState *state); +static void ReorderBufferExecuteInvalidations(ReorderBuffer *rb, ReorderBufferTXN *txn); + +/* + * --------------------------------------- + * Disk serialization support functions + * --------------------------------------- + */ +static void ReorderBufferCheckSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + int fd, ReorderBufferChange *change); +static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, + int *fd, XLogSegNo *segno); +static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + char *change); +static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn); + +static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap); +static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap, + ReorderBufferTXN *txn, CommandId cid); + +/* --------------------------------------- + * toast reassembly support + * --------------------------------------- + */ +static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); +static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); + + +/* + * Allocate a new ReorderBuffer + */ +ReorderBuffer * +ReorderBufferAllocate(void) +{ + ReorderBuffer *buffer; + HASHCTL hash_ctl; + MemoryContext new_ctx; + + StaticAssertExpr((int) REORDER_BUFFER_CHANGE_INTERNAL_INSERT == (int) REORDER_BUFFER_CHANGE_INSERT, "out of sync enums"); + StaticAssertExpr((int) REORDER_BUFFER_CHANGE_INTERNAL_UPDATE == (int) REORDER_BUFFER_CHANGE_UPDATE, "out of sync enums"); + StaticAssertExpr((int) REORDER_BUFFER_CHANGE_INTERNAL_DELETE == (int) REORDER_BUFFER_CHANGE_DELETE, "out of sync enums"); + + /* allocate memory in own context, to have better accountability */ + new_ctx = AllocSetContextCreate(CurrentMemoryContext, + "ReorderBuffer", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + buffer = + (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer)); + + memset(&hash_ctl, 0, sizeof(hash_ctl)); + + buffer->context = new_ctx; + + hash_ctl.keysize = sizeof(TransactionId); + hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt); + hash_ctl.hash = tag_hash; + hash_ctl.hcxt = buffer->context; + + buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl, + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + + buffer->by_txn_last_xid = InvalidTransactionId; + buffer->by_txn_last_txn = NULL; + + buffer->nr_cached_transactions = 0; + buffer->nr_cached_changes = 0; + buffer->nr_cached_tuplebufs = 0; + + buffer->outbuf = NULL; + buffer->outbufsize = 0; + + buffer->current_restart_decoding_lsn = InvalidXLogRecPtr; + + dlist_init(&buffer->toplevel_by_lsn); + dlist_init(&buffer->cached_transactions); + dlist_init(&buffer->cached_changes); + slist_init(&buffer->cached_tuplebufs); + + return buffer; +} + +/* + * Free a ReorderBuffer + */ +void +ReorderBufferFree(ReorderBuffer *rb) +{ + MemoryContext context = rb->context; + + /* + * We free separately allocated data by entirely scrapping reorderbuffer's + * memory context. + */ + MemoryContextDelete(context); +} + +/* + * Get a unused, possibly preallocated, ReorderBufferTXN. + */ +static ReorderBufferTXN * +ReorderBufferGetTXN(ReorderBuffer *rb) +{ + ReorderBufferTXN *txn; + + /* check the slab cache */ + if (rb->nr_cached_transactions > 0) + { + rb->nr_cached_transactions--; + txn = (ReorderBufferTXN *) + dlist_container(ReorderBufferTXN, node, + dlist_pop_head_node(&rb->cached_transactions)); + } + else + { + txn = (ReorderBufferTXN *) + MemoryContextAlloc(rb->context, sizeof(ReorderBufferTXN)); + } + + memset(txn, 0, sizeof(ReorderBufferTXN)); + + dlist_init(&txn->changes); + dlist_init(&txn->tuplecids); + dlist_init(&txn->subtxns); + + return txn; +} + +/* + * Free a ReorderBufferTXN. + * + * Deallocation might be delayed for efficiency purposes, for details check + * the comments above max_cached_changes's definition. + */ +void +ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + /* clean the lookup cache if we were cached (quite likely) */ + if (rb->by_txn_last_xid == txn->xid) + { + rb->by_txn_last_xid = InvalidTransactionId; + rb->by_txn_last_txn = NULL; + } + + /* free data that's contained */ + + if (txn->tuplecid_hash != NULL) + { + hash_destroy(txn->tuplecid_hash); + txn->tuplecid_hash = NULL; + } + + if (txn->invalidations) + { + pfree(txn->invalidations); + txn->invalidations = NULL; + } + + /* check whether to put into the slab cache */ + if (rb->nr_cached_transactions < max_cached_transactions) + { + rb->nr_cached_transactions++; + dlist_push_head(&rb->cached_transactions, &txn->node); + VALGRIND_MAKE_MEM_UNDEFINED(txn, sizeof(ReorderBufferTXN)); + VALGRIND_MAKE_MEM_DEFINED(&txn->node, sizeof(txn->node)); + } + else + { + pfree(txn); + } +} + +/* + * Get a unused, possibly preallocated, ReorderBufferChange. + */ +ReorderBufferChange * +ReorderBufferGetChange(ReorderBuffer *rb) +{ + ReorderBufferChange *change; + + /* check the slab cache */ + if (rb->nr_cached_changes) + { + rb->nr_cached_changes--; + change = (ReorderBufferChange *) + dlist_container(ReorderBufferChange, node, + dlist_pop_head_node(&rb->cached_changes)); + } + else + { + change = (ReorderBufferChange *) + MemoryContextAlloc(rb->context, sizeof(ReorderBufferChange)); + } + + memset(change, 0, sizeof(ReorderBufferChange)); + return change; +} + +/* + * Free an ReorderBufferChange. + * + * Deallocation might be delayed for efficiency purposes, for details check + * the comments above max_cached_changes's definition. + */ +void +ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change) +{ + /* free contained data */ + switch ((ReorderBufferChangeTypeInternal) change->action_internal) + { + case REORDER_BUFFER_CHANGE_INTERNAL_INSERT: + case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE: + case REORDER_BUFFER_CHANGE_INTERNAL_DELETE: + if (change->tp.newtuple) + { + ReorderBufferReturnTupleBuf(rb, change->tp.newtuple); + change->tp.newtuple = NULL; + } + + if (change->tp.oldtuple) + { + ReorderBufferReturnTupleBuf(rb, change->tp.oldtuple); + change->tp.oldtuple = NULL; + } + break; + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + if (change->snapshot) + { + ReorderBufferFreeSnap(rb, change->snapshot); + change->snapshot = NULL; + } + break; + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + break; + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + break; + } + + /* check whether to put into the slab cache */ + if (rb->nr_cached_changes < max_cached_changes) + { + rb->nr_cached_changes++; + dlist_push_head(&rb->cached_changes, &change->node); + VALGRIND_MAKE_MEM_UNDEFINED(change, sizeof(ReorderBufferChange)); + VALGRIND_MAKE_MEM_DEFINED(&change->node, sizeof(change->node)); + } + else + { + pfree(change); + } +} + + +/* + * Get a unused, possibly preallocated, ReorderBufferTupleBuf + */ +ReorderBufferTupleBuf * +ReorderBufferGetTupleBuf(ReorderBuffer *rb) +{ + ReorderBufferTupleBuf *tuple; + + /* check the slab cache */ + if (rb->nr_cached_tuplebufs) + { + rb->nr_cached_tuplebufs--; + tuple = slist_container(ReorderBufferTupleBuf, node, + slist_pop_head_node(&rb->cached_tuplebufs)); +#ifdef USE_ASSERT_CHECKING + memset(tuple, 0xdeadbeef, sizeof(ReorderBufferTupleBuf)); +#endif + } + else + { + tuple = (ReorderBufferTupleBuf *) + MemoryContextAlloc(rb->context, sizeof(ReorderBufferTupleBuf)); + } + + return tuple; +} + +/* + * Free an ReorderBufferTupleBuf. + * + * Deallocation might be delayed for efficiency purposes, for details check + * the comments above max_cached_changes's definition. + */ +void +ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple) +{ + /* check whether to put into the slab cache */ + if (rb->nr_cached_tuplebufs < max_cached_tuplebufs) + { + rb->nr_cached_tuplebufs++; + slist_push_head(&rb->cached_tuplebufs, &tuple->node); + VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf)); + VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node)); + } + else + { + pfree(tuple); + } +} + +/* + * Return the ReorderBufferTXN from the given buffer, specified by Xid. + * If create is true, and a transaction doesn't already exist, create it + * (with the given LSN, and as top transaction if that's specified); + * when this happens, is_new is set to true. + */ +static ReorderBufferTXN * +ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create, + bool *is_new, XLogRecPtr lsn, bool create_as_top) +{ + ReorderBufferTXN *txn; + ReorderBufferTXNByIdEnt *ent; + bool found; + + Assert(TransactionIdIsValid(xid)); + Assert(!create || lsn != InvalidXLogRecPtr); + + /* + * Check the one-entry lookup cache first + */ + if (TransactionIdIsValid(rb->by_txn_last_xid) && + rb->by_txn_last_xid == xid) + { + txn = rb->by_txn_last_txn; + + if (txn != NULL) + { + /* found it, and it's valid */ + if (is_new) + *is_new = false; + return txn; + } + + /* + * cached as non-existant, and asked not to create? Then nothing else + * to do. + */ + if (!create) + return NULL; + /* otherwise fall through to create it */ + } + + /* + * If the cache wasn't hit or it yielded an "does-not-exist" and we want + * to create an entry. + */ + + /* search the lookup table */ + ent = (ReorderBufferTXNByIdEnt *) + hash_search(rb->by_txn, + (void *) &xid, + create ? HASH_ENTER : HASH_FIND, + &found); + if (found) + txn = ent->txn; + else if (create) + { + /* initialize the new entry, if creation was requested */ + Assert(ent != NULL); + + ent->txn = ReorderBufferGetTXN(rb); + ent->txn->xid = xid; + txn = ent->txn; + txn->first_lsn = lsn; + txn->restart_decoding_lsn = rb->current_restart_decoding_lsn; + + if (create_as_top) + { + dlist_push_tail(&rb->toplevel_by_lsn, &txn->node); + AssertTXNLsnOrder(rb); + } + } + else + txn = NULL; /* not found and not asked to create */ + + /* update cache */ + rb->by_txn_last_xid = xid; + rb->by_txn_last_txn = txn; + + if (is_new) + *is_new = !found; + + Assert(!create || !!txn); + return txn; +} + +/* + * Queue a change into a transaction so it can be replayed upon commit. + */ +void +ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, + ReorderBufferChange *change) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + change->lsn = lsn; + Assert(InvalidXLogRecPtr != lsn); + dlist_push_tail(&txn->changes, &change->node); + txn->nentries++; + txn->nentries_mem++; + + ReorderBufferCheckSerializeTXN(rb, txn); +} + +static void +AssertTXNLsnOrder(ReorderBuffer *rb) +{ +#ifdef USE_ASSERT_CHECKING + dlist_iter iter; + XLogRecPtr prev_first_lsn = InvalidXLogRecPtr; + + dlist_foreach(iter, &rb->toplevel_by_lsn) + { + ReorderBufferTXN *cur_txn; + + cur_txn = dlist_container(ReorderBufferTXN, node, iter.cur); + Assert(cur_txn->first_lsn != InvalidXLogRecPtr); + + if (cur_txn->end_lsn != InvalidXLogRecPtr) + Assert(cur_txn->first_lsn <= cur_txn->end_lsn); + + if (prev_first_lsn != InvalidXLogRecPtr) + Assert(prev_first_lsn < cur_txn->first_lsn); + + Assert(!cur_txn->is_known_as_subxact); + prev_first_lsn = cur_txn->first_lsn; + } +#endif +} + +ReorderBufferTXN * +ReorderBufferGetOldestTXN(ReorderBuffer *rb) +{ + ReorderBufferTXN *txn; + + if (dlist_is_empty(&rb->toplevel_by_lsn)) + return NULL; + + AssertTXNLsnOrder(rb); + + txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn); + + Assert(!txn->is_known_as_subxact); + Assert(txn->first_lsn != InvalidXLogRecPtr); + return txn; +} + +void +ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr) +{ + rb->current_restart_decoding_lsn = ptr; +} + +void +ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid, + TransactionId subxid, XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + ReorderBufferTXN *subtxn; + bool new_top; + bool new_sub; + + txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true); + subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false); + + if (new_sub) + { + /* + * we assign subtransactions to top level transaction even if we don't + * have data for it yet, assignment records frequently reference xids + * that have not yet produced any records. Knowing those aren't top + * level xids allows us to make processing cheaper in some places. + */ + dlist_push_tail(&txn->subtxns, &subtxn->node); + txn->nsubtxns++; + } + else if (!subtxn->is_known_as_subxact) + { + subtxn->is_known_as_subxact = true; + Assert(subtxn->nsubtxns == 0); + + /* remove from lsn order list of top-level transactions */ + dlist_delete(&subtxn->node); + + /* add to toplevel transaction */ + dlist_push_tail(&txn->subtxns, &subtxn->node); + txn->nsubtxns++; + } + else if (new_top) + { + elog(ERROR, "existing subxact assigned to unknown toplevel xact"); + } +} + +/* + * Associate a subtransaction with its toplevel transaction at commit + * time. There may be no further changes added after this. + */ +void +ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid, + TransactionId subxid, XLogRecPtr commit_lsn, + XLogRecPtr end_lsn) +{ + ReorderBufferTXN *txn; + ReorderBufferTXN *subtxn; + + subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL, + InvalidXLogRecPtr, false); + + /* + * No need to do anything if that subtxn didn't contain any changes + */ + if (!subtxn) + return; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, true); + + if (txn == NULL) + elog(ERROR, "subxact logged without previous toplevel record"); + + /* + * Pass the our base snapshot to the parent transaction if it doesn't have + * one, or ours is older. That can happen if there are no changes in the + * toplevel transaction but in one of the child transactions. This allows + * the parent to simply use it's base snapshot initially. + */ + if (txn->base_snapshot == NULL || + txn->base_snapshot_lsn > subtxn->base_snapshot_lsn) + { + txn->base_snapshot = subtxn->base_snapshot; + txn->base_snapshot_lsn = subtxn->base_snapshot_lsn; + subtxn->base_snapshot = NULL; + subtxn->base_snapshot_lsn = InvalidXLogRecPtr; + } + + subtxn->final_lsn = commit_lsn; + subtxn->end_lsn = end_lsn; + + if (!subtxn->is_known_as_subxact) + { + subtxn->is_known_as_subxact = true; + Assert(subtxn->nsubtxns == 0); + + /* remove from lsn order list of top-level transactions */ + dlist_delete(&subtxn->node); + + /* add to subtransaction list */ + dlist_push_tail(&txn->subtxns, &subtxn->node); + txn->nsubtxns++; + } +} + + +/* + * Support for efficiently iterating over a transaction's and its + * subtransactions' changes. + * + * We do by doing a k-way merge between transactions/subtransactions. For that + * we model the current heads of the different transactions as a binary heap + * so we easily know which (sub-)transaction has the change with the smallest + * lsn next. + * + * We assume the changes in individual transactions are already sorted by LSN. + */ + +/* + * Binary heap comparison function. + */ +static int +ReorderBufferIterCompare(Datum a, Datum b, void *arg) +{ + ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg; + XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn; + XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn; + + if (pos_a < pos_b) + return 1; + else if (pos_a == pos_b) + return 0; + return -1; +} + +/* + * Allocate & initialize an iterator which iterates in lsn order over a + * transaction and all its subtransactions. + */ +static ReorderBufferIterTXNState * +ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + Size nr_txns = 0; + ReorderBufferIterTXNState *state; + dlist_iter cur_txn_i; + int32 off; + + /* + * Calculate the size of our heap: one element for every transaction that + * contains changes. (Besides the transactions already in the reorder + * buffer, we count the one we were directly passed.) + */ + if (txn->nentries > 0) + nr_txns++; + + dlist_foreach(cur_txn_i, &txn->subtxns) + { + ReorderBufferTXN *cur_txn; + + cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur); + + if (cur_txn->nentries > 0) + nr_txns++; + } + + /* + * TODO: Consider adding fastpath for the rather common nr_txns=1 case, no + * need to allocate/build a heap then. + */ + + /* allocate iteration state */ + state = (ReorderBufferIterTXNState *) + MemoryContextAllocZero(rb->context, + sizeof(ReorderBufferIterTXNState) + + sizeof(ReorderBufferIterTXNEntry) * nr_txns); + + state->nr_txns = nr_txns; + dlist_init(&state->old_change); + + for (off = 0; off < state->nr_txns; off++) + { + state->entries[off].fd = -1; + state->entries[off].segno = 0; + } + + /* allocate heap */ + state->heap = binaryheap_allocate(state->nr_txns, + ReorderBufferIterCompare, + state); + + /* + * Now insert items into the binary heap, in an unordered fashion. (We + * will run a heap assembly step at the end; this is more efficient.) + */ + + off = 0; + + /* add toplevel transaction if it contains changes */ + if (txn->nentries > 0) + { + ReorderBufferChange *cur_change; + + if (txn->nentries != txn->nentries_mem) + ReorderBufferRestoreChanges(rb, txn, &state->entries[off].fd, + &state->entries[off].segno); + + cur_change = dlist_head_element(ReorderBufferChange, node, + &txn->changes); + + state->entries[off].lsn = cur_change->lsn; + state->entries[off].change = cur_change; + state->entries[off].txn = txn; + + binaryheap_add_unordered(state->heap, Int32GetDatum(off++)); + } + + /* add subtransactions if they contain changes */ + dlist_foreach(cur_txn_i, &txn->subtxns) + { + ReorderBufferTXN *cur_txn; + + cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur); + + if (cur_txn->nentries > 0) + { + ReorderBufferChange *cur_change; + + if (txn->nentries != txn->nentries_mem) + ReorderBufferRestoreChanges(rb, cur_txn, + &state->entries[off].fd, + &state->entries[off].segno); + + cur_change = dlist_head_element(ReorderBufferChange, node, + &cur_txn->changes); + + state->entries[off].lsn = cur_change->lsn; + state->entries[off].change = cur_change; + state->entries[off].txn = cur_txn; + + binaryheap_add_unordered(state->heap, Int32GetDatum(off++)); + } + } + + /* assemble a valid binary heap */ + binaryheap_build(state->heap); + + return state; +} + +/* + * Return the next change when iterating over a transaction and its + * subtransactions. + * + * Returns NULL when no further changes exist. + */ +static ReorderBufferChange * +ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state) +{ + ReorderBufferChange *change; + ReorderBufferIterTXNEntry *entry; + int32 off; + + /* nothing there anymore */ + if (state->heap->bh_size == 0) + return NULL; + + off = DatumGetInt32(binaryheap_first(state->heap)); + entry = &state->entries[off]; + + /* free memory we might have "leaked" in the previous *Next call */ + if (!dlist_is_empty(&state->old_change)) + { + change = dlist_container(ReorderBufferChange, node, + dlist_pop_head_node(&state->old_change)); + ReorderBufferReturnChange(rb, change); + Assert(dlist_is_empty(&state->old_change)); + } + + change = entry->change; + + /* + * update heap with information about which transaction has the next + * relevant change in LSN order + */ + + /* there are in-memory changes */ + if (dlist_has_next(&entry->txn->changes, &entry->change->node)) + { + dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node); + ReorderBufferChange *next_change = + dlist_container(ReorderBufferChange, node, next); + + /* txn stays the same */ + state->entries[off].lsn = next_change->lsn; + state->entries[off].change = next_change; + + binaryheap_replace_first(state->heap, Int32GetDatum(off)); + return change; + } + + /* try to load changes from disk */ + if (entry->txn->nentries != entry->txn->nentries_mem) + { + /* + * Ugly: restoring changes will reuse *Change records, thus delete the + * current one from the per-tx list and only free in the next call. + */ + dlist_delete(&change->node); + dlist_push_tail(&state->old_change, &change->node); + + if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->fd, + &state->entries[off].segno)) + { + /* successfully restored changes from disk */ + ReorderBufferChange *next_change = + dlist_head_element(ReorderBufferChange, node, + &entry->txn->changes); + + elog(DEBUG2, "restored %u/%u changes from disk", + (uint32) entry->txn->nentries_mem, + (uint32) entry->txn->nentries); + + Assert(entry->txn->nentries_mem); + /* txn stays the same */ + state->entries[off].lsn = next_change->lsn; + state->entries[off].change = next_change; + binaryheap_replace_first(state->heap, Int32GetDatum(off)); + + return change; + } + } + + /* ok, no changes there anymore, remove */ + binaryheap_remove_first(state->heap); + + return change; +} + +/* + * Deallocate the iterator + */ +static void +ReorderBufferIterTXNFinish(ReorderBuffer *rb, + ReorderBufferIterTXNState *state) +{ + int32 off; + + for (off = 0; off < state->nr_txns; off++) + { + if (state->entries[off].fd != -1) + CloseTransientFile(state->entries[off].fd); + } + + /* free memory we might have "leaked" in the last *Next call */ + if (!dlist_is_empty(&state->old_change)) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, + dlist_pop_head_node(&state->old_change)); + ReorderBufferReturnChange(rb, change); + Assert(dlist_is_empty(&state->old_change)); + } + + binaryheap_free(state->heap); + pfree(state); +} + +/* + * Cleanup the contents of a transaction, usually after the transaction + * committed or aborted. + */ +static void +ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + bool found; + dlist_mutable_iter iter; + + /* cleanup subtransactions & their changes */ + dlist_foreach_modify(iter, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, iter.cur); + + /* + * Subtransactions are always associated to the toplevel TXN, even if + * they originally were happening inside another subtxn, so we won't + * ever recurse more than one level deep here. + */ + Assert(subtxn->is_known_as_subxact); + Assert(subtxn->nsubtxns == 0); + + ReorderBufferCleanupTXN(rb, subtxn); + } + + /* cleanup changes in the toplevel txn */ + dlist_foreach_modify(iter, &txn->changes) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + ReorderBufferReturnChange(rb, change); + } + + /* + * Cleanup the tuplecids we stored for decoding catalog snapshot + * access. They are always stored in the toplevel transaction. + */ + dlist_foreach_modify(iter, &txn->tuplecids) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + Assert(change->action_internal == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID); + ReorderBufferReturnChange(rb, change); + } + + if (txn->base_snapshot != NULL) + { + SnapBuildSnapDecRefcount(txn->base_snapshot); + txn->base_snapshot = NULL; + txn->base_snapshot_lsn = InvalidXLogRecPtr; + } + + /* delete from list of known subxacts */ + if (txn->is_known_as_subxact) + { + /* NB: nsubxacts count of parent will be too high now */ + dlist_delete(&txn->node); + } + /* delete from LSN ordered list of toplevel TXNs */ + else + { + dlist_delete(&txn->node); + } + + /* now remove reference from buffer */ + hash_search(rb->by_txn, + (void *) &txn->xid, + HASH_REMOVE, + &found); + Assert(found); + + /* remove entries spilled to disk */ + if (txn->nentries != txn->nentries_mem) + ReorderBufferRestoreCleanup(rb, txn); + + /* deallocate */ + ReorderBufferReturnTXN(rb, txn); +} + +/* + * Build a hash with a (relfilenode, ctid) -> (cmin, cmax) mapping for use by + * tqual.c's HeapTupleSatisfiesHistoricMVCC. + */ +static void +ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + dlist_iter iter; + HASHCTL hash_ctl; + + if (!txn->has_catalog_changes || dlist_is_empty(&txn->tuplecids)) + return; + + memset(&hash_ctl, 0, sizeof(hash_ctl)); + + hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey); + hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt); + hash_ctl.hash = tag_hash; + hash_ctl.hcxt = rb->context; + + /* + * create the hash with the exact number of to-be-stored tuplecids from + * the start + */ + txn->tuplecid_hash = + hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl, + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + + dlist_foreach(iter, &txn->tuplecids) + { + ReorderBufferTupleCidKey key; + ReorderBufferTupleCidEnt *ent; + bool found; + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + Assert(change->action_internal == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID); + + /* be careful about padding */ + memset(&key, 0, sizeof(ReorderBufferTupleCidKey)); + + key.relnode = change->tuplecid.node; + + ItemPointerCopy(&change->tuplecid.tid, + &key.tid); + + ent = (ReorderBufferTupleCidEnt *) + hash_search(txn->tuplecid_hash, + (void *) &key, + HASH_ENTER | HASH_FIND, + &found); + if (!found) + { + ent->cmin = change->tuplecid.cmin; + ent->cmax = change->tuplecid.cmax; + ent->combocid = change->tuplecid.combocid; + } + else + { + Assert(ent->cmin == change->tuplecid.cmin); + Assert(ent->cmax == InvalidCommandId || + ent->cmax == change->tuplecid.cmax); + + /* + * if the tuple got valid in this transaction and now got deleted + * we already have a valid cmin stored. The cmax will be + * InvalidCommandId though. + */ + ent->cmax = change->tuplecid.cmax; + } + } +} + +/* + * Copy a provided snapshot so we can modify it privately. This is needed so + * that catalog modifying transactions can look into intermediate catalog + * states. + */ +static Snapshot +ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap, + ReorderBufferTXN *txn, CommandId cid) +{ + Snapshot snap; + dlist_iter iter; + int i = 0; + Size size; + + size = sizeof(SnapshotData) + + sizeof(TransactionId) * orig_snap->xcnt + + sizeof(TransactionId) * (txn->nsubtxns + 1); + + snap = MemoryContextAllocZero(rb->context, size); + memcpy(snap, orig_snap, sizeof(SnapshotData)); + + snap->copied = true; + snap->active_count = 0; + snap->regd_count = 1; + snap->xip = (TransactionId *) (snap + 1); + + memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt); + + /* + * snap->subxip contains all txids that belong to our transaction which we + * need to check via cmin/cmax. Thats why we store the toplevel + * transaction in there as well. + */ + snap->subxip = snap->xip + snap->xcnt; + snap->subxip[i++] = txn->xid; + + /* + * nsubxcnt isn't decreased when subtransactions abort, so count + * manually. Since it's an upper boundary it is safe to use it for the + * allocation above. + */ + snap->subxcnt = 1; + + dlist_foreach(iter, &txn->subtxns) + { + ReorderBufferTXN *sub_txn; + + sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur); + snap->subxip[i++] = sub_txn->xid; + snap->subxcnt++; + } + + /* sort so we can bsearch() later */ + qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator); + + /* store the specified current CommandId */ + snap->curcid = cid; + + return snap; +} + +/* + * Free a previously ReorderBufferCopySnap'ed snapshot + */ +static void +ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap) +{ + if (snap->copied) + pfree(snap); + else + SnapBuildSnapDecRefcount(snap); +} + +/* + * Perform the replay of a transaction and it's non-aborted subtransactions. + * + * Subtransactions previously have to be processed by + * ReorderBufferCommitChild(), even if previously assigned to the toplevel + * transaction with ReorderBufferAssignChild. + * + * We currently can only decode a transaction's contents in when their commit + * record is read because that's currently the only place where we know about + * cache invalidations. Thus, once a toplevel commit is read, we iterate over + * the top and subtransactions (using a k-way merge) and replay the changes in + * lsn order. + */ +void +ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr commit_lsn, XLogRecPtr end_lsn, + TimestampTz commit_time) +{ + ReorderBufferTXN *txn; + ReorderBufferIterTXNState *iterstate = NULL; + ReorderBufferChange *change; + + volatile CommandId command_id = FirstCommandId; + volatile Snapshot snapshot_now = NULL; + volatile bool txn_started = false; + volatile bool subtxn_started = false; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown transaction, nothing to replay */ + if (txn == NULL) + return; + + txn->final_lsn = commit_lsn; + txn->end_lsn = end_lsn; + txn->commit_time = commit_time; + + /* serialize the last bunch of changes if we need start earlier anyway */ + if (txn->nentries_mem != txn->nentries) + ReorderBufferSerializeTXN(rb, txn); + + /* + * If this transaction didn't have any real changes in our database, it's + * OK not to have a snapshot. Note that ReorderBufferCommitChild will have + * transferred its snapshot to this transaction if it had one and the + * toplevel tx didn't. + */ + if (txn->base_snapshot == NULL) + { + Assert(txn->ninvalidations == 0); + ReorderBufferCleanupTXN(rb, txn); + return; + } + + snapshot_now = txn->base_snapshot; + + /* build data to be able to lookup the CommandIds of catalog tuples */ + ReorderBufferBuildTupleCidHash(rb, txn); + + /* setup the initial snapshot */ + SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash); + + PG_TRY(); + { + txn_started = false; + + /* + * Decoding needs access to syscaches et al., which in turn use + * heavyweight locks and such. Thus we need to have enough state around + * to keep track of those. The easiest way is to simply use a + * transaction internally. That also allows us to easily enforce that + * nothing writes to the database by checking for xid assignments. + * + * When we're called via the SQL SRF there's already a transaction + * started, so start an explicit subtransaction there. + */ + if (IsTransactionOrTransactionBlock()) + { + BeginInternalSubTransaction("replay"); + subtxn_started = true; + } + else + { + StartTransactionCommand(); + txn_started = true; + } + + rb->begin(rb, txn); + + iterstate = ReorderBufferIterTXNInit(rb, txn); + while ((change = ReorderBufferIterTXNNext(rb, iterstate))) + { + Relation relation = NULL; + Oid reloid; + + switch ((ReorderBufferChangeTypeInternal) change->action_internal) + { + case REORDER_BUFFER_CHANGE_INTERNAL_INSERT: + case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE: + case REORDER_BUFFER_CHANGE_INTERNAL_DELETE: + Assert(snapshot_now); + + reloid = RelidByRelfilenode(change->tp.relnode.spcNode, + change->tp.relnode.relNode); + + /* + * Catalog tuple without data, emitted while catalog was + * in the process of being rewritten. + */ + if (reloid == InvalidOid && + change->tp.newtuple == NULL && + change->tp.oldtuple == NULL) + continue; + else if (reloid == InvalidOid) + elog(ERROR, "could not lookup relation %s", + relpathperm(change->tp.relnode, MAIN_FORKNUM)); + + relation = RelationIdGetRelation(reloid); + + if (relation == NULL) + elog(ERROR, "could open relation descriptor %s", + relpathperm(change->tp.relnode, MAIN_FORKNUM)); + + if (RelationIsLogicallyLogged(relation)) + { + /* + * For now ignore sequence changes entirely. Most of + * the time they don't log changes using records we + * understand, so it doesn't make sense to handle the + * few cases we do. + */ + if (relation->rd_rel->relkind == RELKIND_SEQUENCE) + { + } + /* user-triggered change */ + else if (!IsToastRelation(relation)) + { + ReorderBufferToastReplace(rb, txn, relation, change); + rb->apply_change(rb, txn, relation, change); + ReorderBufferToastReset(rb, txn); + } + /* we're not interested in toast deletions */ + else if (change->action == REORDER_BUFFER_CHANGE_INSERT) + { + /* + * Need to reassemble the full toasted Datum in + * memory, to ensure the chunks don't get reused + * till we're done remove it from the list of this + * transaction's changes. Otherwise it will get + * freed/reused while restoring spooled data from + * disk. + */ + dlist_delete(&change->node); + ReorderBufferToastAppendChunk(rb, txn, relation, + change); + } + + } + RelationClose(relation); + break; + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + /* get rid of the old */ + TeardownHistoricSnapshot(false); + + if (snapshot_now->copied) + { + ReorderBufferFreeSnap(rb, snapshot_now); + snapshot_now = + ReorderBufferCopySnap(rb, change->snapshot, + txn, command_id); + } + /* + * Restored from disk, need to be careful not to double + * free. We could introduce refcounting for that, but for + * now this seems infrequent enough not to care. + */ + else if (change->snapshot->copied) + { + snapshot_now = + ReorderBufferCopySnap(rb, change->snapshot, + txn, command_id); + } + else + { + snapshot_now = change->snapshot; + } + + + /* and continue with the new one */ + SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash); + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + Assert(change->command_id != InvalidCommandId); + + if (command_id < change->command_id) + { + command_id = change->command_id; + + if (!snapshot_now->copied) + { + /* we don't use the global one anymore */ + snapshot_now = ReorderBufferCopySnap(rb, snapshot_now, + txn, command_id); + } + + snapshot_now->curcid = command_id; + + TeardownHistoricSnapshot(false); + SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash); + + /* + * Every time the CommandId is incremented, we could + * see new catalog contents, so execute all + * invalidations. + */ + ReorderBufferExecuteInvalidations(rb, txn); + } + + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + elog(ERROR, "tuplecid value in changequeue"); + break; + } + } + + ReorderBufferIterTXNFinish(rb, iterstate); + + /* call commit callback */ + rb->commit(rb, txn, commit_lsn); + + /* this is just a sanity check against bad output plugin behaviour */ + if (GetCurrentTransactionIdIfAny() != InvalidTransactionId) + elog(ERROR, "output plugin used xid %u", + GetCurrentTransactionId()); + + /* make sure there's no cache pollution */ + ReorderBufferExecuteInvalidations(rb, txn); + + /* cleanup */ + TeardownHistoricSnapshot(false); + + /* + * Abort subtransaction or the transaction as a whole has the right + * semantics. We want all locks acquired in here to be released, not + * reassigned to the parent and we do not want any database access + * have persistent effects. + */ + if (subtxn_started) + RollbackAndReleaseCurrentSubTransaction(); + else if (txn_started) + AbortCurrentTransaction(); + + if (snapshot_now->copied) + ReorderBufferFreeSnap(rb, snapshot_now); + + /* remove potential on-disk data, and deallocate */ + ReorderBufferCleanupTXN(rb, txn); + } + PG_CATCH(); + { + /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */ + if (iterstate) + ReorderBufferIterTXNFinish(rb, iterstate); + + TeardownHistoricSnapshot(true); + + if (snapshot_now->copied) + ReorderBufferFreeSnap(rb, snapshot_now); + + if (subtxn_started) + RollbackAndReleaseCurrentSubTransaction(); + else if (txn_started) + AbortCurrentTransaction(); + + /* + * Invalidations in an aborted transactions aren't allowed to do + * catalog access, so we don't need to still have the snapshot setup. + */ + ReorderBufferExecuteInvalidations(rb, txn); + + /* remove potential on-disk data, and deallocate */ + ReorderBufferCleanupTXN(rb, txn); + + PG_RE_THROW(); + } + PG_END_TRY(); +} + +/* + * Abort a transaction that possibly has previous changes. Needs to be first + * called for subtransactions and then for the toplevel xid. + * + * NB: Transactions handled here have to have actively aborted (i.e. have + * produced an abort record). Implicitly aborted transactions are handled via + * ReorderBufferAbortOld(); transactions we're just not interesteded in, but + * which have committed are handled in ReorderBufferForget(). + * + * This function purges this transaction and its contents from memory and + * disk. + */ +void +ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown, nothing to remove */ + if (txn == NULL) + return; + + /* cosmetic... */ + txn->final_lsn = lsn; + + /* remove potential on-disk data, and deallocate */ + ReorderBufferCleanupTXN(rb, txn); +} + +/* + * Abort all transactions that aren't actually running anymore because the + * server restarted. + * + * NB: These really have to be transactions that have aborted due to a server + * crash/immediate restart, as we don't deal with invalidations here. + */ +void +ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid) +{ + dlist_mutable_iter it; + + /* + * Iterate through all (potential) toplevel TXNs and abort all that are + * older than what possibly can be running. Once we've found the first + * that is alive we stop, there might be some that acquired an xid earlier + * but started writing later, but it's unlikely and they will cleaned up + * in a later call to ReorderBufferAbortOld(). + */ + dlist_foreach_modify(it, &rb->toplevel_by_lsn) + { + ReorderBufferTXN * txn; + + txn = dlist_container(ReorderBufferTXN, node, it.cur); + + if (TransactionIdPrecedes(txn->xid, oldestRunningXid)) + { + elog(DEBUG1, "aborting old transaction %u", txn->xid); + + /* remove potential on-disk data, and deallocate this tx */ + ReorderBufferCleanupTXN(rb, txn); + } + else + return; + } +} + +/* + * Forget the contents of a transaction if we aren't interested in it's + * contents. Needs to be first called for subtransactions and then for the + * toplevel xid. + * + * This is significantly different to ReorderBufferAbort() because + * transactions that have committed need to be treated differenly from aborted + * ones since they may have modified the catalog. + * + * Note that this is only allowed to be called in the moment a transaction + * commit has just been read, not earlier; otherwise later records referring + * to this xid might re-create the transaction incompletely. + */ +void +ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown, nothing to forget */ + if (txn == NULL) + return; + + /* cosmetic... */ + txn->final_lsn = lsn; + + /* + * Proccess cache invalidation messages if there are any. Even if we're + * not interested in the transaction's contents, it could have manipulated + * the catalog and we need to update the caches according to that. + */ + if (txn->base_snapshot != NULL && txn->ninvalidations > 0) + { + /* setup snapshot to perform the invalidations in */ + SetupHistoricSnapshot(txn->base_snapshot, txn->tuplecid_hash); + PG_TRY(); + { + ReorderBufferExecuteInvalidations(rb, txn); + TeardownHistoricSnapshot(false); + } + PG_CATCH(); + { + /* cleanup */ + TeardownHistoricSnapshot(true); + PG_RE_THROW(); + } + PG_END_TRY(); + } + else + Assert(txn->ninvalidations == 0); + + /* remove potential on-disk data, and deallocate */ + ReorderBufferCleanupTXN(rb, txn); +} + + +/* + * Check whether a transaction is already known in this module.xs + */ +bool +ReorderBufferIsXidKnown(ReorderBuffer *rb, TransactionId xid) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + return txn != NULL; +} + +/* + * Add a new snapshot to this transaction that may only used after lsn 'lsn' + * because the previous snapshot doesn't describe the catalog correctly for + * following rows. + */ +void +ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Snapshot snap) +{ + ReorderBufferChange *change = ReorderBufferGetChange(rb); + + change->snapshot = snap; + change->action_internal = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT; + + ReorderBufferQueueChange(rb, xid, lsn, change); +} + +/* + * Setup the base snapshot of a transaction. The base snapshot is the snapshot + * that is used to decode all changes until either this transaction modifies + * the catalog or another catalog modifying transaction commits. + * + * Needs to be called before any changes are added with + * ReorderBufferQueueChange(). + */ +void +ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Snapshot snap) +{ + ReorderBufferTXN *txn; + bool is_new; + + txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true); + Assert(txn->base_snapshot == NULL); + Assert(snap != NULL); + + txn->base_snapshot = snap; + txn->base_snapshot_lsn = lsn; +} + +/* + * Access the catalog with this CommandId at this point in the changestream. + * + * May only be called for command ids > 1 + */ +void +ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, CommandId cid) +{ + ReorderBufferChange *change = ReorderBufferGetChange(rb); + + change->command_id = cid; + change->action_internal = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID; + + ReorderBufferQueueChange(rb, xid, lsn, change); +} + + +/* + * Add new (relfilenode, tid) -> (cmin, cmax) mappings. + */ +void +ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, RelFileNode node, + ItemPointerData tid, CommandId cmin, + CommandId cmax, CommandId combocid) +{ + ReorderBufferChange *change = ReorderBufferGetChange(rb); + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + change->tuplecid.node = node; + change->tuplecid.tid = tid; + change->tuplecid.cmin = cmin; + change->tuplecid.cmax = cmax; + change->tuplecid.combocid = combocid; + change->lsn = lsn; + change->action_internal = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID; + + dlist_push_tail(&txn->tuplecids, &change->node); + txn->ntuplecids++; +} + +/* + * Setup the invalidation of the toplevel transaction. + * + * This needs to be done before ReorderBufferCommit is called! + */ +void +ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Size nmsgs, + SharedInvalidationMessage *msgs) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + if (txn->ninvalidations != 0) + elog(ERROR, "only ever add one set of invalidations"); + + Assert(nmsgs > 0); + + txn->ninvalidations = nmsgs; + txn->invalidations = (SharedInvalidationMessage *) + MemoryContextAlloc(rb->context, + sizeof(SharedInvalidationMessage) * nmsgs); + memcpy(txn->invalidations, msgs, + sizeof(SharedInvalidationMessage) * nmsgs); +} + +/* + * Apply all invalidations we know. Possibly we only need parts at this point + * in the changestream but we don't know which those are. + */ +static void +ReorderBufferExecuteInvalidations(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + int i; + + for (i = 0; i < txn->ninvalidations; i++) + LocalExecuteInvalidationMessage(&txn->invalidations[i]); +} + +/* + * Mark a transaction as containing catalog changes + */ +void +ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + txn->has_catalog_changes = true; +} + +/* + * Query whether a transaction is already *known* to contain catalog + * changes. This can be wrong until directly before the commit! + */ +bool +ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + if (txn == NULL) + return false; + + return txn->has_catalog_changes; +} + +/* + * Have we already added the first snapshot? + */ +bool +ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* transaction isn't known yet, ergo no snapshot */ + if (txn == NULL) + return false; + + /* + * TODO: It would be a nice improvement if we would check the toplevel + * transaction in subtransactions, but we'd need to keep track of a bit + * more state. + */ + return txn->base_snapshot != NULL; +} + + +/* + * --------------------------------------- + * Disk serialization support + * --------------------------------------- + */ + +/* + * Ensure the IO buffer is >= sz. + */ +static void +ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz) +{ + if (!rb->outbufsize) + { + rb->outbuf = MemoryContextAlloc(rb->context, sz); + rb->outbufsize = sz; + } + else if (rb->outbufsize < sz) + { + rb->outbuf = repalloc(rb->outbuf, sz); + rb->outbufsize = sz; + } +} + +/* + * Check whether the transaction tx should spill its data to disk. + */ +static void +ReorderBufferCheckSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + /* + * TODO: improve accounting so we cheaply can take subtransactions into + * account here. + */ + if (txn->nentries_mem >= max_changes_in_memory) + { + ReorderBufferSerializeTXN(rb, txn); + Assert(txn->nentries_mem == 0); + } +} + +/* + * Spill data of a large transaction (and its subtransactions) to disk. + */ +static void +ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + dlist_iter subtxn_i; + dlist_mutable_iter change_i; + int fd = -1; + XLogSegNo curOpenSegNo = 0; + Size spilled = 0; + char path[MAXPGPATH]; + + elog(DEBUG2, "spill %u changes in tx %u to disk", + (uint32) txn->nentries_mem, txn->xid); + + /* do the same to all child TXs */ + dlist_foreach(subtxn_i, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur); + ReorderBufferSerializeTXN(rb, subtxn); + } + + /* serialize changestream */ + dlist_foreach_modify(change_i, &txn->changes) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, change_i.cur); + + /* + * store in segment in which it belongs by start lsn, don't split over + * multiple segments tho + */ + if (fd == -1 || XLByteInSeg(change->lsn, curOpenSegNo)) + { + XLogRecPtr recptr; + + if (fd != -1) + CloseTransientFile(fd); + + XLByteToSeg(change->lsn, curOpenSegNo); + XLogSegNoOffsetToRecPtr(curOpenSegNo, 0, recptr); + + /* + * No need to care about TLIs here, only used during a single run, + * so each LSN only maps to a specific WAL record. + */ + sprintf(path, "pg_replslot/%s/xid-%u-lsn-%X-%X.snap", + NameStr(MyReplicationSlot->data.name), txn->xid, + (uint32) (recptr >> 32), (uint32) recptr); + + /* open segment, create it if necessary */ + fd = OpenTransientFile(path, + O_CREAT | O_WRONLY | O_APPEND | PG_BINARY, + S_IRUSR | S_IWUSR); + + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); + } + + ReorderBufferSerializeChange(rb, txn, fd, change); + dlist_delete(&change->node); + ReorderBufferReturnChange(rb, change); + + spilled++; + } + + Assert(spilled == txn->nentries_mem); + Assert(dlist_is_empty(&txn->changes)); + txn->nentries_mem = 0; + + if (fd != -1) + CloseTransientFile(fd); +} + +/* + * Serialize individual change to disk. + */ +static void +ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + int fd, ReorderBufferChange *change) +{ + ReorderBufferDiskChange *ondisk; + Size sz = sizeof(ReorderBufferDiskChange); + + ReorderBufferSerializeReserve(rb, sz); + + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + memcpy(&ondisk->change, change, sizeof(ReorderBufferChange)); + + switch ((ReorderBufferChangeTypeInternal) change->action_internal) + { + case REORDER_BUFFER_CHANGE_INTERNAL_INSERT: + /* fall through */ + case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE: + /* fall through */ + case REORDER_BUFFER_CHANGE_INTERNAL_DELETE: + { + char *data; + Size oldlen = 0; + Size newlen = 0; + + if (change->tp.oldtuple) + oldlen = offsetof(ReorderBufferTupleBuf, data) + + change->tp.oldtuple->tuple.t_len + - offsetof(HeapTupleHeaderData, t_bits); + + if (change->tp.newtuple) + newlen = offsetof(ReorderBufferTupleBuf, data) + + change->tp.newtuple->tuple.t_len + - offsetof(HeapTupleHeaderData, t_bits); + + sz += oldlen; + sz += newlen; + + /* make sure we have enough space */ + ReorderBufferSerializeReserve(rb, sz); + + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + if (oldlen) + { + memcpy(data, change->tp.oldtuple, oldlen); + data += oldlen; + Assert(&change->tp.oldtuple->header == change->tp.oldtuple->tuple.t_data); + } + + if (newlen) + { + memcpy(data, change->tp.newtuple, newlen); + data += newlen; + Assert(&change->tp.newtuple->header == change->tp.newtuple->tuple.t_data); + } + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + { + char *data; + + sz += sizeof(SnapshotData) + + sizeof(TransactionId) * change->snapshot->xcnt + + sizeof(TransactionId) * change->snapshot->subxcnt + ; + + /* make sure we have enough space */ + ReorderBufferSerializeReserve(rb, sz); + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + memcpy(data, change->snapshot, sizeof(SnapshotData)); + data += sizeof(SnapshotData); + + if (change->snapshot->xcnt) + { + memcpy(data, change->snapshot->xip, + sizeof(TransactionId) + change->snapshot->xcnt); + data += sizeof(TransactionId) + change->snapshot->xcnt; + } + + if (change->snapshot->subxcnt) + { + memcpy(data, change->snapshot->subxip, + sizeof(TransactionId) + change->snapshot->subxcnt); + data += sizeof(TransactionId) + change->snapshot->subxcnt; + } + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + /* ReorderBufferChange contains everything important */ + break; + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + /* ReorderBufferChange contains everything important */ + break; + } + + ondisk->size = sz; + + if (write(fd, rb->outbuf, ondisk->size) != ondisk->size) + { + CloseTransientFile(fd); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to xid %u's data file: %m", + txn->xid))); + } + + Assert(ondisk->change.action_internal == change->action_internal); +} + +/* + * Restore a number of changes spilled to disk back into memory. + */ +static Size +ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, + int *fd, XLogSegNo *segno) +{ + Size restored = 0; + XLogSegNo last_segno; + dlist_mutable_iter cleanup_iter; + + Assert(txn->first_lsn != InvalidXLogRecPtr); + Assert(txn->final_lsn != InvalidXLogRecPtr); + + /* free current entries, so we have memory for more */ + dlist_foreach_modify(cleanup_iter, &txn->changes) + { + ReorderBufferChange *cleanup = + dlist_container(ReorderBufferChange, node, cleanup_iter.cur); + + dlist_delete(&cleanup->node); + ReorderBufferReturnChange(rb, cleanup); + } + txn->nentries_mem = 0; + Assert(dlist_is_empty(&txn->changes)); + + XLByteToSeg(txn->final_lsn, last_segno); + + while (restored < max_changes_in_memory && *segno <= last_segno) + { + int readBytes; + ReorderBufferDiskChange *ondisk; + + if (*fd == -1) + { + XLogRecPtr recptr; + char path[MAXPGPATH]; + + /* first time in */ + if (*segno == 0) + { + XLByteToSeg(txn->first_lsn, *segno); + } + + Assert(*segno != 0 || dlist_is_empty(&txn->changes)); + XLogSegNoOffsetToRecPtr(*segno, 0, recptr); + + /* + * No need to care about TLIs here, only used during a single run, + * so each LSN only maps to a specific WAL record. + */ + sprintf(path, "pg_replslot/%s/xid-%u-lsn-%X-%X.snap", + NameStr(MyReplicationSlot->data.name), txn->xid, + (uint32) (recptr >> 32), (uint32) recptr); + + *fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0); + if (*fd < 0 && errno == ENOENT) + { + *fd = -1; + (*segno)++; + continue; + } + else if (*fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); + + } + + ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange)); + + + /* + * Read the statically sized part of a change which has information + * about the total size. If we couldn't read a record, we're at the + * end of this file. + */ + + readBytes = read(*fd, rb->outbuf, sizeof(ReorderBufferDiskChange)); + + /* eof */ + if (readBytes == 0) + { + CloseTransientFile(*fd); + *fd = -1; + (*segno)++; + continue; + } + else if (readBytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from reorderbuffer spill file: %m"))); + else if (readBytes != sizeof(ReorderBufferDiskChange)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("incomplete read from reorderbuffer spill file: read %d instead of %u", + readBytes, + (uint32) sizeof(ReorderBufferDiskChange)))); + + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + ReorderBufferSerializeReserve(rb, + sizeof(ReorderBufferDiskChange) + ondisk->size); + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + readBytes = read(*fd, rb->outbuf + sizeof(ReorderBufferDiskChange), + ondisk->size - sizeof(ReorderBufferDiskChange)); + + if (readBytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from reorderbuffer spill file: %m"))); + else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from reorderbuffer spill file: read %d instead of %u", + readBytes, + (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange))))); + + /* + * ok, read a full change from disk, now restore it into proper + * in-memory format + */ + ReorderBufferRestoreChange(rb, txn, rb->outbuf); + restored++; + } + + return restored; +} + +/* + * Convert change from its on-disk format to in-memory format and queue it onto + * the TXN's ->changes list. + */ +static void +ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + char *data) +{ + ReorderBufferDiskChange *ondisk; + ReorderBufferChange *change; + + ondisk = (ReorderBufferDiskChange *) data; + + change = ReorderBufferGetChange(rb); + + /* copy static part */ + memcpy(change, &ondisk->change, sizeof(ReorderBufferChange)); + + data += sizeof(ReorderBufferDiskChange); + + /* restore individual stuff */ + switch ((ReorderBufferChangeTypeInternal) change->action_internal) + { + case REORDER_BUFFER_CHANGE_INTERNAL_INSERT: + /* fall through */ + case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE: + /* fall through */ + case REORDER_BUFFER_CHANGE_INTERNAL_DELETE: + if (change->tp.newtuple) + { + Size len = offsetof(ReorderBufferTupleBuf, data) + +((ReorderBufferTupleBuf *) data)->tuple.t_len + - offsetof(HeapTupleHeaderData, t_bits); + + change->tp.newtuple = ReorderBufferGetTupleBuf(rb); + memcpy(change->tp.newtuple, data, len); + change->tp.newtuple->tuple.t_data = &change->tp.newtuple->header; + + data += len; + } + + if (change->tp.oldtuple) + { + Size len = offsetof(ReorderBufferTupleBuf, data) + +((ReorderBufferTupleBuf *) data)->tuple.t_len + - offsetof(HeapTupleHeaderData, t_bits); + + change->tp.oldtuple = ReorderBufferGetTupleBuf(rb); + memcpy(change->tp.oldtuple, data, len); + change->tp.oldtuple->tuple.t_data = &change->tp.oldtuple->header; + data += len; + } + break; + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + { + Snapshot oldsnap = (Snapshot) data; + Size size = sizeof(SnapshotData) + + sizeof(TransactionId) * oldsnap->xcnt + + sizeof(TransactionId) * (oldsnap->subxcnt + 0) + ; + + Assert(change->snapshot != NULL); + + change->snapshot = MemoryContextAllocZero(rb->context, size); + + memcpy(change->snapshot, data, size); + change->snapshot->xip = (TransactionId *) + (((char *) change->snapshot) + sizeof(SnapshotData)); + change->snapshot->subxip = + change->snapshot->xip + change->snapshot->xcnt + 0; + change->snapshot->copied = true; + break; + } + /* the base struct contains all the data, easy peasy */ + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + break; + } + + dlist_push_tail(&txn->changes, &change->node); + txn->nentries_mem++; +} + +/* + * Remove all on-disk stored for the passed in transaction. + */ +static void +ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + XLogSegNo first; + XLogSegNo cur; + XLogSegNo last; + + Assert(txn->first_lsn != InvalidXLogRecPtr); + Assert(txn->final_lsn != InvalidXLogRecPtr); + + XLByteToSeg(txn->first_lsn, first); + XLByteToSeg(txn->final_lsn, last); + + /* iterate over all possible filenames, and delete them */ + for (cur = first; cur <= last; cur++) + { + char path[MAXPGPATH]; + XLogRecPtr recptr; + + XLogSegNoOffsetToRecPtr(cur, 0, recptr); + + sprintf(path, "pg_replslot/%s/xid-%u-lsn-%X-%X.snap", + NameStr(MyReplicationSlot->data.name), txn->xid, + (uint32) (recptr >> 32), (uint32) recptr); + if (unlink(path) != 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not unlink file \"%s\": %m", path))); + } +} + +/* + * Delete all data spilled to disk after we've restarted/crashed. It will be + * recreated when the respective slots are reused. + */ +void +StartupReorderBuffer(void) +{ + DIR *logical_dir; + struct dirent *logical_de; + + DIR *spill_dir; + struct dirent *spill_de; + + logical_dir = AllocateDir("pg_replslot"); + while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL) + { + struct stat statbuf; + char path[MAXPGPATH]; + + if (strcmp(logical_de->d_name, ".") == 0 || + strcmp(logical_de->d_name, "..") == 0) + continue; + + /* if it cannot be a slot, skip the directory */ + if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2)) + continue; + + /* + * ok, has to be a surviving logical slot, iterate and delete + * everythign starting with xid-* + */ + sprintf(path, "pg_replslot/%s", logical_de->d_name); + + /* we're only creating directories here, skip if it's not our's */ + if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode)) + continue; + + spill_dir = AllocateDir(path); + while ((spill_de = ReadDir(spill_dir, path)) != NULL) + { + if (strcmp(spill_de->d_name, ".") == 0 || + strcmp(spill_de->d_name, "..") == 0) + continue; + + /* only look at names that can be ours */ + if (strncmp(spill_de->d_name, "xid", 3) == 0) + { + sprintf(path, "pg_replslot/%s/%s", logical_de->d_name, + spill_de->d_name); + + if (unlink(path) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not unlink file \"%s\": %m", + path))); + } + } + FreeDir(spill_dir); + } + FreeDir(logical_dir); +} + +/* --------------------------------------- + * toast reassembly support + * --------------------------------------- + */ + +/* + * Initialize per tuple toast reconstruction support. + */ +static void +ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + HASHCTL hash_ctl; + + Assert(txn->toast_hash == NULL); + + memset(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(ReorderBufferToastEnt); + hash_ctl.hash = tag_hash; + hash_ctl.hcxt = rb->context; + txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl, + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); +} + +/* + * Per toast-chunk handling for toast reconstruction + * + * Appends a toast chunk so we can reconstruct it when the tuple "owning" the + * toasted Datum comes along. + */ +static void +ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + ReorderBufferToastEnt *ent; + bool found; + int32 chunksize; + bool isnull; + Pointer chunk; + TupleDesc desc = RelationGetDescr(relation); + Oid chunk_id; + Oid chunk_seq; + + if (txn->toast_hash == NULL) + ReorderBufferToastInitHash(rb, txn); + + Assert(IsToastRelation(relation)); + + chunk_id = DatumGetObjectId(fastgetattr(&change->tp.newtuple->tuple, 1, desc, &isnull)); + Assert(!isnull); + chunk_seq = DatumGetInt32(fastgetattr(&change->tp.newtuple->tuple, 2, desc, &isnull)); + Assert(!isnull); + + ent = (ReorderBufferToastEnt *) + hash_search(txn->toast_hash, + (void *) &chunk_id, + HASH_ENTER, + &found); + + if (!found) + { + Assert(ent->chunk_id == chunk_id); + ent->num_chunks = 0; + ent->last_chunk_seq = 0; + ent->size = 0; + ent->reconstructed = NULL; + dlist_init(&ent->chunks); + + if (chunk_seq != 0) + elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0", + chunk_seq, chunk_id); + } + else if (found && chunk_seq != ent->last_chunk_seq + 1) + elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d", + chunk_seq, chunk_id, ent->last_chunk_seq + 1); + + chunk = DatumGetPointer(fastgetattr(&change->tp.newtuple->tuple, 3, desc, &isnull)); + Assert(!isnull); + + /* calculate size so we can allocate the right size at once later */ + if (!VARATT_IS_EXTENDED(chunk)) + chunksize = VARSIZE(chunk) - VARHDRSZ; + else if (VARATT_IS_SHORT(chunk)) + /* could happen due to heap_form_tuple doing its thing */ + chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; + else + elog(ERROR, "unexpected type of toast chunk"); + + ent->size += chunksize; + ent->last_chunk_seq = chunk_seq; + ent->num_chunks++; + dlist_push_tail(&ent->chunks, &change->node); +} + +/* + * Rejigger change->newtuple to point to in-memory toast tuples instead to + * on-disk toast tuples that may not longer exist (think DROP TABLE or VACUUM). + * + * We cannot replace unchanged toast tuples though, so those will still point + * to on-disk toast data. + */ +static void +ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + TupleDesc desc; + int natt; + Datum *attrs; + bool *isnull; + bool *free; + HeapTuple newtup; + Relation toast_rel; + TupleDesc toast_desc; + MemoryContext oldcontext; + + /* no toast tuples changed */ + if (txn->toast_hash == NULL) + return; + + oldcontext = MemoryContextSwitchTo(rb->context); + + /* we should only have toast tuples in an INSERT or UPDATE */ + Assert(change->tp.newtuple); + + desc = RelationGetDescr(relation); + + toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid); + toast_desc = RelationGetDescr(toast_rel); + + /* should we allocate from stack instead? */ + attrs = palloc0(sizeof(Datum) * desc->natts); + isnull = palloc0(sizeof(bool) * desc->natts); + free = palloc0(sizeof(bool) * desc->natts); + + heap_deform_tuple(&change->tp.newtuple->tuple, desc, + attrs, isnull); + + for (natt = 0; natt < desc->natts; natt++) + { + Form_pg_attribute attr = desc->attrs[natt]; + ReorderBufferToastEnt *ent; + struct varlena *varlena; + + /* va_rawsize is the size of the original datum -- including header */ + struct varatt_external toast_pointer; + struct varatt_indirect redirect_pointer; + struct varlena *new_datum = NULL; + struct varlena *reconstructed; + dlist_iter it; + Size data_done = 0; + + /* system columns aren't toasted */ + if (attr->attnum < 0) + continue; + + if (attr->attisdropped) + continue; + + /* not a varlena datatype */ + if (attr->attlen != -1) + continue; + + /* no data */ + if (isnull[natt]) + continue; + + /* ok, we know we have a toast datum */ + varlena = (struct varlena *) DatumGetPointer(attrs[natt]); + + /* no need to do anything if the tuple isn't external */ + if (!VARATT_IS_EXTERNAL(varlena)) + continue; + + VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena); + + /* + * Check whether the toast tuple changed, replace if so. + */ + ent = (ReorderBufferToastEnt *) + hash_search(txn->toast_hash, + (void *) &toast_pointer.va_valueid, + HASH_FIND, + NULL); + if (ent == NULL) + continue; + + new_datum = + (struct varlena *) palloc0(INDIRECT_POINTER_SIZE); + + free[natt] = true; + + reconstructed = palloc0(toast_pointer.va_rawsize); + + ent->reconstructed = reconstructed; + + /* stitch toast tuple back together from its parts */ + dlist_foreach(it, &ent->chunks) + { + bool isnull; + ReorderBufferTupleBuf *tup = + dlist_container(ReorderBufferChange, node, it.cur)->tp.newtuple; + Pointer chunk = + DatumGetPointer(fastgetattr(&tup->tuple, 3, toast_desc, &isnull)); + + Assert(!isnull); + Assert(!VARATT_IS_EXTERNAL(chunk)); + Assert(!VARATT_IS_SHORT(chunk)); + + memcpy(VARDATA(reconstructed) + data_done, + VARDATA(chunk), + VARSIZE(chunk) - VARHDRSZ); + data_done += VARSIZE(chunk) - VARHDRSZ; + } + Assert(data_done == toast_pointer.va_extsize); + + /* make sure its marked as compressed or not */ + if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) + SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ); + else + SET_VARSIZE(reconstructed, data_done + VARHDRSZ); + + memset(&redirect_pointer, 0, sizeof(redirect_pointer)); + redirect_pointer.pointer = reconstructed; + + SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT); + memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer, + sizeof(redirect_pointer)); + + attrs[natt] = PointerGetDatum(new_datum); + } + + /* + * Build tuple in separate memory & copy tuple back into the tuplebuf + * passed to the output plugin. We can't directly heap_fill_tuple() into + * the tuplebuf because attrs[] will point back into the current content. + */ + newtup = heap_form_tuple(desc, attrs, isnull); + Assert(change->tp.newtuple->tuple.t_len <= MaxHeapTupleSize); + Assert(&change->tp.newtuple->header == change->tp.newtuple->tuple.t_data); + + memcpy(change->tp.newtuple->tuple.t_data, + newtup->t_data, + newtup->t_len); + change->tp.newtuple->tuple.t_len = newtup->t_len; + + /* + * free resources we won't further need, more persistent stuff will be + * free'd in ReorderBufferToastReset(). + */ + RelationClose(toast_rel); + pfree(newtup); + for (natt = 0; natt < desc->natts; natt++) + { + if (free[natt]) + pfree(DatumGetPointer(attrs[natt])); + } + pfree(attrs); + pfree(free); + pfree(isnull); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Free all resources allocated for toast reconstruction. + */ +static void +ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + HASH_SEQ_STATUS hstat; + ReorderBufferToastEnt *ent; + + if (txn->toast_hash == NULL) + return; + + /* sequentially walk over the hash and free everything */ + hash_seq_init(&hstat, txn->toast_hash); + while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL) + { + dlist_mutable_iter it; + + if (ent->reconstructed != NULL) + pfree(ent->reconstructed); + + dlist_foreach_modify(it, &ent->chunks) + { + ReorderBufferChange *change = + dlist_container(ReorderBufferChange, node, it.cur); + + dlist_delete(&change->node); + ReorderBufferReturnChange(rb, change); + } + } + + hash_destroy(txn->toast_hash); + txn->toast_hash = NULL; +} + + +/* --------------------------------------- + * Visibility support for logical decoding + * + * + * Lookup actual cmin/cmax values when using decoding snapshot. We can't + * always rely on stored cmin/cmax values because of two scenarios: + * + * * A tuple got changed multiple times during a single transaction and thus + * has got a combocid. Combocid's are only valid for the duration of a + * single transaction. + * * A tuple with a cmin but no cmax (and thus no combocid) got + * deleted/updated in another transaction than the one which created it + * which we are looking at right now. As only one of cmin, cmax or combocid + * is actually stored in the heap we don't have access to the the value we + * need anymore. + * + * To resolve those problems we have a per-transaction hash of (cmin, + * cmax) tuples keyed by (relfilenode, ctid) which contains the actual + * (cmin, cmax) values. That also takes care of combocids by simply + * not caring about them at all. As we have the real cmin/cmax values + * combocids aren't interesting. + * + * As we only care about catalog tuples here the overhead of this + * hashtable should be acceptable. + * + * Heap rewrites complicate this a bit, check rewriteheap.c for + * details. + * ------------------------------------------------------------------------- + */ + +/* struct for qsort()ing mapping files by lsn somewhat efficiently */ +typedef struct RewriteMappingFile +{ + XLogRecPtr lsn; + char fname[MAXPGPATH]; +} RewriteMappingFile; + +#if NOT_USED +static void +DisplayMapping(HTAB *tuplecid_data) +{ + HASH_SEQ_STATUS hstat; + ReorderBufferTupleCidEnt *ent; + + hash_seq_init(&hstat, tuplecid_data); + while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL) + { + elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u", + ent->key.relnode.dbNode, + ent->key.relnode.spcNode, + ent->key.relnode.relNode, + BlockIdGetBlockNumber(&ent->key.tid.ip_blkid), + ent->key.tid.ip_posid, + ent->cmin, + ent->cmax + ); + } +} +#endif + +/* + * Apply a single mapping file to tuplecid_data. + * + * The mapping file has to have been verified to be a) committed b) for our + * transaction c) applied in LSN order. + */ +static void +ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname) +{ + char path[MAXPGPATH]; + int fd; + int readBytes; + LogicalRewriteMappingData map; + + sprintf(path, "pg_llog/mappings/%s", fname); + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0); + if (fd < 0) + ereport(ERROR, + (errmsg("could not open file \"%s\": %m", path))); + + while (true) + { + ReorderBufferTupleCidKey key; + ReorderBufferTupleCidEnt *ent; + ReorderBufferTupleCidEnt *new_ent; + bool found; + + /* be careful about padding */ + memset(&key, 0, sizeof(ReorderBufferTupleCidKey)); + + /* read all mappings till the end of the file */ + readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData)); + + if (readBytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + path))); + else if (readBytes == 0) /* EOF */ + break; + else if (readBytes != sizeof(LogicalRewriteMappingData)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\", read %d instead of %d", + path, readBytes, + (int32) sizeof(LogicalRewriteMappingData)))); + + key.relnode = map.old_node; + ItemPointerCopy(&map.old_tid, + &key.tid); + + + ent = (ReorderBufferTupleCidEnt *) + hash_search(tuplecid_data, + (void *) &key, + HASH_FIND, + NULL); + + /* no existing mapping, no need to update */ + if (!ent) + continue; + + key.relnode = map.new_node; + ItemPointerCopy(&map.new_tid, + &key.tid); + + new_ent = (ReorderBufferTupleCidEnt *) + hash_search(tuplecid_data, + (void *) &key, + HASH_ENTER, + &found); + + if (found) + { + /* + * Make sure the existing mapping makes sense. We sometime update + * old records that did not yet have a cmax (e.g. pg_class' own + * entry while rewriting it) during rewrites, so allow that. + */ + Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin); + Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax); + } + else + { + /* update mapping */ + new_ent->cmin = ent->cmin; + new_ent->cmax = ent->cmax; + new_ent->combocid = ent->combocid; + } + } +} + + +/* + * Check whether the TransactionOId 'xid' is in the pre-sorted array 'xip'. + */ +static bool +TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num) +{ + return bsearch(&xid, xip, num, + sizeof(TransactionId), xidComparator) != NULL; +} + +/* + * qsort() comparator for sorting RewriteMappingFiles in LSN order. + */ +static int +file_sort_by_lsn(const void *a_p, const void *b_p) +{ + RewriteMappingFile *a = *(RewriteMappingFile **)a_p; + RewriteMappingFile *b = *(RewriteMappingFile **)b_p; + + if (a->lsn < b->lsn) + return -1; + else if (a->lsn > b->lsn) + return 1; + return 0; +} + +/* + * Apply any existing logical remapping files if there are any targeted at our + * transaction for relid. + */ +static void +UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot) +{ + DIR *mapping_dir; + struct dirent *mapping_de; + List *files = NIL; + ListCell *file; + RewriteMappingFile **files_a; + size_t off; + Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId; + + mapping_dir = AllocateDir("pg_llog/mappings"); + while ((mapping_de = ReadDir(mapping_dir, "pg_llog/mappings")) != NULL) + { + Oid f_dboid; + Oid f_relid; + TransactionId f_mapped_xid; + TransactionId f_create_xid; + XLogRecPtr f_lsn; + uint32 f_hi, f_lo; + RewriteMappingFile *f; + + if (strcmp(mapping_de->d_name, ".") == 0 || + strcmp(mapping_de->d_name, "..") == 0) + continue; + + /* Ignore files that aren't ours*/ + if (strncmp(mapping_de->d_name, "map-", 4) != 0) + continue; + + if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT, + &f_dboid, &f_relid, &f_hi, &f_lo, + &f_mapped_xid, &f_create_xid) != 6) + elog(ERROR, "could not parse fname %s", mapping_de->d_name); + + f_lsn = ((uint64) f_hi) << 32 | f_lo; + + /* mapping for another database */ + if (f_dboid != dboid) + continue; + + /* mapping for another relation */ + if (f_relid != relid) + continue; + + /* did the creating transaction abort? */ + if (!TransactionIdDidCommit(f_create_xid)) + continue; + + /* not for our transaction */ + if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt)) + continue; + + /* ok, relevant, queue for apply */ + f = palloc(sizeof(RewriteMappingFile)); + f->lsn = f_lsn; + strcpy(f->fname, mapping_de->d_name); + files = lappend(files, f); + } + FreeDir(mapping_dir); + + /* build array we can easily sort */ + files_a = palloc(list_length(files) * sizeof(RewriteMappingFile *)); + off = 0; + foreach(file, files) + { + files_a[off++] = lfirst(file); + } + + /* sort files so we apply them in LSN order */ + qsort(files_a, list_length(files), sizeof(RewriteMappingFile *), + file_sort_by_lsn); + + for(off = 0; off < list_length(files); off++) + { + RewriteMappingFile *f = files_a[off]; + elog(DEBUG1, "applying mapping: %s in %u", f->fname, + snapshot->subxip[0]); + ApplyLogicalMappingFile(tuplecid_data, relid, f->fname); + pfree(f); + } +} + +/* + * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on + * combocids. + */ +bool +ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data, + Snapshot snapshot, + HeapTuple htup, Buffer buffer, + CommandId *cmin, CommandId *cmax) +{ + ReorderBufferTupleCidKey key; + ReorderBufferTupleCidEnt *ent; + ForkNumber forkno; + BlockNumber blockno; + bool updated_mapping = false; + + /* be careful about padding */ + memset(&key, 0, sizeof(key)); + + Assert(!BufferIsLocal(buffer)); + + /* + * get relfilenode from the buffer, no convenient way to access it other + * than that. + */ + BufferGetTag(buffer, &key.relnode, &forkno, &blockno); + + /* tuples can only be in the main fork */ + Assert(forkno == MAIN_FORKNUM); + Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self)); + + ItemPointerCopy(&htup->t_self, + &key.tid); + +restart: + ent = (ReorderBufferTupleCidEnt *) + hash_search(tuplecid_data, + (void *) &key, + HASH_FIND, + NULL); + + /* + * failed to find a mapping, check whether the table was rewritten and + * apply mapping if so, but only do that once - there can be no new + * mappings while we are in here since we have to hold a lock on the + * relation. + */ + if (ent == NULL && !updated_mapping) + { + UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot); + /* now check but don't update for a mapping again */ + updated_mapping = true; + goto restart; + } + else if (ent == NULL) + return false; + + if (cmin) + *cmin = ent->cmin; + if (cmax) + *cmax = ent->cmax; + return true; +} diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c new file mode 100644 index 0000000000..28f9a8a1a6 --- /dev/null +++ b/src/backend/replication/logical/snapbuild.c @@ -0,0 +1,1885 @@ +/*------------------------------------------------------------------------- + * + * snapbuild.c + * + * Infrastructure for building historic catalog snapshots based on contents + * of the WAL, for the purpose of decoding heapam.c style values in the + * WAL. + * + * NOTES: + * + * We build snapshots which can *only* be used to read catalog contents and we + * do so by reading and interpreting the WAL stream. The aim is to build a + * snapshot that behaves the same as a freshly taken MVCC snapshot would have + * at the time the XLogRecord was generated. + * + * To build the snapshots we reuse the infrastructure built for Hot + * Standby. The in-memory snapshots we build look different than HS' because + * we have different needs. To successfully decode data from the WAL we only + * need to access catalog tables and (sys|rel|cat)cache, not the actual user + * tables since the data we decode is wholly contained in the WAL + * records. Also, our snapshots need to be different in comparison to normal + * MVCC ones because in contrast to those we cannot fully rely on the clog and + * pg_subtrans for information about committed transactions because they might + * commit in the future from the POV of the WAL entry we're currently + * decoding. This definition has the advantage that we only need to prevent + * removal of catalog rows, while normal table's rows can still be + * removed. This is achieved by using the replication slot mechanism. + * + * As the percentage of transactions modifying the catalog normally is fairly + * small in comparisons to ones only manipulating user data, we keep track of + * the committed catalog modifying ones inside (xmin, xmax) instead of keeping + * track of all running transactions like its done in a normal snapshot. Note + * that we're generally only looking at transactions that have acquired an + * xid. That is we keep a list of transactions between snapshot->(xmin, xmax) + * that we consider committed, everything else is considered aborted/in + * progress. That also allows us not to care about subtransactions before they + * have committed which means this modules, in contrast to HS, doesn't have to + * care about suboverflowed subtransactions and similar. + * + * One complexity of doing this is that to e.g. handle mixed DDL/DML + * transactions we need Snapshots that see intermediate versions of the + * catalog in a transaction. During normal operation this is achieved by using + * CommandIds/cmin/cmax. The problem with that however is that for space + * efficiency reasons only one value of that is stored + * (c.f. combocid.c). Since ComboCids are only available in memory we log + * additional information which allows us to get the original (cmin, cmax) + * pair during visibility checks. Check the reorderbuffer.c's comment above + * ResolveCminCmaxDuringDecoding() for details. + * + * To facilitate all this we need our own visibility routine, as the normal + * ones are optimized for different usecases. + * + * To replace the normal catalog snapshots with decoding ones use the + * SetupHistoricSnapshot() and TeardownHistoricSnapshot() functions. + * + * + * + * The snapbuild machinery is starting up in in several stages, as illustrated + * by the following graph: + * +-------------------------+ + * +----|SNAPBUILD_START |-------------+ + * | +-------------------------+ | + * | | | + * | | | + * | running_xacts with running xacts | + * | | | + * | | | + * | v | + * | +-------------------------+ v + * | |SNAPBUILD_FULL_SNAPSHOT |------------>| + * | +-------------------------+ | + * running_xacts | saved snapshot + * with zero xacts | at running_xacts's lsn + * | | | + * | all running toplevel TXNs finished | + * | | | + * | v | + * | +-------------------------+ | + * +--->|SNAPBUILD_CONSISTENT |<------------+ + * +-------------------------+ + * + * Initially the machinery is in the START stage. When a xl_running_xacts + * record is read that is sufficiently new (above the safe xmin horizon), + * there's a state transation. If there were no running xacts when the + * runnign_xacts record was generated, we'll directly go into CONSISTENT + * state, otherwise we'll switch to the FULL_SNAPSHOT state. Having a full + * snapshot means that all transactions that start henceforth can be decoded + * in their entirety, but transactions that started previously can't. In + * FULL_SNAPSHOT we'll switch into CONSISTENT once all those previously + * running transactions have committed or aborted. + * + * Only transactions that commit after CONSISTENT state has been reached will + * be replayed, even though they might have started while still in + * FULL_SNAPSHOT. That ensures that we'll reach a point where no previous + * changes has been exported, but all the following ones will be. That point + * is a convenient point to initialize replication from, which is why we + * export a snapshot at that point, which *can* be used to read normal data. + * + * Copyright (c) 2012-2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/snapbuild.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include +#include + +#include "miscadmin.h" + +#include "access/heapam_xlog.h" +#include "access/transam.h" +#include "access/xact.h" + +#include "replication/logical.h" +#include "replication/reorderbuffer.h" +#include "replication/snapbuild.h" + +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/snapshot.h" +#include "utils/snapmgr.h" +#include "utils/tqual.h" + +#include "storage/block.h" /* debugging output */ +#include "storage/fd.h" +#include "storage/lmgr.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/standby.h" + +/* + * This struct contains the current state of the snapshot building + * machinery. Besides a forward declaration in the header, it is not exposed + * to the public, so we can easily change it's contents. + */ +struct SnapBuild +{ + /* how far are we along building our first full snapshot */ + SnapBuildState state; + + /* private memory context used to allocate memory for this module. */ + MemoryContext context; + + /* all transactions < than this have committed/aborted */ + TransactionId xmin; + + /* all transactions >= than this are uncommitted */ + TransactionId xmax; + + /* + * Don't replay commits from an LSN <= this LSN. This can be set + * externally but it will also be advanced (never retreat) from within + * snapbuild.c. + */ + XLogRecPtr transactions_after; + + /* + * Don't start decoding WAL until the "xl_running_xacts" information + * indicates there are no running xids with a xid smaller than this. + */ + TransactionId initial_xmin_horizon; + + /* + * Snapshot that's valid to see the catalog state seen at this moment. + */ + Snapshot snapshot; + + /* + * LSN of the last location we are sure a snapshot has been serialized to. + */ + XLogRecPtr last_serialized_snapshot; + + /* + * The reorderbuffer we need to update with usable snapshots et al. + */ + ReorderBuffer *reorder; + + /* + * Information about initially running transactions + * + * When we start building a snapshot there already may be transactions in + * progress. Those are stored in running.xip. We don't have enough + * information about those to decode their contents, so until they are + * finished (xcnt=0) we cannot switch to a CONSISTENT state. + */ + struct + { + /* + * As long as running.xcnt all XIDs < running.xmin and > running.xmax + * have to be checked whether they still are running. + */ + TransactionId xmin; + TransactionId xmax; + + size_t xcnt; /* number of used xip entries */ + size_t xcnt_space; /* allocated size of xip */ + TransactionId *xip; /* running xacts array, xidComparator-sorted */ + } running; + + /* + * Array of transactions which could have catalog changes that committed + * between xmin and xmax. + */ + struct + { + /* number of committed transactions */ + size_t xcnt; + + /* available space for committed transactions */ + size_t xcnt_space; + + /* + * Until we reach a CONSISTENT state, we record commits of all + * transactions, not just the catalog changing ones. Record when that + * changes so we know we cannot export a snapshot safely anymore. + */ + bool includes_all_transactions; + + /* + * Array of committed transactions that have modified the catalog. + * + * As this array is frequently modified we do *not* keep it in + * xidComparator order. Instead we sort the array when building & + * distributing a snapshot. + * + * TODO: It's unclear whether that reasoning has much merit. Every + * time we add something here after becoming consistent will also + * require distributing a snapshot. Storing them sorted would + * potentially also make it easier to purge (but more complicated wrt + * wraparound?). Should be improved if sorting while building the + * snapshot shows up in profiles. + */ + TransactionId *xip; + } committed; +}; + +/* + * Starting a transaction -- which we need to do while exporting a snapshot -- + * removes knowledge about the previously used resowner, so we save it here. + */ +ResourceOwner SavedResourceOwnerDuringExport = NULL; +bool ExportInProgress = false; + +/* transaction state manipulation functions */ +static void SnapBuildEndTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid); + +/* ->running manipulation */ +static bool SnapBuildTxnIsRunning(SnapBuild *builder, TransactionId xid); + +/* ->committed manipulation */ +static void SnapBuildPurgeCommittedTxn(SnapBuild *builder); + +/* snapshot building/manipulation/distribution functions */ +static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid); + +static void SnapBuildFreeSnapshot(Snapshot snap); + +static void SnapBuildSnapIncRefcount(Snapshot snap); + +static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn); + +/* xlog reading helper functions for SnapBuildProcessRecord */ +static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running); + +/* serialization functions */ +static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn); +static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn); + + +/* + * Allocate a new snapshot builder. + * + * xmin_horizon is the xid >=which we can be sure no catalog rows have been + * removed, start_lsn is the LSN >= we want to replay commits. + */ +SnapBuild * +AllocateSnapshotBuilder(ReorderBuffer *reorder, + TransactionId xmin_horizon, + XLogRecPtr start_lsn) +{ + MemoryContext context; + MemoryContext oldcontext; + SnapBuild *builder; + + /* allocate memory in own context, to have better accountability */ + context = AllocSetContextCreate(CurrentMemoryContext, + "snapshot builder context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcontext = MemoryContextSwitchTo(context); + + builder = palloc0(sizeof(SnapBuild)); + + builder->state = SNAPBUILD_START; + builder->context = context; + builder->reorder = reorder; + /* Other struct members initialized by zeroing via palloc0 above */ + + builder->committed.xcnt = 0; + builder->committed.xcnt_space = 128; /* arbitrary number */ + builder->committed.xip = + palloc0(builder->committed.xcnt_space * sizeof(TransactionId)); + builder->committed.includes_all_transactions = true; + builder->committed.xip = + palloc0(builder->committed.xcnt_space * sizeof(TransactionId)); + builder->initial_xmin_horizon = xmin_horizon; + builder->transactions_after = start_lsn; + + MemoryContextSwitchTo(oldcontext); + + return builder; +} + +/* + * Free a snapshot builder. + */ +void +FreeSnapshotBuilder(SnapBuild *builder) +{ + MemoryContext context = builder->context; + + /* free snapshot explicitly, that contains some error checking */ + if (builder->snapshot != NULL) + { + SnapBuildSnapDecRefcount(builder->snapshot); + builder->snapshot = NULL; + } + + /* other resources are deallocated via memory context reset */ + MemoryContextDelete(context); +} + +/* + * Free an unreferenced snapshot that has previously been built by us. + */ +static void +SnapBuildFreeSnapshot(Snapshot snap) +{ + /* make sure we don't get passed an external snapshot */ + Assert(snap->satisfies == HeapTupleSatisfiesHistoricMVCC); + + /* make sure nobody modified our snapshot */ + Assert(snap->curcid == FirstCommandId); + Assert(!snap->suboverflowed); + Assert(!snap->takenDuringRecovery); + Assert(snap->regd_count == 1); + + /* slightly more likely, so it's checked even without c-asserts */ + if (snap->copied) + elog(ERROR, "cannot free a copied snapshot"); + + if (snap->active_count) + elog(ERROR, "cannot free an active snapshot"); + + pfree(snap); +} + +/* + * In which state of snapshot building are we? + */ +SnapBuildState +SnapBuildCurrentState(SnapBuild *builder) +{ + return builder->state; +} + +/* + * Should the contents of transaction ending at 'ptr' be decoded? + */ +bool +SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr) +{ + return ptr <= builder->transactions_after; +} + +/* + * Increase refcount of a snapshot. + * + * This is used when handing out a snapshot to some external resource or when + * adding a Snapshot as builder->snapshot. + */ +static void +SnapBuildSnapIncRefcount(Snapshot snap) +{ + snap->active_count++; +} + +/* + * Decrease refcount of a snapshot and free if the refcount reaches zero. + * + * Externally visible, so that external resources that have been handed an + * IncRef'ed Snapshot can adjust its refcount easily. + */ +void +SnapBuildSnapDecRefcount(Snapshot snap) +{ + /* make sure we don't get passed an external snapshot */ + Assert(snap->satisfies == HeapTupleSatisfiesHistoricMVCC); + + /* make sure nobody modified our snapshot */ + Assert(snap->curcid == FirstCommandId); + Assert(!snap->suboverflowed); + Assert(!snap->takenDuringRecovery); + + Assert(snap->regd_count == 1); + + Assert(snap->active_count); + + /* slightly more likely, so its checked even without casserts */ + if (snap->copied) + elog(ERROR, "cannot free a copied snapshot"); + + snap->active_count--; + if (!snap->active_count) + SnapBuildFreeSnapshot(snap); +} + +/* + * Build a new snapshot, based on currently committed catalog-modifying + * transactions. + * + * In-progress transactions with catalog access are *not* allowed to modify + * these snapshots; they have to copy them and fill in appropriate ->curcid + * and ->subxip/subxcnt values. + */ +static Snapshot +SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid) +{ + Snapshot snapshot; + Size ssize; + + Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT); + + ssize = sizeof(SnapshotData) + + sizeof(TransactionId) * builder->committed.xcnt + + sizeof(TransactionId) * 1 /* toplevel xid */ ; + + snapshot = MemoryContextAllocZero(builder->context, ssize); + + snapshot->satisfies = HeapTupleSatisfiesHistoricMVCC; + + /* + * We misuse the original meaning of SnapshotData's xip and subxip fields + * to make the more fitting for our needs. + * + * In the 'xip' array we store transactions that have to be treated as + * committed. Since we will only ever look at tuples from transactions + * that have modified the catalog its more efficient to store those few + * that exist between xmin and xmax (frequently there are none). + * + * Snapshots that are used in transactions that have modified the catalog + * also use the 'subxip' array to store their toplevel xid and all the + * subtransaction xids so we can recognize when we need to treat rows as + * visible that are not in xip but still need to be visible. Subxip only + * gets filled when the transaction is copied into the context of a + * catalog modifying transaction since we otherwise share a snapshot + * between transactions. As long as a txn hasn't modified the catalog it + * doesn't need to treat any uncommitted rows as visible, so there is no + * need for those xids. + * + * Both arrays are qsort'ed so that we can use bsearch() on them. + */ + Assert(TransactionIdIsNormal(builder->xmin)); + Assert(TransactionIdIsNormal(builder->xmax)); + + snapshot->xmin = builder->xmin; + snapshot->xmax = builder->xmax; + + /* store all transactions to be treated as committed by this snapshot */ + snapshot->xip = + (TransactionId *) ((char *) snapshot + sizeof(SnapshotData)); + snapshot->xcnt = builder->committed.xcnt; + memcpy(snapshot->xip, + builder->committed.xip, + builder->committed.xcnt * sizeof(TransactionId)); + + /* sort so we can bsearch() */ + qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator); + + /* + * Initially, subxip is empty, i.e. it's a snapshot to be used by + * transactions that don't modify the catalog. Will be filled by + * ReorderBufferCopySnap() if necessary. + */ + snapshot->subxcnt = 0; + snapshot->subxip = NULL; + + snapshot->suboverflowed = false; + snapshot->takenDuringRecovery = false; + snapshot->copied = false; + snapshot->curcid = FirstCommandId; + snapshot->active_count = 0; + snapshot->regd_count = 1; /* mark as registered so nobody frees it */ + + return snapshot; +} + +/* + * Export a snapshot so it can be set in another session with SET TRANSACTION + * SNAPSHOT. + * + * For that we need to start a transaction in the current backend as the + * importing side checks whether the source transaction is still open to make + * sure the xmin horizon hasn't advanced since then. + * + * After that we convert a locally built snapshot into the normal variant + * understood by HeapTupleSatisfiesMVCC et al. + */ +const char * +SnapBuildExportSnapshot(SnapBuild *builder) +{ + Snapshot snap; + char *snapname; + TransactionId xid; + TransactionId *newxip; + int newxcnt = 0; + + if (builder->state != SNAPBUILD_CONSISTENT) + elog(ERROR, "cannot export a snapshot before reaching a consistent state"); + + if (!builder->committed.includes_all_transactions) + elog(ERROR, "cannot export a snapshot, not all transactions are monitored anymore"); + + /* so we don't overwrite the existing value */ + if (TransactionIdIsValid(MyPgXact->xmin)) + elog(ERROR, "cannot export a snapshot when MyPgXact->xmin already is valid"); + + if (IsTransactionOrTransactionBlock()) + elog(ERROR, "cannot export a snapshot from within a transaction"); + + if (SavedResourceOwnerDuringExport) + elog(ERROR, "can only export one snapshot at a time"); + + SavedResourceOwnerDuringExport = CurrentResourceOwner; + ExportInProgress = true; + + StartTransactionCommand(); + + Assert(!FirstSnapshotSet); + + /* There doesn't seem to a nice API to set these */ + XactIsoLevel = XACT_REPEATABLE_READ; + XactReadOnly = true; + + snap = SnapBuildBuildSnapshot(builder, GetTopTransactionId()); + + /* + * We know that snap->xmin is alive, enforced by the logical xmin + * mechanism. Due to that we can do this without locks, we're only + * changing our own value. + */ + MyPgXact->xmin = snap->xmin; + + /* allocate in transaction context */ + newxip = (TransactionId *) + palloc(sizeof(TransactionId) * GetMaxSnapshotXidCount()); + + /* + * snapbuild.c builds transactions in an "inverted" manner, which means it + * stores committed transactions in ->xip, not ones in progress. Build a + * classical snapshot by marking all non-committed transactions as + * in-progress. This can be expensive. + */ + for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);) + { + void *test; + + /* + * Check whether transaction committed using the decoding snapshot + * meaning of ->xip. + */ + test = bsearch(&xid, snap->xip, snap->xcnt, + sizeof(TransactionId), xidComparator); + + if (test == NULL) + { + if (newxcnt >= GetMaxSnapshotXidCount()) + elog(ERROR, "snapshot too large"); + + newxip[newxcnt++] = xid; + } + + TransactionIdAdvance(xid); + } + + snap->xcnt = newxcnt; + snap->xip = newxip; + + /* + * now that we've built a plain snapshot, use the normal mechanisms for + * exporting it + */ + snapname = ExportSnapshot(snap); + + ereport(LOG, + (errmsg("exported logical decoding snapshot: \"%s\" with %u xids", + snapname, snap->xcnt))); + return snapname; +} + +/* + * Reset a previously SnapBuildExportSnapshot()'ed snapshot if there is + * any. Aborts the previously started transaction and resets the resource + * owner back to it's original value. + */ +void +SnapBuildClearExportedSnapshot() +{ + /* nothing exported, thats the usual case */ + if (!ExportInProgress) + return; + + if (!IsTransactionState()) + elog(ERROR, "clearing exported snapshot in wrong transaction state"); + + /* make sure nothing could have ever happened */ + AbortCurrentTransaction(); + + CurrentResourceOwner = SavedResourceOwnerDuringExport; + SavedResourceOwnerDuringExport = NULL; + ExportInProgress = false; +} + +/* + * Handle the effects of a single heap change, appropriate to the current state + * of the snapshot builder and returns whether changes made at (xid, lsn) can + * be decoded. + */ +bool +SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn) +{ + bool is_old_tx; + + /* + * We can't handle data in transactions if we haven't built a snapshot + * yet, so don't store them. + */ + if (builder->state < SNAPBUILD_FULL_SNAPSHOT) + return false; + + /* + * No point in keeping track of changes in transactions that we don't have + * enough information about to decode. This means that they started before + * we got into the SNAPBUILD_FULL_SNAPSHOT state. + */ + if (builder->state < SNAPBUILD_CONSISTENT && + SnapBuildTxnIsRunning(builder, xid)) + return false; + + /* + * If the reorderbuffer doesn't yet have a snapshot, add one now, it will + * be needed to decode the change we're currently processing. + */ + is_old_tx = ReorderBufferIsXidKnown(builder->reorder, xid); + + if (!is_old_tx || !ReorderBufferXidHasBaseSnapshot(builder->reorder, xid)) + { + /* only build a new snapshot if we don't have a prebuilt one */ + if (builder->snapshot == NULL) + { + builder->snapshot = SnapBuildBuildSnapshot(builder, xid); + /* inrease refcount for the snapshot builder */ + SnapBuildSnapIncRefcount(builder->snapshot); + } + + /* + * Increase refcount for the transaction we're handing the snapshot + * out to. + */ + SnapBuildSnapIncRefcount(builder->snapshot); + ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn, + builder->snapshot); + } + + return true; +} + +/* + * Do CommandId/ComboCid handling after reading a xl_heap_new_cid record. This + * implies that a transaction has done some form of write to system catalogs. + */ +void +SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid, + XLogRecPtr lsn, xl_heap_new_cid *xlrec) +{ + CommandId cid; + + /* + * we only log new_cid's if a catalog tuple was modified, so mark + * the transaction as containing catalog modifications + */ + ReorderBufferXidSetCatalogChanges(builder->reorder, xid,lsn); + + ReorderBufferAddNewTupleCids(builder->reorder, xlrec->top_xid, lsn, + xlrec->target.node, xlrec->target.tid, + xlrec->cmin, xlrec->cmax, + xlrec->combocid); + + /* figure out new command id */ + if (xlrec->cmin != InvalidCommandId && + xlrec->cmax != InvalidCommandId) + cid = Max(xlrec->cmin, xlrec->cmax); + else if (xlrec->cmax != InvalidCommandId) + cid = xlrec->cmax; + else if (xlrec->cmin != InvalidCommandId) + cid = xlrec->cmin; + else + { + cid = InvalidCommandId; /* silence compiler */ + elog(ERROR, "xl_heap_new_cid record without a valid CommandId"); + } + + ReorderBufferAddNewCommandId(builder->reorder, xid, lsn, cid + 1); +} + +/* + * Check whether `xid` is currently 'running'. + * + * Running transactions in our parlance are transactions which we didn't + * observe from the start so we can't properly decode their contents. They + * only exist after we freshly started from an < CONSISTENT snapshot. + */ +static bool +SnapBuildTxnIsRunning(SnapBuild *builder, TransactionId xid) +{ + Assert(builder->state < SNAPBUILD_CONSISTENT); + Assert(TransactionIdIsNormal(builder->running.xmin)); + Assert(TransactionIdIsNormal(builder->running.xmax)); + + if (builder->running.xcnt && + NormalTransactionIdFollows(xid, builder->running.xmin) && + NormalTransactionIdPrecedes(xid, builder->running.xmax)) + { + TransactionId *search = + bsearch(&xid, builder->running.xip, builder->running.xcnt_space, + sizeof(TransactionId), xidComparator); + + if (search != NULL) + { + Assert(*search == xid); + return true; + } + } + + return false; +} + +/* + * Add a new Snapshot to all transactions we're decoding that currently are + * in-progress so they can see new catalog contents made by the transaction + * that just committed. This is necessary because those in-progress + * transactions will use the new catalog's contents from here on (at the very + * least everything they do needs to be compatible with newer catalog + * contents). + */ +static void +SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn) +{ + dlist_iter txn_i; + ReorderBufferTXN *txn; + + /* + * Iterate through all toplevel transactions. This can include + * subtransactions which we just don't yet know to be that, but that's + * fine, they will just get an unneccesary snapshot queued. + */ + dlist_foreach(txn_i, &builder->reorder->toplevel_by_lsn) + { + txn = dlist_container(ReorderBufferTXN, node, txn_i.cur); + + Assert(TransactionIdIsValid(txn->xid)); + + /* + * If we don't have a base snapshot yet, there are no changes in this + * transaction which in turn implies we don't yet need a snapshot at + * all. We'll add add a snapshot when the first change gets queued. + * + * NB: This works correctly even for subtransactions because + * ReorderBufferCommitChild() takes care to pass the parent the base + * snapshot, and while iterating the changequeue we'll get the change + * from the subtxn. + */ + if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, txn->xid)) + continue; + + elog(DEBUG2, "adding a new snapshot to %u at %X/%X", + txn->xid, (uint32) (lsn >> 32), (uint32) lsn); + + /* + * increase the snapshot's refcount for the transaction we are handing + * it out to + */ + SnapBuildSnapIncRefcount(builder->snapshot); + ReorderBufferAddSnapshot(builder->reorder, txn->xid, lsn, + builder->snapshot); + } +} + +/* + * Keep track of a new catalog changing transaction that has committed. + */ +static void +SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid) +{ + Assert(TransactionIdIsValid(xid)); + + if (builder->committed.xcnt == builder->committed.xcnt_space) + { + builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1; + + elog(DEBUG1, "increasing space for committed transactions to %u", + (uint32) builder->committed.xcnt_space); + + builder->committed.xip = repalloc(builder->committed.xip, + builder->committed.xcnt_space * sizeof(TransactionId)); + } + + /* + * TODO: It might make sense to keep the array sorted here instead of + * doing it every time we build a new snapshot. On the other hand this + * gets called repeatedly when a transaction with subtransactions commits. + */ + builder->committed.xip[builder->committed.xcnt++] = xid; +} + +/* + * Remove knowledge about transactions we treat as committed that are smaller + * than ->xmin. Those won't ever get checked via the ->commited array but via + * the clog machinery, so we don't need to waste memory on them. + */ +static void +SnapBuildPurgeCommittedTxn(SnapBuild *builder) +{ + int off; + TransactionId *workspace; + int surviving_xids = 0; + + /* not ready yet */ + if (!TransactionIdIsNormal(builder->xmin)) + return; + + /* TODO: Neater algorithm than just copying and iterating? */ + workspace = + MemoryContextAlloc(builder->context, + builder->committed.xcnt * sizeof(TransactionId)); + + /* copy xids that still are interesting to workspace */ + for (off = 0; off < builder->committed.xcnt; off++) + { + if (NormalTransactionIdPrecedes(builder->committed.xip[off], + builder->xmin)) + ; /* remove */ + else + workspace[surviving_xids++] = builder->committed.xip[off]; + } + + /* copy workspace back to persistent state */ + memcpy(builder->committed.xip, workspace, + surviving_xids * sizeof(TransactionId)); + + elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u", + (uint32) builder->committed.xcnt, (uint32) surviving_xids, + builder->xmin, builder->xmax); + builder->committed.xcnt = surviving_xids; + + pfree(workspace); +} + +/* + * Common logic for SnapBuildAbortTxn and SnapBuildCommitTxn dealing with + * keeping track of the amount of running transactions. + */ +static void +SnapBuildEndTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid) +{ + if (builder->state == SNAPBUILD_CONSISTENT) + return; + + /* + * NB: This handles subtransactions correctly even if we started from + * suboverflowed xl_running_xacts because we only keep track of toplevel + * transactions. Since the latter are always are allocated before their + * subxids and since they end at the same time it's sufficient to deal + * with them here. + */ + if (SnapBuildTxnIsRunning(builder, xid)) + { + Assert(builder->running.xcnt > 0); + + if (!--builder->running.xcnt) + { + /* + * None of the originally running transaction is running anymore, + * so our incrementaly built snapshot now is consistent. + */ + ereport(LOG, + (errmsg("logical decoding found consistent point at %X/%X", + (uint32)(lsn >> 32), (uint32)lsn), + errdetail("xid %u finished, no running transactions anymore", + xid))); + builder->state = SNAPBUILD_CONSISTENT; + } + } +} + +/* + * Abort a transaction, throw away all state we kept. + */ +void +SnapBuildAbortTxn(SnapBuild *builder, XLogRecPtr lsn, + TransactionId xid, + int nsubxacts, TransactionId *subxacts) +{ + int i; + + for (i = 0; i < nsubxacts; i++) + { + TransactionId subxid = subxacts[i]; + + SnapBuildEndTxn(builder, lsn, subxid); + } + + SnapBuildEndTxn(builder, lsn, xid); +} + +/* + * Handle everything that needs to be done when a transaction commits + */ +void +SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid, + int nsubxacts, TransactionId *subxacts) +{ + int nxact; + + bool forced_timetravel = false; + bool sub_needs_timetravel = false; + bool top_needs_timetravel = false; + + TransactionId xmax = xid; + + /* + * If we couldn't observe every change of a transaction because it was + * already running at the point we started to observe we have to assume it + * made catalog changes. + * + * This has the positive benefit that we afterwards have enough + * information to build an exportable snapshot that's usable by pg_dump et + * al. + */ + if (builder->state < SNAPBUILD_CONSISTENT) + { + /* ensure that only commits after this are getting replayed */ + if (builder->transactions_after < lsn) + builder->transactions_after = lsn; + + /* + * We could avoid treating !SnapBuildTxnIsRunning transactions as + * timetravel ones, but we want to be able to export a snapshot when + * we reached consistency. + */ + forced_timetravel = true; + elog(DEBUG1, "forced to assume catalog changes for xid %u because it was running to early", xid); + } + + for (nxact = 0; nxact < nsubxacts; nxact++) + { + TransactionId subxid = subxacts[nxact]; + + /* + * make sure txn is not tracked in running txn's anymore, switch state + */ + SnapBuildEndTxn(builder, lsn, subxid); + + /* + * If we're forcing timetravel we also need visibility information + * about subtransaction, so keep track of subtransaction's state. + */ + if (forced_timetravel) + { + SnapBuildAddCommittedTxn(builder, subxid); + if (NormalTransactionIdFollows(subxid, xmax)) + xmax = subxid; + } + + /* + * Add subtransaction to base snapshot if it DDL, we don't distinguish + * to toplevel transactions there. + */ + else if (ReorderBufferXidHasCatalogChanges(builder->reorder, subxid)) + { + sub_needs_timetravel = true; + + elog(DEBUG1, "found subtransaction %u:%u with catalog changes.", + xid, subxid); + + SnapBuildAddCommittedTxn(builder, subxid); + + if (NormalTransactionIdFollows(subxid, xmax)) + xmax = subxid; + } + } + + /* + * Make sure toplevel txn is not tracked in running txn's anymore, switch + * state to consistent if possible. + */ + SnapBuildEndTxn(builder, lsn, xid); + + if (forced_timetravel) + { + elog(DEBUG2, "forced transaction %u to do timetravel.", xid); + + SnapBuildAddCommittedTxn(builder, xid); + } + /* add toplevel transaction to base snapshot */ + else if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid)) + { + elog(DEBUG2, "found top level transaction %u, with catalog changes!", + xid); + + top_needs_timetravel = true; + SnapBuildAddCommittedTxn(builder, xid); + } + else if (sub_needs_timetravel) + { + /* mark toplevel txn as timetravel as well */ + SnapBuildAddCommittedTxn(builder, xid); + } + + /* if there's any reason to build a historic snapshot, to so now */ + if (forced_timetravel || top_needs_timetravel || sub_needs_timetravel) + { + /* + * Adjust xmax of the snapshot builder, we only do that for committed, + * catalog modifying, transactions, everything else isn't interesting + * for us since we'll never look at the respective rows. + */ + if (!TransactionIdIsValid(builder->xmax) || + TransactionIdFollowsOrEquals(xmax, builder->xmax)) + { + builder->xmax = xmax; + TransactionIdAdvance(builder->xmax); + } + + /* + * If we haven't built a complete snapshot yet there's no need to hand + * it out, it wouldn't (and couldn't) be used anyway. + */ + if (builder->state < SNAPBUILD_FULL_SNAPSHOT) + return; + + /* + * Decrease the snapshot builder's refcount of the old snapshot, note + * that it still will be used if it has been handed out to the + * reorderbuffer earlier. + */ + if (builder->snapshot) + SnapBuildSnapDecRefcount(builder->snapshot); + + builder->snapshot = SnapBuildBuildSnapshot(builder, xid); + + /* we might need to execute invalidations, add snapshot */ + if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid)) + { + SnapBuildSnapIncRefcount(builder->snapshot); + ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn, + builder->snapshot); + } + + /* refcount of the snapshot builder for the new snapshot */ + SnapBuildSnapIncRefcount(builder->snapshot); + + /* add a new SnapshotNow to all currently running transactions */ + SnapBuildDistributeNewCatalogSnapshot(builder, lsn); + } + else + { + /* record that we cannot export a general snapshot anymore */ + builder->committed.includes_all_transactions = false; + } +} + + +/* ----------------------------------- + * Snapshot building functions dealing with xlog records + * ----------------------------------- + */ + +/* + * Process a running xacts record, and use it's information to first build a + * historic snapshot and later to release resources that aren't needed + * anymore. + */ +void +SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running) +{ + ReorderBufferTXN *txn; + + /* + * If we're not consistent yet, inspect the record to see whether it + * allows to get closer to being consistent. If we are consistent, dump + * our snapshot so others or we, after a restart, can use it. + */ + if (builder->state < SNAPBUILD_CONSISTENT) + { + /* returns false if there's no point in performing cleanup just yet */ + if (!SnapBuildFindSnapshot(builder, lsn, running)) + return; + } + else + SnapBuildSerialize(builder, lsn); + + /* + * Update range of interesting xids base don the running xacts + * information. We don't increase ->xmax using it, because once we are in + * a consistent state we can do that ourselves and much more efficiently + * so, because we only need to do it for catalog transactions since we + * only ever look at those. + * + * NB: Because of that xmax can be lower than xmin, because we only + * increase xmax when a catalog modifying transaction commits. While odd + * looking, its correct and actually more efficient this way since we hit + * fast paths in tqual.c. + */ + builder->xmin = running->oldestRunningXid; + + /* Remove transactions we don't need to keep track off anymore */ + SnapBuildPurgeCommittedTxn(builder); + + elog(DEBUG3, "xmin: %u, xmax: %u, oldestrunning: %u", + builder->xmin, builder->xmax, + running->oldestRunningXid); + + /* + * Inrease shared memory limits, so vacuum can work on tuples we prevented + * from being pruned till now. + */ + LogicalIncreaseXminForSlot(lsn, running->oldestRunningXid); + + /* + * Also tell the slot where we can restart decoding from. We don't want to + * do that after every commit because changing that implies an fsync of + * the logical slot's state file, so we only do it every time we see a + * running xacts record. + * + * Do so by looking for the oldest in progress transaction (determined by + * the first LSN of any of its relevant records). Every transaction + * remembers the last location we stored the snapshot to disk before its + * beginning. That point is where we can restart from. + */ + + /* + * Can't know about a serialized snapshot's location if we're not + * consistent. + */ + if (builder->state < SNAPBUILD_CONSISTENT) + return; + + txn = ReorderBufferGetOldestTXN(builder->reorder); + + /* + * oldest ongoing txn might have started when we didn't yet serialize + * anything because we hadn't reached a consistent state yet. + */ + if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr) + LogicalIncreaseRestartDecodingForSlot(lsn, txn->restart_decoding_lsn); + /* + * No in-progress transaction, can reuse the last serialized snapshot if + * we have one. + */ + else if (txn == NULL && + builder->reorder->current_restart_decoding_lsn != InvalidXLogRecPtr && + builder->last_serialized_snapshot != InvalidXLogRecPtr) + LogicalIncreaseRestartDecodingForSlot(lsn, + builder->last_serialized_snapshot); +} + + +/* + * Build the start of a snapshot that's capable of decoding the catalog. + * + * Helper function for SnapBuildProcessRunningXacts() while we're not yet + * consistent. + * + * Returns true if there is a point in performing internal maintenance/cleanup + * using the xl_running_xacts record. + */ +static bool +SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running) +{ + /* --- + * Build catalog decoding snapshot incrementally using information about + * the currently running transactions. There are several ways to do that: + * + * a) There were no running transactions when the xl_running_xacts record + * was inserted, jump to CONSISTENT immediately. We might find such a + * state we were waiting for b) and c). + * + * b) Wait for all toplevel transactions that were running to end. We + * simply track the number of in-progress toplevel transactions and + * lower it whenever one commits or aborts. When that number + * (builder->running.xcnt) reaches zero, we can go from FULL_SNAPSHOT + * to CONSISTENT. + * NB: We need to search running.xip when seeing a transaction's end to + * make sure it's a toplevel transaction and it's been one of the + * intially running ones. + * Interestingly, in contrast to HS, this allows us not to care about + * subtransactions - and by extension suboverflowed xl_running_xacts - + * at all. + * + * c) This (in a previous run) or another decoding slot serialized a + * snapshot to disk that we can use. + * --- + */ + + /* + * xl_running_xact record is older than what we can use, we might not have + * all necessary catalog rows anymore. + */ + if (TransactionIdIsNormal(builder->initial_xmin_horizon) && + NormalTransactionIdPrecedes(running->oldestRunningXid, + builder->initial_xmin_horizon)) + { + ereport(DEBUG1, + (errmsg("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low", + (uint32) (lsn >> 32), (uint32) lsn), + errdetail("initial xmin horizon of %u vs the snapshot's %u", + builder->initial_xmin_horizon, running->oldestRunningXid))); + return true; + } + + /* + * a) No transaction were running, we can jump to consistent. + * + * NB: We might have already started to incrementally assemble a snapshot, + * so we need to be careful to deal with that. + */ + if (running->xcnt == 0) + { + if (builder->transactions_after == InvalidXLogRecPtr || + builder->transactions_after < lsn) + builder->transactions_after = lsn; + + builder->xmin = running->oldestRunningXid; + builder->xmax = running->latestCompletedXid; + TransactionIdAdvance(builder->xmax); + + Assert(TransactionIdIsNormal(builder->xmin)); + Assert(TransactionIdIsNormal(builder->xmax)); + + /* no transactions running now */ + builder->running.xcnt = 0; + builder->running.xmin = InvalidTransactionId; + builder->running.xmax = InvalidTransactionId; + + builder->state = SNAPBUILD_CONSISTENT; + + ereport(LOG, + (errmsg("logical decoding found consistent point at %X/%X", + (uint32)(lsn >> 32), (uint32)lsn), + errdetail("running xacts with xcnt == 0"))); + + return false; + } + /* c) valid on disk state */ + else if (SnapBuildRestore(builder, lsn)) + { + /* there won't be any state to cleanup */ + return false; + } + /* + * b) first encounter of a useable xl_running_xacts record. If we had + * found one earlier we would either track running transactions + * (i.e. builder->running.xcnt != 0) or be consistent (this function + * wouldn't get called). + */ + else if (!builder->running.xcnt) + { + int off; + + /* + * We only care about toplevel xids as those are the ones we + * definitely see in the wal stream. As snapbuild.c tracks committed + * instead of running transactions we don't need to know anything + * about uncommitted subtransactions. + */ + builder->xmin = running->oldestRunningXid; + builder->xmax = running->latestCompletedXid; + TransactionIdAdvance(builder->xmax); + + /* so we can safely use the faster comparisons */ + Assert(TransactionIdIsNormal(builder->xmin)); + Assert(TransactionIdIsNormal(builder->xmax)); + + builder->running.xcnt = running->xcnt; + builder->running.xcnt_space = running->xcnt; + builder->running.xip = + MemoryContextAlloc(builder->context, + builder->running.xcnt * sizeof(TransactionId)); + memcpy(builder->running.xip, running->xids, + builder->running.xcnt * sizeof(TransactionId)); + + /* sort so we can do a binary search */ + qsort(builder->running.xip, builder->running.xcnt, + sizeof(TransactionId), xidComparator); + + builder->running.xmin = builder->running.xip[0]; + builder->running.xmax = builder->running.xip[running->xcnt - 1]; + + /* makes comparisons cheaper later */ + TransactionIdRetreat(builder->running.xmin); + TransactionIdAdvance(builder->running.xmax); + + builder->state = SNAPBUILD_FULL_SNAPSHOT; + + ereport(LOG, + (errmsg("logical decoding found initial starting point at %X/%X", + (uint32)(lsn >> 32), (uint32)lsn), + errdetail("%u xacts need to finish", (uint32) builder->running.xcnt))); + + /* + * Iterate through all xids, wait for them to finish. + * + * This isn't required for the correctness of decoding, but to allow + * isolationtester to notice that we're currently waiting for + * something. + */ + for(off = 0; off < builder->running.xcnt; off++) + { + TransactionId xid = builder->running.xip[off]; + + /* + * Upper layers should prevent that we ever need to wait on + * ourselves. Check anyway, since failing to do so would either + * result in an endless wait or an Assert() failure. + */ + if (TransactionIdIsCurrentTransactionId(xid)) + elog(ERROR, "waiting for ourselves"); + + XactLockTableWait(xid); + } + + /* nothing could have built up so far, so don't perform cleanup */ + return false; + } + + /* + * We already started to track running xacts and need to wait for all + * in-progress ones to finish. We fall through to the normal processing of + * records so incremental cleanup can be performed. + */ + return true; +} + + +/* ----------------------------------- + * Snapshot serialization support + * ----------------------------------- + */ + +/* + * We store current state of struct SnapBuild on disk in the following manner: + * + * struct SnapBuildOnDisk; + * TransactionId * running.xcnt_space; + * TransactionId * committed.xcnt; (*not xcnt_space*) + * + */ +typedef struct SnapBuildOnDisk +{ + /* first part of this struct needs to be version independent */ + + /* data not covered by checksum */ + uint32 magic; + pg_crc32 checksum; + + /* data covered by checksum */ + + /* version, in case we want to support pg_upgrade */ + uint32 version; + /* how large is the on disk data, excluding the constant sized part */ + uint32 length; + + /* version dependent part */ + SnapBuild builder; + + /* variable amount of TransactionIds follows */ +} SnapBuildOnDisk; + +#define SnapBuildOnDiskConstantSize \ + offsetof(SnapBuildOnDisk, builder) +#define SnapBuildOnDiskNotChecksummedSize \ + offsetof(SnapBuildOnDisk, version) + +#define SNAPBUILD_MAGIC 0x51A1E001 +#define SNAPBUILD_VERSION 1 + +/* + * Store/Load a snapshot from disk, depending on the snapshot builder's state. + * + * Supposed to be used by external (i.e. not snapbuild.c) code that just reada + * record that's a potential location for a serialized snapshot. + */ +void +SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn) +{ + if (builder->state < SNAPBUILD_CONSISTENT) + SnapBuildRestore(builder, lsn); + else + SnapBuildSerialize(builder, lsn); +} + +/* + * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already + * been done by another decoding process. + */ +static void +SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) +{ + Size needed_length; + SnapBuildOnDisk *ondisk; + char *ondisk_c; + int fd; + char tmppath[MAXPGPATH]; + char path[MAXPGPATH]; + int ret; + struct stat stat_buf; + uint32 sz; + + Assert(lsn != InvalidXLogRecPtr); + Assert(builder->last_serialized_snapshot == InvalidXLogRecPtr || + builder->last_serialized_snapshot <= lsn); + + /* + * no point in serializing if we cannot continue to work immediately after + * restoring the snapshot + */ + if (builder->state < SNAPBUILD_CONSISTENT) + return; + + /* + * We identify snapshots by the LSN they are valid for. We don't need to + * include timelines in the name as each LSN maps to exactly one timeline + * unless the user used pg_resetxlog or similar. If a user did so, there's + * no hope continuing to decode anyway. + */ + sprintf(path, "pg_llog/snapshots/%X-%X.snap", + (uint32) (lsn >> 32), (uint32) lsn); + + /* + * first check whether some other backend already has written the snapshot + * for this LSN. It's perfectly fine if there's none, so we accept ENOENT + * as a valid state. Everything else is an unexpected error. + */ + ret = stat(path, &stat_buf); + + if (ret != 0 && errno != ENOENT) + ereport(ERROR, + (errmsg("could not stat file \"%s\": %m", path))); + + else if (ret == 0) + { + /* + * somebody else has already serialized to this point, don't overwrite + * but remember location, so we don't need to read old data again. + * + * To be sure it has been synced to disk after the rename() from the + * tempfile filename to the real filename, we just repeat the + * fsync. That ought to be cheap because in most scenarios it should + * already be safely on disk. + */ + fsync_fname(path, false); + fsync_fname("pg_llog/snapshots", true); + + builder->last_serialized_snapshot = lsn; + goto out; + } + + /* + * there is an obvious race condition here between the time we stat(2) the + * file and us writing the file. But we rename the file into place + * atomically and all files created need to contain the same data anyway, + * so this is perfectly fine, although a bit of a resource waste. Locking + * seems like pointless complication. + */ + elog(DEBUG1, "serializing snapshot to %s", path); + + /* to make sure only we will write to this tempfile, include pid */ + sprintf(tmppath, "pg_llog/snapshots/%X-%X.snap.%u.tmp", + (uint32) (lsn >> 32), (uint32) lsn, MyProcPid); + + /* + * Unlink temporary file if it already exists, needs to have been before a + * crash/error since we won't enter this function twice from within a + * single decoding slot/backend and the temporary file contains the pid of + * the current process. + */ + if (unlink(tmppath) != 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not unlink file \"%s\": %m", path))); + + needed_length = sizeof(SnapBuildOnDisk) + + sizeof(TransactionId) * builder->running.xcnt_space + + sizeof(TransactionId) * builder->committed.xcnt; + + ondisk_c = MemoryContextAllocZero(builder->context, needed_length); + ondisk = (SnapBuildOnDisk *) ondisk_c; + ondisk->magic = SNAPBUILD_MAGIC; + ondisk->version = SNAPBUILD_VERSION; + ondisk->length = needed_length; + INIT_CRC32(ondisk->checksum); + COMP_CRC32(ondisk->checksum, + ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize, + SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize); + ondisk_c += sizeof(SnapBuildOnDisk); + + memcpy(&ondisk->builder, builder, sizeof(SnapBuild)); + /* NULL-ify memory-only data */ + ondisk->builder.context = NULL; + ondisk->builder.snapshot = NULL; + ondisk->builder.reorder = NULL; + ondisk->builder.running.xip = NULL; + ondisk->builder.committed.xip = NULL; + + COMP_CRC32(ondisk->checksum, + &ondisk->builder, + sizeof(SnapBuild)); + + /* copy running xacts */ + sz = sizeof(TransactionId) * builder->running.xcnt_space; + memcpy(ondisk_c, builder->running.xip, sz); + COMP_CRC32(ondisk->checksum, ondisk_c, sz); + ondisk_c += sz; + + /* copy committed xacts */ + sz = sizeof(TransactionId) * builder->committed.xcnt; + memcpy(ondisk_c, builder->committed.xip, sz); + COMP_CRC32(ondisk->checksum, ondisk_c, sz); + ondisk_c += sz; + + /* we have valid data now, open tempfile and write it there */ + fd = OpenTransientFile(tmppath, + O_CREAT | O_EXCL | O_WRONLY | PG_BINARY, + S_IRUSR | S_IWUSR); + if (fd < 0) + ereport(ERROR, + (errmsg("could not open file \"%s\": %m", path))); + + if ((write(fd, ondisk, needed_length)) != needed_length) + { + CloseTransientFile(fd); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); + } + + /* + * fsync the file before renaming so that even if we crash after this we + * have either a fully valid file or nothing. + * + * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has + * some noticeable overhead since it's performed synchronously during + * decoding? + */ + if (pg_fsync(fd) != 0) + { + CloseTransientFile(fd); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", tmppath))); + } + CloseTransientFile(fd); + + fsync_fname("pg_llog/snapshots", true); + + /* + * We may overwrite the work from some other backend, but that's ok, our + * snapshot is valid as well, we'll just have done some superflous work. + */ + if (rename(tmppath, path) != 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + tmppath, path))); + } + + /* make sure we persist */ + fsync_fname(path, false); + fsync_fname("pg_llog/snapshots", true); + + /* + * Now there's no way we can loose the dumped state anymore, remember + * this as a serialization point. + */ + builder->last_serialized_snapshot = lsn; + +out: + ReorderBufferSetRestartPoint(builder->reorder, + builder->last_serialized_snapshot); +} + +/* + * Restore a snapshot into 'builder' if previously one has been stored at the + * location indicated by 'lsn'. Returns true if successful, false otherwise. + */ +static bool +SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn) +{ + SnapBuildOnDisk ondisk; + int fd; + char path[MAXPGPATH]; + Size sz; + int readBytes; + pg_crc32 checksum; + + /* no point in loading a snapshot if we're already there */ + if (builder->state == SNAPBUILD_CONSISTENT) + return false; + + sprintf(path, "pg_llog/snapshots/%X-%X.snap", + (uint32) (lsn >> 32), (uint32) lsn); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0); + + if (fd < 0 && errno == ENOENT) + return false; + else if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + /* ---- + * Make sure the snapshot had been stored safely to disk, that's normally + * cheap. + * Note that we do not need PANIC here, nobody will be able to use the + * slot without fsyncing, and saving it won't suceed without an fsync() + * either... + * ---- + */ + fsync_fname(path, false); + fsync_fname("pg_llog/snapshots", true); + + + /* read statically sized portion of snapshot */ + readBytes = read(fd, &ondisk, SnapBuildOnDiskConstantSize); + if (readBytes != SnapBuildOnDiskConstantSize) + { + CloseTransientFile(fd); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\", read %d of %d: %m", + path, readBytes, (int) SnapBuildOnDiskConstantSize))); + } + + if (ondisk.magic != SNAPBUILD_MAGIC) + ereport(ERROR, + (errmsg("snapbuild state file \"%s\" has wrong magic %u instead of %u", + path, ondisk.magic, SNAPBUILD_MAGIC))); + + if (ondisk.version != SNAPBUILD_VERSION) + ereport(ERROR, + (errmsg("snapbuild state file \"%s\" has unsupported version %u instead of %u", + path, ondisk.version, SNAPBUILD_VERSION))); + + INIT_CRC32(checksum); + COMP_CRC32(checksum, + ((char *) &ondisk) + SnapBuildOnDiskNotChecksummedSize, + SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize); + + /* read SnapBuild */ + readBytes = read(fd, &ondisk.builder, sizeof(SnapBuild)); + if (readBytes != sizeof(SnapBuild)) + { + CloseTransientFile(fd); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\", read %d of %d: %m", + path, readBytes, (int) sizeof(SnapBuild)))); + } + COMP_CRC32(checksum, &ondisk.builder, sizeof(SnapBuild)); + + /* restore running xacts information */ + sz = sizeof(TransactionId) * ondisk.builder.running.xcnt_space; + ondisk.builder.running.xip = MemoryContextAlloc(builder->context, sz); + readBytes = read(fd, ondisk.builder.running.xip, sz); + if (readBytes != sz) + { + CloseTransientFile(fd); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\", read %d of %d: %m", + path, readBytes, (int) sz))); + } + COMP_CRC32(checksum, ondisk.builder.running.xip, sz); + + /* restore committed xacts information */ + sz = sizeof(TransactionId) * ondisk.builder.committed.xcnt; + ondisk.builder.committed.xip = MemoryContextAlloc(builder->context, sz); + readBytes = read(fd, ondisk.builder.committed.xip, sz); + if (readBytes != sz) + { + CloseTransientFile(fd); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\", read %d of %d: %m", + path, readBytes, (int) sz))); + } + COMP_CRC32(checksum, ondisk.builder.committed.xip, sz); + + CloseTransientFile(fd); + + /* verify checksum of what we've read */ + if (!EQ_CRC32(checksum, ondisk.checksum)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("snapbuild state file %s: checksum mismatch, is %u, should be %u", + path, checksum, ondisk.checksum))); + + /* + * ok, we now have a sensible snapshot here, figure out if it has more + * information than we have. + */ + + /* + * We are only interested in consistent snapshots for now, comparing + * whether one imcomplete snapshot is more "advanced" seems to be + * unnecessarily complex. + */ + if (ondisk.builder.state < SNAPBUILD_CONSISTENT) + goto snapshot_not_interesting; + + /* + * Don't use a snapshot that requires an xmin that we cannot guarantee to + * be available. + */ + if (TransactionIdPrecedes(ondisk.builder.xmin, builder->initial_xmin_horizon)) + goto snapshot_not_interesting; + + + /* ok, we think the snapshot is sensible, copy over everything important */ + builder->xmin = ondisk.builder.xmin; + builder->xmax = ondisk.builder.xmax; + builder->state = ondisk.builder.state; + + builder->committed.xcnt = ondisk.builder.committed.xcnt; + /* We only allocated/stored xcnt, not xcnt_space xids ! */ + /* don't overwrite preallocated xip, if we don't have anything here */ + if (builder->committed.xcnt > 0) + { + pfree(builder->committed.xip); + builder->committed.xcnt_space = ondisk.builder.committed.xcnt; + builder->committed.xip = ondisk.builder.committed.xip; + } + ondisk.builder.committed.xip = NULL; + + builder->running.xcnt = ondisk.builder.committed.xcnt; + if (builder->running.xip) + pfree(builder->running.xip); + builder->running.xcnt_space = ondisk.builder.committed.xcnt_space; + builder->running.xip = ondisk.builder.running.xip; + + /* our snapshot is not interesting anymore, build a new one */ + if (builder->snapshot != NULL) + { + SnapBuildSnapDecRefcount(builder->snapshot); + } + builder->snapshot = SnapBuildBuildSnapshot(builder, InvalidTransactionId); + SnapBuildSnapIncRefcount(builder->snapshot); + + ReorderBufferSetRestartPoint(builder->reorder, lsn); + + Assert(builder->state == SNAPBUILD_CONSISTENT); + + ereport(LOG, + (errmsg("logical decoding found consistent point at %X/%X", + (uint32)(lsn >> 32), (uint32)lsn), + errdetail("found initial snapshot in snapbuild file"))); + return true; + +snapshot_not_interesting: + if (ondisk.builder.running.xip != NULL) + pfree(ondisk.builder.running.xip); + if (ondisk.builder.committed.xip != NULL) + pfree(ondisk.builder.committed.xip); + return false; +} + +/* + * Remove all serialized snapshots that are not required anymore because no + * slot can need them. This doesn't actually have to run during a checkpoint, + * but it's a convenient point to schedule this. + * + * NB: We run this during checkpoints even if logical decoding is disabled so + * we cleanup old slots at some point after it got disabled. + */ +void +CheckPointSnapBuild(void) +{ + XLogRecPtr cutoff; + XLogRecPtr redo; + DIR *snap_dir; + struct dirent *snap_de; + char path[MAXPGPATH]; + + /* + * We start of with a minimum of the last redo pointer. No new replication + * slot will start before that, so that's a safe upper bound for removal. + */ + redo = GetRedoRecPtr(); + + /* now check for the restart ptrs from existing slots */ + cutoff = ReplicationSlotsComputeLogicalRestartLSN(); + + /* don't start earlier than the restart lsn */ + if (redo < cutoff) + cutoff = redo; + + snap_dir = AllocateDir("pg_llog/snapshots"); + while ((snap_de = ReadDir(snap_dir, "pg_llog/snapshots")) != NULL) + { + uint32 hi; + uint32 lo; + XLogRecPtr lsn; + struct stat statbuf; + + if (strcmp(snap_de->d_name, ".") == 0 || + strcmp(snap_de->d_name, "..") == 0) + continue; + + snprintf(path, MAXPGPATH, "pg_llog/snapshots/%s", snap_de->d_name); + + if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode)) + { + elog(DEBUG1, "only regular files expected: %s", path); + continue; + } + + /* + * temporary filenames from SnapBuildSerialize() include the LSN and + * everything but are postfixed by .$pid.tmp. We can just remove them + * the same as other files because there can be none that are currently + * being written that are older than cutoff. + * + * We just log a message if a file doesn't fit the pattern, it's + * probably some editors lock/state file or similar... + */ + if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2) + { + ereport(LOG, + (errmsg("could not parse filename \"%s\"", path))); + continue; + } + + lsn = ((uint64) hi) << 32 | lo; + + /* check whether we still need it */ + if (lsn < cutoff || cutoff == InvalidXLogRecPtr) + { + elog(DEBUG1, "removing snapbuild snapshot %s", path); + + /* + * It's not particularly harmful, though strange, if we can't + * remove the file here. Don't prevent the checkpoint from + * completing, that'd be cure worse than the disease. + */ + if (unlink(path) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not unlink file \"%s\": %m", + path))); + continue; + } + } + } + FreeDir(snap_dir); +} diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 826c7f027e..45ed7e40e8 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -43,6 +43,7 @@ #include "miscadmin.h" #include "replication/slot.h" #include "storage/fd.h" +#include "storage/proc.h" #include "storage/procarray.h" /* @@ -82,6 +83,8 @@ ReplicationSlot *MyReplicationSlot = NULL; /* GUCs */ int max_replication_slots = 0; /* the maximum number of replication slots */ +static void ReplicationSlotDropAcquired(void); + /* internal persistency functions */ static void RestoreSlotFromDisk(const char *name); static void CreateSlotOnDisk(ReplicationSlot *slot); @@ -190,11 +193,12 @@ ReplicationSlotValidateName(const char *name, int elevel) * Create a new replication slot and mark it as used by this backend. * * name: Name of the slot - * db_specific: changeset extraction is db specific, if the slot is going to + * db_specific: logical decoding is db specific; if the slot is going to * be used for that pass true, otherwise false. */ void -ReplicationSlotCreate(const char *name, bool db_specific) +ReplicationSlotCreate(const char *name, bool db_specific, + ReplicationSlotPersistency persistency) { ReplicationSlot *slot = NULL; int i; @@ -246,6 +250,7 @@ ReplicationSlotCreate(const char *name, bool db_specific) */ Assert(!slot->in_use); Assert(!slot->active); + slot->data.persistency = persistency; slot->data.xmin = InvalidTransactionId; slot->effective_xmin = InvalidTransactionId; strncpy(NameStr(slot->data.name), name, NAMEDATALEN); @@ -348,14 +353,30 @@ ReplicationSlotRelease(void) Assert(slot != NULL && slot->active); - /* Mark slot inactive. We're not freeing it, just disconnecting. */ + if (slot->data.persistency == RS_EPHEMERAL) + { + /* + * Delete the slot. There is no !PANIC case where this is allowed to + * fail, all that may happen is an incomplete cleanup of the on-disk + * data. + */ + ReplicationSlotDropAcquired(); + } + else { + /* Mark slot inactive. We're not freeing it, just disconnecting. */ volatile ReplicationSlot *vslot = slot; SpinLockAcquire(&slot->mutex); vslot->active = false; SpinLockRelease(&slot->mutex); - MyReplicationSlot = NULL; } + + MyReplicationSlot = NULL; + + /* might not have been set when we've been a plain slot */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + MyPgXact->vacuumFlags &= ~PROC_IN_LOGICAL_DECODING; + LWLockRelease(ProcArrayLock); } /* @@ -364,52 +385,36 @@ ReplicationSlotRelease(void) void ReplicationSlotDrop(const char *name) { - ReplicationSlot *slot = NULL; - int i; - bool active; + Assert(MyReplicationSlot == NULL); + + ReplicationSlotAcquire(name); + + ReplicationSlotDropAcquired(); +} + +/* + * Permanently drop the currently acquired replication slot which will be + * released by the point this function returns. + */ +static void +ReplicationSlotDropAcquired(void) +{ char path[MAXPGPATH]; char tmppath[MAXPGPATH]; + ReplicationSlot *slot = MyReplicationSlot; - ReplicationSlotValidateName(name, ERROR); + Assert(MyReplicationSlot != NULL); + + /* slot isn't acquired anymore */ + MyReplicationSlot = NULL; /* - * If some other backend ran this code currently with us, we might both - * try to free the same slot at the same time. Or we might try to delete - * a slot with a certain name while someone else was trying to create a - * slot with the same name. + * If some other backend ran this code concurrently with us, we might try + * to delete a slot with a certain name while someone else was trying to + * create a slot with the same name. */ LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE); - /* Search for the named slot and mark it active if we find it. */ - LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); - for (i = 0; i < max_replication_slots; i++) - { - ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; - - if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0) - { - volatile ReplicationSlot *vslot = s; - - SpinLockAcquire(&s->mutex); - active = vslot->active; - vslot->active = true; - SpinLockRelease(&s->mutex); - slot = s; - break; - } - } - LWLockRelease(ReplicationSlotControlLock); - - /* If we did not find the slot or it was already active, error out. */ - if (slot == NULL) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("replication slot \"%s\" does not exist", name))); - if (active) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_IN_USE), - errmsg("replication slot \"%s\" is already active", name))); - /* Generate pathnames. */ sprintf(path, "pg_replslot/%s", NameStr(slot->data.name)); sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name)); @@ -417,34 +422,40 @@ ReplicationSlotDrop(const char *name) /* * Rename the slot directory on disk, so that we'll no longer recognize * this as a valid slot. Note that if this fails, we've got to mark the - * slot inactive again before bailing out. + * slot inactive before bailing out. If we're dropping a ephemeral slot, + * we better never fail hard as the caller won't expect the slot to + * survive and this might get called during error handling. */ - if (rename(path, tmppath) != 0) + if (rename(path, tmppath) == 0) + { + /* + * We need to fsync() the directory we just renamed and its parent to + * make sure that our changes are on disk in a crash-safe fashion. If + * fsync() fails, we can't be sure whether the changes are on disk or + * not. For now, we handle that by panicking; + * StartupReplicationSlots() will try to straighten it out after + * restart. + */ + START_CRIT_SECTION(); + fsync_fname(tmppath, true); + fsync_fname("pg_replslot", true); + END_CRIT_SECTION(); + } + else { volatile ReplicationSlot *vslot = slot; + bool fail_softly = slot->data.persistency == RS_EPHEMERAL; SpinLockAcquire(&slot->mutex); vslot->active = false; SpinLockRelease(&slot->mutex); - ereport(ERROR, + ereport(fail_softly ? WARNING : ERROR, (errcode_for_file_access(), errmsg("could not rename \"%s\" to \"%s\": %m", path, tmppath))); } - /* - * We need to fsync() the directory we just renamed and its parent to make - * sure that our changes are on disk in a crash-safe fashion. If fsync() - * fails, we can't be sure whether the changes are on disk or not. For - * now, we handle that by panicking; StartupReplicationSlots() will - * try to straighten it out after restart. - */ - START_CRIT_SECTION(); - fsync_fname(tmppath, true); - fsync_fname("pg_replslot", true); - END_CRIT_SECTION(); - /* * The slot is definitely gone. Lock out concurrent scans of the array * long enough to kill it. It's OK to clear the active flag here without @@ -461,7 +472,7 @@ ReplicationSlotDrop(const char *name) * Slot is dead and doesn't prevent resource removal anymore, recompute * limits. */ - ReplicationSlotsComputeRequiredXmin(); + ReplicationSlotsComputeRequiredXmin(false); ReplicationSlotsComputeRequiredLSN(); /* @@ -518,22 +529,50 @@ ReplicationSlotMarkDirty(void) } } +/* + * Convert a slot that's marked as RS_DROP_ON_ERROR to a RS_PERSISTENT slot, + * guaranteeing it will be there after a eventual crash. + */ +void +ReplicationSlotPersist(void) +{ + ReplicationSlot *slot = MyReplicationSlot; + + Assert(slot != NULL); + Assert(slot->data.persistency != RS_PERSISTENT); + + { + volatile ReplicationSlot *vslot = slot; + + SpinLockAcquire(&slot->mutex); + vslot->data.persistency = RS_PERSISTENT; + SpinLockRelease(&slot->mutex); + } + + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); +} + /* * Compute the oldest xmin across all slots and store it in the ProcArray. */ void -ReplicationSlotsComputeRequiredXmin(void) +ReplicationSlotsComputeRequiredXmin(bool already_locked) { int i; TransactionId agg_xmin = InvalidTransactionId; + TransactionId agg_catalog_xmin = InvalidTransactionId; Assert(ReplicationSlotCtl != NULL); - LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + if (!already_locked) + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; TransactionId effective_xmin; + TransactionId effective_catalog_xmin; if (!s->in_use) continue; @@ -543,6 +582,7 @@ ReplicationSlotsComputeRequiredXmin(void) SpinLockAcquire(&s->mutex); effective_xmin = vslot->effective_xmin; + effective_catalog_xmin = vslot->effective_catalog_xmin; SpinLockRelease(&s->mutex); } @@ -551,10 +591,18 @@ ReplicationSlotsComputeRequiredXmin(void) (!TransactionIdIsValid(agg_xmin) || TransactionIdPrecedes(effective_xmin, agg_xmin))) agg_xmin = effective_xmin; + + /* check the catalog xmin */ + if (TransactionIdIsValid(effective_catalog_xmin) && + (!TransactionIdIsValid(agg_catalog_xmin) || + TransactionIdPrecedes(effective_catalog_xmin, agg_catalog_xmin))) + agg_catalog_xmin = effective_catalog_xmin; } - LWLockRelease(ReplicationSlotControlLock); - ProcArraySetReplicationSlotXmin(agg_xmin); + if (!already_locked) + LWLockRelease(ReplicationSlotControlLock); + + ProcArraySetReplicationSlotXmin(agg_xmin, agg_catalog_xmin, already_locked); } /* @@ -595,6 +643,110 @@ ReplicationSlotsComputeRequiredLSN(void) XLogSetReplicationSlotMinimumLSN(min_required); } +/* + * Compute the oldest WAL LSN required by *logical* decoding slots.. + * + * Returns InvalidXLogRecPtr if logical decoding is disabled or no logicals + * slots exist. + * + * NB: this returns a value >= ReplicationSlotsComputeRequiredLSN(), since it + * ignores physical replication slots. + * + * The results aren't required frequently, so we don't maintain a precomputed + * value like we do for ComputeRequiredLSN() and ComputeRequiredXmin(). + */ +XLogRecPtr +ReplicationSlotsComputeLogicalRestartLSN(void) +{ + XLogRecPtr result = InvalidXLogRecPtr; + int i; + + if (max_replication_slots <= 0) + return InvalidXLogRecPtr; + + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + + for (i = 0; i < max_replication_slots; i++) + { + volatile ReplicationSlot *s; + XLogRecPtr restart_lsn; + + s = &ReplicationSlotCtl->replication_slots[i]; + + /* cannot change while ReplicationSlotCtlLock is held */ + if (!s->in_use) + continue; + + /* we're only interested in logical slots */ + if (s->data.database == InvalidOid) + continue; + + /* read once, it's ok if it increases while we're checking */ + SpinLockAcquire(&s->mutex); + restart_lsn = s->data.restart_lsn; + SpinLockRelease(&s->mutex); + + if (result == InvalidXLogRecPtr || + restart_lsn < result) + result = restart_lsn; + } + + LWLockRelease(ReplicationSlotControlLock); + + return result; +} + +/* + * ReplicationSlotsCountDBSlots -- count the number of slots that refer to the + * passed database oid. + * + * Returns true if there are any slots referencing the database. *nslots will + * be set to the absolute number of slots in the database, *nactive to ones + * currently active. + */ +bool +ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive) +{ + int i; + + *nslots = *nactive = 0; + + if (max_replication_slots <= 0) + return false; + + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + for (i = 0; i < max_replication_slots; i++) + { + volatile ReplicationSlot *s; + + s = &ReplicationSlotCtl->replication_slots[i]; + + /* cannot change while ReplicationSlotCtlLock is held */ + if (!s->in_use) + continue; + + /* not database specific, skip */ + if (s->data.database == InvalidOid) + + /* not our database, skip */ + if (s->data.database != dboid) + continue; + + /* count slots with spinlock held */ + SpinLockAcquire(&s->mutex); + (*nslots)++; + if (s->active) + (*nactive)++; + SpinLockRelease(&s->mutex); + } + LWLockRelease(ReplicationSlotControlLock); + + if (*nslots > 0) + return true; + return false; +} + + /* * Check whether the server's configuration supports using replication * slots. @@ -723,7 +875,7 @@ StartupReplicationSlots(XLogRecPtr checkPointRedo) return; /* Now that we have recovered all the data, compute replication xmin */ - ReplicationSlotsComputeRequiredXmin(); + ReplicationSlotsComputeRequiredXmin(false); ReplicationSlotsComputeRequiredLSN(); } @@ -1050,8 +1202,19 @@ RestoreSlotFromDisk(const char *name) memcpy(&slot->data, &cp.slotdata, sizeof(ReplicationSlotPersistentData)); + /* Don't restore the slot if it's not parked as persistent. */ + if (slot->data.persistency != RS_PERSISTENT) + return; + /* initialize in memory state */ slot->effective_xmin = cp.slotdata.xmin; + slot->effective_catalog_xmin = cp.slotdata.catalog_xmin; + + slot->candidate_catalog_xmin = InvalidTransactionId; + slot->candidate_xmin_lsn = InvalidXLogRecPtr; + slot->candidate_restart_lsn = InvalidXLogRecPtr; + slot->candidate_restart_valid = InvalidXLogRecPtr; + slot->in_use = true; slot->active = false; diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c index 5acd2bae19..c9416b03ee 100644 --- a/src/backend/replication/slotfuncs.c +++ b/src/backend/replication/slotfuncs.c @@ -15,13 +15,13 @@ #include "funcapi.h" #include "miscadmin.h" + #include "access/htup_details.h" +#include "replication/slot.h" +#include "replication/logical.h" +#include "replication/logicalfuncs.h" #include "utils/builtins.h" #include "utils/pg_lsn.h" -#include "replication/slot.h" - -Datum pg_create_physical_replication_slot(PG_FUNCTION_ARGS); -Datum pg_drop_replication_slot(PG_FUNCTION_ARGS); static void check_permissions(void) @@ -54,7 +54,7 @@ pg_create_physical_replication_slot(PG_FUNCTION_ARGS) elog(ERROR, "return type must be a row type"); /* acquire replication slot, this will check for conflicting names*/ - ReplicationSlotCreate(NameStr(*name), false); + ReplicationSlotCreate(NameStr(*name), false, RS_PERSISTENT); values[0] = NameGetDatum(&MyReplicationSlot->data.name); @@ -69,6 +69,68 @@ pg_create_physical_replication_slot(PG_FUNCTION_ARGS) PG_RETURN_DATUM(result); } + +/* + * SQL function for creating a new logical replication slot. + */ +Datum +pg_create_logical_replication_slot(PG_FUNCTION_ARGS) +{ + Name name = PG_GETARG_NAME(0); + Name plugin = PG_GETARG_NAME(1); + + LogicalDecodingContext *ctx = NULL; + + TupleDesc tupdesc; + HeapTuple tuple; + Datum result; + Datum values[2]; + bool nulls[2]; + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + check_permissions(); + + CheckLogicalDecodingRequirements(); + + Assert(!MyReplicationSlot); + + /* + * Acquire a logical decoding slot, this will check for conflicting + * names. + */ + ReplicationSlotCreate(NameStr(*name), true, RS_EPHEMERAL); + + /* + * Create logical decoding context, to build the initial snapshot. + */ + ctx = CreateInitDecodingContext( + NameStr(*plugin), NIL, + logical_read_local_xlog_page, NULL, NULL); + + /* build initial snapshot, might take a while */ + DecodingContextFindStartpoint(ctx); + + values[0] = CStringGetTextDatum(NameStr(MyReplicationSlot->data.name)); + values[1] = LSNGetDatum(MyReplicationSlot->data.confirmed_flush); + + /* don't need the decoding context anymore */ + FreeDecodingContext(ctx); + + memset(nulls, 0, sizeof(nulls)); + + tuple = heap_form_tuple(tupdesc, values, nulls); + result = HeapTupleGetDatum(tuple); + + /* ok, slot is now fully created, mark it as persistent */ + ReplicationSlotPersist(); + ReplicationSlotRelease(); + + PG_RETURN_DATUM(result); +} + + /* * SQL function for dropping a replication slot. */ @@ -92,7 +154,7 @@ pg_drop_replication_slot(PG_FUNCTION_ARGS) Datum pg_get_replication_slots(PG_FUNCTION_ARGS) { -#define PG_STAT_GET_REPLICATION_SLOTS_COLS 6 +#define PG_GET_REPLICATION_SLOTS_COLS 8 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; TupleDesc tupdesc; Tuplestorestate *tupstore; @@ -134,15 +196,16 @@ pg_get_replication_slots(PG_FUNCTION_ARGS) for (slotno = 0; slotno < max_replication_slots; slotno++) { ReplicationSlot *slot = &ReplicationSlotCtl->replication_slots[slotno]; - Datum values[PG_STAT_GET_REPLICATION_SLOTS_COLS]; - bool nulls[PG_STAT_GET_REPLICATION_SLOTS_COLS]; + Datum values[PG_GET_REPLICATION_SLOTS_COLS]; + bool nulls[PG_GET_REPLICATION_SLOTS_COLS]; TransactionId xmin; + TransactionId catalog_xmin; XLogRecPtr restart_lsn; bool active; Oid database; NameData slot_name; - + NameData plugin; int i; SpinLockAcquire(&slot->mutex); @@ -154,9 +217,11 @@ pg_get_replication_slots(PG_FUNCTION_ARGS) else { xmin = slot->data.xmin; + catalog_xmin = slot->data.catalog_xmin; database = slot->data.database; restart_lsn = slot->data.restart_lsn; namecpy(&slot_name, &slot->data.name); + namecpy(&plugin, &slot->data.plugin); active = slot->active; } @@ -166,19 +231,34 @@ pg_get_replication_slots(PG_FUNCTION_ARGS) i = 0; values[i++] = NameGetDatum(&slot_name); + + if (database == InvalidOid) + nulls[i++] = true; + else + values[i++] = NameGetDatum(&plugin); + if (database == InvalidOid) values[i++] = CStringGetTextDatum("physical"); else values[i++] = CStringGetTextDatum("logical"); + if (database == InvalidOid) nulls[i++] = true; else values[i++] = database; + values[i++] = BoolGetDatum(active); + if (xmin != InvalidTransactionId) values[i++] = TransactionIdGetDatum(xmin); else nulls[i++] = true; + + if (catalog_xmin != InvalidTransactionId) + values[i++] = TransactionIdGetDatum(catalog_xmin); + else + nulls[i++] = true; + if (restart_lsn != InvalidTransactionId) values[i++] = LSNGetDatum(restart_lsn); else diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index e31977eee0..43db10851c 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -1147,7 +1147,7 @@ XLogWalRcvSendHSFeedback(bool immed) * everything else has been checked. */ if (hot_standby_feedback) - xmin = GetOldestXmin(true, false); + xmin = GetOldestXmin(NULL, false); else xmin = InvalidTransactionId; diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 048367af29..5227eab414 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -55,6 +55,7 @@ #include "replication/basebackup.h" #include "replication/slot.h" #include "replication/syncrep.h" +#include "replication/slot.h" #include "replication/walreceiver.h" #include "replication/walsender.h" #include "replication/walsender_private.h" @@ -434,7 +435,7 @@ StartReplication(StartReplicationCmd *cmd) if (MyReplicationSlot->data.database != InvalidOid) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - (errmsg("cannot use a replication slot created for changeset extraction for streaming replication")))); + (errmsg("cannot use a logical replication slot for physical replication")))); } /* @@ -656,7 +657,9 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) sendTimeLineIsHistoric = false; sendTimeLine = ThisTimeLineID; - ReplicationSlotCreate(cmd->slotname, cmd->kind == REPLICATION_KIND_LOGICAL); + ReplicationSlotCreate(cmd->slotname, + cmd->kind == REPLICATION_KIND_LOGICAL, + RS_PERSISTENT); initStringInfo(&output_message); @@ -766,7 +769,7 @@ exec_replication_command(const char *cmd_string) if (cmd->kind == REPLICATION_KIND_PHYSICAL) StartReplication(cmd); else - elog(ERROR, "cannot handle changeset extraction yet"); + elog(ERROR, "cannot handle logical decoding yet"); break; } @@ -1017,7 +1020,7 @@ ProcessStandbyReplyMessage(void) if (MyReplicationSlot && flushPtr != InvalidXLogRecPtr) { if (MyReplicationSlot->data.database != InvalidOid) - elog(ERROR, "cannot handle changeset extraction yet"); + elog(ERROR, "cannot handle logical decoding yet"); else PhysicalConfirmReceivedLocation(flushPtr); } @@ -1050,7 +1053,7 @@ PhysicalReplicationSlotNewXmin(TransactionId feedbackXmin) if (changed) { ReplicationSlotMarkDirty(); - ReplicationSlotsComputeRequiredXmin(); + ReplicationSlotsComputeRequiredXmin(false); } } diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index eac418442d..3376a353a4 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -50,11 +50,13 @@ #include "access/transam.h" #include "access/xact.h" #include "access/twophase.h" +#include "catalog/catalog.h" #include "miscadmin.h" #include "storage/proc.h" #include "storage/procarray.h" #include "storage/spin.h" #include "utils/builtins.h" +#include "utils/rel.h" #include "utils/snapmgr.h" @@ -84,6 +86,8 @@ typedef struct ProcArrayStruct /* oldest xmin of any replication slot */ TransactionId replication_slot_xmin; + /* oldest catalog xmin of any replication slot */ + TransactionId replication_slot_catalog_xmin; /* * We declare pgprocnos[] as 1 entry because C wants a fixed-size array, @@ -1108,21 +1112,22 @@ TransactionIdIsActive(TransactionId xid) * GetOldestXmin -- returns oldest transaction that was running * when any current transaction was started. * - * If allDbs is TRUE then all backends are considered; if allDbs is FALSE - * then only backends running in my own database are considered. + * If rel is NULL or a shared relation, all backends are considered, otherwise + * only backends running in this database are considered. * * If ignoreVacuum is TRUE then backends with the PROC_IN_VACUUM flag set are * ignored. * - * This is used by VACUUM to decide which deleted tuples must be preserved - * in a table. allDbs = TRUE is needed for shared relations, but allDbs = - * FALSE is sufficient for non-shared relations, since only backends in my - * own database could ever see the tuples in them. Also, we can ignore - * concurrently running lazy VACUUMs because (a) they must be working on other - * tables, and (b) they don't need to do snapshot-based lookups. + * This is used by VACUUM to decide which deleted tuples must be preserved in + * the passed in table. For shared relations backends in all databases must be + * considered, but for non-shared relations that's not required, since only + * backends in my own database could ever see the tuples in them. Also, we can + * ignore concurrently running lazy VACUUMs because (a) they must be working + * on other tables, and (b) they don't need to do snapshot-based lookups. * - * This is also used to determine where to truncate pg_subtrans. allDbs - * must be TRUE for that case, and ignoreVacuum FALSE. + * This is also used to determine where to truncate pg_subtrans. For that + * backends in all databases have to be considered, so rel = NULL has to be + * passed in. * * Note: we include all currently running xids in the set of considered xids. * This ensures that if a just-started xact has not yet set its snapshot, @@ -1133,7 +1138,7 @@ TransactionIdIsActive(TransactionId xid) * backwards on repeated calls. The calculated value is conservative, so that * anything older is definitely not considered as running by anyone anymore, * but the exact value calculated depends on a number of things. For example, - * if allDbs is FALSE and there are no transactions running in the current + * if rel = NULL and there are no transactions running in the current * database, GetOldestXmin() returns latestCompletedXid. If a transaction * begins after that, its xmin will include in-progress transactions in other * databases that started earlier, so another call will return a lower value. @@ -1152,12 +1157,22 @@ TransactionIdIsActive(TransactionId xid) * GetOldestXmin() move backwards, with no consequences for data integrity. */ TransactionId -GetOldestXmin(bool allDbs, bool ignoreVacuum) +GetOldestXmin(Relation rel, bool ignoreVacuum) { ProcArrayStruct *arrayP = procArray; TransactionId result; int index; + bool allDbs; + volatile TransactionId replication_slot_xmin = InvalidTransactionId; + volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId; + + /* + * If we're not computing a relation specific limit, or if a shared + * relation has been passed in, backends in all databases have to be + * considered. + */ + allDbs = rel == NULL || rel->rd_rel->relisshared; /* Cannot look for individual databases during recovery */ Assert(allDbs || !RecoveryInProgress()); @@ -1180,6 +1195,13 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum) volatile PGPROC *proc = &allProcs[pgprocno]; volatile PGXACT *pgxact = &allPgXact[pgprocno]; + /* + * Backend is doing logical decoding which manages xmin separately, + * check below. + */ + if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING) + continue; + if (ignoreVacuum && (pgxact->vacuumFlags & PROC_IN_VACUUM)) continue; @@ -1211,6 +1233,7 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum) /* fetch into volatile var while ProcArrayLock is held */ replication_slot_xmin = procArray->replication_slot_xmin; + replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin; if (RecoveryInProgress()) { @@ -1259,6 +1282,18 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum) NormalTransactionIdPrecedes(replication_slot_xmin, result)) result = replication_slot_xmin; + /* + * After locks have been released and defer_cleanup_age has been applied, + * check whether we need to back up further to make logical decoding + * possible. We need to do so if we're computing the global limit (rel = + * NULL) or if the passed relation is a catalog relation of some kind. + */ + if ((rel == NULL || + RelationIsAccessibleInLogicalDecoding(rel)) && + TransactionIdIsValid(replication_slot_catalog_xmin) && + NormalTransactionIdPrecedes(replication_slot_catalog_xmin, result)) + result = replication_slot_catalog_xmin; + return result; } @@ -1313,6 +1348,8 @@ GetMaxSnapshotSubxidCount(void) * RecentGlobalXmin: the global xmin (oldest TransactionXmin across all * running transactions, except those running LAZY VACUUM). This is * the same computation done by GetOldestXmin(true, true). + * RecentGlobalDataXmin: the global xmin for non-catalog tables + * >= RecentGlobalXmin * * Note: this function should probably not be called with an argument that's * not statically allocated (see xip allocation below). @@ -1329,6 +1366,7 @@ GetSnapshotData(Snapshot snapshot) int subcount = 0; bool suboverflowed = false; volatile TransactionId replication_slot_xmin = InvalidTransactionId; + volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId; Assert(snapshot != NULL); @@ -1397,6 +1435,13 @@ GetSnapshotData(Snapshot snapshot) volatile PGXACT *pgxact = &allPgXact[pgprocno]; TransactionId xid; + /* + * Backend is doing logical decoding which manages xmin + * separately, check below. + */ + if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING) + continue; + /* Ignore procs running LAZY VACUUM */ if (pgxact->vacuumFlags & PROC_IN_VACUUM) continue; @@ -1509,6 +1554,7 @@ GetSnapshotData(Snapshot snapshot) /* fetch into volatile var while ProcArrayLock is held */ replication_slot_xmin = procArray->replication_slot_xmin; + replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin; if (!TransactionIdIsValid(MyPgXact->xmin)) MyPgXact->xmin = TransactionXmin = xmin; @@ -1533,6 +1579,17 @@ GetSnapshotData(Snapshot snapshot) NormalTransactionIdPrecedes(replication_slot_xmin, RecentGlobalXmin)) RecentGlobalXmin = replication_slot_xmin; + /* Non-catalog tables can be vacuumed if older than this xid */ + RecentGlobalDataXmin = RecentGlobalXmin; + + /* + * Check whether there's a replication slot requiring an older catalog + * xmin. + */ + if (TransactionIdIsNormal(replication_slot_catalog_xmin) && + NormalTransactionIdPrecedes(replication_slot_catalog_xmin, RecentGlobalXmin)) + RecentGlobalXmin = replication_slot_catalog_xmin; + RecentXmin = xmin; snapshot->xmin = xmin; @@ -1633,9 +1690,11 @@ ProcArrayInstallImportedXmin(TransactionId xmin, TransactionId sourcexid) * Similar to GetSnapshotData but returns more information. We include * all PGXACTs with an assigned TransactionId, even VACUUM processes. * - * We acquire XidGenLock, but the caller is responsible for releasing it. - * This ensures that no new XIDs enter the proc array until the caller has - * WAL-logged this snapshot, and releases the lock. + * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for + * releasing them. Acquiring XidGenLock ensures that no new XIDs enter the proc + * array until the caller has WAL-logged this snapshot, and releases the + * lock. Acquiring ProcArrayLock ensures that no transactions commit until the + * lock is released. * * The returned data structure is statically allocated; caller should not * modify it, and must not assume it is valid past the next call. @@ -1770,6 +1829,15 @@ GetRunningTransactionData(void) } } + /* + * It's important *not* to include the limits set by slots here because + * snapbuild.c uses oldestRunningXid to manage its xmin horizon. If those + * were to be included here the initial value could never increase because + * of a circular dependency where slots only increase their limits when + * running xacts increases oldestRunningXid and running xacts only + * increases if slots do. + */ + CurrentRunningXacts->xcnt = count - subcount; CurrentRunningXacts->subxcnt = subcount; CurrentRunningXacts->subxid_overflow = suboverflowed; @@ -1777,13 +1845,12 @@ GetRunningTransactionData(void) CurrentRunningXacts->oldestRunningXid = oldestRunningXid; CurrentRunningXacts->latestCompletedXid = latestCompletedXid; - /* We don't release XidGenLock here, the caller is responsible for that */ - LWLockRelease(ProcArrayLock); - Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid)); Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid)); Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid)); + /* We don't release the locks here, the caller is responsible for that */ + return CurrentRunningXacts; } @@ -1852,6 +1919,92 @@ GetOldestActiveTransactionId(void) return oldestRunningXid; } +/* + * GetOldestSafeDecodingTransactionId -- lowest xid not affected by vacuum + * + * Returns the oldest xid that we can guarantee not to have been affected by + * vacuum, i.e. no rows >= that xid have been vacuumed away unless the + * transaction aborted. Note that the value can (and most of the time will) be + * much more conservative than what really has been affected by vacuum, but we + * currently don't have better data available. + * + * This is useful to initalize the cutoff xid after which a new changeset + * extraction replication slot can start decoding changes. + * + * Must be called with ProcArrayLock held either shared or exclusively, + * although most callers will want to use exclusive mode since it is expected + * that the caller will immediately use the xid to peg the xmin horizon. + */ +TransactionId +GetOldestSafeDecodingTransactionId(void) +{ + ProcArrayStruct *arrayP = procArray; + TransactionId oldestSafeXid; + int index; + bool recovery_in_progress = RecoveryInProgress(); + + Assert(LWLockHeldByMe(ProcArrayLock)); + + /* + * Acquire XidGenLock, so no transactions can acquire an xid while we're + * running. If no transaction with xid were running concurrently a new xid + * could influence the the RecentXmin et al. + * + * We initialize the computation to nextXid since that's guaranteed to be + * a safe, albeit pessimal, value. + */ + LWLockAcquire(XidGenLock, LW_SHARED); + oldestSafeXid = ShmemVariableCache->nextXid; + + /* + * If there's already a slot pegging the xmin horizon, we can start with + * that value, it's guaranteed to be safe since it's computed by this + * routine initally and has been enforced since. + */ + if (TransactionIdIsValid(procArray->replication_slot_catalog_xmin) && + TransactionIdPrecedes(procArray->replication_slot_catalog_xmin, + oldestSafeXid)) + oldestSafeXid = procArray->replication_slot_catalog_xmin; + + /* + * If we're not in recovery, we walk over the procarray and collect the + * lowest xid. Since we're called with ProcArrayLock held and have + * acquired XidGenLock, no entries can vanish concurrently, since + * PGXACT->xid is only set with XidGenLock held and only cleared with + * ProcArrayLock held. + * + * In recovery we can't lower the safe value besides what we've computed + * above, so we'll have to wait a bit longer there. We unfortunately can + * *not* use KnownAssignedXidsGetOldestXmin() since the KnownAssignedXids + * machinery can miss values and return an older value than is safe. + */ + if (!recovery_in_progress) + { + /* + * Spin over procArray collecting all min(PGXACT->xid) + */ + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + volatile PGXACT *pgxact = &allPgXact[pgprocno]; + TransactionId xid; + + /* Fetch xid just once - see GetNewTransactionId */ + xid = pgxact->xid; + + if (!TransactionIdIsNormal(xid)) + continue; + + if (TransactionIdPrecedes(xid, oldestSafeXid)) + oldestSafeXid = xid; + } + } + + LWLockRelease(XidGenLock); + + return oldestSafeXid; +} + /* * GetVirtualXIDsDelayingChkpt -- Get the VXIDs of transactions that are * delaying checkpoint because they have critical actions in progress. @@ -2523,10 +2676,39 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared) * replicaton slots. */ void -ProcArraySetReplicationSlotXmin(TransactionId xmin) +ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin, + bool already_locked) { - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + Assert(!already_locked || LWLockHeldByMe(ProcArrayLock)); + + if (!already_locked) + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + procArray->replication_slot_xmin = xmin; + procArray->replication_slot_catalog_xmin = catalog_xmin; + + if (!already_locked) + LWLockRelease(ProcArrayLock); +} + +/* + * ProcArrayGetReplicationSlotXmin + * + * Return the current slot xmin limits. That's useful to be able to remove + * data that's older than those limits. + */ +void +ProcArrayGetReplicationSlotXmin(TransactionId *xmin, + TransactionId *catalog_xmin) +{ + LWLockAcquire(ProcArrayLock, LW_SHARED); + + if (xmin != NULL) + *xmin = procArray->replication_slot_xmin; + + if (catalog_xmin != NULL) + *catalog_xmin = procArray->replication_slot_catalog_xmin; + LWLockRelease(ProcArrayLock); } diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index fb5f18edfc..aa8bea5538 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -800,7 +800,9 @@ standby_redo(XLogRecPtr lsn, XLogRecord *record) /* * Log details of the current snapshot to WAL. This allows the snapshot state - * to be reconstructed on the standby. + * to be reconstructed on the standby and for logical decoding. + * + * This is used for Hot Standby as follows: * * We can move directly to STANDBY_SNAPSHOT_READY at startup if we * start from a shutdown checkpoint because we know nothing was running @@ -854,6 +856,12 @@ standby_redo(XLogRecPtr lsn, XLogRecord *record) * Zero xids should no longer be possible, but we may be replaying WAL * from a time when they were possible. * + * For logical decoding only the running xacts information is needed; + * there's no need to look at the locking information, but it's logged anyway, + * as there's no independent knob to just enable logical decoding. For + * details of how this is used, check snapbuild.c's introductory comment. + * + * * Returns the RecPtr of the last inserted record. */ XLogRecPtr @@ -879,8 +887,28 @@ LogStandbySnapshot(void) * record we write, because standby will open up when it sees this. */ running = GetRunningTransactionData(); + + /* + * GetRunningTransactionData() acquired ProcArrayLock, we must release + * it. For Hot Standby this can be done before inserting the WAL record + * because ProcArrayApplyRecoveryInfo() rechecks the commit status using + * the clog. For logical decoding, though, the lock can't be released + * early becuase the clog might be "in the future" from the POV of the + * historic snapshot. This would allow for situations where we're waiting + * for the end of a transaction listed in the xl_running_xacts record + * which, according to the WAL, have commit before the xl_running_xacts + * record. Fortunately this routine isn't executed frequently, and it's + * only a shared lock. + */ + if (wal_level < WAL_LEVEL_LOGICAL) + LWLockRelease(ProcArrayLock); + recptr = LogCurrentRunningXacts(running); + /* Release lock if we kept it longer ... */ + if (wal_level >= WAL_LEVEL_LOGICAL) + LWLockRelease(ProcArrayLock); + /* GetRunningTransactionData() acquired XidGenLock, we must release it */ LWLockRelease(XidGenLock); diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index fa460ca82e..f595a0747c 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -781,10 +781,6 @@ ProcKill(int code, Datum arg) /* Make sure we're out of the sync rep lists */ SyncRepCleanupAtProcExit(); - /* Make sure active replication slots are released */ - if (MyReplicationSlot != NULL) - ReplicationSlotRelease(); - #ifdef USE_ASSERT_CHECKING if (assert_enabled) { @@ -803,6 +799,10 @@ ProcKill(int code, Datum arg) */ LWLockReleaseAll(); + /* Make sure active replication slots are released */ + if (MyReplicationSlot != NULL) + ReplicationSlotRelease(); + /* * Clear MyProc first; then disown the process latch. This is so that * signal handlers won't try to clear the process latch after it's no diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index a230d7eda6..be961017d6 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -55,6 +55,7 @@ #include "pg_getopt.h" #include "postmaster/autovacuum.h" #include "postmaster/postmaster.h" +#include "replication/slot.h" #include "replication/walsender.h" #include "rewrite/rewriteHandler.h" #include "storage/bufmgr.h" @@ -3853,6 +3854,16 @@ PostgresMain(int argc, char *argv[], if (am_walsender) WalSndErrorCleanup(); + /* + * We can't release replication slots inside AbortTransaction() as we + * need to be able to start and abort transactions while having a slot + * acquired. But we never need to hold them across top level errors, + * so releasing here is fine. There's another cleanup in ProcKill() + * ensuring we'll correctly cleanup on FATAL errors as well. + */ + if (MyReplicationSlot != NULL) + ReplicationSlotRelease(); + /* * Now return to normal top-level context and clear ErrorContext for * next time. diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index 4423fe01bd..115bcac5d2 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -512,7 +512,7 @@ RegisterSnapshotInvalidation(Oid dbId, Oid relId) * Only the local caches are flushed; this does not transmit the message * to other backends. */ -static void +void LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg) { if (msg->id >= 0) @@ -596,7 +596,7 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg) * since that tells us we've lost some shared-inval messages and hence * don't know what needs to be invalidated. */ -static void +void InvalidateSystemCaches(void) { int i; diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 2810b35eea..32313244ad 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -73,6 +73,7 @@ #include "utils/memutils.h" #include "utils/relmapper.h" #include "utils/resowner_private.h" +#include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tqual.h" @@ -235,7 +236,7 @@ static void formrdesc(const char *relationName, Oid relationReltype, bool isshared, bool hasoids, int natts, const FormData_pg_attribute *attrs); -static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK); +static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK, bool force_non_historic); static Relation AllocateRelationDesc(Form_pg_class relp); static void RelationParseRelOptions(Relation relation, HeapTuple tuple); static void RelationBuildTupleDesc(Relation relation); @@ -274,12 +275,13 @@ static void unlink_initfile(const char *initfilename); * and must eventually be freed with heap_freetuple. */ static HeapTuple -ScanPgRelation(Oid targetRelId, bool indexOK) +ScanPgRelation(Oid targetRelId, bool indexOK, bool force_non_historic) { HeapTuple pg_class_tuple; Relation pg_class_desc; SysScanDesc pg_class_scan; ScanKeyData key[1]; + Snapshot snapshot; /* * If something goes wrong during backend startup, we might find ourselves @@ -305,9 +307,20 @@ ScanPgRelation(Oid targetRelId, bool indexOK) * scan by setting indexOK == false. */ pg_class_desc = heap_open(RelationRelationId, AccessShareLock); + + /* + * The caller might need a tuple that's newer than the one the historic + * snapshot; currently the only case requiring to do so is looking up the + * relfilenode of non mapped system relations during decoding. + */ + if (force_non_historic) + snapshot = GetNonHistoricCatalogSnapshot(RelationRelationId); + else + snapshot = GetCatalogSnapshot(RelationRelationId); + pg_class_scan = systable_beginscan(pg_class_desc, ClassOidIndexId, indexOK && criticalRelcachesBuilt, - NULL, + snapshot, 1, key); pg_class_tuple = systable_getnext(pg_class_scan); @@ -836,7 +849,7 @@ RelationBuildDesc(Oid targetRelId, bool insertIt) /* * find the tuple in pg_class corresponding to the given relation id */ - pg_class_tuple = ScanPgRelation(targetRelId, true); + pg_class_tuple = ScanPgRelation(targetRelId, true, false); /* * if no such tuple exists, return NULL @@ -989,8 +1002,42 @@ RelationInitPhysicalAddr(Relation relation) relation->rd_node.dbNode = InvalidOid; else relation->rd_node.dbNode = MyDatabaseId; + if (relation->rd_rel->relfilenode) + { + /* + * Even if we are using a decoding snapshot that doesn't represent + * the current state of the catalog we need to make sure the + * filenode points to the current file since the older file will + * be gone (or truncated). The new file will still contain older + * rows so lookups in them will work correctly. This wouldn't work + * correctly if rewrites were allowed to change the schema in a + * noncompatible way, but those are prevented both on catalog + * tables and on user tables declared as additional catalog + * tables. + */ + if (HistoricSnapshotActive() + && RelationIsAccessibleInLogicalDecoding(relation) + && IsTransactionState()) + { + HeapTuple phys_tuple; + Form_pg_class physrel; + + phys_tuple = ScanPgRelation(RelationGetRelid(relation), + RelationGetRelid(relation) != ClassOidIndexId, + true); + if (!HeapTupleIsValid(phys_tuple)) + elog(ERROR, "could not find pg_class entry for %u", + RelationGetRelid(relation)); + physrel = (Form_pg_class) GETSTRUCT(phys_tuple); + + relation->rd_rel->reltablespace = physrel->reltablespace; + relation->rd_rel->relfilenode = physrel->relfilenode; + heap_freetuple(phys_tuple); + } + relation->rd_node.relNode = relation->rd_rel->relfilenode; + } else { /* Consult the relation mapper */ @@ -1742,7 +1789,7 @@ RelationReloadIndexInfo(Relation relation) * for pg_class_oid_index ... */ indexOK = (RelationGetRelid(relation) != ClassOidIndexId); - pg_class_tuple = ScanPgRelation(RelationGetRelid(relation), indexOK); + pg_class_tuple = ScanPgRelation(RelationGetRelid(relation), indexOK, false); if (!HeapTupleIsValid(pg_class_tuple)) elog(ERROR, "could not find pg_class tuple for index %u", RelationGetRelid(relation)); diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 4c0e0accc1..4146527d2f 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -19,6 +19,10 @@ * have regd_count = 1 and are counted in RegisteredSnapshots, but are not * tracked by any resource owner. * + * The same is true for historic snapshots used during logical decoding, + * their lifetime is managed separately (as they life longer as one xact.c + * transaction). + * * These arrangements let us reset MyPgXact->xmin when there are no snapshots * referenced by this transaction. (One possible improvement would be to be * able to advance Xmin when the snapshot with the earliest Xmin is no longer @@ -69,12 +73,13 @@ */ static SnapshotData CurrentSnapshotData = {HeapTupleSatisfiesMVCC}; static SnapshotData SecondarySnapshotData = {HeapTupleSatisfiesMVCC}; -static SnapshotData CatalogSnapshotData = {HeapTupleSatisfiesMVCC}; +SnapshotData CatalogSnapshotData = {HeapTupleSatisfiesMVCC}; /* Pointers to valid snapshots */ static Snapshot CurrentSnapshot = NULL; static Snapshot SecondarySnapshot = NULL; static Snapshot CatalogSnapshot = NULL; +static Snapshot HistoricSnapshot = NULL; /* * Staleness detection for CatalogSnapshot. @@ -86,13 +91,18 @@ static bool CatalogSnapshotStale = true; * for the convenience of TransactionIdIsInProgress: even in bootstrap * mode, we don't want it to say that BootstrapTransactionId is in progress. * - * RecentGlobalXmin is initialized to InvalidTransactionId, to ensure that no - * one tries to use a stale value. Readers should ensure that it has been set - * to something else before using it. + * RecentGlobalXmin and RecentGlobalDataXmin are initialized to + * InvalidTransactionId, to ensure that no one tries to use a stale + * value. Readers should ensure that it has been set to something else + * before using it. */ TransactionId TransactionXmin = FirstNormalTransactionId; TransactionId RecentXmin = FirstNormalTransactionId; TransactionId RecentGlobalXmin = InvalidTransactionId; +TransactionId RecentGlobalDataXmin = InvalidTransactionId; + +/* (table, ctid) => (cmin, cmax) mapping during timetravel */ +static HTAB *tuplecid_data = NULL; /* * Elements of the active snapshot stack. @@ -158,6 +168,18 @@ static void SnapshotResetXmin(void); Snapshot GetTransactionSnapshot(void) { + /* + * Return historic snapshot if doing logical decoding. We'll never + * need a non-historic transaction snapshot in this (sub-)transaction, so + * there's no need to be careful to set one up for later calls to + * GetTransactionSnapshot(). + */ + if (HistoricSnapshotActive()) + { + Assert(!FirstSnapshotSet); + return HistoricSnapshot; + } + /* First call in transaction? */ if (!FirstSnapshotSet) { @@ -214,6 +236,13 @@ GetTransactionSnapshot(void) Snapshot GetLatestSnapshot(void) { + /* + * So far there are no cases requiring support for GetLatestSnapshot() + * during logical decoding, but it wouldn't be hard to add if + * required. + */ + Assert(!HistoricSnapshotActive()); + /* If first call in transaction, go ahead and set the xact snapshot */ if (!FirstSnapshotSet) return GetTransactionSnapshot(); @@ -230,6 +259,26 @@ GetLatestSnapshot(void) */ Snapshot GetCatalogSnapshot(Oid relid) +{ + /* + * Return historic snapshot if we're doing logical decoding, but + * return a non-historic, snapshot if we temporarily are doing up2date + * lookups. + */ + if (HistoricSnapshotActive()) + return HistoricSnapshot; + + return GetNonHistoricCatalogSnapshot(relid); +} + +/* + * GetNonHistoricCatalogSnapshot + * Get a snapshot that is sufficiently up-to-date for scan of the system + * catalog with the specified OID, even while historic snapshots are set + * up. + */ +Snapshot +GetNonHistoricCatalogSnapshot(Oid relid) { /* * If the caller is trying to scan a relation that has no syscache, @@ -303,6 +352,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid) Assert(RegisteredSnapshots == 0); Assert(FirstXactSnapshot == NULL); + Assert(HistoricSnapshotActive()); /* * Even though we are not going to use the snapshot it computes, we must @@ -796,7 +846,7 @@ AtEOXact_Snapshot(bool isCommit) * Returns the token (the file name) that can be used to import this * snapshot. */ -static char * +char * ExportSnapshot(Snapshot snapshot) { TransactionId topXid; @@ -1258,3 +1308,45 @@ ThereAreNoPriorRegisteredSnapshots(void) return false; } + +/* + * Setup a snapshot that replaces normal catalog snapshots that allows catalog + * access to behave just like it did at a certain point in the past. + * + * Needed for logical decoding. + */ +void +SetupHistoricSnapshot(Snapshot historic_snapshot, HTAB *tuplecids) +{ + Assert(historic_snapshot != NULL); + + /* setup the timetravel snapshot */ + HistoricSnapshot = historic_snapshot; + + /* setup (cmin, cmax) lookup hash */ + tuplecid_data = tuplecids; +} + + +/* + * Make catalog snapshots behave normally again. + */ +void +TeardownHistoricSnapshot(bool is_error) +{ + HistoricSnapshot = NULL; + tuplecid_data = NULL; +} + +bool +HistoricSnapshotActive(void) +{ + return HistoricSnapshot != NULL; +} + +HTAB * +HistoricSnapshotGetTupleCids(void) +{ + Assert(HistoricSnapshotActive()); + return tuplecid_data; +} diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c index f626755257..c4732ed311 100644 --- a/src/backend/utils/time/tqual.c +++ b/src/backend/utils/time/tqual.c @@ -62,6 +62,9 @@ #include "access/xact.h" #include "storage/bufmgr.h" #include "storage/procarray.h" +#include "utils/builtins.h" +#include "utils/combocid.h" +#include "utils/snapmgr.h" #include "utils/tqual.h" @@ -73,7 +76,6 @@ SnapshotData SnapshotToastData = {HeapTupleSatisfiesToast}; /* local functions */ static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot); - /* * SetHintBits() * @@ -1545,3 +1547,163 @@ HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple) */ return true; } + +/* + * check whether the transaciont id 'xid' in in the pre-sorted array 'xip'. + */ +static bool +TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num) +{ + return bsearch(&xid, xip, num, + sizeof(TransactionId), xidComparator) != NULL; +} + +/* + * See the comments for HeapTupleSatisfiesMVCC for the semantics this function + * obeys. + * + * Only usable on tuples from catalog tables! + * + * We don't need to support HEAP_MOVED_(IN|OFF) for now because we only support + * reading catalog pages which couldn't have been created in an older version. + * + * We don't set any hint bits in here as it seems unlikely to be beneficial as + * those should already be set by normal access and it seems to be too + * dangerous to do so as the semantics of doing so during timetravel are more + * complicated than when dealing "only" with the present. + */ +bool +HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, + Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; + TransactionId xmin = HeapTupleHeaderGetXmin(tuple); + TransactionId xmax = HeapTupleHeaderGetRawXmax(tuple); + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + /* inserting transaction aborted */ + if (HeapTupleHeaderXminInvalid(tuple)) + { + Assert(!TransactionIdDidCommit(xmin)); + return false; + } + /* check if its one of our txids, toplevel is also in there */ + else if (TransactionIdInArray(xmin, snapshot->subxip, snapshot->subxcnt)) + { + bool resolved; + CommandId cmin = HeapTupleHeaderGetRawCommandId(tuple); + CommandId cmax = InvalidCommandId; + + /* + * another transaction might have (tried to) delete this tuple or + * cmin/cmax was stored in a combocid. S we need to to lookup the + * actual values externally. + */ + resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot, + htup, buffer, + &cmin, &cmax); + + if (!resolved) + elog(ERROR, "could not resolve cmin/cmax of catalog tuple"); + + Assert(cmin != InvalidCommandId); + + if (cmin >= snapshot->curcid) + return false; /* inserted after scan started */ + /* fall through */ + } + /* committed before our xmin horizon. Do a normal visibility check. */ + else if (TransactionIdPrecedes(xmin, snapshot->xmin)) + { + Assert(!(HeapTupleHeaderXminCommitted(tuple) && + !TransactionIdDidCommit(xmin))); + + /* check for hint bit first, consult clog afterwards */ + if (!HeapTupleHeaderXminCommitted(tuple) && + !TransactionIdDidCommit(xmin)) + return false; + /* fall through */ + } + /* beyond our xmax horizon, i.e. invisible */ + else if (TransactionIdFollowsOrEquals(xmin, snapshot->xmax)) + { + return false; + } + /* check if it's a committed transaction in [xmin, xmax) */ + else if(TransactionIdInArray(xmin, snapshot->xip, snapshot->xcnt)) + { + /* fall through */ + } + /* + * none of the above, i.e. between [xmin, xmax) but hasn't + * committed. I.e. invisible. + */ + else + { + return false; + } + + /* at this point we know xmin is visible, go on to check xmax */ + + /* xid invalid or aborted */ + if (tuple->t_infomask & HEAP_XMAX_INVALID) + return true; + /* locked tuples are always visible */ + else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + return true; + /* + * We can see multis here if we're looking at user tables or if + * somebody SELECT ... FOR SHARE/UPDATE a system table. + */ + else if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + xmax = HeapTupleGetUpdateXid(tuple); + } + + /* check if its one of our txids, toplevel is also in there */ + if (TransactionIdInArray(xmax, snapshot->subxip, snapshot->subxcnt)) + { + bool resolved; + CommandId cmin; + CommandId cmax = HeapTupleHeaderGetRawCommandId(tuple); + + /* Lookup actual cmin/cmax values */ + resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot, + htup, buffer, + &cmin, &cmax); + + if (!resolved) + elog(ERROR, "could not resolve combocid to cmax"); + + Assert(cmax != InvalidCommandId); + + if (cmax >= snapshot->curcid) + return true; /* deleted after scan started */ + else + return false; /* deleted before scan started */ + } + /* below xmin horizon, normal transaction state is valid */ + else if (TransactionIdPrecedes(xmax, snapshot->xmin)) + { + Assert(!(tuple->t_infomask & HEAP_XMAX_COMMITTED && + !TransactionIdDidCommit(xmax))); + + /* check hint bit first */ + if (tuple->t_infomask & HEAP_XMAX_COMMITTED) + return false; + + /* check clog */ + return !TransactionIdDidCommit(xmax); + } + /* above xmax horizon, we cannot possibly see the deleting transaction */ + else if (TransactionIdFollowsOrEquals(xmax, snapshot->xmax)) + return true; + /* xmax is between [xmin, xmax), check known committed array */ + else if (TransactionIdInArray(xmax, snapshot->xip, snapshot->xcnt)) + return false; + /* xmax is between [xmin, xmax), but known not to have committed yet */ + else + return true; +} diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 71248ee1bc..a7d5f7a153 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -198,7 +198,10 @@ static const char *subdirs[] = { "pg_replslot", "pg_tblspc", "pg_stat", - "pg_stat_tmp" + "pg_stat_tmp", + "pg_llog", + "pg_llog/snapshots", + "pg_llog/mappings" }; diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index bfdadc3d5b..0f802577c7 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -164,8 +164,7 @@ extern void heap_restrpos(HeapScanDesc scan); extern void heap_sync(Relation relation); /* in heap/pruneheap.c */ -extern void heap_page_prune_opt(Relation relation, Buffer buffer, - TransactionId OldestXmin); +extern void heap_page_prune_opt(Relation relation, Buffer buffer); extern int heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, bool report_stats, TransactionId *latestRemovedXid); diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index d4383ab2cb..194635952c 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -48,7 +48,7 @@ * the ones above associated with RM_HEAP_ID. XLOG_HEAP_OPMASK applies to * these, too. */ -/* 0x00 is free, was XLOG_HEAP2_FREEZE */ +#define XLOG_HEAP2_REWRITE 0x00 #define XLOG_HEAP2_CLEAN 0x10 #define XLOG_HEAP2_FREEZE_PAGE 0x20 #define XLOG_HEAP2_CLEANUP_INFO 0x30 @@ -332,6 +332,17 @@ typedef struct xl_heap_new_cid xl_heaptid target; } xl_heap_new_cid; +/* logical rewrite xlog record header */ +typedef struct xl_heap_rewrite_mapping +{ + TransactionId mapped_xid; /* xid that might need to see the row */ + Oid mapped_db; /* DbOid or InvalidOid for shared rels */ + Oid mapped_rel; /* Oid of the mapped relation */ + off_t offset; /* How far have we written so far */ + uint32 num_mappings; /* Number of in-memory mappings */ + XLogRecPtr start_lsn; /* Insert LSN at begin of rewrite */ +} xl_heap_rewrite_mapping; + #define SizeOfHeapNewCid (offsetof(xl_heap_new_cid, target) + SizeOfHeapTid) extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, @@ -341,6 +352,7 @@ extern void heap_redo(XLogRecPtr lsn, XLogRecord *rptr); extern void heap_desc(StringInfo buf, uint8 xl_info, char *rec); extern void heap2_redo(XLogRecPtr lsn, XLogRecord *rptr); extern void heap2_desc(StringInfo buf, uint8 xl_info, char *rec); +extern void heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r); extern XLogRecPtr log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid); diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h index d098a0b171..07df3b4f2b 100644 --- a/src/include/access/rewriteheap.h +++ b/src/include/access/rewriteheap.h @@ -14,12 +14,14 @@ #define REWRITE_HEAP_H #include "access/htup.h" +#include "storage/itemptr.h" +#include "storage/relfilenode.h" #include "utils/relcache.h" /* struct definition is private to rewriteheap.c */ typedef struct RewriteStateData *RewriteState; -extern RewriteState begin_heap_rewrite(Relation NewHeap, +extern RewriteState begin_heap_rewrite(Relation OldHeap, Relation NewHeap, TransactionId OldestXmin, TransactionId FreezeXid, MultiXactId MultiXactCutoff, bool use_wal); extern void end_heap_rewrite(RewriteState state); @@ -27,4 +29,29 @@ extern void rewrite_heap_tuple(RewriteState state, HeapTuple oldTuple, HeapTuple newTuple); extern bool rewrite_heap_dead_tuple(RewriteState state, HeapTuple oldTuple); +/* + * On-Disk data format for an individual logical rewrite mapping. + */ +typedef struct LogicalRewriteMappingData +{ + RelFileNode old_node; + RelFileNode new_node; + ItemPointerData old_tid; + ItemPointerData new_tid; +} LogicalRewriteMappingData; + +/* --- + * The filename consists out of the following, dash separated, + * components: + * 1) database oid or InvalidOid for shared relations + * 2) the oid of the relation + * 3) xid we are mapping for + * 4) upper 32bit of the LSN at which a rewrite started + * 5) lower 32bit of the LSN at which a rewrite started + * 6) xid of the xact performing the mapping + * --- + */ +#define LOGICAL_REWRITE_FORMAT "map-%x-%x-%X_%X-%x-%x" +void CheckPointLogicalRewriteHeap(void); + #endif /* REWRITE_HEAP_H */ diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 8376dfd669..a9774e9f59 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -63,6 +63,11 @@ (AssertMacro(TransactionIdIsNormal(id1) && TransactionIdIsNormal(id2)), \ (int32) ((id1) - (id2)) < 0) +/* compare two XIDs already known to be normal; this is a macro for speed */ +#define NormalTransactionIdFollows(id1, id2) \ + (AssertMacro(TransactionIdIsNormal(id1) && TransactionIdIsNormal(id2)), \ + (int32) ((id1) - (id2)) > 0) + /* ---------- * Object ID (OID) zero is InvalidOid. * diff --git a/src/include/access/tuptoaster.h b/src/include/access/tuptoaster.h index 5adf4f2816..296d016c9f 100644 --- a/src/include/access/tuptoaster.h +++ b/src/include/access/tuptoaster.h @@ -98,9 +98,34 @@ /* Size of an EXTERNAL datum that contains a standard TOAST pointer */ #define TOAST_POINTER_SIZE (VARHDRSZ_EXTERNAL + sizeof(struct varatt_external)) -/* Size of an indirect datum that contains an indirect TOAST pointer */ +/* Size of an indirect datum that contains a standard TOAST pointer */ #define INDIRECT_POINTER_SIZE (VARHDRSZ_EXTERNAL + sizeof(struct varatt_indirect)) +/* + * Testing whether an externally-stored value is compressed now requires + * comparing extsize (the actual length of the external data) to rawsize + * (the original uncompressed datum's size). The latter includes VARHDRSZ + * overhead, the former doesn't. We never use compression unless it actually + * saves space, so we expect either equality or less-than. + */ +#define VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer) \ + ((toast_pointer).va_extsize < (toast_pointer).va_rawsize - VARHDRSZ) + +/* + * Macro to fetch the possibly-unaligned contents of an EXTERNAL datum + * into a local "struct varatt_external" toast pointer. This should be + * just a memcpy, but some versions of gcc seem to produce broken code + * that assumes the datum contents are aligned. Introducing an explicit + * intermediate "varattrib_1b_e *" variable seems to fix it. + */ +#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr) \ +do { \ + varattrib_1b_e *attre = (varattrib_1b_e *) (attr); \ + Assert(VARATT_IS_EXTERNAL(attre)); \ + Assert(VARSIZE_EXTERNAL(attre) == sizeof(toast_pointer) + VARHDRSZ_EXTERNAL); \ + memcpy(&(toast_pointer), VARDATA_EXTERNAL(attre), sizeof(toast_pointer)); \ +} while (0) + /* ---------- * toast_insert_or_update - * diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 11ab277199..a238292b76 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -288,6 +288,7 @@ extern int XLogFileOpen(XLogSegNo segno); extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std); extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli); +extern XLogSegNo XLogGetLastRemovedSegno(void); extern void XLogSetAsyncXactLSN(XLogRecPtr record); extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn); diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 80560574bf..22dd0fc58e 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201403031 +#define CATALOG_VERSION_NO 201403032 #endif diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 7a11721ba4..c570651872 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -4804,8 +4804,18 @@ DATA(insert OID = 3779 ( pg_create_physical_replication_slot PGNSP PGUID 12 1 0 DESCR("create a physical replication slot"); DATA(insert OID = 3780 ( pg_drop_replication_slot PGNSP PGUID 12 1 0 0 0 f f f f f f v 1 0 2278 "19" _null_ _null_ _null_ _null_ pg_drop_replication_slot _null_ _null_ _null_ )); DESCR("drop a replication slot"); -DATA(insert OID = 3781 ( pg_get_replication_slots PGNSP PGUID 12 1 10 0 0 f f f f f t s 0 0 2249 "" "{19,25,26,16,28,3220}" "{o,o,o,o,o,o}" "{slot_name,slot_type,datoid,active,xmin,restart_lsn}" _null_ pg_get_replication_slots _null_ _null_ _null_ )); +DATA(insert OID = 3781 ( pg_get_replication_slots PGNSP PGUID 12 1 10 0 0 f f f f f t s 0 0 2249 "" "{19,19,25,26,16,28,28,3220}" "{o,o,o,o,o,o,o,o}" "{slot_name,plugin,slot_type,datoid,active,xmin,catalog_xmin,restart_lsn}" _null_ pg_get_replication_slots _null_ _null_ _null_ )); DESCR("information about replication slots currently in use"); +DATA(insert OID = 3786 ( pg_create_logical_replication_slot PGNSP PGUID 12 1 0 0 0 f f f f f f v 2 0 2249 "19 19" "{19,19,25,3220}" "{i,i,o,o}" "{slotname,plugin,slotname,xlog_position}" _null_ pg_create_logical_replication_slot _null_ _null_ _null_ )); +DESCR("set up a logical replication slot"); +DATA(insert OID = 3782 ( pg_logical_slot_get_changes PGNSP PGUID 12 1000 1000 25 0 f f f f f t v 4 0 2249 "19 3220 23 1009" "{19,3220,23,1009,3220,28,25}" "{i,i,i,v,o,o,o}" "{slotname,upto_lsn,upto_nchanges,options,location,xid,data}" _null_ pg_logical_slot_get_changes _null_ _null_ _null_ )); +DESCR("get changes from replication slot"); +DATA(insert OID = 3783 ( pg_logical_slot_get_binary_changes PGNSP PGUID 12 1000 1000 25 0 f f f f f t v 4 0 2249 "19 3220 23 1009" "{19,3220,23,1009,3220,28,17}" "{i,i,i,v,o,o,o}" "{slotname,upto_lsn,upto_nchanges,options,location,xid,data}" _null_ pg_logical_slot_get_binary_changes _null_ _null_ _null_ )); +DESCR("get binary changes from replication slot"); +DATA(insert OID = 3784 ( pg_logical_slot_peek_changes PGNSP PGUID 12 1000 1000 25 0 f f f f f t v 4 0 2249 "19 3220 23 1009" "{19,3220,23,1009,3220,28,25}" "{i,i,i,v,o,o,o}" "{slotname,upto_lsn,upto_nchanges,options,location,xid,data}" _null_ pg_logical_slot_peek_changes _null_ _null_ _null_ )); +DESCR("peek at changes from replication slot"); +DATA(insert OID = 3785 ( pg_logical_slot_peek_binary_changes PGNSP PGUID 12 1000 1000 25 0 f f f f f t v 4 0 2249 "19 3220 23 1009" "{19,3220,23,1009,3220,28,17}" "{i,i,i,v,o,o,o}" "{slotname,upto_lsn,upto_nchanges,options,location,xid,data}" _null_ pg_logical_slot_peek_binary_changes _null_ _null_ _null_ )); +DESCR("peek at binary changes from replication slot"); /* event triggers */ DATA(insert OID = 3566 ( pg_event_trigger_dropped_objects PGNSP PGUID 12 10 100 0 0 f f f f t t s 0 0 2249 "" "{26,26,23,25,25,25,25}" "{o,o,o,o,o,o,o}" "{classid, objid, objsubid, object_type, schema_name, object_name, object_identity}" _null_ pg_event_trigger_dropped_objects _null_ _null_ _null_ )); diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 70350e02cb..058dc5f667 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -157,10 +157,10 @@ extern void vac_update_relstats(Relation relation, bool hasindex, TransactionId frozenxid, MultiXactId minmulti); -extern void vacuum_set_xid_limits(int freeze_min_age, int freeze_table_age, +extern void vacuum_set_xid_limits(Relation rel, + int freeze_min_age, int freeze_table_age, int multixact_freeze_min_age, int multixact_freeze_table_age, - bool sharedRel, TransactionId *oldestXmin, TransactionId *freezeLimit, TransactionId *xidFullScanLimit, diff --git a/src/include/replication/decode.h b/src/include/replication/decode.h new file mode 100644 index 0000000000..7f55d789a2 --- /dev/null +++ b/src/include/replication/decode.h @@ -0,0 +1,19 @@ +/*------------------------------------------------------------------------- + * decode.h + * PostgreSQL WAL to logical transformation + * + * Portions Copyright (c) 2012-2014, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ +#ifndef DECODE_H +#define DECODE_H + +#include "access/xlogreader.h" +#include "replication/reorderbuffer.h" +#include "replication/logical.h" + +void LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, + XLogRecord *record); + +#endif diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h new file mode 100644 index 0000000000..e65c8b8075 --- /dev/null +++ b/src/include/replication/logical.h @@ -0,0 +1,100 @@ +/*------------------------------------------------------------------------- + * logical.h + * PostgreSQL logical decoding coordination + * + * Copyright (c) 2012-2014, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ +#ifndef LOGICAL_H +#define LOGICAL_H + +#include "replication/slot.h" + +#include "access/xlog.h" +#include "access/xlogreader.h" +#include "replication/output_plugin.h" + +struct LogicalDecodingContext; + +typedef void (*LogicalOutputPluginWriterWrite) ( + struct LogicalDecodingContext *lr, + XLogRecPtr Ptr, + TransactionId xid, + bool last_write +); + +typedef LogicalOutputPluginWriterWrite LogicalOutputPluginWriterPrepareWrite; + +typedef struct LogicalDecodingContext +{ + /* memory context this is all allocated in */ + MemoryContext context; + + /* infrastructure pieces */ + XLogReaderState *reader; + ReplicationSlot *slot; + struct ReorderBuffer *reorder; + struct SnapBuild *snapshot_builder; + + OutputPluginCallbacks callbacks; + OutputPluginOptions options; + + /* + * User specified options + */ + List *output_plugin_options; + + /* + * User-Provided callback for writing/streaming out data. + */ + LogicalOutputPluginWriterPrepareWrite prepare_write; + LogicalOutputPluginWriterWrite write; + + /* + * Output buffer. + */ + StringInfo out; + + /* + * Private data pointer of the output plugin. + */ + void *output_plugin_private; + + /* + * Private data pointer for the data writer. + */ + void *output_writer_private; + + /* + * State for writing output. + */ + bool accept_writes; + bool prepared_write; + XLogRecPtr write_location; + TransactionId write_xid; +} LogicalDecodingContext; + +extern void CheckLogicalDecodingRequirements(void); + +extern LogicalDecodingContext *CreateInitDecodingContext(char *plugin, + List *output_plugin_options, + XLogPageReadCB read_page, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write); +extern LogicalDecodingContext *CreateDecodingContext( + XLogRecPtr start_lsn, + List *output_plugin_options, + XLogPageReadCB read_page, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write); +extern void DecodingContextFindStartpoint(LogicalDecodingContext *ctx); +extern bool DecodingContextReady(LogicalDecodingContext *ctx); +extern void FreeDecodingContext(LogicalDecodingContext *ctx); + +extern void LogicalIncreaseXminForSlot(XLogRecPtr lsn, TransactionId xmin); +extern void LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, + XLogRecPtr restart_lsn); +extern void LogicalConfirmReceivedLocation(XLogRecPtr lsn); + +#endif diff --git a/src/include/replication/logicalfuncs.h b/src/include/replication/logicalfuncs.h new file mode 100644 index 0000000000..21bf44ec4b --- /dev/null +++ b/src/include/replication/logicalfuncs.h @@ -0,0 +1,24 @@ +/*------------------------------------------------------------------------- + * logicalfuncs.h + * PostgreSQL WAL to logical transformation support functions + * + * Copyright (c) 2012-2014, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ +#ifndef LOGICALFUNCS_H +#define LOGICALFUNCS_H + +#include "replication/logical.h" + +extern int logical_read_local_xlog_page(XLogReaderState *state, + XLogRecPtr targetPagePtr, + int reqLen, XLogRecPtr targetRecPtr, + char *cur_page, TimeLineID *pageTLI); + +extern Datum pg_logical_slot_get_changes(PG_FUNCTION_ARGS); +extern Datum pg_logical_slot_get_binary_changes(PG_FUNCTION_ARGS); +extern Datum pg_logical_slot_peek_changes(PG_FUNCTION_ARGS); +extern Datum pg_logical_slot_peek_binary_changes(PG_FUNCTION_ARGS); + +#endif diff --git a/src/include/replication/output_plugin.h b/src/include/replication/output_plugin.h new file mode 100644 index 0000000000..c47c24c8db --- /dev/null +++ b/src/include/replication/output_plugin.h @@ -0,0 +1,98 @@ +/*------------------------------------------------------------------------- + * output_plugin.h + * PostgreSQL Logical Decode Plugin Interface + * + * Copyright (c) 2012-2014, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ +#ifndef OUTPUT_PLUGIN_H +#define OUTPUT_PLUGIN_H + +#include "replication/reorderbuffer.h" + +struct LogicalDecodingContext; +struct OutputPluginCallbacks; + +typedef enum OutputPluginOutputType +{ + OUTPUT_PLUGIN_BINARY_OUTPUT, + OUTPUT_PLUGIN_TEXTUAL_OUTPUT +} OutputPluginOutputType; + +/* + * Options set by the output plugin, in the startup callback. + */ +typedef struct OutputPluginOptions +{ + OutputPluginOutputType output_type; +} OutputPluginOptions; + +/* + * Type of the shared library symbol _PG_output_plugin_init that is looked up + * when loading an output plugin shared library. + */ +typedef void (*LogicalOutputPluginInit)(struct OutputPluginCallbacks *cb); + +/* + * Callback that gets called in a user-defined plugin. ctx->private_data can + * be set to some private data. + * + * "is_init" will be set to "true" if the decoding slot just got defined. When + * the same slot is used from there one, it will be "false". + */ +typedef void (*LogicalDecodeStartupCB) ( + struct LogicalDecodingContext *ctx, + OutputPluginOptions *options, + bool is_init +); + +/* + * Callback called for every (explicit or implicit) BEGIN of a successful + * transaction. + */ +typedef void (*LogicalDecodeBeginCB) ( + struct LogicalDecodingContext *, + ReorderBufferTXN *txn); + +/* + * Callback for every individual change in a successful transaction. + */ +typedef void (*LogicalDecodeChangeCB) ( + struct LogicalDecodingContext *, + ReorderBufferTXN *txn, + Relation relation, + ReorderBufferChange *change +); + +/* + * Called for every (explicit or implicit) COMMIT of a successful transaction. + */ +typedef void (*LogicalDecodeCommitCB) ( + struct LogicalDecodingContext *, + ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); + +/* + * Called to shutdown an output plugin. + */ +typedef void (*LogicalDecodeShutdownCB) ( + struct LogicalDecodingContext * +); + +/* + * Output plugin callbacks + */ +typedef struct OutputPluginCallbacks +{ + LogicalDecodeStartupCB startup_cb; + LogicalDecodeBeginCB begin_cb; + LogicalDecodeChangeCB change_cb; + LogicalDecodeCommitCB commit_cb; + LogicalDecodeShutdownCB shutdown_cb; +} OutputPluginCallbacks; + +void OutputPluginPrepareWrite(struct LogicalDecodingContext *ctx, bool last_write); +void OutputPluginWrite(struct LogicalDecodingContext *ctx, bool last_write); + +#endif /* OUTPUT_PLUGIN_H */ diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h new file mode 100644 index 0000000000..01eabfb7be --- /dev/null +++ b/src/include/replication/reorderbuffer.h @@ -0,0 +1,351 @@ +/* + * reorderbuffer.h + * PostgreSQL logical replay/reorder buffer management. + * + * Copyright (c) 2012-2014, PostgreSQL Global Development Group + * + * src/include/replication/reorderbuffer.h + */ +#ifndef REORDERBUFFER_H +#define REORDERBUFFER_H + +#include "access/htup_details.h" + +#include "lib/ilist.h" + +#include "storage/sinval.h" + +#include "utils/hsearch.h" +#include "utils/rel.h" +#include "utils/snapshot.h" +#include "utils/timestamp.h" + +/* an individual tuple, stored in one chunk of memory */ +typedef struct ReorderBufferTupleBuf +{ + /* position in preallocated list */ + slist_node node; + + /* tuple, stored sequentially */ + HeapTupleData tuple; + HeapTupleHeaderData header; + char data[MaxHeapTupleSize]; +} ReorderBufferTupleBuf; + +/* types of the change passed to a 'change' callback */ +enum ReorderBufferChangeType +{ + REORDER_BUFFER_CHANGE_INSERT, + REORDER_BUFFER_CHANGE_UPDATE, + REORDER_BUFFER_CHANGE_DELETE +}; + +/* + * a single 'change', can be an insert (with one tuple), an update (old, new), + * or a delete (old). + * + * The same struct is also used internally for other purposes but that should + * never be visible outside reorderbuffer.c. + */ +typedef struct ReorderBufferChange +{ + XLogRecPtr lsn; + + /* type of change */ + union + { + enum ReorderBufferChangeType action; + /* do not leak internal enum values to the outside */ + int action_internal; + }; + + /* + * Context data for the change, which part of the union is valid depends + * on action/action_internal. + */ + union + { + /* old, new tuples when action == *_INSERT|UPDATE|DELETE */ + struct + { + /* relation that has been changed */ + RelFileNode relnode; + /* valid for DELETE || UPDATE */ + ReorderBufferTupleBuf *oldtuple; + /* valid for INSERT || UPDATE */ + ReorderBufferTupleBuf *newtuple; + } tp; + + /* new snapshot */ + Snapshot snapshot; + + /* new command id for existing snapshot in a catalog changing tx */ + CommandId command_id; + + /* new cid mapping for catalog changing transaction */ + struct + { + RelFileNode node; + ItemPointerData tid; + CommandId cmin; + CommandId cmax; + CommandId combocid; + } tuplecid; + }; + + /* + * While in use this is how a change is linked into a transactions, + * otherwise it's the preallocated list. + */ + dlist_node node; +} ReorderBufferChange; + +typedef struct ReorderBufferTXN +{ + /* + * The transactions transaction id, can be a toplevel or sub xid. + */ + TransactionId xid; + + /* did the TX have catalog changes */ + bool has_catalog_changes; + + /* + * Do we know this is a subxact? + */ + bool is_known_as_subxact; + + /* + * LSN of the first data carrying, WAL record with knowledge about this + * xid. This is allowed to *not* be first record adorned with this xid, if + * the previous records aren't relevant for logical decoding. + */ + XLogRecPtr first_lsn; + + /* ---- + * LSN of the record that lead to this xact to be committed or + * aborted. This can be a + * * plain commit record + * * plain commit record, of a parent transaction + * * prepared transaction commit + * * plain abort record + * * prepared transaction abort + * * error during decoding + * ---- + */ + XLogRecPtr final_lsn; + + /* + * LSN pointing to the end of the commit record + 1. + */ + XLogRecPtr end_lsn; + + /* + * LSN of the last lsn at which snapshot information reside, so we can + * restart decoding from there and fully recover this transaction from + * WAL. + */ + XLogRecPtr restart_decoding_lsn; + + /* + * Commit time, only known when we read the actual commit record. + */ + TimestampTz commit_time; + + /* + * Base snapshot or NULL. + */ + Snapshot base_snapshot; + XLogRecPtr base_snapshot_lsn; + + /* + * How many ReorderBufferChange's do we have in this txn. + * + * Changes in subtransactions are *not* included but tracked separately. + */ + uint64 nentries; + + /* + * How many of the above entries are stored in memory in contrast to being + * spilled to disk. + */ + uint64 nentries_mem; + + /* + * List of ReorderBufferChange structs, including new Snapshots and new + * CommandIds + */ + dlist_head changes; + + /* + * List of (relation, ctid) => (cmin, cmax) mappings for catalog tuples. + * Those are always assigned to the toplevel transaction. (Keep track of + * #entries to create a hash of the right size) + */ + dlist_head tuplecids; + uint64 ntuplecids; + + /* + * On-demand built hash for looking up the above values. + */ + HTAB *tuplecid_hash; + + /* + * Hash containing (potentially partial) toast entries. NULL if no toast + * tuples have been found for the current change. + */ + HTAB *toast_hash; + + /* + * non-hierarchical list of subtransactions that are *not* aborted. Only + * used in toplevel transactions. + */ + dlist_head subtxns; + uint32 nsubtxns; + + /* + * Stored cache invalidations. This is not a linked list because we get + * all the invalidations at once. + */ + uint32 ninvalidations; + SharedInvalidationMessage *invalidations; + + /* --- + * Position in one of three lists: + * * list of subtransactions if we are *known* to be subxact + * * list of toplevel xacts (can be a as-yet unknown subxact) + * * list of preallocated ReorderBufferTXNs + * --- + */ + dlist_node node; + +} ReorderBufferTXN; + +/* so we can define the callbacks used inside struct ReorderBuffer itself */ +typedef struct ReorderBuffer ReorderBuffer; + +/* change callback signature */ +typedef void (*ReorderBufferApplyChangeCB) ( + ReorderBuffer *rb, + ReorderBufferTXN *txn, + Relation relation, + ReorderBufferChange *change); + +/* begin callback signature */ +typedef void (*ReorderBufferBeginCB) ( + ReorderBuffer *rb, + ReorderBufferTXN *txn); + +/* commit callback signature */ +typedef void (*ReorderBufferCommitCB) ( + ReorderBuffer *rb, + ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); + +struct ReorderBuffer +{ + /* + * xid => ReorderBufferTXN lookup table + */ + HTAB *by_txn; + + /* + * Transactions that could be a toplevel xact, ordered by LSN of the first + * record bearing that xid.. + */ + dlist_head toplevel_by_lsn; + + /* + * one-entry sized cache for by_txn. Very frequently the same txn gets + * looked up over and over again. + */ + TransactionId by_txn_last_xid; + ReorderBufferTXN *by_txn_last_txn; + + /* + * Callacks to be called when a transactions commits. + */ + ReorderBufferBeginCB begin; + ReorderBufferApplyChangeCB apply_change; + ReorderBufferCommitCB commit; + + /* + * Pointer that will be passed untouched to the callbacks. + */ + void *private_data; + + /* + * Private memory context. + */ + MemoryContext context; + + /* + * Data structure slab cache. + * + * We allocate/deallocate some structures very frequently, to avoid bigger + * overhead we cache some unused ones here. + * + * The maximum number of cached entries is controlled by const variables + * ontop of reorderbuffer.c + */ + + /* cached ReorderBufferTXNs */ + dlist_head cached_transactions; + Size nr_cached_transactions; + + /* cached ReorderBufferChanges */ + dlist_head cached_changes; + Size nr_cached_changes; + + /* cached ReorderBufferTupleBufs */ + slist_head cached_tuplebufs; + Size nr_cached_tuplebufs; + + XLogRecPtr current_restart_decoding_lsn; + + /* buffer for disk<->memory conversions */ + char *outbuf; + Size outbufsize; +}; + + +ReorderBuffer *ReorderBufferAllocate(void); +void ReorderBufferFree(ReorderBuffer *); + +ReorderBufferTupleBuf *ReorderBufferGetTupleBuf(ReorderBuffer *); +void ReorderBufferReturnTupleBuf(ReorderBuffer *, ReorderBufferTupleBuf *tuple); +ReorderBufferChange *ReorderBufferGetChange(ReorderBuffer *); +void ReorderBufferReturnChange(ReorderBuffer *, ReorderBufferChange *); + +void ReorderBufferQueueChange(ReorderBuffer *, TransactionId, XLogRecPtr lsn, ReorderBufferChange *); +void ReorderBufferCommit(ReorderBuffer *, TransactionId, + XLogRecPtr commit_lsn, XLogRecPtr end_lsn, + TimestampTz commit_time); +void ReorderBufferAssignChild(ReorderBuffer *, TransactionId, TransactionId, XLogRecPtr commit_lsn); +void ReorderBufferCommitChild(ReorderBuffer *, TransactionId, TransactionId, + XLogRecPtr commit_lsn, XLogRecPtr end_lsn); +void ReorderBufferAbort(ReorderBuffer *, TransactionId, XLogRecPtr lsn); +void ReorderBufferAbortOld(ReorderBuffer *, TransactionId xid); +void ReorderBufferForget(ReorderBuffer *, TransactionId, XLogRecPtr lsn); + +void ReorderBufferSetBaseSnapshot(ReorderBuffer *, TransactionId, XLogRecPtr lsn, struct SnapshotData *snap); +void ReorderBufferAddSnapshot(ReorderBuffer *, TransactionId, XLogRecPtr lsn, struct SnapshotData *snap); +void ReorderBufferAddNewCommandId(ReorderBuffer *, TransactionId, XLogRecPtr lsn, + CommandId cid); +void ReorderBufferAddNewTupleCids(ReorderBuffer *, TransactionId, XLogRecPtr lsn, + RelFileNode node, ItemPointerData pt, + CommandId cmin, CommandId cmax, CommandId combocid); +void ReorderBufferAddInvalidations(ReorderBuffer *, TransactionId, XLogRecPtr lsn, + Size nmsgs, SharedInvalidationMessage *msgs); +bool ReorderBufferIsXidKnown(ReorderBuffer *, TransactionId xid); +void ReorderBufferXidSetCatalogChanges(ReorderBuffer *, TransactionId xid, XLogRecPtr lsn); +bool ReorderBufferXidHasCatalogChanges(ReorderBuffer *, TransactionId xid); +bool ReorderBufferXidHasBaseSnapshot(ReorderBuffer *, TransactionId xid); + +ReorderBufferTXN *ReorderBufferGetOldestTXN(ReorderBuffer *); + +void ReorderBufferSetRestartPoint(ReorderBuffer *, XLogRecPtr ptr); + +void StartupReorderBuffer(void); + +#endif diff --git a/src/include/replication/slot.h b/src/include/replication/slot.h index 089b0f4b70..c354c9133b 100644 --- a/src/include/replication/slot.h +++ b/src/include/replication/slot.h @@ -16,6 +16,24 @@ #include "storage/shmem.h" #include "storage/spin.h" +/* + * Behaviour of replication slots, upon release or crash. + * + * Slots marked as PERSISTENT are crashsafe and will not be dropped when + * released. Slots marked as EPHEMERAL will be dropped when released or after + * restarts. + * + * EPHEMERAL slots can be made PERSISTENT by calling ReplicationSlotPersist(). + */ +typedef enum ReplicationSlotPersistency +{ + RS_PERSISTENT, + RS_EPHEMERAL +} ReplicationSlotPersistency; + +/* + * On-Disk data of a replication slot, preserved across restarts. + */ typedef struct ReplicationSlotPersistentData { /* The slot's identifier */ @@ -24,6 +42,11 @@ typedef struct ReplicationSlotPersistentData /* database the slot is active on */ Oid database; + /* + * The slot's behaviour when being dropped (or restored after a crash). + */ + ReplicationSlotPersistency persistency; + /* * xmin horizon for data * @@ -32,9 +55,22 @@ typedef struct ReplicationSlotPersistentData */ TransactionId xmin; + /* + * xmin horizon for catalog tuples + * + * NB: This may represent a value that hasn't been written to disk yet; + * see notes for effective_xmin, below. + */ + TransactionId catalog_xmin; + /* oldest LSN that might be required by this replication slot */ XLogRecPtr restart_lsn; + /* oldest LSN that the client has acked receipt for */ + XLogRecPtr confirmed_flush; + + /* plugin name */ + NameData plugin; } ReplicationSlotPersistentData; /* @@ -67,12 +103,26 @@ typedef struct ReplicationSlot * same as the persistent value (data.xmin). */ TransactionId effective_xmin; + TransactionId effective_catalog_xmin; /* data surviving shutdowns and crashes */ ReplicationSlotPersistentData data; /* is somebody performing io on this slot? */ LWLock *io_in_progress_lock; + + /* all the remaining data is only used for logical slots */ + + /* ---- + * When the client has confirmed flushes >= candidate_xmin_lsn we can + * advance the catalog xmin, when restart_valid has been passed, + * restart_lsn can be increased. + * ---- + */ + TransactionId candidate_catalog_xmin; + XLogRecPtr candidate_xmin_lsn; + XLogRecPtr candidate_restart_valid; + XLogRecPtr candidate_restart_lsn; } ReplicationSlot; /* @@ -97,8 +147,11 @@ extern Size ReplicationSlotsShmemSize(void); extern void ReplicationSlotsShmemInit(void); /* management of individual slots */ -extern void ReplicationSlotCreate(const char *name, bool db_specific); +extern void ReplicationSlotCreate(const char *name, bool db_specific, + ReplicationSlotPersistency p); +extern void ReplicationSlotPersist(void); extern void ReplicationSlotDrop(const char *name); + extern void ReplicationSlotAcquire(const char *name); extern void ReplicationSlotRelease(void); extern void ReplicationSlotSave(void); @@ -106,15 +159,20 @@ extern void ReplicationSlotMarkDirty(void); /* misc stuff */ extern bool ReplicationSlotValidateName(const char *name, int elevel); -extern void ReplicationSlotsComputeRequiredXmin(void); +extern void ReplicationSlotsComputeRequiredXmin(bool already_locked); extern void ReplicationSlotsComputeRequiredLSN(void); +extern XLogRecPtr ReplicationSlotsComputeLogicalRestartLSN(void); +extern bool ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive); + extern void StartupReplicationSlots(XLogRecPtr checkPointRedo); extern void CheckPointReplicationSlots(void); extern void CheckSlotRequirements(void); -extern void ReplicationSlotAtProcExit(void); /* SQL callable functions */ +extern Datum pg_create_physical_replication_slot(PG_FUNCTION_ARGS); +extern Datum pg_create_logical_replication_slot(PG_FUNCTION_ARGS); +extern Datum pg_drop_replication_slot(PG_FUNCTION_ARGS); extern Datum pg_get_replication_slots(PG_FUNCTION_ARGS); #endif /* SLOT_H */ diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h new file mode 100644 index 0000000000..087c0e510d --- /dev/null +++ b/src/include/replication/snapbuild.h @@ -0,0 +1,83 @@ +/*------------------------------------------------------------------------- + * + * snapbuild.h + * Exports from replication/logical/snapbuild.c. + * + * Copyright (c) 2012-2014, PostgreSQL Global Development Group + * + * src/include/replication/snapbuild.h + * + *------------------------------------------------------------------------- + */ +#ifndef SNAPBUILD_H +#define SNAPBUILD_H + +#include "access/xlogdefs.h" +#include "utils/snapmgr.h" + +typedef enum +{ + /* + * Initial state, we can't do much yet. + */ + SNAPBUILD_START, + + /* + * We have collected enough information to decode tuples in transactions + * that started after this. + * + * Once we reached this we start to collect changes. We cannot apply them + * yet because the might be based on transactions that were still running + * when we reached them yet. + */ + SNAPBUILD_FULL_SNAPSHOT, + + /* + * Found a point after hitting built_full_snapshot where all transactions + * that were running at that point finished. Till we reach that we hold + * off calling any commit callbacks. + */ + SNAPBUILD_CONSISTENT +} SnapBuildState; + +/* forward declare so we don't have to expose the struct to the public */ +struct SnapBuild; +typedef struct SnapBuild SnapBuild; + +/* forward declare so we don't have to include reorderbuffer.h */ +struct ReorderBuffer; + +/* forward declare so we don't have to include heapam_xlog.h */ +struct xl_heap_new_cid; +struct xl_running_xacts; + +extern void CheckPointSnapBuild(void); + +extern SnapBuild *AllocateSnapshotBuilder(struct ReorderBuffer *cache, + TransactionId xmin_horizon, XLogRecPtr start_lsn); +extern void FreeSnapshotBuilder(SnapBuild *cache); + +extern void SnapBuildSnapDecRefcount(Snapshot snap); + +extern const char *SnapBuildExportSnapshot(SnapBuild *snapstate); +extern void SnapBuildClearExportedSnapshot(void); + +extern SnapBuildState SnapBuildCurrentState(SnapBuild *snapstate); + +extern bool SnapBuildXactNeedsSkip(SnapBuild *snapstate, XLogRecPtr ptr); + +extern void SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, + TransactionId xid, int nsubxacts, + TransactionId *subxacts); +extern void SnapBuildAbortTxn(SnapBuild *builder, XLogRecPtr lsn, + TransactionId xid, int nsubxacts, + TransactionId *subxacts); +extern bool SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, + XLogRecPtr lsn); +extern void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid, + XLogRecPtr lsn, struct xl_heap_new_cid *cid); +extern void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, + struct xl_running_xacts *running); +extern void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn); + +#endif /* SNAPBUILD_H */ diff --git a/src/include/storage/itemptr.h b/src/include/storage/itemptr.h index 67bbdbb988..0b81d53f5f 100644 --- a/src/include/storage/itemptr.h +++ b/src/include/storage/itemptr.h @@ -116,6 +116,9 @@ typedef ItemPointerData *ItemPointer; /* * ItemPointerCopy * Copies the contents of one disk item pointer to another. + * + * Should there ever be padding in an ItemPointer this would need to be handled + * differently as it's used as hash key. */ #define ItemPointerCopy(fromPointer, toPointer) \ ( \ diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index a3cadd9a01..5218b448cd 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -41,10 +41,12 @@ struct XidCache #define PROC_IS_AUTOVACUUM 0x01 /* is it an autovac worker? */ #define PROC_IN_VACUUM 0x02 /* currently running lazy vacuum */ #define PROC_IN_ANALYZE 0x04 /* currently running analyze */ -#define PROC_VACUUM_FOR_WRAPAROUND 0x08 /* set by autovac only */ +#define PROC_VACUUM_FOR_WRAPAROUND 0x08 /* set by autovac only */ +#define PROC_IN_LOGICAL_DECODING 0x10 /* currently doing logical decoding */ /* flags reset at EOXact */ -#define PROC_VACUUM_STATE_MASK (0x0E) +#define PROC_VACUUM_STATE_MASK \ + (PROC_IN_VACUUM | PROC_IN_ANALYZE | PROC_VACUUM_FOR_WRAPAROUND) /* * We allow a small number of "weak" relation locks (AccesShareLock, diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index d1a58a3661..d0b4103a09 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -15,6 +15,7 @@ #define PROCARRAY_H #include "storage/standby.h" +#include "utils/relcache.h" #include "utils/snapshot.h" @@ -50,8 +51,9 @@ extern RunningTransactions GetRunningTransactionData(void); extern bool TransactionIdIsInProgress(TransactionId xid); extern bool TransactionIdIsActive(TransactionId xid); -extern TransactionId GetOldestXmin(bool allDbs, bool ignoreVacuum); +extern TransactionId GetOldestXmin(Relation rel, bool ignoreVacuum); extern TransactionId GetOldestActiveTransactionId(void); +extern TransactionId GetOldestSafeDecodingTransactionId(void); extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids); extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids); @@ -77,6 +79,10 @@ extern void XidCacheRemoveRunningXids(TransactionId xid, int nxids, const TransactionId *xids, TransactionId latestXid); -extern void ProcArraySetReplicationSlotXmin(TransactionId xmin); +extern void ProcArraySetReplicationSlotXmin(TransactionId xmin, + TransactionId catalog_xmin, bool already_locked); + +extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin, + TransactionId *catalog_xmin); #endif /* PROCARRAY_H */ diff --git a/src/include/storage/sinval.h b/src/include/storage/sinval.h index 0cae810f49..d5bb850337 100644 --- a/src/include/storage/sinval.h +++ b/src/include/storage/sinval.h @@ -147,4 +147,6 @@ extern void ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs int nmsgs, bool RelcacheInitFileInval, Oid dbid, Oid tsid); +extern void LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg); + #endif /* SINVAL_H */ diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h index c1409c863d..6156e0219d 100644 --- a/src/include/utils/inval.h +++ b/src/include/utils/inval.h @@ -64,4 +64,5 @@ extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func, extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue); +extern void InvalidateSystemCaches(void); #endif /* INVAL_H */ diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index c601770ec9..abe7016d04 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -23,12 +23,14 @@ extern bool FirstSnapshotSet; extern TransactionId TransactionXmin; extern TransactionId RecentXmin; extern TransactionId RecentGlobalXmin; +extern TransactionId RecentGlobalDataXmin; extern Snapshot GetTransactionSnapshot(void); extern Snapshot GetLatestSnapshot(void); extern void SnapshotSetCommandId(CommandId curcid); extern Snapshot GetCatalogSnapshot(Oid relid); +extern Snapshot GetNonHistoricCatalogSnapshot(Oid relid); extern void InvalidateCatalogSnapshot(void); extern void PushActiveSnapshot(Snapshot snapshot); @@ -53,4 +55,13 @@ extern bool XactHasExportedSnapshots(void); extern void DeleteAllExportedSnapshotFiles(void); extern bool ThereAreNoPriorRegisteredSnapshots(void); +extern char *ExportSnapshot(Snapshot snapshot); + +/* Support for catalog timetravel for logical decoding */ +struct HTAB; +extern struct HTAB *HistoricSnapshotGetTupleCids(void); +extern void SetupHistoricSnapshot(Snapshot snapshot_now, struct HTAB *tuplecids); +extern void TeardownHistoricSnapshot(bool is_error); +extern bool HistoricSnapshotActive(void); + #endif /* SNAPMGR_H */ diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index c542238825..4b256074b0 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -30,6 +30,22 @@ typedef struct SnapshotData *Snapshot; typedef bool (*SnapshotSatisfiesFunc) (HeapTuple htup, Snapshot snapshot, Buffer buffer); +/* + * Struct representing all kind of possible snapshots. + * + * There are several different kinds of snapshots: + * * Normal MVCC snapshots + * * MVCC snapshots taken during recovery (in Hot-Standby mode) + * * Historic MVCC snapshots used during logical decoding + * * snapshots passed to HeapTupleSatisfiesDirty() + * * snapshots used for SatisfiesAny, Toast, Self where no members are + * accessed. + * + * TODO: It's probably a good idea to split this struct using a NodeTag + * similar to how parser and executor nodes are handled, with one type for + * each different kind of snapshot to avoid overloading the meaning of + * individual fields. + */ typedef struct SnapshotData { SnapshotSatisfiesFunc satisfies; /* tuple test function */ @@ -46,11 +62,23 @@ typedef struct SnapshotData */ TransactionId xmin; /* all XID < xmin are visible to me */ TransactionId xmax; /* all XID >= xmax are invisible to me */ - TransactionId *xip; /* array of xact IDs in progress */ + /* + * For normal MVCC snapshot this contains the all xact IDs that are in + * progress, unless the snapshot was taken during recovery in which case + * it's empty. For historic MVCC snapshots, the meaning is inverted, + * i.e. it contains *committed* transactions between xmin and xmax. + */ + TransactionId *xip; uint32 xcnt; /* # of xact ids in xip[] */ /* note: all ids in xip[] satisfy xmin <= xip[i] < xmax */ int32 subxcnt; /* # of xact ids in subxip[] */ - TransactionId *subxip; /* array of subxact IDs in progress */ + /* + * For non-historic MVCC snapshots, this contains subxact IDs that are in + * progress (and other transactions that are in progress if taken during + * recovery). For historic snapshot it contains *all* xids assigned to the + * replayed transaction, including the toplevel xid. + */ + TransactionId *subxip; bool suboverflowed; /* has the subxip array overflowed? */ bool takenDuringRecovery; /* recovery-shaped snapshot? */ bool copied; /* false if it's a static snapshot */ diff --git a/src/include/utils/tqual.h b/src/include/utils/tqual.h index e34c28a4f7..48abe62983 100644 --- a/src/include/utils/tqual.h +++ b/src/include/utils/tqual.h @@ -22,6 +22,7 @@ extern PGDLLIMPORT SnapshotData SnapshotSelfData; extern PGDLLIMPORT SnapshotData SnapshotAnyData; extern PGDLLIMPORT SnapshotData SnapshotToastData; +extern PGDLLIMPORT SnapshotData CatalogSnapshotData; #define SnapshotSelf (&SnapshotSelfData) #define SnapshotAny (&SnapshotAnyData) @@ -37,7 +38,8 @@ extern PGDLLIMPORT SnapshotData SnapshotToastData; /* This macro encodes the knowledge of which snapshots are MVCC-safe */ #define IsMVCCSnapshot(snapshot) \ - ((snapshot)->satisfies == HeapTupleSatisfiesMVCC) + ((snapshot)->satisfies == HeapTupleSatisfiesMVCC || \ + (snapshot)->satisfies == HeapTupleSatisfiesHistoricMVCC) /* * HeapTupleSatisfiesVisibility @@ -73,6 +75,8 @@ extern bool HeapTupleSatisfiesToast(HeapTuple htup, Snapshot snapshot, Buffer buffer); extern bool HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, Buffer buffer); +extern bool HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, + Snapshot snapshot, Buffer buffer); /* Special "satisfies" routines with different APIs */ extern HTSU_Result HeapTupleSatisfiesUpdate(HeapTuple htup, @@ -86,4 +90,13 @@ extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid); extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple); +/* + * To avoid leaking to much knowledge about reorderbuffer implementation + * details this is implemented in reorderbuffer.c not tqual.c. + */ +extern bool ResolveCminCmaxDuringDecoding(struct HTAB *tuplecid_data, + Snapshot snapshot, + HeapTuple htup, + Buffer buffer, + CommandId *cmin, CommandId *cmax); #endif /* TQUAL_H */ diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index ef50f4da21..b0b6e27d8a 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1368,13 +1368,15 @@ pg_prepared_xacts| SELECT p.transaction, LEFT JOIN pg_authid u ON ((p.ownerid = u.oid))) LEFT JOIN pg_database d ON ((p.dbid = d.oid))); pg_replication_slots| SELECT l.slot_name, + l.plugin, l.slot_type, l.datoid, d.datname AS database, l.active, l.xmin, + l.catalog_xmin, l.restart_lsn - FROM (pg_get_replication_slots() l(slot_name, slot_type, datoid, active, xmin, restart_lsn) + FROM (pg_get_replication_slots() l(slot_name, plugin, slot_type, datoid, active, xmin, catalog_xmin, restart_lsn) LEFT JOIN pg_database d ON ((l.datoid = d.oid))); pg_roles| SELECT pg_authid.rolname, pg_authid.rolsuper, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 3b7f61ef20..f9604541c7 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -940,6 +940,17 @@ LockTupleMode LockingClause LogOpts LogStmtLevel +LogicalDecodeBeginCB +LogicalDecodeChangeCB +LogicalDecodeCleanupCB +LogicalDecodeCommitCB +LogicalDecodeInitCB +LogicalDecodingCheckpointData +LogicalDecodingContext +LogicalDecodingCtlData +LogicalDecodingSlot +LogicalOutputPluginWriterPrepareWrite +LogicalOutputPluginWriterWrite LogicalTape LogicalTapeSet MAGIC @@ -1053,6 +1064,7 @@ OprInfo OprProofCacheEntry OprProofCacheKey OutputContext +OutputPluginCallbacks OverrideSearchPath OverrideStackEntry PACE_HEADER @@ -1468,6 +1480,21 @@ Relids RelocationBufferInfo RenameStmt ReopenPtr +ReorderBuffer +ReorderBufferApplyChangeCB +ReorderBufferBeginCB +ReorderBufferChange +ReorderBufferChangeTypeInternal +ReorderBufferCommitCB +ReorderBufferDiskChange +ReorderBufferIterTXNEntry +ReorderBufferIterTXNState +ReorderBufferToastEnt +ReorderBufferTupleBuf +ReorderBufferTupleCidEnt +ReorderBufferTupleCidKey +ReorderBufferTXN +ReorderBufferTXNByIdEnt ReplaceVarsFromTargetList_context ReplaceVarsNoMatchOption ResTarget @@ -1522,6 +1549,8 @@ SID_NAME_USE SISeg SMgrRelation SMgrRelationData +SnapBuildAction +SnapBuildState SOCKADDR SOCKET SPELL @@ -1613,6 +1642,8 @@ SlruSharedData Snapshot SnapshotData SnapshotSatisfiesFunc +Snapstate +SnapstateOnDisk SockAddr Sort SortBy @@ -1929,6 +1960,7 @@ XLogReaderState XLogRecData XLogRecPtr XLogRecord +XLogRecordBuffer XLogSegNo XLogSource XLogwrtResult @@ -2351,6 +2383,7 @@ symbol tablespaceinfo teReqs teSection +TestDecodingData temp_tablespaces_extra text timeKEY