Fix WAL-logging of FSM and VM truncation.

author Heikki Linnakangas <heikki.linnakangas@iki.fi>

Wed, 19 Oct 2016 11:43:34 +0000 (14:43 +0300)

committer Heikki Linnakangas <heikki.linnakangas@iki.fi>

Wed, 19 Oct 2016 11:43:34 +0000 (14:43 +0300)
author Heikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 19 Oct 2016 11:43:34 +0000 (14:43 +0300)
committer Heikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 19 Oct 2016 11:43:34 +0000 (14:43 +0300)
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c

index 3ad4a9f5870ddf68120eb1566ec00fa400d1b2be..f020737f800f8f07b2c7663d624b165581bec32d 100644 (file)
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -508,6 +508,9 @@ visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
  
                 LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);
  
+               /* NO EREPORT(ERROR) from here till changes are logged */
+               START_CRIT_SECTION();
+
                 /* Clear out the unwanted bytes. */
                 MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1));
  
@@ -523,7 +526,20 @@ visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
                  */
                 map[truncByte] &= (1 << truncOffset) - 1;
  
+               /*
+                * Truncation of a relation is WAL-logged at a higher-level, and we
+                * will be called at WAL replay. But if checksums are enabled, we need
+                * to still write a WAL record to protect against a torn page, if the
+                * page is flushed to disk before the truncation WAL record. We cannot
+                * use MarkBufferDirtyHint here, because that will not dirty the page
+                * during recovery.
+                */
                 MarkBufferDirty(mapBuffer);
+               if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
+                       log_newpage_buffer(mapBuffer, false);
+
+               END_CRIT_SECTION();
+
                 UnlockReleaseBuffer(mapBuffer);
         }
         else
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c

index bbd90c911aaced8d85091e35382a7f697e16fb1b..4138b04839a66da21e830b820bf18418574f7f4d 100644 (file)
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -327,8 +327,26 @@ FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks)
                 if (!BufferIsValid(buf))
                         return;                         /* nothing to do; the FSM was already smaller */
                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+               /* NO EREPORT(ERROR) from here till changes are logged */
+               START_CRIT_SECTION();
+
                 fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
-               MarkBufferDirtyHint(buf, false);
+
+               /*
+                * Truncation of a relation is WAL-logged at a higher-level, and we
+                * will be called at WAL replay. But if checksums are enabled, we need
+                * to still write a WAL record to protect against a torn page, if the
+                * page is flushed to disk before the truncation WAL record. We cannot
+                * use MarkBufferDirtyHint here, because that will not dirty the page
+                * during recovery.
+                */
+               MarkBufferDirty(buf);
+               if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
+                       log_newpage_buffer(buf, false);
+
+               END_CRIT_SECTION();
+
                 UnlockReleaseBuffer(buf);
  
                 new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
diff --git a/src/test/recovery/t/008_fsm_truncation.pl b/src/test/recovery/t/008_fsm_truncation.pl

new file mode 100644 (file)

index 0000000..9f6bdb0
--- /dev/null
+++ b/src/test/recovery/t/008_fsm_truncation.pl
@@ -0,0 +1,93 @@
+# Test WAL replay of FSM changes.
+#
+# FSM changes don't normally need to be WAL-logged, except for truncation.
+# The FSM mustn't return a page that doesn't exist (anymore).
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use Test::More tests => 1;
+
+my $node_master = get_new_node('master');
+$node_master->init(allows_streaming => 1);
+
+$node_master->append_conf('postgresql.conf', qq{
+fsync = on
+wal_level = replica
+wal_log_hints = on
+max_prepared_transactions = 5
+autovacuum = off
+});
+
+# Create a master node and its standby, initializing both with some data
+# at the same time.
+$node_master->start;
+
+$node_master->backup('master_backup');
+my $node_standby = get_new_node('standby');
+$node_standby->init_from_backup($node_master, 'master_backup',
+       has_streaming => 1);
+$node_standby->start;
+
+$node_master->psql('postgres', qq{
+create table testtab (a int, b char(100));
+insert into testtab select generate_series(1,1000), 'foo';
+insert into testtab select generate_series(1,1000), 'foo';
+delete from testtab where ctid > '(8,0)';
+});
+
+# Take a lock on the table to prevent following vacuum from truncating it
+$node_master->psql('postgres', qq{
+begin;
+lock table testtab in row share mode;
+prepare transaction 'p1';
+});
+
+# Vacuum, update FSM without truncation
+$node_master->psql('postgres', 'vacuum verbose testtab');
+
+# Force a checkpoint
+$node_master->psql('postgres', 'checkpoint');
+
+# Now do some more insert/deletes, another vacuum to ensure full-page writes
+# are done
+$node_master->psql('postgres', qq{
+insert into testtab select generate_series(1,1000), 'foo';
+delete from testtab where ctid > '(8,0)';
+vacuum verbose testtab;
+});
+
+# Ensure all buffers are now clean on the standby
+$node_standby->psql('postgres', 'checkpoint');
+
+# Release the lock, vacuum again which should lead to truncation
+$node_master->psql('postgres', qq{
+rollback prepared 'p1';
+vacuum verbose testtab;
+});
+
+$node_master->psql('postgres', 'checkpoint');
+my $until_lsn =
+       $node_master->safe_psql('postgres', "SELECT pg_current_xlog_location();");
+
+# Wait long enough for standby to receive and apply all WAL
+my $caughtup_query =
+       "SELECT '$until_lsn'::pg_lsn <= pg_last_xlog_replay_location()";
+$node_standby->poll_query_until('postgres', $caughtup_query)
+       or die "Timed out while waiting for standby to catch up";
+
+# Promote the standby
+$node_standby->promote;
+$node_standby->poll_query_until('postgres',
+       "SELECT NOT pg_is_in_recovery()")
+  or die "Timed out while waiting for promotion of standby";
+$node_standby->psql('postgres', 'checkpoint');
+
+# Restart to discard in-memory copy of FSM
+$node_standby->restart;
+
+# Insert should work on standby
+is($node_standby->psql('postgres',
+   qq{insert into testtab select generate_series(1,1000), 'foo';}),
+   0, 'INSERT succeeds with truncated relation FSM');
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Wed, 19 Oct 2016 11:43:34 +0000 (14:43 +0300)
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Wed, 19 Oct 2016 11:43:34 +0000 (14:43 +0300)
src/backend/access/heap/visibilitymap.c		patch \| blob \| history
src/backend/storage/freespace/freespace.c		patch \| blob \| history
src/test/recovery/t/008_fsm_truncation.pl	[new file with mode: 0644]	patch \| blob