]> granicus.if.org Git - postgresql/commitdiff
Fix WAL-logging of FSM and VM truncation.
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 19 Oct 2016 11:43:34 +0000 (14:43 +0300)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 19 Oct 2016 11:43:34 +0000 (14:43 +0300)
When a relation is truncated, it is important that the FSM is truncated as
well. Otherwise, after recovery, the FSM can return a page that has been
truncated away, leading to errors like:

ERROR:  could not read block 28991 in file "base/16390/572026": read only 0
of 8192 bytes

We were using MarkBufferDirtyHint() to dirty the buffer holding the last
remaining page of the FSM, but during recovery, that might in fact not
dirty the page, and the FSM update might be lost.

To fix, use the stronger MarkBufferDirty() function. MarkBufferDirty()
requires us to do WAL-logging ourselves, to protect from a torn page, if
checksumming is enabled.

Also fix an oversight in visibilitymap_truncate: it also needs to WAL-log
when checksumming is enabled.

Analysis by Pavan Deolasee.

Discussion: <CABOikdNr5vKucqyZH9s1Mh0XebLs_jRhKv6eJfNnD2wxTn=_9A@mail.gmail.com>

Backpatch to 9.3, where we got data checksums.

src/backend/access/heap/visibilitymap.c
src/backend/storage/freespace/freespace.c
src/test/recovery/t/008_fsm_truncation.pl [new file with mode: 0644]

index 3ad4a9f5870ddf68120eb1566ec00fa400d1b2be..f020737f800f8f07b2c7663d624b165581bec32d 100644 (file)
@@ -508,6 +508,9 @@ visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
 
                LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);
 
+               /* NO EREPORT(ERROR) from here till changes are logged */
+               START_CRIT_SECTION();
+
                /* Clear out the unwanted bytes. */
                MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1));
 
@@ -523,7 +526,20 @@ visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
                 */
                map[truncByte] &= (1 << truncOffset) - 1;
 
+               /*
+                * Truncation of a relation is WAL-logged at a higher-level, and we
+                * will be called at WAL replay. But if checksums are enabled, we need
+                * to still write a WAL record to protect against a torn page, if the
+                * page is flushed to disk before the truncation WAL record. We cannot
+                * use MarkBufferDirtyHint here, because that will not dirty the page
+                * during recovery.
+                */
                MarkBufferDirty(mapBuffer);
+               if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
+                       log_newpage_buffer(mapBuffer, false);
+
+               END_CRIT_SECTION();
+
                UnlockReleaseBuffer(mapBuffer);
        }
        else
index bbd90c911aaced8d85091e35382a7f697e16fb1b..4138b04839a66da21e830b820bf18418574f7f4d 100644 (file)
@@ -327,8 +327,26 @@ FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks)
                if (!BufferIsValid(buf))
                        return;                         /* nothing to do; the FSM was already smaller */
                LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+               /* NO EREPORT(ERROR) from here till changes are logged */
+               START_CRIT_SECTION();
+
                fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
-               MarkBufferDirtyHint(buf, false);
+
+               /*
+                * Truncation of a relation is WAL-logged at a higher-level, and we
+                * will be called at WAL replay. But if checksums are enabled, we need
+                * to still write a WAL record to protect against a torn page, if the
+                * page is flushed to disk before the truncation WAL record. We cannot
+                * use MarkBufferDirtyHint here, because that will not dirty the page
+                * during recovery.
+                */
+               MarkBufferDirty(buf);
+               if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
+                       log_newpage_buffer(buf, false);
+
+               END_CRIT_SECTION();
+
                UnlockReleaseBuffer(buf);
 
                new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
diff --git a/src/test/recovery/t/008_fsm_truncation.pl b/src/test/recovery/t/008_fsm_truncation.pl
new file mode 100644 (file)
index 0000000..9f6bdb0
--- /dev/null
@@ -0,0 +1,93 @@
+# Test WAL replay of FSM changes.
+#
+# FSM changes don't normally need to be WAL-logged, except for truncation.
+# The FSM mustn't return a page that doesn't exist (anymore).
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use Test::More tests => 1;
+
+my $node_master = get_new_node('master');
+$node_master->init(allows_streaming => 1);
+
+$node_master->append_conf('postgresql.conf', qq{
+fsync = on
+wal_level = replica
+wal_log_hints = on
+max_prepared_transactions = 5
+autovacuum = off
+});
+
+# Create a master node and its standby, initializing both with some data
+# at the same time.
+$node_master->start;
+
+$node_master->backup('master_backup');
+my $node_standby = get_new_node('standby');
+$node_standby->init_from_backup($node_master, 'master_backup',
+       has_streaming => 1);
+$node_standby->start;
+
+$node_master->psql('postgres', qq{
+create table testtab (a int, b char(100));
+insert into testtab select generate_series(1,1000), 'foo';
+insert into testtab select generate_series(1,1000), 'foo';
+delete from testtab where ctid > '(8,0)';
+});
+
+# Take a lock on the table to prevent following vacuum from truncating it
+$node_master->psql('postgres', qq{
+begin;
+lock table testtab in row share mode;
+prepare transaction 'p1';
+});
+
+# Vacuum, update FSM without truncation
+$node_master->psql('postgres', 'vacuum verbose testtab');
+
+# Force a checkpoint
+$node_master->psql('postgres', 'checkpoint');
+
+# Now do some more insert/deletes, another vacuum to ensure full-page writes
+# are done
+$node_master->psql('postgres', qq{
+insert into testtab select generate_series(1,1000), 'foo';
+delete from testtab where ctid > '(8,0)';
+vacuum verbose testtab;
+});
+
+# Ensure all buffers are now clean on the standby
+$node_standby->psql('postgres', 'checkpoint');
+
+# Release the lock, vacuum again which should lead to truncation
+$node_master->psql('postgres', qq{
+rollback prepared 'p1';
+vacuum verbose testtab;
+});
+
+$node_master->psql('postgres', 'checkpoint');
+my $until_lsn =
+       $node_master->safe_psql('postgres', "SELECT pg_current_xlog_location();");
+
+# Wait long enough for standby to receive and apply all WAL
+my $caughtup_query =
+       "SELECT '$until_lsn'::pg_lsn <= pg_last_xlog_replay_location()";
+$node_standby->poll_query_until('postgres', $caughtup_query)
+       or die "Timed out while waiting for standby to catch up";
+
+# Promote the standby
+$node_standby->promote;
+$node_standby->poll_query_until('postgres',
+       "SELECT NOT pg_is_in_recovery()")
+  or die "Timed out while waiting for promotion of standby";
+$node_standby->psql('postgres', 'checkpoint');
+
+# Restart to discard in-memory copy of FSM
+$node_standby->restart;
+
+# Insert should work on standby
+is($node_standby->psql('postgres',
+   qq{insert into testtab select generate_series(1,1000), 'foo';}),
+   0, 'INSERT succeeds with truncated relation FSM');