From 5c995139752f5d57c560a0708286b99f9ca2bddd Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 30 Nov 2018 10:34:45 +0900 Subject: [PATCH] Fix various checksum check problems for pg_verify_checksums and base backups Three issues are fixed in this patch: - Base backups forgot to ignore files specific to EXEC_BACKEND, leading to spurious warnings when checksums are enabled, per analysis from me. - pg_verify_checksums forgot about files specific to EXEC_BACKEND, leading to failures of the tool on any such build, particularly Windows. This error was originally found by newly-introduced TAP tests in various buildfarm members using EXEC_BACKEND. - pg_verify_checksums forgot to count for temporary files and temporary paths, which could be valid relation files, without checksums, per report from Andres Freund. More tests are added to cover this case. A new test case which emulates corruption for a file in a different tablespace is added, coming from from Michael Banck, while I have coded the main code and refactored the test code. Author: Michael Banck, Michael Paquier Reviewed-by: Stephen Frost, David Steele Discussion: https://postgr.es/m/20181021134206.GA14282@paquier.xyz --- src/backend/replication/basebackup.c | 7 + .../pg_verify_checksums/pg_verify_checksums.c | 33 ++++- src/bin/pg_verify_checksums/t/002_actions.pl | 127 ++++++++++++------ 3 files changed, 121 insertions(+), 46 deletions(-) diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index a7e3db2783..78ed6cf797 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -189,12 +189,19 @@ static const char *excludeFiles[] = /* * List of files excluded from checksum validation. + * + * Note: this list should be kept in sync with what pg_verify_checksums.c + * includes. */ static const char *const noChecksumFiles[] = { "pg_control", "pg_filenode.map", "pg_internal.init", "PG_VERSION", +#ifdef EXEC_BACKEND + "config_exec_params", + "config_exec_params.new", +#endif NULL, }; diff --git a/src/bin/pg_verify_checksums/pg_verify_checksums.c b/src/bin/pg_verify_checksums/pg_verify_checksums.c index 1bc020ab6c..6444fc9ca4 100644 --- a/src/bin/pg_verify_checksums/pg_verify_checksums.c +++ b/src/bin/pg_verify_checksums/pg_verify_checksums.c @@ -20,6 +20,7 @@ #include "storage/bufpage.h" #include "storage/checksum.h" #include "storage/checksum_impl.h" +#include "storage/fd.h" static int64 files = 0; @@ -49,11 +50,20 @@ usage(void) printf(_("Report bugs to .\n")); } +/* + * List of files excluded from checksum validation. + * + * Note: this list should be kept in sync with what basebackup.c includes. + */ static const char *const skip[] = { "pg_control", "pg_filenode.map", "pg_internal.init", "PG_VERSION", +#ifdef EXEC_BACKEND + "config_exec_params", + "config_exec_params.new", +#endif NULL, }; @@ -62,13 +72,10 @@ skipfile(const char *fn) { const char *const *f; - if (strcmp(fn, ".") == 0 || - strcmp(fn, "..") == 0) - return true; - for (f = skip; *f; f++) if (strcmp(*f, fn) == 0) return true; + return false; } @@ -146,9 +153,22 @@ scan_directory(const char *basedir, const char *subdir) char fn[MAXPGPATH]; struct stat st; - if (skipfile(de->d_name)) + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) continue; + /* Skip temporary files */ + if (strncmp(de->d_name, + PG_TEMP_FILE_PREFIX, + strlen(PG_TEMP_FILE_PREFIX)) == 0) + continue; + + /* Skip temporary folders */ + if (strncmp(de->d_name, + PG_TEMP_FILES_DIR, + strlen(PG_TEMP_FILES_DIR)) == 0) + return; + snprintf(fn, sizeof(fn), "%s/%s", path, de->d_name); if (lstat(fn, &st) < 0) { @@ -163,6 +183,9 @@ scan_directory(const char *basedir, const char *subdir) *segmentpath; BlockNumber segmentno = 0; + if (skipfile(de->d_name)) + continue; + /* * Cut off at the segment boundary (".") to get the segment number * in order to mix it into the checksum. Then also cut off at the diff --git a/src/bin/pg_verify_checksums/t/002_actions.pl b/src/bin/pg_verify_checksums/t/002_actions.pl index c640cce260..12cca604e6 100644 --- a/src/bin/pg_verify_checksums/t/002_actions.pl +++ b/src/bin/pg_verify_checksums/t/002_actions.pl @@ -5,7 +5,74 @@ use strict; use warnings; use PostgresNode; use TestLib; -use Test::More tests => 36; +use Test::More tests => 45; + + +# Utility routine to create and check a table with corrupted checksums +# on a wanted tablespace. Note that this stops and starts the node +# multiple times to perform the checks, leaving the node started +# at the end. +sub check_relation_corruption +{ + my $node = shift; + my $table = shift; + my $tablespace = shift; + my $pgdata = $node->data_dir; + + $node->safe_psql('postgres', + "SELECT a INTO $table FROM generate_series(1,10000) AS a; + ALTER TABLE $table SET (autovacuum_enabled=false);"); + + $node->safe_psql('postgres', + "ALTER TABLE ".$table." SET TABLESPACE ".$tablespace.";"); + + my $file_corrupted = $node->safe_psql('postgres', + "SELECT pg_relation_filepath('$table');"); + my $relfilenode_corrupted = $node->safe_psql('postgres', + "SELECT relfilenode FROM pg_class WHERE relname = '$table';"); + + # Set page header and block size + my $pageheader_size = 24; + my $block_size = $node->safe_psql('postgres', 'SHOW block_size;'); + $node->stop; + + # Checksums are correct for single relfilenode as the table is not + # corrupted yet. + command_ok(['pg_verify_checksums', '-D', $pgdata, + '-r', $relfilenode_corrupted], + "succeeds for single relfilenode on tablespace $tablespace with offline cluster"); + + # Time to create some corruption + open my $file, '+<', "$pgdata/$file_corrupted"; + seek($file, $pageheader_size, 0); + syswrite($file, '\0\0\0\0\0\0\0\0\0'); + close $file; + + # Checksum checks on single relfilenode fail + $node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata, '-r', + $relfilenode_corrupted], + 1, + [qr/Bad checksums:.*1/], + [qr/checksum verification failed/], + "fails with corrupted data for single relfilenode on tablespace $tablespace"); + + # Global checksum checks fail as well + $node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata], + 1, + [qr/Bad checksums:.*1/], + [qr/checksum verification failed/], + "fails with corrupted data on tablespace $tablespace"); + + # Drop corrupted table again and make sure there is no more corruption. + $node->start; + $node->safe_psql('postgres', "DROP TABLE $table;"); + $node->stop; + $node->command_ok(['pg_verify_checksums', '-D', $pgdata], + "succeeds again after table drop on tablespace $tablespace"); + + $node->start; + return; +} # Initialize node with checksums enabled. my $node = get_new_node('node_checksum'); @@ -27,6 +94,12 @@ append_to_file "$pgdata/global/99999_init.123", ""; append_to_file "$pgdata/global/99999_fsm.123", ""; append_to_file "$pgdata/global/99999_vm.123", ""; +# These are temporary files and folders with dummy contents, which +# should be ignored by the scan. +append_to_file "$pgdata/global/pgsql_tmp_123", "foo"; +mkdir "$pgdata/global/pgsql_tmp"; +append_to_file "$pgdata/global/pgsql_tmp/1.1", "foo"; + # Checksums pass on a newly-created cluster command_ok(['pg_verify_checksums', '-D', $pgdata], "succeeds with offline cluster"); @@ -36,47 +109,16 @@ $node->start; command_fails(['pg_verify_checksums', '-D', $pgdata], "fails with online cluster"); -# Create table to corrupt and get its relfilenode -$node->safe_psql('postgres', - "SELECT a INTO corrupt1 FROM generate_series(1,10000) AS a; - ALTER TABLE corrupt1 SET (autovacuum_enabled=false);"); - -my $file_corrupted = $node->safe_psql('postgres', - "SELECT pg_relation_filepath('corrupt1')"); -my $relfilenode_corrupted = $node->safe_psql('postgres', - "SELECT relfilenode FROM pg_class WHERE relname = 'corrupt1';"); - -# Set page header and block size -my $pageheader_size = 24; -my $block_size = $node->safe_psql('postgres', 'SHOW block_size;'); -$node->stop; - -# Checksums are correct for single relfilenode as the table is not -# corrupted yet. -command_ok(['pg_verify_checksums', '-D', $pgdata, - '-r', $relfilenode_corrupted], - "succeeds for single relfilenode with offline cluster"); - -# Time to create some corruption -open my $file, '+<', "$pgdata/$file_corrupted"; -seek($file, $pageheader_size, 0); -syswrite($file, '\0\0\0\0\0\0\0\0\0'); -close $file; +# Check corruption of table on default tablespace. +check_relation_corruption($node, 'corrupt1', 'pg_default'); -# Global checksum checks fail -$node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata], - 1, - [qr/Bad checksums:.*1/], - [qr/checksum verification failed/], - 'fails with corrupted data'); - -# Checksum checks on single relfilenode fail -$node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata, '-r', - $relfilenode_corrupted], - 1, - [qr/Bad checksums:.*1/], - [qr/checksum verification failed/], - 'fails for corrupted data on single relfilenode'); +# Create tablespace to check corruptions in a non-default tablespace. +my $basedir = $node->basedir; +my $tablespace_dir = "$basedir/ts_corrupt_dir"; +mkdir ($tablespace_dir); +$node->safe_psql('postgres', + "CREATE TABLESPACE ts_corrupt LOCATION '$tablespace_dir';"); +check_relation_corruption($node, 'corrupt2', 'ts_corrupt'); # Utility routine to check that pg_verify_checksums is able to detect # correctly-named relation files filled with some corrupted data. @@ -101,6 +143,9 @@ sub fail_corrupt return; } +# Stop instance for the follow-up checks. +$node->stop; + # Authorized relation files filled with corrupted data cause the # checksum checks to fail. Make sure to use file names different # than the previous ones. -- 2.40.0