From 7d68f2281a4b56834c8e5648fc7da0b73b674c45 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 22 Apr 2017 18:18:25 -0400 Subject: [PATCH] Make PostgresNode.pm check server status more carefully. PostgresNode blithely ignored the exit status of pg_ctl, and in general made no effort to be sure that the server was running when it should be. This caused it to miss server crashes, which is a serious shortcoming in a test scaffold. Make it complain if pg_ctl fails, and modify the start and stop logic to complain if the server doesn't start, or doesn't stop, when expected. Also, have it turn off the "restart_after_crash" configuration parameter in created clusters, as bitter experience has shown that leaving that on can mask crashes too. We might at some point need variant functions that allow for, eg, server start failure to be expected. But no existing test case appears to want that, and it surely shouldn't be the default behavior. Note that this *will* break the buildfarm, as it will expose known bugs that the previous testing failed to. I'm committing it despite that, to verify that we get the expected failures in the buildfarm not just in manual testing. Back-patch into 9.6 where PostgresNode was introduced. (The 9.6 branch is not expected to show any failures.) Discussion: https://postgr.es/m/21432.1492886428@sss.pgh.pa.us --- src/test/perl/PostgresNode.pm | 41 +++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm index 6ee412ed1d..e42eb88896 100644 --- a/src/test/perl/PostgresNode.pm +++ b/src/test/perl/PostgresNode.pm @@ -402,6 +402,7 @@ sub init open my $conf, '>>', "$pgdata/postgresql.conf"; print $conf "\n# Added by PostgresNode.pm\n"; print $conf "fsync = off\n"; + print $conf "restart_after_crash = off\n"; print $conf "log_line_prefix = '%m [%p] %q%a '\n"; print $conf "log_statement = all\n"; print $conf "port = $port\n"; @@ -644,18 +645,19 @@ sub start my $port = $self->port; my $pgdata = $self->data_dir; my $name = $self->name; + BAIL_OUT("node \"$name\" is already running") if defined $self->{_pid}; print("### Starting node \"$name\"\n"); my $ret = TestLib::system_log('pg_ctl', '-D', $self->data_dir, '-l', $self->logfile, 'start'); if ($ret != 0) { - print "# pg_ctl failed; logfile:\n"; + print "# pg_ctl start failed; logfile:\n"; print TestLib::slurp_file($self->logfile); - BAIL_OUT("pg_ctl failed"); + BAIL_OUT("pg_ctl start failed"); } - $self->_update_pid; + $self->_update_pid(1); } =pod @@ -664,6 +666,10 @@ sub start Stop the node using pg_ctl -m $mode and wait for it to stop. +Note: if the node is already known stopped, this does nothing. +However, if we think it's running and it's not, it's important for +this to fail. Otherwise, tests might fail to detect server crashes. + =cut sub stop @@ -675,9 +681,8 @@ sub stop $mode = 'fast' unless defined $mode; return unless defined $self->{_pid}; print "### Stopping node \"$name\" using mode $mode\n"; - TestLib::system_log('pg_ctl', '-D', $pgdata, '-m', $mode, 'stop'); - $self->{_pid} = undef; - $self->_update_pid; + TestLib::system_or_bail('pg_ctl', '-D', $pgdata, '-m', $mode, 'stop'); + $self->_update_pid(0); } =pod @@ -695,7 +700,7 @@ sub reload my $pgdata = $self->data_dir; my $name = $self->name; print "### Reloading node \"$name\"\n"; - TestLib::system_log('pg_ctl', '-D', $pgdata, 'reload'); + TestLib::system_or_bail('pg_ctl', '-D', $pgdata, 'reload'); } =pod @@ -714,9 +719,9 @@ sub restart my $logfile = $self->logfile; my $name = $self->name; print "### Restarting node \"$name\"\n"; - TestLib::system_log('pg_ctl', '-D', $pgdata, '-l', $logfile, - 'restart'); - $self->_update_pid; + TestLib::system_or_bail('pg_ctl', '-D', $pgdata, '-l', $logfile, + 'restart'); + $self->_update_pid(1); } =pod @@ -735,8 +740,8 @@ sub promote my $logfile = $self->logfile; my $name = $self->name; print "### Promoting node \"$name\"\n"; - TestLib::system_log('pg_ctl', '-D', $pgdata, '-l', $logfile, - 'promote'); + TestLib::system_or_bail('pg_ctl', '-D', $pgdata, '-l', $logfile, + 'promote'); } # Internal routine to enable streaming replication on a standby node. @@ -814,22 +819,26 @@ archive_command = '$copy_command' # Internal method sub _update_pid { - my $self = shift; + my ($self, $is_running) = @_; my $name = $self->name; # If we can open the PID file, read its first line and that's the PID we - # want. If the file cannot be opened, presumably the server is not - # running; don't be noisy in that case. + # want. if (open my $pidfile, '<', $self->data_dir . "/postmaster.pid") { chomp($self->{_pid} = <$pidfile>); print "# Postmaster PID for node \"$name\" is $self->{_pid}\n"; close $pidfile; + + # If we found a pidfile when there shouldn't be one, complain. + BAIL_OUT("postmaster.pid unexpectedly present") unless $is_running; return; } $self->{_pid} = undef; - print "# No postmaster PID\n"; + print "# No postmaster PID for node \"$name\"\n"; + # Complain if we expected to find a pidfile. + BAIL_OUT("postmaster.pid unexpectedly not present") if $is_running; } =pod -- 2.40.0