From c61559ec3a41ad72a13c18d95b1eee8251de94c6 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 26 Jun 2017 15:13:23 -0400 Subject: [PATCH] Reduce pg_ctl's reaction time when waiting for postmaster start/stop. pg_ctl has traditionally waited one second between probes for whether the start or stop request has completed. That behavior was embodied in the original shell script written in 1999 (commit 5b912b089) and I doubt anyone's questioned it since. Nowadays, machines are a lot faster, and the shell script is long since replaced by C code, so it's fair to reconsider how long we ought to wait. This patch adjusts the coding so that the wait time can be any even divisor of 1 second, and sets the actual probe rate to 10 per second. That's based on experimentation with the src/test/recovery TAP tests, which include a lot of postmaster starts and stops. This patch alone reduces the (non-parallelized) runtime of those tests from ~4m30s to ~3m5s on my machine. Increasing the probe rate further doesn't help much, so this seems like a good number. In the real world this probably won't have much impact, since people don't start/stop production postmasters often, and the shutdown checkpoint usually takes nontrivial time too. But it makes development work and testing noticeably snappier, and that's good enough reason for me. Also, by reducing the dead time in postmaster restart sequences, this change has made it easier to reproduce some bugs that have been lurking for awhile. Patches for those will follow. Discussion: https://postgr.es/m/18444.1498428798@sss.pgh.pa.us --- src/bin/pg_ctl/pg_ctl.c | 63 ++++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index 82ac62d5f4..ad2a16f755 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -68,6 +68,10 @@ typedef enum #define DEFAULT_WAIT 60 +#define USEC_PER_SEC 1000000 + +#define WAITS_PER_SEC 10 /* should divide USEC_PER_SEC evenly */ + static bool do_wait = true; static int wait_seconds = DEFAULT_WAIT; static bool wait_seconds_arg = false; @@ -531,7 +535,7 @@ test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint) connstr[0] = '\0'; - for (i = 0; i < wait_seconds; i++) + for (i = 0; i < wait_seconds * WAITS_PER_SEC; i++) { /* Do we need a connection string? */ if (connstr[0] == '\0') @@ -701,24 +705,28 @@ test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint) #endif /* No response, or startup still in process; wait */ -#ifdef WIN32 - if (do_checkpoint) + if (i % WAITS_PER_SEC == 0) { - /* - * Increment the wait hint by 6 secs (connection timeout + sleep) - * We must do this to indicate to the SCM that our startup time is - * changing, otherwise it'll usually send a stop signal after 20 - * seconds, despite incrementing the checkpoint counter. - */ - status.dwWaitHint += 6000; - status.dwCheckPoint++; - SetServiceStatus(hStatus, (LPSERVICE_STATUS) &status); - } - else +#ifdef WIN32 + if (do_checkpoint) + { + /* + * Increment the wait hint by 6 secs (connection timeout + + * sleep). We must do this to indicate to the SCM that our + * startup time is changing, otherwise it'll usually send a + * stop signal after 20 seconds, despite incrementing the + * checkpoint counter. + */ + status.dwWaitHint += 6000; + status.dwCheckPoint++; + SetServiceStatus(hStatus, (LPSERVICE_STATUS) &status); + } + else #endif - print_msg("."); + print_msg("."); + } - pg_usleep(1000000); /* 1 sec */ + pg_usleep(USEC_PER_SEC / WAITS_PER_SEC); } /* return result of last call to PQping */ @@ -998,12 +1006,13 @@ do_stop(void) print_msg(_("waiting for server to shut down...")); - for (cnt = 0; cnt < wait_seconds; cnt++) + for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++) { if ((pid = get_pgpid(false)) != 0) { - print_msg("."); - pg_usleep(1000000); /* 1 sec */ + if (cnt % WAITS_PER_SEC == 0) + print_msg("."); + pg_usleep(USEC_PER_SEC / WAITS_PER_SEC); } else break; @@ -1088,12 +1097,13 @@ do_restart(void) /* always wait for restart */ - for (cnt = 0; cnt < wait_seconds; cnt++) + for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++) { if ((pid = get_pgpid(false)) != 0) { - print_msg("."); - pg_usleep(1000000); /* 1 sec */ + if (cnt % WAITS_PER_SEC == 0) + print_msg("."); + pg_usleep(USEC_PER_SEC / WAITS_PER_SEC); } else break; @@ -1225,17 +1235,18 @@ do_promote(void) if (do_wait) { DBState state = DB_STARTUP; + int cnt; print_msg(_("waiting for server to promote...")); - while (wait_seconds > 0) + for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++) { state = get_control_dbstate(); if (state == DB_IN_PRODUCTION) break; - print_msg("."); - pg_usleep(1000000); /* 1 sec */ - wait_seconds--; + if (cnt % WAITS_PER_SEC == 0) + print_msg("."); + pg_usleep(USEC_PER_SEC / WAITS_PER_SEC); } if (state == DB_IN_PRODUCTION) { -- 2.40.0