]> granicus.if.org Git - postgresql/commitdiff
Reduce pg_ctl's reaction time when waiting for postmaster start/stop.
authorTom Lane <tgl@sss.pgh.pa.us>
Mon, 26 Jun 2017 19:13:23 +0000 (15:13 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Mon, 26 Jun 2017 19:13:23 +0000 (15:13 -0400)
pg_ctl has traditionally waited one second between probes for whether
the start or stop request has completed.  That behavior was embodied
in the original shell script written in 1999 (commit 5b912b089) and
I doubt anyone's questioned it since.  Nowadays, machines are a lot
faster, and the shell script is long since replaced by C code, so it's
fair to reconsider how long we ought to wait.

This patch adjusts the coding so that the wait time can be any even
divisor of 1 second, and sets the actual probe rate to 10 per second.
That's based on experimentation with the src/test/recovery TAP tests,
which include a lot of postmaster starts and stops.  This patch alone
reduces the (non-parallelized) runtime of those tests from ~4m30s to
~3m5s on my machine.  Increasing the probe rate further doesn't help
much, so this seems like a good number.

In the real world this probably won't have much impact, since people
don't start/stop production postmasters often, and the shutdown checkpoint
usually takes nontrivial time too.  But it makes development work and
testing noticeably snappier, and that's good enough reason for me.

Also, by reducing the dead time in postmaster restart sequences, this
change has made it easier to reproduce some bugs that have been lurking
for awhile.  Patches for those will follow.

Discussion: https://postgr.es/m/18444.1498428798@sss.pgh.pa.us

src/bin/pg_ctl/pg_ctl.c

index 82ac62d5f482ae4c60fd57811629baad84d30925..ad2a16f7558b0f74054643d2906d8f56bd867d4c 100644 (file)
@@ -68,6 +68,10 @@ typedef enum
 
 #define DEFAULT_WAIT   60
 
+#define USEC_PER_SEC   1000000
+
+#define WAITS_PER_SEC  10              /* should divide USEC_PER_SEC evenly */
+
 static bool do_wait = true;
 static int     wait_seconds = DEFAULT_WAIT;
 static bool wait_seconds_arg = false;
@@ -531,7 +535,7 @@ test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint)
 
        connstr[0] = '\0';
 
-       for (i = 0; i < wait_seconds; i++)
+       for (i = 0; i < wait_seconds * WAITS_PER_SEC; i++)
        {
                /* Do we need a connection string? */
                if (connstr[0] == '\0')
@@ -701,24 +705,28 @@ test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint)
 #endif
 
                /* No response, or startup still in process; wait */
-#ifdef WIN32
-               if (do_checkpoint)
+               if (i % WAITS_PER_SEC == 0)
                {
-                       /*
-                        * Increment the wait hint by 6 secs (connection timeout + sleep)
-                        * We must do this to indicate to the SCM that our startup time is
-                        * changing, otherwise it'll usually send a stop signal after 20
-                        * seconds, despite incrementing the checkpoint counter.
-                        */
-                       status.dwWaitHint += 6000;
-                       status.dwCheckPoint++;
-                       SetServiceStatus(hStatus, (LPSERVICE_STATUS) &status);
-               }
-               else
+#ifdef WIN32
+                       if (do_checkpoint)
+                       {
+                               /*
+                                * Increment the wait hint by 6 secs (connection timeout +
+                                * sleep).  We must do this to indicate to the SCM that our
+                                * startup time is changing, otherwise it'll usually send a
+                                * stop signal after 20 seconds, despite incrementing the
+                                * checkpoint counter.
+                                */
+                               status.dwWaitHint += 6000;
+                               status.dwCheckPoint++;
+                               SetServiceStatus(hStatus, (LPSERVICE_STATUS) &status);
+                       }
+                       else
 #endif
-                       print_msg(".");
+                               print_msg(".");
+               }
 
-               pg_usleep(1000000);             /* 1 sec */
+               pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
        }
 
        /* return result of last call to PQping */
@@ -998,12 +1006,13 @@ do_stop(void)
 
                print_msg(_("waiting for server to shut down..."));
 
-               for (cnt = 0; cnt < wait_seconds; cnt++)
+               for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++)
                {
                        if ((pid = get_pgpid(false)) != 0)
                        {
-                               print_msg(".");
-                               pg_usleep(1000000); /* 1 sec */
+                               if (cnt % WAITS_PER_SEC == 0)
+                                       print_msg(".");
+                               pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
                        }
                        else
                                break;
@@ -1088,12 +1097,13 @@ do_restart(void)
 
                /* always wait for restart */
 
-               for (cnt = 0; cnt < wait_seconds; cnt++)
+               for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++)
                {
                        if ((pid = get_pgpid(false)) != 0)
                        {
-                               print_msg(".");
-                               pg_usleep(1000000); /* 1 sec */
+                               if (cnt % WAITS_PER_SEC == 0)
+                                       print_msg(".");
+                               pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
                        }
                        else
                                break;
@@ -1225,17 +1235,18 @@ do_promote(void)
        if (do_wait)
        {
                DBState         state = DB_STARTUP;
+               int                     cnt;
 
                print_msg(_("waiting for server to promote..."));
-               while (wait_seconds > 0)
+               for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++)
                {
                        state = get_control_dbstate();
                        if (state == DB_IN_PRODUCTION)
                                break;
 
-                       print_msg(".");
-                       pg_usleep(1000000); /* 1 sec */
-                       wait_seconds--;
+                       if (cnt % WAITS_PER_SEC == 0)
+                               print_msg(".");
+                       pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
                }
                if (state == DB_IN_PRODUCTION)
                {