]> granicus.if.org Git - postgresql/commitdiff
Shorten timeouts while waiting for logicalrep worker slot attach/detach.
authorTom Lane <tgl@sss.pgh.pa.us>
Sat, 1 Jul 2017 15:59:44 +0000 (11:59 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Sat, 1 Jul 2017 15:59:44 +0000 (11:59 -0400)
When waiting for a logical replication worker process to start or stop,
we have to busy-wait until we see it add or remove itself from the
LogicalRepWorker slot in shared memory.  Those loops were using a
one-second delay between checks, but on any reasonably modern machine, it
doesn't take more than a couple of msec for a worker to spawn or shut down.
Reduce the loop delays to 10ms to avoid wasting quite so much time in the
related regression tests.

In principle, a better solution would be to fix things so that the waiting
process can be awakened via its latch at the right time.  But that seems
considerably more invasive, which is undesirable for a post-beta fix.
Worker start/stop performance likely isn't of huge interest anyway for
production purposes, so we might not ever get around to it.

In passing, rearrange the second wait loop in logicalrep_worker_stop()
so that the lock is held at the top of the loop, thus saving one lock
acquisition/release per call, and making it look more like the other loop.

Discussion: https://postgr.es/m/30864.1498861103@sss.pgh.pa.us

src/backend/replication/logical/launcher.c

index 961110c94be38e3edc0a0d515f0234aaf44d57da..d165d518e1b35d9630683641b2f1e67f27e3b932 100644 (file)
@@ -201,11 +201,11 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker,
 
                /*
                 * We need timeout because we generally don't get notified via latch
-                * about the worker attach.
+                * about the worker attach.  But we don't expect to have to wait long.
                 */
                rc = WaitLatch(MyLatch,
                                           WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
-                                          1000L, WAIT_EVENT_BGWORKER_STARTUP);
+                                          10L, WAIT_EVENT_BGWORKER_STARTUP);
 
                /* emergency bailout if postmaster has died */
                if (rc & WL_POSTMASTER_DEATH)
@@ -408,8 +408,8 @@ retry:
 }
 
 /*
- * Stop the logical replication worker and wait until it detaches from the
- * slot.
+ * Stop the logical replication worker for subid/relid, if any, and wait until
+ * it detaches from the slot.
  */
 void
 logicalrep_worker_stop(Oid subid, Oid relid)
@@ -435,8 +435,8 @@ logicalrep_worker_stop(Oid subid, Oid relid)
        generation = worker->generation;
 
        /*
-        * If we found worker but it does not have proc set it is starting up,
-        * wait for it to finish and then kill it.
+        * If we found a worker but it does not have proc set then it is still
+        * starting up; wait for it to finish starting and then kill it.
         */
        while (worker->in_use && !worker->proc)
        {
@@ -444,10 +444,10 @@ logicalrep_worker_stop(Oid subid, Oid relid)
 
                LWLockRelease(LogicalRepWorkerLock);
 
-               /* Wait for signal. */
+               /* Wait a bit --- we don't expect to have to wait long. */
                rc = WaitLatch(MyLatch,
                                           WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
-                                          1000L, WAIT_EVENT_BGWORKER_STARTUP);
+                                          10L, WAIT_EVENT_BGWORKER_STARTUP);
 
                /* emergency bailout if postmaster has died */
                if (rc & WL_POSTMASTER_DEATH)
@@ -459,7 +459,7 @@ logicalrep_worker_stop(Oid subid, Oid relid)
                        CHECK_FOR_INTERRUPTS();
                }
 
-               /* Check worker status. */
+               /* Recheck worker status. */
                LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
 
                /*
@@ -480,27 +480,22 @@ logicalrep_worker_stop(Oid subid, Oid relid)
 
        /* Now terminate the worker ... */
        kill(worker->proc->pid, SIGTERM);
-       LWLockRelease(LogicalRepWorkerLock);
 
        /* ... and wait for it to die. */
        for (;;)
        {
                int                     rc;
 
-               LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
+               /* is it gone? */
                if (!worker->proc || worker->generation != generation)
-               {
-                       LWLockRelease(LogicalRepWorkerLock);
                        break;
-               }
-               LWLockRelease(LogicalRepWorkerLock);
 
-               CHECK_FOR_INTERRUPTS();
+               LWLockRelease(LogicalRepWorkerLock);
 
-               /* Wait for more work. */
+               /* Wait a bit --- we don't expect to have to wait long. */
                rc = WaitLatch(MyLatch,
                                           WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
-                                          1000L, WAIT_EVENT_BGWORKER_SHUTDOWN);
+                                          10L, WAIT_EVENT_BGWORKER_SHUTDOWN);
 
                /* emergency bailout if postmaster has died */
                if (rc & WL_POSTMASTER_DEATH)
@@ -511,7 +506,11 @@ logicalrep_worker_stop(Oid subid, Oid relid)
                        ResetLatch(MyLatch);
                        CHECK_FOR_INTERRUPTS();
                }
+
+               LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
        }
+
+       LWLockRelease(LogicalRepWorkerLock);
 }
 
 /*