]> granicus.if.org Git - postgresql/commitdiff
Don't CHECK_FOR_INTERRUPTS between WaitLatch and ResetLatch.
authorTom Lane <tgl@sss.pgh.pa.us>
Mon, 1 Aug 2016 19:13:53 +0000 (15:13 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Mon, 1 Aug 2016 19:13:53 +0000 (15:13 -0400)
This coding pattern creates a race condition, because if an interesting
interrupt happens after we've checked InterruptPending but before we reset
our latch, the latch-setting done by the signal handler would get lost,
and then we might block at WaitLatch in the next iteration without ever
noticing the interrupt condition.  You can put the CHECK_FOR_INTERRUPTS
before WaitLatch or after ResetLatch, but not between them.

Aside from fixing the bugs, add some explanatory comments to latch.h
to perhaps forestall the next person from making the same mistake.

In HEAD, also replace gather_readnext's direct call of
HandleParallelMessages with CHECK_FOR_INTERRUPTS.  It does not seem clean
or useful for this one caller to bypass ProcessInterrupts and go straight
to HandleParallelMessages; not least because that fails to consider the
InterruptPending flag, resulting in useless work both here
(if InterruptPending isn't set) and in the next CHECK_FOR_INTERRUPTS call
(if it is).

This thinko seems to have been introduced in the initial coding of
storage/ipc/shm_mq.c (commit ec9037df2), and then blindly copied into all
the subsequent parallel-query support logic.  Back-patch relevant hunks
to 9.4 to extirpate the error everywhere.

Discussion: <1661.1469996911@sss.pgh.pa.us>

src/backend/executor/nodeGather.c
src/backend/libpq/pqmq.c
src/backend/storage/ipc/shm_mq.c
src/include/storage/latch.h
src/test/modules/test_shm_mq/setup.c
src/test/modules/test_shm_mq/test.c

index 93a566ba629dbb2510344ca5148f0c695751b22d..438d1b24fc2e4be8d1b56206a337269e23c0a8bf 100644 (file)
@@ -330,8 +330,8 @@ gather_readnext(GatherState *gatherstate)
                HeapTuple       tup;
                bool            readerdone;
 
-               /* Make sure we've read all messages from workers. */
-               HandleParallelMessages();
+               /* Check for async events, particularly messages from workers. */
+               CHECK_FOR_INTERRUPTS();
 
                /* Attempt to read a tuple, but don't block if none is available. */
                reader = gatherstate->reader[gatherstate->nextreader];
@@ -388,7 +388,6 @@ gather_readnext(GatherState *gatherstate)
 
                        /* Nothing to do except wait for developments. */
                        WaitLatch(MyLatch, WL_LATCH_SET, 0);
-                       CHECK_FOR_INTERRUPTS();
                        ResetLatch(MyLatch);
                        nvisited = 0;
                }
index 0dcdee03db53fdd8584e5b4594afac5746b46c30..921242fbc4e1dbe449e5f25ea2260442964d808e 100644 (file)
@@ -172,8 +172,8 @@ mq_putmessage(char msgtype, const char *s, size_t len)
                        break;
 
                WaitLatch(&MyProc->procLatch, WL_LATCH_SET, 0);
-               CHECK_FOR_INTERRUPTS();
                ResetLatch(&MyProc->procLatch);
+               CHECK_FOR_INTERRUPTS();
        }
 
        pq_mq_busy = false;
index 44ede336162669b4dcf97ceae1baee85c62cfb28..5b32782022b7265c7c2d3c5f840749e37ae95852 100644 (file)
@@ -896,11 +896,11 @@ shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data,
                         */
                        WaitLatch(MyLatch, WL_LATCH_SET, 0);
 
-                       /* An interrupt may have occurred while we were waiting. */
-                       CHECK_FOR_INTERRUPTS();
-
                        /* Reset the latch so we don't spin. */
                        ResetLatch(MyLatch);
+
+                       /* An interrupt may have occurred while we were waiting. */
+                       CHECK_FOR_INTERRUPTS();
                }
                else
                {
@@ -993,11 +993,11 @@ shm_mq_receive_bytes(shm_mq *mq, Size bytes_needed, bool nowait,
                 */
                WaitLatch(MyLatch, WL_LATCH_SET, 0);
 
-               /* An interrupt may have occurred while we were waiting. */
-               CHECK_FOR_INTERRUPTS();
-
                /* Reset the latch so we don't spin. */
                ResetLatch(MyLatch);
+
+               /* An interrupt may have occurred while we were waiting. */
+               CHECK_FOR_INTERRUPTS();
        }
 }
 
@@ -1092,11 +1092,11 @@ shm_mq_wait_internal(volatile shm_mq *mq, PGPROC *volatile * ptr,
                /* Wait to be signalled. */
                WaitLatch(MyLatch, WL_LATCH_SET, 0);
 
-               /* An interrupt may have occurred while we were waiting. */
-               CHECK_FOR_INTERRUPTS();
-
                /* Reset the latch so we don't spin. */
                ResetLatch(MyLatch);
+
+               /* An interrupt may have occurred while we were waiting. */
+               CHECK_FOR_INTERRUPTS();
        }
 
        return result;
index 85d211c0e1cd1f19c700a42d5ff965a48b5d835c..5179ecc0dbda376801eacd565150781254ed1360 100644 (file)
  * do. Otherwise, if someone sets the latch between the check and the
  * ResetLatch call, you will miss it and Wait will incorrectly block.
  *
+ * Another valid coding pattern looks like:
+ *
+ * for (;;)
+ * {
+ *        if (work to do)
+ *                Do Stuff(); // in particular, exit loop if some condition satisfied
+ *        WaitLatch();
+ *        ResetLatch();
+ * }
+ *
+ * This is useful to reduce latch traffic if it's expected that the loop's
+ * termination condition will often be satisfied in the first iteration;
+ * the cost is an extra loop iteration before blocking when it is not.
+ * What must be avoided is placing any checks for asynchronous events after
+ * WaitLatch and before ResetLatch, as that creates a race condition.
+ *
  * To wake up the waiter, you must first set a global flag or something
  * else that the wait loop tests in the "if (work to do)" part, and call
  * SetLatch *after* that. SetLatch is designed to return quickly if the
index 5bd282078ccb8500a1a2b513592b3fafca5d8b12..143df4eb651b521eb039d08e410c2656af552620 100644 (file)
@@ -281,11 +281,11 @@ wait_for_workers_to_become_ready(worker_state *wstate,
                /* Wait to be signalled. */
                WaitLatch(MyLatch, WL_LATCH_SET, 0);
 
-               /* An interrupt may have occurred while we were waiting. */
-               CHECK_FOR_INTERRUPTS();
-
                /* Reset the latch so we don't spin. */
                ResetLatch(MyLatch);
+
+               /* An interrupt may have occurred while we were waiting. */
+               CHECK_FOR_INTERRUPTS();
        }
 
        if (!result)
index 6948e208996ca3273ab97f7584d4d5e2e4511d9d..dd34bc7e7f07ec19df121f520cbf36f1e5cdad61 100644 (file)
@@ -231,8 +231,8 @@ test_shm_mq_pipelined(PG_FUNCTION_ARGS)
                         * for us to do.
                         */
                        WaitLatch(MyLatch, WL_LATCH_SET, 0);
-                       CHECK_FOR_INTERRUPTS();
                        ResetLatch(MyLatch);
+                       CHECK_FOR_INTERRUPTS();
                }
        }