WaitForParallelWorkersToFinish(pcxt);
WaitForParallelWorkersToExit(pcxt);
pcxt->nworkers_launched = 0;
- if (pcxt->any_message_received)
+ if (pcxt->known_attached_workers)
{
- pfree(pcxt->any_message_received);
- pcxt->any_message_received = NULL;
+ pfree(pcxt->known_attached_workers);
+ pcxt->known_attached_workers = NULL;
+ pcxt->nknown_attached_workers = 0;
}
}
/*
* Now that nworkers_launched has taken its final value, we can initialize
- * any_message_received.
+ * known_attached_workers.
*/
if (pcxt->nworkers_launched > 0)
- pcxt->any_message_received =
+ {
+ pcxt->known_attached_workers =
palloc0(sizeof(bool) * pcxt->nworkers_launched);
+ pcxt->nknown_attached_workers = 0;
+ }
/* Restore previous memory context. */
MemoryContextSwitchTo(oldcontext);
}
+/*
+ * Wait for all workers to attach to their error queues, and throw an error if
+ * any worker fails to do this.
+ *
+ * Callers can assume that if this function returns successfully, then the
+ * number of workers given by pcxt->nworkers_launched have initialized and
+ * attached to their error queues. Whether or not these workers are guaranteed
+ * to still be running depends on what code the caller asked them to run;
+ * this function does not guarantee that they have not exited. However, it
+ * does guarantee that any workers which exited must have done so cleanly and
+ * after successfully performing the work with which they were tasked.
+ *
+ * If this function is not called, then some of the workers that were launched
+ * may not have been started due to a fork() failure, or may have exited during
+ * early startup prior to attaching to the error queue, so nworkers_launched
+ * cannot be viewed as completely reliable. It will never be less than the
+ * number of workers which actually started, but it might be more. Any workers
+ * that failed to start will still be discovered by
+ * WaitForParallelWorkersToFinish and an error will be thrown at that time,
+ * provided that function is eventually reached.
+ *
+ * In general, the leader process should do as much work as possible before
+ * calling this function. fork() failures and other early-startup failures
+ * are very uncommon, and having the leader sit idle when it could be doing
+ * useful work is undesirable. However, if the leader needs to wait for
+ * all of its workers or for a specific worker, it may want to call this
+ * function before doing so. If not, it must make some other provision for
+ * the failure-to-start case, lest it wait forever. On the other hand, a
+ * leader which never waits for a worker that might not be started yet, or
+ * at least never does so prior to WaitForParallelWorkersToFinish(), need not
+ * call this function at all.
+ */
+void
+WaitForParallelWorkersToAttach(ParallelContext *pcxt)
+{
+ int i;
+
+ /* Skip this if we have no launched workers. */
+ if (pcxt->nworkers_launched == 0)
+ return;
+
+ for (;;)
+ {
+ /*
+ * This will process any parallel messages that are pending and it may
+ * also throw an error propagated from a worker.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ for (i = 0; i < pcxt->nworkers_launched; ++i)
+ {
+ BgwHandleStatus status;
+ shm_mq *mq;
+ int rc;
+ pid_t pid;
+
+ if (pcxt->known_attached_workers[i])
+ continue;
+
+ /*
+ * If error_mqh is NULL, then the worker has already exited
+ * cleanly.
+ */
+ if (pcxt->worker[i].error_mqh == NULL)
+ {
+ pcxt->known_attached_workers[i] = true;
+ ++pcxt->nknown_attached_workers;
+ continue;
+ }
+
+ status = GetBackgroundWorkerPid(pcxt->worker[i].bgwhandle, &pid);
+ if (status == BGWH_STARTED)
+ {
+ /* Has the worker attached to the error queue? */
+ mq = shm_mq_get_queue(pcxt->worker[i].error_mqh);
+ if (shm_mq_get_sender(mq) != NULL)
+ {
+ /* Yes, so it is known to be attached. */
+ pcxt->known_attached_workers[i] = true;
+ ++pcxt->nknown_attached_workers;
+ }
+ }
+ else if (status == BGWH_STOPPED)
+ {
+ /*
+ * If the worker stopped without attaching to the error queue,
+ * throw an error.
+ */
+ mq = shm_mq_get_queue(pcxt->worker[i].error_mqh);
+ if (shm_mq_get_sender(mq) == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("parallel worker failed to initialize"),
+ errhint("More details may be available in the server log.")));
+
+ pcxt->known_attached_workers[i] = true;
+ ++pcxt->nknown_attached_workers;
+ }
+ else
+ {
+ /*
+ * Worker not yet started, so we must wait. The postmaster
+ * will notify us if the worker's state changes. Our latch
+ * might also get set for some other reason, but if so we'll
+ * just end up waiting for the same worker again.
+ */
+ rc = WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_POSTMASTER_DEATH,
+ -1, WAIT_EVENT_BGWORKER_STARTUP);
+
+ /* emergency bailout if postmaster has died */
+ if (rc & WL_POSTMASTER_DEATH)
+ proc_exit(1);
+
+ if (rc & WL_LATCH_SET)
+ ResetLatch(MyLatch);
+ }
+ }
+
+ /* If all workers are known to have started, we're done. */
+ if (pcxt->nknown_attached_workers >= pcxt->nworkers_launched)
+ {
+ Assert(pcxt->nknown_attached_workers == pcxt->nworkers_launched);
+ break;
+ }
+ }
+}
+
/*
* Wait for all workers to finish computing.
*
*/
if (pcxt->worker[i].error_mqh == NULL)
++nfinished;
- else if (pcxt->any_message_received[i])
+ else if (pcxt->known_attached_workers[i])
{
anyone_alive = true;
break;
{
char msgtype;
- if (pcxt->any_message_received != NULL)
- pcxt->any_message_received[i] = true;
+ if (pcxt->known_attached_workers != NULL &&
+ !pcxt->known_attached_workers[i])
+ {
+ pcxt->known_attached_workers[i] = true;
+ pcxt->nknown_attached_workers++;
+ }
msgtype = pq_getmsgbyte(msg);