]> granicus.if.org Git - postgresql/blob - src/backend/access/transam/parallel.c
Introduce group locking to prevent parallel processes from deadlocking.
[postgresql] / src / backend / access / transam / parallel.c
1 /*-------------------------------------------------------------------------
2  *
3  * parallel.c
4  *        Infrastructure for launching parallel workers
5  *
6  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  *        src/backend/access/transam/parallel.c
11  *
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres.h"
16
17 #include "access/xact.h"
18 #include "access/xlog.h"
19 #include "access/parallel.h"
20 #include "commands/async.h"
21 #include "libpq/libpq.h"
22 #include "libpq/pqformat.h"
23 #include "libpq/pqmq.h"
24 #include "miscadmin.h"
25 #include "storage/ipc.h"
26 #include "storage/sinval.h"
27 #include "storage/spin.h"
28 #include "tcop/tcopprot.h"
29 #include "utils/combocid.h"
30 #include "utils/guc.h"
31 #include "utils/inval.h"
32 #include "utils/memutils.h"
33 #include "utils/resowner.h"
34 #include "utils/snapmgr.h"
35
36 /*
37  * We don't want to waste a lot of memory on an error queue which, most of
38  * the time, will process only a handful of small messages.  However, it is
39  * desirable to make it large enough that a typical ErrorResponse can be sent
40  * without blocking.  That way, a worker that errors out can write the whole
41  * message into the queue and terminate without waiting for the user backend.
42  */
43 #define PARALLEL_ERROR_QUEUE_SIZE                       16384
44
45 /* Magic number for parallel context TOC. */
46 #define PARALLEL_MAGIC                                          0x50477c7c
47
48 /*
49  * Magic numbers for parallel state sharing.  Higher-level code should use
50  * smaller values, leaving these very large ones for use by this module.
51  */
52 #define PARALLEL_KEY_FIXED                                      UINT64CONST(0xFFFFFFFFFFFF0001)
53 #define PARALLEL_KEY_ERROR_QUEUE                        UINT64CONST(0xFFFFFFFFFFFF0002)
54 #define PARALLEL_KEY_LIBRARY                            UINT64CONST(0xFFFFFFFFFFFF0003)
55 #define PARALLEL_KEY_GUC                                        UINT64CONST(0xFFFFFFFFFFFF0004)
56 #define PARALLEL_KEY_COMBO_CID                          UINT64CONST(0xFFFFFFFFFFFF0005)
57 #define PARALLEL_KEY_TRANSACTION_SNAPSHOT       UINT64CONST(0xFFFFFFFFFFFF0006)
58 #define PARALLEL_KEY_ACTIVE_SNAPSHOT            UINT64CONST(0xFFFFFFFFFFFF0007)
59 #define PARALLEL_KEY_TRANSACTION_STATE          UINT64CONST(0xFFFFFFFFFFFF0008)
60 #define PARALLEL_KEY_EXTENSION_TRAMPOLINE       UINT64CONST(0xFFFFFFFFFFFF0009)
61
62 /* Fixed-size parallel state. */
63 typedef struct FixedParallelState
64 {
65         /* Fixed-size state that workers must restore. */
66         Oid                     database_id;
67         Oid                     authenticated_user_id;
68         Oid                     current_user_id;
69         int                     sec_context;
70         PGPROC     *parallel_master_pgproc;
71         pid_t           parallel_master_pid;
72         BackendId       parallel_master_backend_id;
73
74         /* Entrypoint for parallel workers. */
75         parallel_worker_main_type entrypoint;
76
77         /* Mutex protects remaining fields. */
78         slock_t         mutex;
79
80         /* Maximum XactLastRecEnd of any worker. */
81         XLogRecPtr      last_xlog_end;
82 } FixedParallelState;
83
84 /*
85  * Our parallel worker number.  We initialize this to -1, meaning that we are
86  * not a parallel worker.  In parallel workers, it will be set to a value >= 0
87  * and < the number of workers before any user code is invoked; each parallel
88  * worker will get a different parallel worker number.
89  */
90 int                     ParallelWorkerNumber = -1;
91
92 /* Is there a parallel message pending which we need to receive? */
93 bool            ParallelMessagePending = false;
94
95 /* Are we initializing a parallel worker? */
96 bool            InitializingParallelWorker = false;
97
98 /* Pointer to our fixed parallel state. */
99 static FixedParallelState *MyFixedParallelState;
100
101 /* List of active parallel contexts. */
102 static dlist_head pcxt_list = DLIST_STATIC_INIT(pcxt_list);
103
104 /* Private functions. */
105 static void HandleParallelMessage(ParallelContext *, int, StringInfo msg);
106 static void ParallelErrorContext(void *arg);
107 static void ParallelExtensionTrampoline(dsm_segment *seg, shm_toc *toc);
108 static void ParallelWorkerMain(Datum main_arg);
109 static void WaitForParallelWorkersToExit(ParallelContext *pcxt);
110
111 /*
112  * Establish a new parallel context.  This should be done after entering
113  * parallel mode, and (unless there is an error) the context should be
114  * destroyed before exiting the current subtransaction.
115  */
116 ParallelContext *
117 CreateParallelContext(parallel_worker_main_type entrypoint, int nworkers)
118 {
119         MemoryContext oldcontext;
120         ParallelContext *pcxt;
121
122         /* It is unsafe to create a parallel context if not in parallel mode. */
123         Assert(IsInParallelMode());
124
125         /* Number of workers should be non-negative. */
126         Assert(nworkers >= 0);
127
128         /*
129          * If dynamic shared memory is not available, we won't be able to use
130          * background workers.
131          */
132         if (dynamic_shared_memory_type == DSM_IMPL_NONE)
133                 nworkers = 0;
134
135         /*
136          * If we are running under serializable isolation, we can't use
137          * parallel workers, at least not until somebody enhances that mechanism
138          * to be parallel-aware.
139          */
140         if (IsolationIsSerializable())
141                 nworkers = 0;
142
143         /* We might be running in a short-lived memory context. */
144         oldcontext = MemoryContextSwitchTo(TopTransactionContext);
145
146         /* Initialize a new ParallelContext. */
147         pcxt = palloc0(sizeof(ParallelContext));
148         pcxt->subid = GetCurrentSubTransactionId();
149         pcxt->nworkers = nworkers;
150         pcxt->entrypoint = entrypoint;
151         pcxt->error_context_stack = error_context_stack;
152         shm_toc_initialize_estimator(&pcxt->estimator);
153         dlist_push_head(&pcxt_list, &pcxt->node);
154
155         /* Restore previous memory context. */
156         MemoryContextSwitchTo(oldcontext);
157
158         return pcxt;
159 }
160
161 /*
162  * Establish a new parallel context that calls a function provided by an
163  * extension.  This works around the fact that the library might get mapped
164  * at a different address in each backend.
165  */
166 ParallelContext *
167 CreateParallelContextForExternalFunction(char *library_name,
168                                                                                  char *function_name,
169                                                                                  int nworkers)
170 {
171         MemoryContext oldcontext;
172         ParallelContext *pcxt;
173
174         /* We might be running in a very short-lived memory context. */
175         oldcontext = MemoryContextSwitchTo(TopTransactionContext);
176
177         /* Create the context. */
178         pcxt = CreateParallelContext(ParallelExtensionTrampoline, nworkers);
179         pcxt->library_name = pstrdup(library_name);
180         pcxt->function_name = pstrdup(function_name);
181
182         /* Restore previous memory context. */
183         MemoryContextSwitchTo(oldcontext);
184
185         return pcxt;
186 }
187
188 /*
189  * Establish the dynamic shared memory segment for a parallel context and
190  * copied state and other bookkeeping information that will need by parallel
191  * workers into it.
192  */
193 void
194 InitializeParallelDSM(ParallelContext *pcxt)
195 {
196         MemoryContext oldcontext;
197         Size            library_len = 0;
198         Size            guc_len = 0;
199         Size            combocidlen = 0;
200         Size            tsnaplen = 0;
201         Size            asnaplen = 0;
202         Size            tstatelen = 0;
203         Size            segsize = 0;
204         int                     i;
205         FixedParallelState *fps;
206         Snapshot        transaction_snapshot = GetTransactionSnapshot();
207         Snapshot        active_snapshot = GetActiveSnapshot();
208
209         /* We might be running in a very short-lived memory context. */
210         oldcontext = MemoryContextSwitchTo(TopTransactionContext);
211
212         /* Allow space to store the fixed-size parallel state. */
213         shm_toc_estimate_chunk(&pcxt->estimator, sizeof(FixedParallelState));
214         shm_toc_estimate_keys(&pcxt->estimator, 1);
215
216         /*
217          * Normally, the user will have requested at least one worker process, but
218          * if by chance they have not, we can skip a bunch of things here.
219          */
220         if (pcxt->nworkers > 0)
221         {
222                 /* Estimate space for various kinds of state sharing. */
223                 library_len = EstimateLibraryStateSpace();
224                 shm_toc_estimate_chunk(&pcxt->estimator, library_len);
225                 guc_len = EstimateGUCStateSpace();
226                 shm_toc_estimate_chunk(&pcxt->estimator, guc_len);
227                 combocidlen = EstimateComboCIDStateSpace();
228                 shm_toc_estimate_chunk(&pcxt->estimator, combocidlen);
229                 tsnaplen = EstimateSnapshotSpace(transaction_snapshot);
230                 shm_toc_estimate_chunk(&pcxt->estimator, tsnaplen);
231                 asnaplen = EstimateSnapshotSpace(active_snapshot);
232                 shm_toc_estimate_chunk(&pcxt->estimator, asnaplen);
233                 tstatelen = EstimateTransactionStateSpace();
234                 shm_toc_estimate_chunk(&pcxt->estimator, tstatelen);
235                 /* If you add more chunks here, you probably need to add keys. */
236                 shm_toc_estimate_keys(&pcxt->estimator, 6);
237
238                 /* Estimate space need for error queues. */
239                 StaticAssertStmt(BUFFERALIGN(PARALLEL_ERROR_QUEUE_SIZE) ==
240                                                  PARALLEL_ERROR_QUEUE_SIZE,
241                                                  "parallel error queue size not buffer-aligned");
242                 shm_toc_estimate_chunk(&pcxt->estimator,
243                                                            PARALLEL_ERROR_QUEUE_SIZE * pcxt->nworkers);
244                 shm_toc_estimate_keys(&pcxt->estimator, 1);
245
246                 /* Estimate how much we'll need for extension entrypoint info. */
247                 if (pcxt->library_name != NULL)
248                 {
249                         Assert(pcxt->entrypoint == ParallelExtensionTrampoline);
250                         Assert(pcxt->function_name != NULL);
251                         shm_toc_estimate_chunk(&pcxt->estimator, strlen(pcxt->library_name)
252                                                                    + strlen(pcxt->function_name) + 2);
253                         shm_toc_estimate_keys(&pcxt->estimator, 1);
254                 }
255         }
256
257         /*
258          * Create DSM and initialize with new table of contents.  But if the user
259          * didn't request any workers, then don't bother creating a dynamic shared
260          * memory segment; instead, just use backend-private memory.
261          *
262          * Also, if we can't create a dynamic shared memory segment because the
263          * maximum number of segments have already been created, then fall back to
264          * backend-private memory, and plan not to use any workers.  We hope this
265          * won't happen very often, but it's better to abandon the use of
266          * parallelism than to fail outright.
267          */
268         segsize = shm_toc_estimate(&pcxt->estimator);
269         if (pcxt->nworkers != 0)
270                 pcxt->seg = dsm_create(segsize, DSM_CREATE_NULL_IF_MAXSEGMENTS);
271         if (pcxt->seg != NULL)
272                 pcxt->toc = shm_toc_create(PARALLEL_MAGIC,
273                                                                    dsm_segment_address(pcxt->seg),
274                                                                    segsize);
275         else
276         {
277                 pcxt->nworkers = 0;
278                 pcxt->private_memory = MemoryContextAlloc(TopMemoryContext, segsize);
279                 pcxt->toc = shm_toc_create(PARALLEL_MAGIC, pcxt->private_memory,
280                                                                    segsize);
281         }
282
283         /* Initialize fixed-size state in shared memory. */
284         fps = (FixedParallelState *)
285                 shm_toc_allocate(pcxt->toc, sizeof(FixedParallelState));
286         fps->database_id = MyDatabaseId;
287         fps->authenticated_user_id = GetAuthenticatedUserId();
288         GetUserIdAndSecContext(&fps->current_user_id, &fps->sec_context);
289         fps->parallel_master_pgproc = MyProc;
290         fps->parallel_master_pid = MyProcPid;
291         fps->parallel_master_backend_id = MyBackendId;
292         fps->entrypoint = pcxt->entrypoint;
293         SpinLockInit(&fps->mutex);
294         fps->last_xlog_end = 0;
295         shm_toc_insert(pcxt->toc, PARALLEL_KEY_FIXED, fps);
296
297         /* We can skip the rest of this if we're not budgeting for any workers. */
298         if (pcxt->nworkers > 0)
299         {
300                 char       *libraryspace;
301                 char       *gucspace;
302                 char       *combocidspace;
303                 char       *tsnapspace;
304                 char       *asnapspace;
305                 char       *tstatespace;
306                 char       *error_queue_space;
307
308                 /* Serialize shared libraries we have loaded. */
309                 libraryspace = shm_toc_allocate(pcxt->toc, library_len);
310                 SerializeLibraryState(library_len, libraryspace);
311                 shm_toc_insert(pcxt->toc, PARALLEL_KEY_LIBRARY, libraryspace);
312
313                 /* Serialize GUC settings. */
314                 gucspace = shm_toc_allocate(pcxt->toc, guc_len);
315                 SerializeGUCState(guc_len, gucspace);
316                 shm_toc_insert(pcxt->toc, PARALLEL_KEY_GUC, gucspace);
317
318                 /* Serialize combo CID state. */
319                 combocidspace = shm_toc_allocate(pcxt->toc, combocidlen);
320                 SerializeComboCIDState(combocidlen, combocidspace);
321                 shm_toc_insert(pcxt->toc, PARALLEL_KEY_COMBO_CID, combocidspace);
322
323                 /* Serialize transaction snapshot and active snapshot. */
324                 tsnapspace = shm_toc_allocate(pcxt->toc, tsnaplen);
325                 SerializeSnapshot(transaction_snapshot, tsnapspace);
326                 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TRANSACTION_SNAPSHOT,
327                                            tsnapspace);
328                 asnapspace = shm_toc_allocate(pcxt->toc, asnaplen);
329                 SerializeSnapshot(active_snapshot, asnapspace);
330                 shm_toc_insert(pcxt->toc, PARALLEL_KEY_ACTIVE_SNAPSHOT, asnapspace);
331
332                 /* Serialize transaction state. */
333                 tstatespace = shm_toc_allocate(pcxt->toc, tstatelen);
334                 SerializeTransactionState(tstatelen, tstatespace);
335                 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TRANSACTION_STATE, tstatespace);
336
337                 /* Allocate space for worker information. */
338                 pcxt->worker = palloc0(sizeof(ParallelWorkerInfo) * pcxt->nworkers);
339
340                 /*
341                  * Establish error queues in dynamic shared memory.
342                  *
343                  * These queues should be used only for transmitting ErrorResponse,
344                  * NoticeResponse, and NotifyResponse protocol messages.  Tuple data
345                  * should be transmitted via separate (possibly larger?) queues.
346                  */
347                 error_queue_space =
348                         shm_toc_allocate(pcxt->toc,
349                                                          PARALLEL_ERROR_QUEUE_SIZE * pcxt->nworkers);
350                 for (i = 0; i < pcxt->nworkers; ++i)
351                 {
352                         char       *start;
353                         shm_mq     *mq;
354
355                         start = error_queue_space + i * PARALLEL_ERROR_QUEUE_SIZE;
356                         mq = shm_mq_create(start, PARALLEL_ERROR_QUEUE_SIZE);
357                         shm_mq_set_receiver(mq, MyProc);
358                         pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL);
359                 }
360                 shm_toc_insert(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, error_queue_space);
361
362                 /* Serialize extension entrypoint information. */
363                 if (pcxt->library_name != NULL)
364                 {
365                         Size            lnamelen = strlen(pcxt->library_name);
366                         char       *extensionstate;
367
368                         extensionstate = shm_toc_allocate(pcxt->toc, lnamelen
369                                                                                   + strlen(pcxt->function_name) + 2);
370                         strcpy(extensionstate, pcxt->library_name);
371                         strcpy(extensionstate + lnamelen + 1, pcxt->function_name);
372                         shm_toc_insert(pcxt->toc, PARALLEL_KEY_EXTENSION_TRAMPOLINE,
373                                                    extensionstate);
374                 }
375         }
376
377         /* Restore previous memory context. */
378         MemoryContextSwitchTo(oldcontext);
379 }
380
381 /*
382  * Reinitialize the dynamic shared memory segment for a parallel context such
383  * that we could launch workers for it again.
384  */
385 void
386 ReinitializeParallelDSM(ParallelContext *pcxt)
387 {
388         FixedParallelState *fps;
389         char       *error_queue_space;
390         int                     i;
391
392         if (pcxt->nworkers_launched == 0)
393                 return;
394
395         WaitForParallelWorkersToFinish(pcxt);
396         WaitForParallelWorkersToExit(pcxt);
397
398         /* Reset a few bits of fixed parallel state to a clean state. */
399         fps = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_FIXED);
400         fps->last_xlog_end = 0;
401
402         /* Recreate error queues. */
403         error_queue_space =
404                 shm_toc_lookup(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE);
405         for (i = 0; i < pcxt->nworkers; ++i)
406         {
407                 char       *start;
408                 shm_mq     *mq;
409
410                 start = error_queue_space + i * PARALLEL_ERROR_QUEUE_SIZE;
411                 mq = shm_mq_create(start, PARALLEL_ERROR_QUEUE_SIZE);
412                 shm_mq_set_receiver(mq, MyProc);
413                 pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL);
414         }
415
416         /* Reset number of workers launched. */
417         pcxt->nworkers_launched = 0;
418 }
419
420 /*
421  * Launch parallel workers.
422  */
423 void
424 LaunchParallelWorkers(ParallelContext *pcxt)
425 {
426         MemoryContext oldcontext;
427         BackgroundWorker worker;
428         int                     i;
429         bool            any_registrations_failed = false;
430
431         /* Skip this if we have no workers. */
432         if (pcxt->nworkers == 0)
433                 return;
434
435         /* We need to be a lock group leader. */
436         BecomeLockGroupLeader();
437
438         /* If we do have workers, we'd better have a DSM segment. */
439         Assert(pcxt->seg != NULL);
440
441         /* We might be running in a short-lived memory context. */
442         oldcontext = MemoryContextSwitchTo(TopTransactionContext);
443
444         /* Configure a worker. */
445         snprintf(worker.bgw_name, BGW_MAXLEN, "parallel worker for PID %d",
446                          MyProcPid);
447         worker.bgw_flags =
448                 BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
449         worker.bgw_start_time = BgWorkerStart_ConsistentState;
450         worker.bgw_restart_time = BGW_NEVER_RESTART;
451         worker.bgw_main = ParallelWorkerMain;
452         worker.bgw_main_arg = UInt32GetDatum(dsm_segment_handle(pcxt->seg));
453         worker.bgw_notify_pid = MyProcPid;
454         memset(&worker.bgw_extra, 0, BGW_EXTRALEN);
455
456         /*
457          * Start workers.
458          *
459          * The caller must be able to tolerate ending up with fewer workers than
460          * expected, so there is no need to throw an error here if registration
461          * fails.  It wouldn't help much anyway, because registering the worker in
462          * no way guarantees that it will start up and initialize successfully.
463          */
464         for (i = 0; i < pcxt->nworkers; ++i)
465         {
466                 memcpy(worker.bgw_extra, &i, sizeof(int));
467                 if (!any_registrations_failed &&
468                         RegisterDynamicBackgroundWorker(&worker,
469                                                                                         &pcxt->worker[i].bgwhandle))
470                 {
471                         shm_mq_set_handle(pcxt->worker[i].error_mqh,
472                                                           pcxt->worker[i].bgwhandle);
473                         pcxt->nworkers_launched++;
474                 }
475                 else
476                 {
477                         /*
478                          * If we weren't able to register the worker, then we've bumped up
479                          * against the max_worker_processes limit, and future
480                          * registrations will probably fail too, so arrange to skip them.
481                          * But we still have to execute this code for the remaining slots
482                          * to make sure that we forget about the error queues we budgeted
483                          * for those workers.  Otherwise, we'll wait for them to start,
484                          * but they never will.
485                          */
486                         any_registrations_failed = true;
487                         pcxt->worker[i].bgwhandle = NULL;
488                         pcxt->worker[i].error_mqh = NULL;
489                 }
490         }
491
492         /* Restore previous memory context. */
493         MemoryContextSwitchTo(oldcontext);
494 }
495
496 /*
497  * Wait for all workers to finish computing.
498  *
499  * Even if the parallel operation seems to have completed successfully, it's
500  * important to call this function afterwards.  We must not miss any errors
501  * the workers may have thrown during the parallel operation, or any that they
502  * may yet throw while shutting down.
503  *
504  * Also, we want to update our notion of XactLastRecEnd based on worker
505  * feedback.
506  */
507 void
508 WaitForParallelWorkersToFinish(ParallelContext *pcxt)
509 {
510         for (;;)
511         {
512                 bool            anyone_alive = false;
513                 int                     i;
514
515                 /*
516                  * This will process any parallel messages that are pending, which may
517                  * change the outcome of the loop that follows.  It may also throw an
518                  * error propagated from a worker.
519                  */
520                 CHECK_FOR_INTERRUPTS();
521
522                 for (i = 0; i < pcxt->nworkers; ++i)
523                 {
524                         if (pcxt->worker[i].error_mqh != NULL)
525                         {
526                                 anyone_alive = true;
527                                 break;
528                         }
529                 }
530
531                 if (!anyone_alive)
532                         break;
533
534                 WaitLatch(&MyProc->procLatch, WL_LATCH_SET, -1);
535                 ResetLatch(&MyProc->procLatch);
536         }
537
538         if (pcxt->toc != NULL)
539         {
540                 FixedParallelState *fps;
541
542                 fps = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_FIXED);
543                 if (fps->last_xlog_end > XactLastRecEnd)
544                         XactLastRecEnd = fps->last_xlog_end;
545         }
546 }
547
548 /*
549  * Wait for all workers to exit.
550  *
551  * This function ensures that workers have been completely shutdown.  The
552  * difference between WaitForParallelWorkersToFinish and this function is
553  * that former just ensures that last message sent by worker backend is
554  * received by master backend whereas this ensures the complete shutdown.
555  */
556 static void
557 WaitForParallelWorkersToExit(ParallelContext *pcxt)
558 {
559         int                     i;
560
561         /* Wait until the workers actually die. */
562         for (i = 0; i < pcxt->nworkers; ++i)
563         {
564                 BgwHandleStatus status;
565
566                 if (pcxt->worker == NULL || pcxt->worker[i].bgwhandle == NULL)
567                         continue;
568
569                 status = WaitForBackgroundWorkerShutdown(pcxt->worker[i].bgwhandle);
570
571                 /*
572                  * If the postmaster kicked the bucket, we have no chance of cleaning
573                  * up safely -- we won't be able to tell when our workers are actually
574                  * dead.  This doesn't necessitate a PANIC since they will all abort
575                  * eventually, but we can't safely continue this session.
576                  */
577                 if (status == BGWH_POSTMASTER_DIED)
578                         ereport(FATAL,
579                                         (errcode(ERRCODE_ADMIN_SHUTDOWN),
580                                  errmsg("postmaster exited during a parallel transaction")));
581
582                 /* Release memory. */
583                 pfree(pcxt->worker[i].bgwhandle);
584                 pcxt->worker[i].bgwhandle = NULL;
585         }
586 }
587
588 /*
589  * Destroy a parallel context.
590  *
591  * If expecting a clean exit, you should use WaitForParallelWorkersToFinish()
592  * first, before calling this function.  When this function is invoked, any
593  * remaining workers are forcibly killed; the dynamic shared memory segment
594  * is unmapped; and we then wait (uninterruptibly) for the workers to exit.
595  */
596 void
597 DestroyParallelContext(ParallelContext *pcxt)
598 {
599         int                     i;
600
601         /*
602          * Be careful about order of operations here!  We remove the parallel
603          * context from the list before we do anything else; otherwise, if an
604          * error occurs during a subsequent step, we might try to nuke it again
605          * from AtEOXact_Parallel or AtEOSubXact_Parallel.
606          */
607         dlist_delete(&pcxt->node);
608
609         /* Kill each worker in turn, and forget their error queues. */
610         if (pcxt->worker != NULL)
611         {
612                 for (i = 0; i < pcxt->nworkers; ++i)
613                 {
614                         if (pcxt->worker[i].error_mqh != NULL)
615                         {
616                                 TerminateBackgroundWorker(pcxt->worker[i].bgwhandle);
617
618                                 pfree(pcxt->worker[i].error_mqh);
619                                 pcxt->worker[i].error_mqh = NULL;
620                         }
621                 }
622         }
623
624         /*
625          * If we have allocated a shared memory segment, detach it.  This will
626          * implicitly detach the error queues, and any other shared memory queues,
627          * stored there.
628          */
629         if (pcxt->seg != NULL)
630         {
631                 dsm_detach(pcxt->seg);
632                 pcxt->seg = NULL;
633         }
634
635         /*
636          * If this parallel context is actually in backend-private memory rather
637          * than shared memory, free that memory instead.
638          */
639         if (pcxt->private_memory != NULL)
640         {
641                 pfree(pcxt->private_memory);
642                 pcxt->private_memory = NULL;
643         }
644
645         /*
646          * We can't finish transaction commit or abort until all of the
647          * workers have exited.  This means, in particular, that we can't respond
648          * to interrupts at this stage.
649          */
650         HOLD_INTERRUPTS();
651         WaitForParallelWorkersToExit(pcxt);
652         RESUME_INTERRUPTS();
653
654         /* Free the worker array itself. */
655         if (pcxt->worker != NULL)
656         {
657                 pfree(pcxt->worker);
658                 pcxt->worker = NULL;
659         }
660
661         /* Free memory. */
662         pfree(pcxt);
663 }
664
665 /*
666  * Are there any parallel contexts currently active?
667  */
668 bool
669 ParallelContextActive(void)
670 {
671         return !dlist_is_empty(&pcxt_list);
672 }
673
674 /*
675  * Handle receipt of an interrupt indicating a parallel worker message.
676  */
677 void
678 HandleParallelMessageInterrupt(void)
679 {
680         int                     save_errno = errno;
681
682         InterruptPending = true;
683         ParallelMessagePending = true;
684         SetLatch(MyLatch);
685
686         errno = save_errno;
687 }
688
689 /*
690  * Handle any queued protocol messages received from parallel workers.
691  */
692 void
693 HandleParallelMessages(void)
694 {
695         dlist_iter      iter;
696
697         ParallelMessagePending = false;
698
699         dlist_foreach(iter, &pcxt_list)
700         {
701                 ParallelContext *pcxt;
702                 int                     i;
703                 Size            nbytes;
704                 void       *data;
705
706                 pcxt = dlist_container(ParallelContext, node, iter.cur);
707                 if (pcxt->worker == NULL)
708                         continue;
709
710                 for (i = 0; i < pcxt->nworkers; ++i)
711                 {
712                         /*
713                          * Read as many messages as we can from each worker, but stop when
714                          * either (1) the error queue goes away, which can happen if we
715                          * receive a Terminate message from the worker; or (2) no more
716                          * messages can be read from the worker without blocking.
717                          */
718                         while (pcxt->worker[i].error_mqh != NULL)
719                         {
720                                 shm_mq_result res;
721
722                                 res = shm_mq_receive(pcxt->worker[i].error_mqh, &nbytes,
723                                                                          &data, true);
724                                 if (res == SHM_MQ_WOULD_BLOCK)
725                                         break;
726                                 else if (res == SHM_MQ_SUCCESS)
727                                 {
728                                         StringInfoData msg;
729
730                                         initStringInfo(&msg);
731                                         appendBinaryStringInfo(&msg, data, nbytes);
732                                         HandleParallelMessage(pcxt, i, &msg);
733                                         pfree(msg.data);
734                                 }
735                                 else
736                                         ereport(ERROR,
737                                                         (errcode(ERRCODE_INTERNAL_ERROR),       /* XXX: wrong errcode? */
738                                                          errmsg("lost connection to parallel worker")));
739
740                                 /* This might make the error queue go away. */
741                                 CHECK_FOR_INTERRUPTS();
742                         }
743                 }
744         }
745 }
746
747 /*
748  * Handle a single protocol message received from a single parallel worker.
749  */
750 static void
751 HandleParallelMessage(ParallelContext *pcxt, int i, StringInfo msg)
752 {
753         char            msgtype;
754
755         msgtype = pq_getmsgbyte(msg);
756
757         switch (msgtype)
758         {
759                 case 'K':                               /* BackendKeyData */
760                         {
761                                 int32           pid = pq_getmsgint(msg, 4);
762
763                                 (void) pq_getmsgint(msg, 4);    /* discard cancel key */
764                                 (void) pq_getmsgend(msg);
765                                 pcxt->worker[i].pid = pid;
766                                 break;
767                         }
768
769                 case 'E':                               /* ErrorResponse */
770                 case 'N':                               /* NoticeResponse */
771                         {
772                                 ErrorData       edata;
773                                 ErrorContextCallback errctx;
774                                 ErrorContextCallback *save_error_context_stack;
775
776                                 /*
777                                  * Rethrow the error using the error context callbacks that
778                                  * were in effect when the context was created, not the
779                                  * current ones.
780                                  */
781                                 save_error_context_stack = error_context_stack;
782                                 errctx.callback = ParallelErrorContext;
783                                 errctx.arg = &pcxt->worker[i].pid;
784                                 errctx.previous = pcxt->error_context_stack;
785                                 error_context_stack = &errctx;
786
787                                 /* Parse ErrorResponse or NoticeResponse. */
788                                 pq_parse_errornotice(msg, &edata);
789
790                                 /* Death of a worker isn't enough justification for suicide. */
791                                 edata.elevel = Min(edata.elevel, ERROR);
792
793                                 /* Rethrow error or notice. */
794                                 ThrowErrorData(&edata);
795
796                                 /* Restore previous context. */
797                                 error_context_stack = save_error_context_stack;
798
799                                 break;
800                         }
801
802                 case 'A':                               /* NotifyResponse */
803                         {
804                                 /* Propagate NotifyResponse. */
805                                 pq_putmessage(msg->data[0], &msg->data[1], msg->len - 1);
806                                 break;
807                         }
808
809                 case 'X':                               /* Terminate, indicating clean exit */
810                         {
811                                 pfree(pcxt->worker[i].error_mqh);
812                                 pcxt->worker[i].error_mqh = NULL;
813                                 break;
814                         }
815
816                 default:
817                         {
818                                 elog(ERROR, "unknown message type: %c (%d bytes)",
819                                          msgtype, msg->len);
820                         }
821         }
822 }
823
824 /*
825  * End-of-subtransaction cleanup for parallel contexts.
826  *
827  * Currently, it's forbidden to enter or leave a subtransaction while
828  * parallel mode is in effect, so we could just blow away everything.  But
829  * we may want to relax that restriction in the future, so this code
830  * contemplates that there may be multiple subtransaction IDs in pcxt_list.
831  */
832 void
833 AtEOSubXact_Parallel(bool isCommit, SubTransactionId mySubId)
834 {
835         while (!dlist_is_empty(&pcxt_list))
836         {
837                 ParallelContext *pcxt;
838
839                 pcxt = dlist_head_element(ParallelContext, node, &pcxt_list);
840                 if (pcxt->subid != mySubId)
841                         break;
842                 if (isCommit)
843                         elog(WARNING, "leaked parallel context");
844                 DestroyParallelContext(pcxt);
845         }
846 }
847
848 /*
849  * End-of-transaction cleanup for parallel contexts.
850  */
851 void
852 AtEOXact_Parallel(bool isCommit)
853 {
854         while (!dlist_is_empty(&pcxt_list))
855         {
856                 ParallelContext *pcxt;
857
858                 pcxt = dlist_head_element(ParallelContext, node, &pcxt_list);
859                 if (isCommit)
860                         elog(WARNING, "leaked parallel context");
861                 DestroyParallelContext(pcxt);
862         }
863 }
864
865 /*
866  * Main entrypoint for parallel workers.
867  */
868 static void
869 ParallelWorkerMain(Datum main_arg)
870 {
871         dsm_segment *seg;
872         shm_toc    *toc;
873         FixedParallelState *fps;
874         char       *error_queue_space;
875         shm_mq     *mq;
876         shm_mq_handle *mqh;
877         char       *libraryspace;
878         char       *gucspace;
879         char       *combocidspace;
880         char       *tsnapspace;
881         char       *asnapspace;
882         char       *tstatespace;
883         StringInfoData msgbuf;
884
885         /* Set flag to indicate that we're initializing a parallel worker. */
886         InitializingParallelWorker = true;
887
888         /* Establish signal handlers. */
889         pqsignal(SIGTERM, die);
890         BackgroundWorkerUnblockSignals();
891
892         /* Determine and set our parallel worker number. */
893         Assert(ParallelWorkerNumber == -1);
894         memcpy(&ParallelWorkerNumber, MyBgworkerEntry->bgw_extra, sizeof(int));
895
896         /* Set up a memory context and resource owner. */
897         Assert(CurrentResourceOwner == NULL);
898         CurrentResourceOwner = ResourceOwnerCreate(NULL, "parallel toplevel");
899         CurrentMemoryContext = AllocSetContextCreate(TopMemoryContext,
900                                                                                                  "parallel worker",
901                                                                                                  ALLOCSET_DEFAULT_MINSIZE,
902                                                                                                  ALLOCSET_DEFAULT_INITSIZE,
903                                                                                                  ALLOCSET_DEFAULT_MAXSIZE);
904
905         /*
906          * Now that we have a resource owner, we can attach to the dynamic shared
907          * memory segment and read the table of contents.
908          */
909         seg = dsm_attach(DatumGetUInt32(main_arg));
910         if (seg == NULL)
911                 ereport(ERROR,
912                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
913                                  errmsg("could not map dynamic shared memory segment")));
914         toc = shm_toc_attach(PARALLEL_MAGIC, dsm_segment_address(seg));
915         if (toc == NULL)
916                 ereport(ERROR,
917                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
918                            errmsg("invalid magic number in dynamic shared memory segment")));
919
920         /* Look up fixed parallel state. */
921         fps = shm_toc_lookup(toc, PARALLEL_KEY_FIXED);
922         Assert(fps != NULL);
923         MyFixedParallelState = fps;
924
925         /*
926          * Now that we have a worker number, we can find and attach to the error
927          * queue provided for us.  That's good, because until we do that, any
928          * errors that happen here will not be reported back to the process that
929          * requested that this worker be launched.
930          */
931         error_queue_space = shm_toc_lookup(toc, PARALLEL_KEY_ERROR_QUEUE);
932         mq = (shm_mq *) (error_queue_space +
933                                          ParallelWorkerNumber * PARALLEL_ERROR_QUEUE_SIZE);
934         shm_mq_set_sender(mq, MyProc);
935         mqh = shm_mq_attach(mq, seg, NULL);
936         pq_redirect_to_shm_mq(seg, mqh);
937         pq_set_parallel_master(fps->parallel_master_pid,
938                                                    fps->parallel_master_backend_id);
939
940         /*
941          * Send a BackendKeyData message to the process that initiated parallelism
942          * so that it has access to our PID before it receives any other messages
943          * from us.  Our cancel key is sent, too, since that's the way the
944          * protocol message is defined, but it won't actually be used for anything
945          * in this case.
946          */
947         pq_beginmessage(&msgbuf, 'K');
948         pq_sendint(&msgbuf, (int32) MyProcPid, sizeof(int32));
949         pq_sendint(&msgbuf, (int32) MyCancelKey, sizeof(int32));
950         pq_endmessage(&msgbuf);
951
952         /*
953          * Hooray! Primary initialization is complete.  Now, we need to set up our
954          * backend-local state to match the original backend.
955          */
956
957         /*
958          * Join locking group.  We must do this before anything that could try
959          * to acquire a heavyweight lock, because any heavyweight locks acquired
960          * to this point could block either directly against the parallel group
961          * leader or against some process which in turn waits for a lock that
962          * conflicts with the parallel group leader, causing an undetected
963          * deadlock.  (If we can't join the lock group, the leader has gone away,
964          * so just exit quietly.)
965          */
966         if (!BecomeLockGroupMember(fps->parallel_master_pgproc,
967                                                            fps->parallel_master_pid))
968                 return;
969
970         /*
971          * Load libraries that were loaded by original backend.  We want to do
972          * this before restoring GUCs, because the libraries might define custom
973          * variables.
974          */
975         libraryspace = shm_toc_lookup(toc, PARALLEL_KEY_LIBRARY);
976         Assert(libraryspace != NULL);
977         RestoreLibraryState(libraryspace);
978
979         /* Restore database connection. */
980         BackgroundWorkerInitializeConnectionByOid(fps->database_id,
981                                                                                           fps->authenticated_user_id);
982
983         /* Restore GUC values from launching backend. */
984         gucspace = shm_toc_lookup(toc, PARALLEL_KEY_GUC);
985         Assert(gucspace != NULL);
986         StartTransactionCommand();
987         RestoreGUCState(gucspace);
988         CommitTransactionCommand();
989
990         /* Crank up a transaction state appropriate to a parallel worker. */
991         tstatespace = shm_toc_lookup(toc, PARALLEL_KEY_TRANSACTION_STATE);
992         StartParallelWorkerTransaction(tstatespace);
993
994         /* Restore combo CID state. */
995         combocidspace = shm_toc_lookup(toc, PARALLEL_KEY_COMBO_CID);
996         Assert(combocidspace != NULL);
997         RestoreComboCIDState(combocidspace);
998
999         /* Restore transaction snapshot. */
1000         tsnapspace = shm_toc_lookup(toc, PARALLEL_KEY_TRANSACTION_SNAPSHOT);
1001         Assert(tsnapspace != NULL);
1002         RestoreTransactionSnapshot(RestoreSnapshot(tsnapspace),
1003                                                            fps->parallel_master_pgproc);
1004
1005         /* Restore active snapshot. */
1006         asnapspace = shm_toc_lookup(toc, PARALLEL_KEY_ACTIVE_SNAPSHOT);
1007         Assert(asnapspace != NULL);
1008         PushActiveSnapshot(RestoreSnapshot(asnapspace));
1009
1010         /*
1011          * We've changed which tuples we can see, and must therefore invalidate
1012          * system caches.
1013          */
1014         InvalidateSystemCaches();
1015
1016         /* Restore user ID and security context. */
1017         SetUserIdAndSecContext(fps->current_user_id, fps->sec_context);
1018
1019         /*
1020          * We've initialized all of our state now; nothing should change
1021          * hereafter.
1022          */
1023         InitializingParallelWorker = false;
1024         EnterParallelMode();
1025
1026         /*
1027          * Time to do the real work: invoke the caller-supplied code.
1028          *
1029          * If you get a crash at this line, see the comments for
1030          * ParallelExtensionTrampoline.
1031          */
1032         fps->entrypoint(seg, toc);
1033
1034         /* Must exit parallel mode to pop active snapshot. */
1035         ExitParallelMode();
1036
1037         /* Must pop active snapshot so resowner.c doesn't complain. */
1038         PopActiveSnapshot();
1039
1040         /* Shut down the parallel-worker transaction. */
1041         EndParallelWorkerTransaction();
1042
1043         /* Report success. */
1044         pq_putmessage('X', NULL, 0);
1045 }
1046
1047 /*
1048  * It's unsafe for the entrypoint invoked by ParallelWorkerMain to be a
1049  * function living in a dynamically loaded module, because the module might
1050  * not be loaded in every process, or might be loaded but not at the same
1051  * address.  To work around that problem, CreateParallelContextForExtension()
1052  * arranges to call this function rather than calling the extension-provided
1053  * function directly; and this function then looks up the real entrypoint and
1054  * calls it.
1055  */
1056 static void
1057 ParallelExtensionTrampoline(dsm_segment *seg, shm_toc *toc)
1058 {
1059         char       *extensionstate;
1060         char       *library_name;
1061         char       *function_name;
1062         parallel_worker_main_type entrypt;
1063
1064         extensionstate = shm_toc_lookup(toc, PARALLEL_KEY_EXTENSION_TRAMPOLINE);
1065         Assert(extensionstate != NULL);
1066         library_name = extensionstate;
1067         function_name = extensionstate + strlen(library_name) + 1;
1068
1069         entrypt = (parallel_worker_main_type)
1070                 load_external_function(library_name, function_name, true, NULL);
1071         entrypt(seg, toc);
1072 }
1073
1074 /*
1075  * Give the user a hint that this is a message propagated from a parallel
1076  * worker.  Otherwise, it can sometimes be confusing to understand what
1077  * actually happened.
1078  */
1079 static void
1080 ParallelErrorContext(void *arg)
1081 {
1082         errcontext("parallel worker, PID %d", *(int32 *) arg);
1083 }
1084
1085 /*
1086  * Update shared memory with the ending location of the last WAL record we
1087  * wrote, if it's greater than the value already stored there.
1088  */
1089 void
1090 ParallelWorkerReportLastRecEnd(XLogRecPtr last_xlog_end)
1091 {
1092         FixedParallelState *fps = MyFixedParallelState;
1093
1094         Assert(fps != NULL);
1095         SpinLockAcquire(&fps->mutex);
1096         if (fps->last_xlog_end < last_xlog_end)
1097                 fps->last_xlog_end = last_xlog_end;
1098         SpinLockRelease(&fps->mutex);
1099 }