From 3887e9455f812035473eee1cba0cf9c237969998 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 7 Oct 2019 12:39:09 -0400 Subject: [PATCH] Check for too many postmaster children before spawning a bgworker. The postmaster's code path for spawning a bgworker neglected to check whether we already have the max number of live child processes. That's a bit hard to hit, since it would necessarily be a transient condition; but if we do, AssignPostmasterChildSlot() fails causing a postmaster crash, as seen in a report from Bhargav Kamineni. To fix, invoke canAcceptConnections() in the bgworker code path, as we do in the other code paths that spawn children. Since we don't want the same pmState tests in this case, add a child-process-type parameter to canAcceptConnections() so that it can know what to do. Back-patch to 9.5. In principle the same hazard exists in 9.4, but the code is enough different that this patch wouldn't quite fix it there. Given the tiny usage of bgworkers in that branch it doesn't seem worth creating a variant patch for it. Discussion: https://postgr.es/m/18733.1570382257@sss.pgh.pa.us --- src/backend/postmaster/postmaster.c | 46 +++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index eb9e0221f8..85f15a5666 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -409,7 +409,7 @@ static void SendNegotiateProtocolVersion(List *unrecognized_protocol_options); static void processCancelRequest(Port *port, void *pkt); static int initMasks(fd_set *rmask); static void report_fork_failure_to_client(Port *port, int errnum); -static CAC_state canAcceptConnections(void); +static CAC_state canAcceptConnections(int backend_type); static bool RandomCancelKey(int32 *cancel_key); static void signal_child(pid_t pid, int signal); static bool SignalSomeChildren(int signal, int targets); @@ -2398,16 +2398,21 @@ processCancelRequest(Port *port, void *pkt) } /* - * canAcceptConnections --- check to see if database state allows connections. + * canAcceptConnections --- check to see if database state allows connections + * of the specified type. backend_type can be BACKEND_TYPE_NORMAL, + * BACKEND_TYPE_AUTOVAC, or BACKEND_TYPE_BGWORKER. (Note that we don't yet + * know whether a NORMAL connection might turn into a walsender.) */ static CAC_state -canAcceptConnections(void) +canAcceptConnections(int backend_type) { CAC_state result = CAC_OK; /* * Can't start backends when in startup/shutdown/inconsistent recovery - * state. + * state. We treat autovac workers the same as user backends for this + * purpose. However, bgworkers are excluded from this test; we expect + * bgworker_should_start_now() decided whether the DB state allows them. * * In state PM_WAIT_BACKUP only superusers can connect (this must be * allowed so that a superuser can end online backup mode); we return @@ -2415,7 +2420,8 @@ canAcceptConnections(void) * that neither CAC_OK nor CAC_WAITBACKUP can safely be returned until we * have checked for too many children. */ - if (pmState != PM_RUN) + if (pmState != PM_RUN && + backend_type != BACKEND_TYPE_BGWORKER) { if (pmState == PM_WAIT_BACKUP) result = CAC_WAITBACKUP; /* allow superusers only */ @@ -2435,9 +2441,9 @@ canAcceptConnections(void) /* * Don't start too many children. * - * We allow more connections than we can have backends here because some + * We allow more connections here than we can have backends because some * might still be authenticating; they might fail auth, or some existing - * backend might exit before the auth cycle is completed. The exact + * backend might exit before the auth cycle is completed. The exact * MaxBackends limit is enforced when a new backend tries to join the * shared-inval backend array. * @@ -4114,7 +4120,7 @@ BackendStartup(Port *port) bn->cancel_key = MyCancelKey; /* Pass down canAcceptConnections state */ - port->canAcceptConnections = canAcceptConnections(); + port->canAcceptConnections = canAcceptConnections(BACKEND_TYPE_NORMAL); bn->dead_end = (port->canAcceptConnections != CAC_OK && port->canAcceptConnections != CAC_WAITBACKUP); @@ -5486,7 +5492,7 @@ StartAutovacuumWorker(void) * we have to check to avoid race-condition problems during DB state * changes. */ - if (canAcceptConnections() == CAC_OK) + if (canAcceptConnections(BACKEND_TYPE_AUTOVAC) == CAC_OK) { /* * Compute the cancel key that will be assigned to this session. We @@ -5731,12 +5737,13 @@ do_start_bgworker(RegisteredBgWorker *rw) /* * Allocate and assign the Backend element. Note we must do this before - * forking, so that we can handle out of memory properly. + * forking, so that we can handle failures (out of memory or child-process + * slots) cleanly. * * Treat failure as though the worker had crashed. That way, the - * postmaster will wait a bit before attempting to start it again; if it - * tried again right away, most likely it'd find itself repeating the - * out-of-memory or fork failure condition. + * postmaster will wait a bit before attempting to start it again; if we + * tried again right away, most likely we'd find ourselves hitting the + * same resource-exhaustion condition. */ if (!assign_backendlist_entry(rw)) { @@ -5862,6 +5869,19 @@ assign_backendlist_entry(RegisteredBgWorker *rw) { Backend *bn; + /* + * Check that database state allows another connection. Currently the + * only possible failure is CAC_TOOMANY, so we just log an error message + * based on that rather than checking the error code precisely. + */ + if (canAcceptConnections(BACKEND_TYPE_BGWORKER) != CAC_OK) + { + ereport(LOG, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("no slot available for new worker process"))); + return false; + } + /* * Compute the cancel key that will be assigned to this session. We * probably don't need cancel keys for background workers, but we'd better -- 2.40.0