From: Yann Ylavic Date: Wed, 28 Sep 2016 22:02:48 +0000 (+0000) Subject: event: avoid unnecessary listener/polling wake ups (context switches) by using X-Git-Tag: 2.5.0-alpha~1119 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a25b3473bac59f04bca35d692c0408638f13c9bf;p=apache event: avoid unnecessary listener/polling wake ups (context switches) by using apr_pollset_wakeup(), when implemented, to signal the listener according to the next timers or timeout queues expiry (updated at insert and maintenance time). git-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@1762718 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c index a13630239a..66b1689678 100644 --- a/server/mpm/event/event.c +++ b/server/mpm/event/event.c @@ -182,6 +182,7 @@ static int dying = 0; static int workers_may_exit = 0; static int start_thread_may_exit = 0; static int listener_may_exit = 0; +static int listener_is_wakeable = 0; /* Pollset supports APR_POLLSET_WAKEABLE */ static int num_listensocks = 0; static apr_int32_t conns_this_child; /* MaxConnectionsPerChild, only access in listener thread */ @@ -204,6 +205,19 @@ module AP_MODULE_DECLARE_DATA mpm_event_module; struct event_srv_cfg_s; typedef struct event_srv_cfg_s event_srv_cfg; +static apr_pollfd_t *listener_pollfd; + +/* + * The pollset for sockets that are in any of the timeout queues. Currently + * we use the timeout_mutex to make sure that connections are added/removed + * atomically to/from both event_pollset and a timeout queue. Otherwise + * some confusion can happen under high load if timeout queues and pollset + * get out of sync. + * XXX: It should be possible to make the lock unnecessary in many or even all + * XXX: cases. + */ +static apr_pollset_t *event_pollset; + struct event_conn_state_t { /** APR_RING of expiration timeouts */ APR_RING_ENTRY(event_conn_state_t) timeout_list; @@ -250,8 +264,13 @@ static struct timeout_queue *write_completion_q, *keepalive_q, *linger_q, *short_linger_q; +static volatile apr_time_t queues_next_expiry; -static apr_pollfd_t *listener_pollfd; +/* Prevent extra poll/wakeup calls for timeouts close in the future (queues + * have the granularity of a second anyway). + * XXX: Wouldn't 0.5s (instead of 0.1s) be "enough"? + */ +#define TIMEOUT_FUDGE_FACTOR apr_time_from_msec(100) /* * Macros for accessing struct timeout_queue. @@ -259,9 +278,26 @@ static apr_pollfd_t *listener_pollfd; */ static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *el) { + apr_time_t q_expiry; + apr_time_t next_expiry; + APR_RING_INSERT_TAIL(&q->head, el, event_conn_state_t, timeout_list); apr_atomic_inc32(q->total); ++q->count; + + /* Cheaply update the overall queues' next expiry according to the + * first entry of this queue (oldest), if necessary. + */ + el = APR_RING_FIRST(&q->head); + q_expiry = el->queue_timestamp + q->timeout; + next_expiry = queues_next_expiry; + if (!next_expiry || next_expiry > q_expiry + TIMEOUT_FUDGE_FACTOR) { + queues_next_expiry = q_expiry; + /* Unblock the poll()ing listener for it to update its timeout. */ + if (listener_is_wakeable) { + apr_pollset_wakeup(event_pollset); + } + } } static void TO_QUEUE_REMOVE(struct timeout_queue *q, event_conn_state_t *el) @@ -287,17 +323,6 @@ static struct timeout_queue *TO_QUEUE_MAKE(apr_pool_t *p, apr_time_t t, #define TO_QUEUE_ELEM_INIT(el) \ APR_RING_ELEM_INIT((el), timeout_list) -/* - * The pollset for sockets that are in any of the timeout queues. Currently - * we use the timeout_mutex to make sure that connections are added/removed - * atomically to/from both event_pollset and a timeout queue. Otherwise - * some confusion can happen under high load if timeout queues and pollset - * get out of sync. - * XXX: It should be possible to make the lock unnecessary in many or even all - * XXX: cases. - */ -static apr_pollset_t *event_pollset; - #if HAVE_SERF typedef struct { apr_pollset_t *pollset; @@ -496,6 +521,11 @@ static void wakeup_listener(void) return; } + /* Unblock the listener if it's poll()ing */ + if (listener_is_wakeable) { + apr_pollset_wakeup(event_pollset); + } + /* unblock the listener if it's waiting for a worker */ ap_queue_info_term(worker_queue_info); @@ -678,7 +708,11 @@ static apr_status_t decrement_connection_count(void *cs_) default: break; } - apr_atomic_dec32(&connection_count); + /* Unblock the listener if it's waiting for connection_count = 0 */ + if (!apr_atomic_dec32(&connection_count) + && listener_is_wakeable && listener_may_exit) { + apr_pollset_wakeup(event_pollset); + } return APR_SUCCESS; } @@ -1464,6 +1498,13 @@ static void get_worker(int *have_idle_worker_p, int blocking, int *all_busy) static APR_RING_HEAD(timer_free_ring_t, timer_event_t) timer_free_ring; static apr_skiplist *timer_skiplist; +static volatile apr_time_t timers_next_expiry; + +/* Same goal as for TIMEOUT_FUDGE_FACTOR (avoid extra poll calls), but applied + * to timers. Since their timeouts are custom (user defined), we can't be too + * approximative here (hence using 0.01s). + */ +#define EVENT_FUDGE_FACTOR apr_time_from_msec(10) /* The following compare function is used by apr_skiplist_insert() to keep the * elements (timers) sorted and provide O(log n) complexity (this is also true @@ -1517,8 +1558,22 @@ static timer_event_t * event_get_timer_event(apr_time_t t, te->remove = remove; if (insert) { + apr_time_t next_expiry; + /* Okay, add sorted by when.. */ apr_skiplist_insert(timer_skiplist, te); + + /* Cheaply update the overall timers' next expiry according to + * this event, if necessary. + */ + next_expiry = timers_next_expiry; + if (!next_expiry || next_expiry > te->when + EVENT_FUDGE_FACTOR) { + timers_next_expiry = te->when; + /* Unblock the poll()ing listener for it to update its timeout. */ + if (listener_is_wakeable) { + apr_pollset_wakeup(event_pollset); + } + } } apr_thread_mutex_unlock(g_timer_skiplist_mtx); @@ -1681,20 +1736,32 @@ static void process_timeout_queue(struct timeout_queue *q, count = 0; cs = first = last = APR_RING_FIRST(&qp->head); while (cs != APR_RING_SENTINEL(&qp->head, event_conn_state_t, - timeout_list) - /* Trash the entry if: - * - no timeout_time was given (asked for all), or - * - it expired (according to the queue timeout), or - * - the system clock skewed in the past: no entry should be - * registered above the given timeout_time (~now) + the queue - * timeout, we won't keep any here (eg. for centuries). - * Stop otherwise, no following entry will match thanks to the - * single timeout per queue (entries are added to the end!). - * This allows maintenance in O(1). - */ - && (!timeout_time - || cs->queue_timestamp + qp->timeout < timeout_time - || cs->queue_timestamp > timeout_time + qp->timeout)) { + timeout_list)) { + /* Trash the entry if: + * - no timeout_time was given (asked for all), or + * - it expired (according to the queue timeout), or + * - the system clock skewed in the past: no entry should be + * registered above the given timeout_time (~now) + the queue + * timeout, we won't keep any here (eg. for centuries). + * + * Otherwise stop, no following entry will match thanks to the + * single timeout per queue (entries are added to the end!). + * This allows maintenance in O(1). + */ + if (timeout_time + && cs->queue_timestamp + qp->timeout > timeout_time + && cs->queue_timestamp < timeout_time + qp->timeout) { + /* Since this is the next expiring of this queue, update the + * overall queues' next expiry if it's later than this one. + */ + apr_time_t q_expiry = cs->queue_timestamp + qp->timeout; + apr_time_t next_expiry = queues_next_expiry; + if (!next_expiry || next_expiry > q_expiry) { + queues_next_expiry = q_expiry; + } + break; + } + last = cs; rv = apr_pollset_remove(event_pollset, &cs->pfd); if (rv != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rv)) { @@ -1736,21 +1803,13 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) int process_slot = ti->pslot; struct process_score *ps = ap_get_scoreboard_process(process_slot); apr_pool_t *tpool = apr_thread_pool_get(thd); - apr_time_t timeout_time = 0, last_log; int closed = 0, listeners_disabled = 0; int have_idle_worker = 0; + apr_time_t last_log; last_log = apr_time_now(); free(ti); - /* the following times out events that are really close in the future - * to prevent extra poll calls - * - * current value is .1 second - */ -#define TIMEOUT_FUDGE_FACTOR 100000 -#define EVENT_FUDGE_FACTOR 10000 - rc = init_pollset(tpool); if (rc != APR_SUCCESS) { ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf, @@ -1773,7 +1832,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) apr_int32_t num = 0; apr_uint32_t c_count, l_count, i_count; apr_interval_time_t timeout_interval; - apr_time_t now; + apr_time_t now, timeout_time; int workers_were_busy = 0; int keepalives; @@ -1790,7 +1849,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) if (APLOGtrace6(ap_server_conf)) { now = apr_time_now(); /* trace log status every second */ - if (now - last_log > apr_time_from_msec(1000)) { + if (now - last_log > apr_time_from_sec(1)) { last_log = now; apr_thread_mutex_lock(timeout_mutex); ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf, @@ -1820,47 +1879,73 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) #endif now = apr_time_now(); - apr_thread_mutex_lock(g_timer_skiplist_mtx); - te = apr_skiplist_peek(timer_skiplist); - if (te) { - if (te->when > now) { - timeout_interval = te->when - now; - } - else { - timeout_interval = 1; - } - } - else { - timeout_interval = apr_time_from_msec(100); - } - while (te) { - if (te->when < now + EVENT_FUDGE_FACTOR) { + + /* Start with an infinite poll() timeout and update it according to + * the next expiring timer or queue entry. If there are none, either + * the listener is wakeable and it can poll() indefinitely until a wake + * up occurs, otherwise periodic checks (maintenance, shutdown, ...) + * must be performed. + */ + timeout_interval = -1; + + /* Push expired timers to a worker, the first remaining one determines + * the maximum time to poll() below, if any. + */ + timeout_time = timers_next_expiry; + if (timeout_time && timeout_time < now + EVENT_FUDGE_FACTOR) { + apr_thread_mutex_lock(g_timer_skiplist_mtx); + while ((te = apr_skiplist_peek(timer_skiplist))) { + if (te->when > now + EVENT_FUDGE_FACTOR) { + timers_next_expiry = te->when; + timeout_interval = te->when - now; + break; + } apr_skiplist_pop(timer_skiplist, NULL); if (!te->canceled) { if (te->remove) { int i; for (i = 0; i < te->remove->nelts; i++) { - apr_pollfd_t *pfd = (apr_pollfd_t *)te->remove->elts + i; + apr_pollfd_t *pfd; + pfd = (apr_pollfd_t *)te->remove->elts + i; apr_pollset_remove(event_pollset, pfd); } } push_timer2worker(te); } else { - APR_RING_INSERT_TAIL(&timer_free_ring, te, timer_event_t, - link); + APR_RING_INSERT_TAIL(&timer_free_ring, te, + timer_event_t, link); } } - else { - break; + if (!te) { + timers_next_expiry = 0; } - te = apr_skiplist_peek(timer_skiplist); + apr_thread_mutex_unlock(g_timer_skiplist_mtx); + } + + /* Same for queues, use their next expiry, if any. */ + timeout_time = queues_next_expiry; + if (timeout_time + && (timeout_interval < 0 + || timeout_time <= now + || timeout_interval > timeout_time - now)) { + timeout_interval = timeout_time > now ? timeout_time - now : 1; + } + + /* When non-wakeable, don't wait more than 100 ms, in any case. */ +#define NON_WAKEABLE_POLL_TIMEOUT apr_time_from_msec(100) + if (!listener_is_wakeable + && (timeout_interval < 0 + || timeout_interval > NON_WAKEABLE_POLL_TIMEOUT)) { + timeout_interval = NON_WAKEABLE_POLL_TIMEOUT; } - apr_thread_mutex_unlock(g_timer_skiplist_mtx); rc = apr_pollset_poll(event_pollset, timeout_interval, &num, &out_pfd); if (rc != APR_SUCCESS) { if (APR_STATUS_IS_EINTR(rc)) { + /* Woken up, either update timeouts or shutdown, + * both logics are above. + */ continue; } if (!APR_STATUS_IS_TIMEUP(rc)) { @@ -1870,6 +1955,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) "shutdown process gracefully"); signal_threads(ST_GRACEFUL); } + num = 0; } if (listener_may_exit) { @@ -2082,37 +2168,25 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) /* XXX possible optimization: stash the current time for use as * r->request_time for new requests */ - now = apr_time_now(); - /* We only do this once per 0.1s (TIMEOUT_FUDGE_FACTOR), or on a clock - * skew (if the system time is set back in the meantime, timeout_time - * will exceed now + TIMEOUT_FUDGE_FACTOR, can't happen otherwise). + /* We process the timeout queues here only when their overall next + * expiry (read once above) is over. This happens accurately since + * adding to the queues (in workers) can only decrease this expiry, + * while latest ones are only taken into account here (in listener) + * during queues' processing, with the lock held. This works both + * with and without wake-ability. */ - if (now > timeout_time || now + TIMEOUT_FUDGE_FACTOR < timeout_time ) { + if (timeout_time && timeout_time < (now = apr_time_now())) { timeout_time = now + TIMEOUT_FUDGE_FACTOR; /* handle timed out sockets */ apr_thread_mutex_lock(timeout_mutex); + /* Processing all the queues below will recompute this. */ + queues_next_expiry = 0; + /* Step 1: keepalive timeouts */ - /* If all workers are busy, we kill older keep-alive connections so that they - * may connect to another process. - */ - if ((workers_were_busy || dying) - && (keepalives = apr_atomic_read32(keepalive_q->total))) { - /* If all workers are busy, we kill older keep-alive connections so - * that they may connect to another process. - */ - ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, - "All workers are %s, will close %d keep-alive " - "connections", dying ? "dying" : "busy", - keepalives); - process_timeout_queue(keepalive_q, 0, - start_lingering_close_nonblocking); - } - else { - process_timeout_queue(keepalive_q, timeout_time, - start_lingering_close_nonblocking); - } + process_timeout_queue(keepalive_q, timeout_time, + start_lingering_close_nonblocking); /* Step 2: write completion timeouts */ process_timeout_queue(write_completion_q, timeout_time, start_lingering_close_nonblocking); @@ -2131,6 +2205,22 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy) ps->suspended = apr_atomic_read32(&suspended_count); ps->lingering_close = apr_atomic_read32(&lingering_count); } + else if ((workers_were_busy || dying) + && (keepalives = apr_atomic_read32(keepalive_q->total))) { + /* If all workers are busy, we kill older keep-alive connections so + * that they may connect to another process. + */ + ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf, + "All workers are %s, will close %d keep-alive " + "connections", dying ? "dying" : "busy", + keepalives); + apr_thread_mutex_lock(timeout_mutex); + process_timeout_queue(keepalive_q, 0, + start_lingering_close_nonblocking); + apr_thread_mutex_unlock(timeout_mutex); + ps->keep_alive = apr_atomic_read32(keepalive_q->total); + } + if (listeners_disabled && !workers_were_busy && ((c_count = apr_atomic_read32(&connection_count)) >= (l_count = apr_atomic_read32(&lingering_count)) @@ -2343,6 +2433,8 @@ static void *APR_THREAD_FUNC start_threads(apr_thread_t * thd, void *dummy) int prev_threads_created; int max_recycled_pools = -1; int good_methods[] = {APR_POLLSET_KQUEUE, APR_POLLSET_PORT, APR_POLLSET_EPOLL}; + /* XXX don't we need more to handle K-A or lingering close? */ + const apr_uint32_t pollset_size = threads_per_child * 2; /* We must create the fd queues before we start up the listener * and worker threads. */ @@ -2382,24 +2474,24 @@ static void *APR_THREAD_FUNC start_threads(apr_thread_t * thd, void *dummy) /* Create the main pollset */ for (i = 0; i < sizeof(good_methods) / sizeof(good_methods[0]); i++) { - rv = apr_pollset_create_ex(&event_pollset, - threads_per_child*2, /* XXX don't we need more, to handle - * connections in K-A or lingering - * close? - */ - pchild, APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY | APR_POLLSET_NODEFAULT, - good_methods[i]); + apr_uint32_t flags = APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY | + APR_POLLSET_NODEFAULT | APR_POLLSET_WAKEABLE; + rv = apr_pollset_create_ex(&event_pollset, pollset_size, pchild, flags, + good_methods[i]); + if (rv == APR_SUCCESS) { + listener_is_wakeable = 1; + break; + } + flags &= ~APR_POLLSET_WAKEABLE; + rv = apr_pollset_create_ex(&event_pollset, pollset_size, pchild, flags, + good_methods[i]); if (rv == APR_SUCCESS) { break; } } if (rv != APR_SUCCESS) { - rv = apr_pollset_create(&event_pollset, - threads_per_child*2, /* XXX don't we need more, to handle - * connections in K-A or lingering - * close? - */ - pchild, APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY); + rv = apr_pollset_create(&event_pollset, pollset_size, pchild, + APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY); } if (rv != APR_SUCCESS) { ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03103) @@ -2408,7 +2500,9 @@ static void *APR_THREAD_FUNC start_threads(apr_thread_t * thd, void *dummy) } ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, APLOGNO(02471) - "start_threads: Using %s", apr_pollset_method_name(event_pollset)); + "start_threads: Using %s (%swakeable)", + apr_pollset_method_name(event_pollset), + listener_is_wakeable ? "" : "not "); worker_sockets = apr_pcalloc(pchild, threads_per_child * sizeof(apr_socket_t *));