event: avoid unnecessary listener/polling wake ups (context switches) by using

author Yann Ylavic <ylavic@apache.org>

Wed, 28 Sep 2016 22:02:48 +0000 (22:02 +0000)

committer Yann Ylavic <ylavic@apache.org>

Wed, 28 Sep 2016 22:02:48 +0000 (22:02 +0000)
author Yann Ylavic <ylavic@apache.org>
Wed, 28 Sep 2016 22:02:48 +0000 (22:02 +0000)
committer Yann Ylavic <ylavic@apache.org>
Wed, 28 Sep 2016 22:02:48 +0000 (22:02 +0000)
diff --git a/server/mpm/event/event.c b/server/mpm/event/event.c

index a13630239a442bedfda218d2201fbdaab22d1fb3..66b1689678a33670182e562d9b892cfa864bab11 100644 (file)
--- a/server/mpm/event/event.c
+++ b/server/mpm/event/event.c
@@ -182,6 +182,7 @@ static int dying = 0;
  static int workers_may_exit = 0;
  static int start_thread_may_exit = 0;
  static int listener_may_exit = 0;
+static int listener_is_wakeable = 0;        /* Pollset supports APR_POLLSET_WAKEABLE */
  static int num_listensocks = 0;
  static apr_int32_t conns_this_child;        /* MaxConnectionsPerChild, only access
                                                 in listener thread */
@@ -204,6 +205,19 @@ module AP_MODULE_DECLARE_DATA mpm_event_module;
  struct event_srv_cfg_s;
  typedef struct event_srv_cfg_s event_srv_cfg;
  
+static apr_pollfd_t *listener_pollfd;
+
+/*
+ * The pollset for sockets that are in any of the timeout queues. Currently
+ * we use the timeout_mutex to make sure that connections are added/removed
+ * atomically to/from both event_pollset and a timeout queue. Otherwise
+ * some confusion can happen under high load if timeout queues and pollset
+ * get out of sync.
+ * XXX: It should be possible to make the lock unnecessary in many or even all
+ * XXX: cases.
+ */
+static apr_pollset_t *event_pollset;
+
  struct event_conn_state_t {
      /** APR_RING of expiration timeouts */
      APR_RING_ENTRY(event_conn_state_t) timeout_list;
@@ -250,8 +264,13 @@ static struct timeout_queue *write_completion_q,
                              *keepalive_q,
                              *linger_q,
                              *short_linger_q;
+static volatile apr_time_t  queues_next_expiry;
  
-static apr_pollfd_t *listener_pollfd;
+/* Prevent extra poll/wakeup calls for timeouts close in the future (queues
+ * have the granularity of a second anyway).
+ * XXX: Wouldn't 0.5s (instead of 0.1s) be "enough"?
+ */
+#define TIMEOUT_FUDGE_FACTOR apr_time_from_msec(100)
  
  /*
   * Macros for accessing struct timeout_queue.
@@ -259,9 +278,26 @@ static apr_pollfd_t *listener_pollfd;
   */
  static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *el)
  {
+    apr_time_t q_expiry;
+    apr_time_t next_expiry;
+
      APR_RING_INSERT_TAIL(&q->head, el, event_conn_state_t, timeout_list);
      apr_atomic_inc32(q->total);
      ++q->count;
+
+    /* Cheaply update the overall queues' next expiry according to the
+     * first entry of this queue (oldest), if necessary.
+     */
+    el = APR_RING_FIRST(&q->head);
+    q_expiry = el->queue_timestamp + q->timeout;
+    next_expiry = queues_next_expiry;
+    if (!next_expiry || next_expiry > q_expiry + TIMEOUT_FUDGE_FACTOR) {
+        queues_next_expiry = q_expiry;
+        /* Unblock the poll()ing listener for it to update its timeout. */
+        if (listener_is_wakeable) {
+            apr_pollset_wakeup(event_pollset);
+        }
+    }
  }
  
  static void TO_QUEUE_REMOVE(struct timeout_queue *q, event_conn_state_t *el)
@@ -287,17 +323,6 @@ static struct timeout_queue *TO_QUEUE_MAKE(apr_pool_t *p, apr_time_t t,
  #define TO_QUEUE_ELEM_INIT(el) \
      APR_RING_ELEM_INIT((el), timeout_list)
  
-/*
- * The pollset for sockets that are in any of the timeout queues. Currently
- * we use the timeout_mutex to make sure that connections are added/removed
- * atomically to/from both event_pollset and a timeout queue. Otherwise
- * some confusion can happen under high load if timeout queues and pollset
- * get out of sync.
- * XXX: It should be possible to make the lock unnecessary in many or even all
- * XXX: cases.
- */
-static apr_pollset_t *event_pollset;
-
  #if HAVE_SERF
  typedef struct {
      apr_pollset_t *pollset;
@@ -496,6 +521,11 @@ static void wakeup_listener(void)
          return;
      }
  
+    /* Unblock the listener if it's poll()ing */
+    if (listener_is_wakeable) {
+        apr_pollset_wakeup(event_pollset);
+    }
+
      /* unblock the listener if it's waiting for a worker */
      ap_queue_info_term(worker_queue_info);
  
@@ -678,7 +708,11 @@ static apr_status_t decrement_connection_count(void *cs_)
          default:
              break;
      }
-    apr_atomic_dec32(&connection_count);
+    /* Unblock the listener if it's waiting for connection_count = 0 */
+    if (!apr_atomic_dec32(&connection_count)
+             && listener_is_wakeable && listener_may_exit) {
+        apr_pollset_wakeup(event_pollset);
+    }
      return APR_SUCCESS;
  }
  
@@ -1464,6 +1498,13 @@ static void get_worker(int *have_idle_worker_p, int blocking, int *all_busy)
  static APR_RING_HEAD(timer_free_ring_t, timer_event_t) timer_free_ring;
  
  static apr_skiplist *timer_skiplist;
+static volatile apr_time_t timers_next_expiry;
+
+/* Same goal as for TIMEOUT_FUDGE_FACTOR (avoid extra poll calls), but applied
+ * to timers. Since their timeouts are custom (user defined), we can't be too
+ * approximative here (hence using 0.01s).
+ */
+#define EVENT_FUDGE_FACTOR apr_time_from_msec(10)
  
  /* The following compare function is used by apr_skiplist_insert() to keep the
   * elements (timers) sorted and provide O(log n) complexity (this is also true
@@ -1517,8 +1558,22 @@ static timer_event_t * event_get_timer_event(apr_time_t t,
      te->remove = remove;
  
      if (insert) { 
+        apr_time_t next_expiry;
+
          /* Okay, add sorted by when.. */
          apr_skiplist_insert(timer_skiplist, te);
+
+        /* Cheaply update the overall timers' next expiry according to
+         * this event, if necessary.
+         */
+        next_expiry = timers_next_expiry;
+        if (!next_expiry || next_expiry > te->when + EVENT_FUDGE_FACTOR) {
+            timers_next_expiry = te->when;
+            /* Unblock the poll()ing listener for it to update its timeout. */
+            if (listener_is_wakeable) {
+                apr_pollset_wakeup(event_pollset);
+            }
+        }
      }
      apr_thread_mutex_unlock(g_timer_skiplist_mtx);
  
@@ -1681,20 +1736,32 @@ static void process_timeout_queue(struct timeout_queue *q,
          count = 0;
          cs = first = last = APR_RING_FIRST(&qp->head);
          while (cs != APR_RING_SENTINEL(&qp->head, event_conn_state_t,
-                                       timeout_list)
-               /* Trash the entry if:
-                * - no timeout_time was given (asked for all), or
-                * - it expired (according to the queue timeout), or
-                * - the system clock skewed in the past: no entry should be
-                *   registered above the given timeout_time (~now) + the queue
-                *   timeout, we won't keep any here (eg. for centuries).
-                * Stop otherwise, no following entry will match thanks to the
-                * single timeout per queue (entries are added to the end!).
-                * This allows maintenance in O(1).
-                */
-               && (!timeout_time
-                   || cs->queue_timestamp + qp->timeout < timeout_time
-                   || cs->queue_timestamp > timeout_time + qp->timeout)) {
+                                       timeout_list)) {
+            /* Trash the entry if:
+             * - no timeout_time was given (asked for all), or
+             * - it expired (according to the queue timeout), or
+             * - the system clock skewed in the past: no entry should be
+             *   registered above the given timeout_time (~now) + the queue
+             *   timeout, we won't keep any here (eg. for centuries).
+             *
+             * Otherwise stop, no following entry will match thanks to the
+             * single timeout per queue (entries are added to the end!).
+             * This allows maintenance in O(1).
+             */
+            if (timeout_time
+                    && cs->queue_timestamp + qp->timeout > timeout_time
+                    && cs->queue_timestamp < timeout_time + qp->timeout) {
+                /* Since this is the next expiring of this queue, update the
+                 * overall queues' next expiry if it's later than this one.
+                 */
+                apr_time_t q_expiry = cs->queue_timestamp + qp->timeout;
+                apr_time_t next_expiry = queues_next_expiry;
+                if (!next_expiry || next_expiry > q_expiry) {
+                    queues_next_expiry = q_expiry;
+                }
+                break;
+            }
+
              last = cs;
              rv = apr_pollset_remove(event_pollset, &cs->pfd);
              if (rv != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rv)) {
@@ -1736,21 +1803,13 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
      int process_slot = ti->pslot;
      struct process_score *ps = ap_get_scoreboard_process(process_slot);
      apr_pool_t *tpool = apr_thread_pool_get(thd);
-    apr_time_t timeout_time = 0, last_log;
      int closed = 0, listeners_disabled = 0;
      int have_idle_worker = 0;
+    apr_time_t last_log;
  
      last_log = apr_time_now();
      free(ti);
  
-    /* the following times out events that are really close in the future
-     *   to prevent extra poll calls
-     *
-     * current value is .1 second
-     */
-#define TIMEOUT_FUDGE_FACTOR 100000
-#define EVENT_FUDGE_FACTOR 10000
-
      rc = init_pollset(tpool);
      if (rc != APR_SUCCESS) {
          ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf,
@@ -1773,7 +1832,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
          apr_int32_t num = 0;
          apr_uint32_t c_count, l_count, i_count;
          apr_interval_time_t timeout_interval;
-        apr_time_t now;
+        apr_time_t now, timeout_time;
          int workers_were_busy = 0;
          int keepalives;
  
@@ -1790,7 +1849,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
          if (APLOGtrace6(ap_server_conf)) {
              now = apr_time_now();
              /* trace log status every second */
-            if (now - last_log > apr_time_from_msec(1000)) {
+            if (now - last_log > apr_time_from_sec(1)) {
                  last_log = now;
                  apr_thread_mutex_lock(timeout_mutex);
                  ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf,
@@ -1820,47 +1879,73 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
  #endif
  
          now = apr_time_now();
-        apr_thread_mutex_lock(g_timer_skiplist_mtx);
-        te = apr_skiplist_peek(timer_skiplist);
-        if (te) {
-            if (te->when > now) {
-                timeout_interval = te->when - now;
-            }
-            else {
-                timeout_interval = 1;
-            }
-        }
-        else {
-            timeout_interval = apr_time_from_msec(100);
-        }
-        while (te) {
-            if (te->when < now + EVENT_FUDGE_FACTOR) {
+
+        /* Start with an infinite poll() timeout and update it according to
+         * the next expiring timer or queue entry. If there are none, either
+         * the listener is wakeable and it can poll() indefinitely until a wake
+         * up occurs, otherwise periodic checks (maintenance, shutdown, ...)
+         * must be performed.
+         */
+        timeout_interval = -1;
+
+        /* Push expired timers to a worker, the first remaining one determines
+         * the maximum time to poll() below, if any.
+         */
+        timeout_time = timers_next_expiry;
+        if (timeout_time && timeout_time < now + EVENT_FUDGE_FACTOR) {
+            apr_thread_mutex_lock(g_timer_skiplist_mtx);
+            while ((te = apr_skiplist_peek(timer_skiplist))) {
+                if (te->when > now + EVENT_FUDGE_FACTOR) {
+                    timers_next_expiry = te->when;
+                    timeout_interval = te->when - now;
+                    break;
+                }
                  apr_skiplist_pop(timer_skiplist, NULL);
                  if (!te->canceled) { 
                      if (te->remove) {
                          int i;
                          for (i = 0; i < te->remove->nelts; i++) {
-                            apr_pollfd_t *pfd = (apr_pollfd_t *)te->remove->elts + i;
+                            apr_pollfd_t *pfd;
+                            pfd = (apr_pollfd_t *)te->remove->elts + i;
                              apr_pollset_remove(event_pollset, pfd);
                          }
                      }
                      push_timer2worker(te);
                  }
                  else {
-                    APR_RING_INSERT_TAIL(&timer_free_ring, te, timer_event_t,
-                                         link);
+                    APR_RING_INSERT_TAIL(&timer_free_ring, te,
+                                         timer_event_t, link);
                  }
              }
-            else {
-                break;
+            if (!te) {
+                timers_next_expiry = 0;
              }
-            te = apr_skiplist_peek(timer_skiplist);
+            apr_thread_mutex_unlock(g_timer_skiplist_mtx);
+        }
+
+        /* Same for queues, use their next expiry, if any. */
+        timeout_time = queues_next_expiry;
+        if (timeout_time
+                && (timeout_interval < 0
+                    || timeout_time <= now
+                    || timeout_interval > timeout_time - now)) {
+            timeout_interval = timeout_time > now ? timeout_time - now : 1;
+        }
+
+        /* When non-wakeable, don't wait more than 100 ms, in any case. */
+#define NON_WAKEABLE_POLL_TIMEOUT apr_time_from_msec(100)
+        if (!listener_is_wakeable
+                && (timeout_interval < 0
+                    || timeout_interval > NON_WAKEABLE_POLL_TIMEOUT)) {
+            timeout_interval = NON_WAKEABLE_POLL_TIMEOUT;
          }
-        apr_thread_mutex_unlock(g_timer_skiplist_mtx);
  
          rc = apr_pollset_poll(event_pollset, timeout_interval, &num, &out_pfd);
          if (rc != APR_SUCCESS) {
              if (APR_STATUS_IS_EINTR(rc)) {
+                /* Woken up, either update timeouts or shutdown,
+                 * both logics are above.
+                 */
                  continue;
              }
              if (!APR_STATUS_IS_TIMEUP(rc)) {
@@ -1870,6 +1955,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
                               "shutdown process gracefully");
                  signal_threads(ST_GRACEFUL);
              }
+            num = 0;
          }
  
          if (listener_may_exit) {
@@ -2082,37 +2168,25 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
          /* XXX possible optimization: stash the current time for use as
           * r->request_time for new requests
           */
-        now = apr_time_now();
-        /* We only do this once per 0.1s (TIMEOUT_FUDGE_FACTOR), or on a clock
-         * skew (if the system time is set back in the meantime, timeout_time
-         * will exceed now + TIMEOUT_FUDGE_FACTOR, can't happen otherwise).
+        /* We process the timeout queues here only when their overall next
+         * expiry (read once above) is over. This happens accurately since
+         * adding to the queues (in workers) can only decrease this expiry,
+         * while latest ones are only taken into account here (in listener)
+         * during queues' processing, with the lock held. This works both
+         * with and without wake-ability.
           */
-        if (now > timeout_time || now + TIMEOUT_FUDGE_FACTOR < timeout_time ) {
+        if (timeout_time && timeout_time < (now = apr_time_now())) {
              timeout_time = now + TIMEOUT_FUDGE_FACTOR;
  
              /* handle timed out sockets */
              apr_thread_mutex_lock(timeout_mutex);
  
+            /* Processing all the queues below will recompute this. */
+            queues_next_expiry = 0;
+
              /* Step 1: keepalive timeouts */
-            /* If all workers are busy, we kill older keep-alive connections so that they
-             * may connect to another process.
-             */
-            if ((workers_were_busy || dying)
-                     && (keepalives = apr_atomic_read32(keepalive_q->total))) {
-                /* If all workers are busy, we kill older keep-alive connections so
-                 * that they may connect to another process.
-                 */
-                ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf,
-                             "All workers are %s, will close %d keep-alive "
-                             "connections", dying ? "dying" : "busy",
-                             keepalives);
-                process_timeout_queue(keepalive_q, 0,
-                                      start_lingering_close_nonblocking);
-            }
-            else {
-                process_timeout_queue(keepalive_q, timeout_time,
-                                      start_lingering_close_nonblocking);
-            }
+            process_timeout_queue(keepalive_q, timeout_time,
+                                  start_lingering_close_nonblocking);
              /* Step 2: write completion timeouts */
              process_timeout_queue(write_completion_q, timeout_time,
                                    start_lingering_close_nonblocking);
@@ -2131,6 +2205,22 @@ static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
              ps->suspended = apr_atomic_read32(&suspended_count);
              ps->lingering_close = apr_atomic_read32(&lingering_count);
          }
+        else if ((workers_were_busy || dying)
+                 && (keepalives = apr_atomic_read32(keepalive_q->total))) {
+            /* If all workers are busy, we kill older keep-alive connections so
+             * that they may connect to another process.
+             */
+            ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf,
+                         "All workers are %s, will close %d keep-alive "
+                         "connections", dying ? "dying" : "busy",
+                         keepalives);
+            apr_thread_mutex_lock(timeout_mutex);
+            process_timeout_queue(keepalive_q, 0,
+                                  start_lingering_close_nonblocking);
+            apr_thread_mutex_unlock(timeout_mutex);
+            ps->keep_alive = apr_atomic_read32(keepalive_q->total);
+        }
+
          if (listeners_disabled && !workers_were_busy
              && ((c_count = apr_atomic_read32(&connection_count))
                      >= (l_count = apr_atomic_read32(&lingering_count))
@@ -2343,6 +2433,8 @@ static void *APR_THREAD_FUNC start_threads(apr_thread_t * thd, void *dummy)
      int prev_threads_created;
      int max_recycled_pools = -1;
      int good_methods[] = {APR_POLLSET_KQUEUE, APR_POLLSET_PORT, APR_POLLSET_EPOLL};
+    /* XXX don't we need more to handle K-A or lingering close? */
+    const apr_uint32_t pollset_size = threads_per_child * 2;
  
      /* We must create the fd queues before we start up the listener
       * and worker threads. */
@@ -2382,24 +2474,24 @@ static void *APR_THREAD_FUNC start_threads(apr_thread_t * thd, void *dummy)
  
      /* Create the main pollset */
      for (i = 0; i < sizeof(good_methods) / sizeof(good_methods[0]); i++) {
-        rv = apr_pollset_create_ex(&event_pollset,
-                            threads_per_child*2, /* XXX don't we need more, to handle
-                                                * connections in K-A or lingering
-                                                * close?
-                                                */
-                            pchild, APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY | APR_POLLSET_NODEFAULT,
-                            good_methods[i]);
+        apr_uint32_t flags = APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY |
+                             APR_POLLSET_NODEFAULT | APR_POLLSET_WAKEABLE;
+        rv = apr_pollset_create_ex(&event_pollset, pollset_size, pchild, flags,
+                                   good_methods[i]);
+        if (rv == APR_SUCCESS) {
+            listener_is_wakeable = 1;
+            break;
+        }
+        flags &= ~APR_POLLSET_WAKEABLE;
+        rv = apr_pollset_create_ex(&event_pollset, pollset_size, pchild, flags,
+                                   good_methods[i]);
          if (rv == APR_SUCCESS) {
              break;
          }
      }
      if (rv != APR_SUCCESS) {
-        rv = apr_pollset_create(&event_pollset,
-                               threads_per_child*2, /* XXX don't we need more, to handle
-                                                     * connections in K-A or lingering
-                                                     * close?
-                                                     */
-                               pchild, APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY);
+        rv = apr_pollset_create(&event_pollset, pollset_size, pchild,
+                                APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY);
      }
      if (rv != APR_SUCCESS) {
          ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03103)
@@ -2408,7 +2500,9 @@ static void *APR_THREAD_FUNC start_threads(apr_thread_t * thd, void *dummy)
      }
  
      ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, APLOGNO(02471)
-                 "start_threads: Using %s", apr_pollset_method_name(event_pollset));
+                 "start_threads: Using %s (%swakeable)",
+                 apr_pollset_method_name(event_pollset),
+                 listener_is_wakeable ? "" : "not ");
      worker_sockets = apr_pcalloc(pchild, threads_per_child
                                   * sizeof(apr_socket_t *));
author	Yann Ylavic <ylavic@apache.org>
	Wed, 28 Sep 2016 22:02:48 +0000 (22:02 +0000)
committer	Yann Ylavic <ylavic@apache.org>
	Wed, 28 Sep 2016 22:02:48 +0000 (22:02 +0000)