From ec2b41049f7f576aaa772b326d083e5971212d33 Mon Sep 17 00:00:00 2001 From: Ned Bass Date: Tue, 17 Jan 2012 15:34:55 -0800 Subject: [PATCH] Taskq locking optimizations Testing has shown that tq->tq_lock can be highly contended when a large number of small work items are dispatched. The lock hold time is reduced by the following changes: 1) Use exclusive threads in the work_waitq When a single work item is dispatched we only need to wake a single thread to service it. The current implementation uses non-exclusive threads so all threads are woken when the dispatcher calls wake_up(). If a large number of threads are in the queue this overhead can become non-negligible. 2) Conditionally add/remove threads from work waitq outside of tq_lock Taskq threads need only add themselves to the work wait queue if there are no pending work items. Furthermore, the add and remove function calls can be made outside of the taskq lock since the wait queues are protected from concurrent access by their own spinlocks. 3) Call wake_up() outside of tq->tq_lock Again, the wait queues are protected by their own spinlock, so the dispatcher functions can drop tq->tq_lock before calling wake_up(). A new splat test taskq:contention was added in a prior commit to measure the impact of these changes. The following table summarizes the results using data from the kernel lock profiler. tq_lock time %diff Wall clock (s) %diff original: 39117614.10 0 41.72 0 exclusive threads: 31871483.61 18.5 34.2 18.0 unlocked add/rm waitq: 13794303.90 64.7 16.17 61.2 unlocked wake_up(): 1589172.08 95.9 16.61 60.2 Each row reflects the average result over 5 test runs. /proc/lock_stats was zeroed out before and collected after each run. Column 1 is the cumulative hold time in microseconds for tq->tq_lock. The tests are cumulative; each row reflects the code changes of the previous rows. %diff is calculated with respect to "original" as 100*(orig-new)/orig. Although calling wake_up() outside of the taskq lock dramatically reduced the taskq lock hold time, the test actually took slightly more wall clock time. This is because the point of contention shifts from the taskq lock to the wait queue lock. But the change still seems worthwhile since it removes our taskq implementation as a bottleneck, assuming the small increase in wall clock time to be statistical noise. Signed-off-by: Brian Behlendorf Closes #32 --- module/spl/spl-taskq.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c index ece99aa..b067766 100644 --- a/module/spl/spl-taskq.c +++ b/module/spl/spl-taskq.c @@ -286,10 +286,11 @@ __taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); spin_unlock(&t->tqent_lock); - - wake_up(&tq->tq_work_waitq); out: spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + if (rc > 0) + wake_up(&tq->tq_work_waitq); + SRETURN(rc); } EXPORT_SYMBOL(__taskq_dispatch); @@ -309,6 +310,7 @@ __taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, /* Taskq being destroyed and all tasks drained */ if (!(tq->tq_flags & TQ_ACTIVE)) { t->tqent_id = 0; + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); goto out; } @@ -332,10 +334,10 @@ __taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, t->tqent_arg = arg; spin_unlock(&t->tqent_lock); + spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); wake_up(&tq->tq_work_waitq); out: - spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); SEXIT; } EXPORT_SYMBOL(__taskq_dispatch_ent); @@ -454,17 +456,17 @@ taskq_thread(void *args) while (!kthread_should_stop()) { - add_wait_queue(&tq->tq_work_waitq, &wait); if (list_empty(&tq->tq_pend_list) && list_empty(&tq->tq_prio_list)) { spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); + add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); schedule(); + remove_wait_queue(&tq->tq_work_waitq, &wait); spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); } else { __set_current_state(TASK_RUNNING); } - remove_wait_queue(&tq->tq_work_waitq, &wait); if (!list_empty(&tq->tq_prio_list)) pend_list = &tq->tq_prio_list; -- 2.40.0