From ec2b41049f7f576aaa772b326d083e5971212d33 Mon Sep 17 00:00:00 2001
From: Ned Bass <bass6@llnl.gov>
Date: Tue, 17 Jan 2012 15:34:55 -0800
Subject: [PATCH] Taskq locking optimizations

Testing has shown that tq->tq_lock can be highly contended when a
large number of small work items are dispatched.  The lock hold time
is reduced by the following changes:

1) Use exclusive threads in the work_waitq

When a single work item is dispatched we only need to wake a single
thread to service it.  The current implementation uses non-exclusive
threads so all threads are woken when the dispatcher calls wake_up().
If a large number of threads are in the queue this overhead can become
non-negligible.

2) Conditionally add/remove threads from work waitq outside of tq_lock

Taskq threads need only add themselves to the work wait queue if there
are no pending work items.  Furthermore, the add and remove function
calls can be made outside of the taskq lock since the wait queues are
protected from concurrent access by their own spinlocks.

3) Call wake_up() outside of tq->tq_lock

Again, the wait queues are protected by their own spinlock, so the
dispatcher functions can drop tq->tq_lock before calling wake_up().

A new splat test taskq:contention was added in a prior commit to measure
the impact of these changes.  The following table summarizes the
results using data from the kernel lock profiler.

                        tq_lock time    %diff   Wall clock (s)  %diff
original:               39117614.10     0       41.72           0
exclusive threads:      31871483.61     18.5    34.2            18.0
unlocked add/rm waitq:  13794303.90     64.7    16.17           61.2
unlocked wake_up():     1589172.08      95.9    16.61           60.2

Each row reflects the average result over 5 test runs.
/proc/lock_stats was zeroed out before and collected after each run.
Column 1 is the cumulative hold time in microseconds for tq->tq_lock.
The tests are cumulative; each row reflects the code changes of the
previous rows.  %diff is calculated with respect to "original" as
100*(orig-new)/orig.

Although calling wake_up() outside of the taskq lock dramatically
reduced the taskq lock hold time, the test actually took slightly more
wall clock time.  This is because the point of contention shifts from
the taskq lock to the wait queue lock.  But the change still seems
worthwhile since it removes our taskq implementation as a bottleneck,
assuming the small increase in wall clock time to be statistical
noise.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #32
---
 module/spl/spl-taskq.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c
index ece99aa..b067766 100644
--- a/module/spl/spl-taskq.c
+++ b/module/spl/spl-taskq.c
@@ -286,10 +286,11 @@ __taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 	ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
 
 	spin_unlock(&t->tqent_lock);
-
-	wake_up(&tq->tq_work_waitq);
 out:
 	spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
+	if (rc > 0)
+		wake_up(&tq->tq_work_waitq);
+
 	SRETURN(rc);
 }
 EXPORT_SYMBOL(__taskq_dispatch);
@@ -309,6 +310,7 @@ __taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
 	/* Taskq being destroyed and all tasks drained */
 	if (!(tq->tq_flags & TQ_ACTIVE)) {
 		t->tqent_id = 0;
+		spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
 		goto out;
 	}
 
@@ -332,10 +334,10 @@ __taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
 	t->tqent_arg = arg;
 
 	spin_unlock(&t->tqent_lock);
+	spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
 
 	wake_up(&tq->tq_work_waitq);
 out:
-	spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
 	SEXIT;
 }
 EXPORT_SYMBOL(__taskq_dispatch_ent);
@@ -454,17 +456,17 @@ taskq_thread(void *args)
 
         while (!kthread_should_stop()) {
 
-		add_wait_queue(&tq->tq_work_waitq, &wait);
 		if (list_empty(&tq->tq_pend_list) &&
 		    list_empty(&tq->tq_prio_list)) {
 			spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
+			add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
 			schedule();
+			remove_wait_queue(&tq->tq_work_waitq, &wait);
 			spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
 		} else {
 			__set_current_state(TASK_RUNNING);
 		}
 
-		remove_wait_queue(&tq->tq_work_waitq, &wait);
 
 		if (!list_empty(&tq->tq_prio_list))
 			pend_list = &tq->tq_prio_list;
-- 
2.40.0