From 26f7245c7cfa77f25aedf1a500db689343644ead Mon Sep 17 00:00:00 2001
From: "Ricardo M. Correia" <ricardo.correia@oracle.com>
Date: Mon, 2 Aug 2010 09:24:01 +0000
Subject: [PATCH] Fix taskq code to not drop tasks when TQ_SLEEP is used.

When TQ_SLEEP is used, taskq_dispatch() should always succeed even if the
number of pending tasks is above tq->tq_maxalloc. This semantic is similar
to KM_SLEEP in kmem allocations, which also always succeed.

However, we cannot block forever otherwise there is a risk of deadlock.
Therefore, we still allow the number of pending tasks to go above
tq->tq_maxalloc with TQ_SLEEP, but we may sleep up to 1 second per task
dispatch, thereby throttling the task dispatch rate.

One of the existing splat tests was also augmented to test for this scenario.
The test would fail with the previous implementation but now it succeeds.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 module/spl/spl-taskq.c     | 47 +++++++++++++++++++-------------------
 module/splat/splat-taskq.c | 29 +++++++++++++++++++----
 2 files changed, 48 insertions(+), 28 deletions(-)

diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c
index 201cb5949..5a17f1ccf 100644
--- a/module/spl/spl-taskq.c
+++ b/module/spl/spl-taskq.c
@@ -78,35 +78,36 @@ retry:
                 if (flags & TQ_NOSLEEP)
                         SRETURN(NULL);
 
-                /* Sleep periodically polling the free list for an available
-                 * spl_task_t.  If a full second passes and we have not found
-                 * one gives up and return a NULL to the caller. */
-                if (flags & TQ_SLEEP) {
-                        spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
-                        schedule_timeout(HZ / 100);
-                        spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
-                        if (count < 100)
-                                SGOTO(retry, count++);
-
-                        SRETURN(NULL);
-                }
-
-                /* Unreachable, Neither TQ_SLEEP or TQ_NOSLEEP set */
-                PANIC("Neither TQ_SLEEP or TQ_NOSLEEP set");
+                /*
+                 * Sleep periodically polling the free list for an available
+                 * spl_task_t. Dispatching with TQ_SLEEP should always succeed
+                 * but we cannot block forever waiting for an spl_taskq_t to
+                 * show up in the free list, otherwise a deadlock can happen.
+                 *
+                 * Therefore, we need to allocate a new task even if the number
+                 * of allocated tasks is above tq->tq_maxalloc, but we still
+                 * end up delaying the task allocation by one second, thereby
+                 * throttling the task dispatch rate.
+                 */
+                 spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
+                 schedule_timeout(HZ / 100);
+                 spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
+                 if (count < 100)
+                        SGOTO(retry, count++);
         }
 
-	spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
+        spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
         t = kmem_alloc(sizeof(spl_task_t), flags & (TQ_SLEEP | TQ_NOSLEEP));
         spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
 
-	if (t) {
-		spin_lock_init(&t->t_lock);
+        if (t) {
+                spin_lock_init(&t->t_lock);
                 INIT_LIST_HEAD(&t->t_list);
-		t->t_id = 0;
-		t->t_func = NULL;
-		t->t_arg = NULL;
-		tq->tq_nalloc++;
-	}
+                t->t_id = 0;
+                t->t_func = NULL;
+                t->t_arg = NULL;
+                tq->tq_nalloc++;
+        }
 
         SRETURN(t);
 }
diff --git a/module/splat/splat-taskq.c b/module/splat/splat-taskq.c
index d4540f37a..a7c537236 100644
--- a/module/splat/splat-taskq.c
+++ b/module/splat/splat-taskq.c
@@ -296,6 +296,10 @@ splat_taskq_test3(struct file *file, void *arg)
  * Then use taskq_wait() to block until all the tasks complete, then
  * cross check that all the tasks ran by checking tg_arg->count which
  * is incremented in the task function.  Finally cleanup the taskq.
+ *
+ * First we try with a large 'maxalloc' value, then we try with a small one.
+ * We should not drop tasks when TQ_SLEEP is used in taskq_dispatch(), even
+ * if the number of pending tasks is above maxalloc.
  */
 static void
 splat_taskq_test4_func(void *arg)
@@ -307,16 +311,18 @@ splat_taskq_test4_func(void *arg)
 }
 
 static int
-splat_taskq_test4(struct file *file, void *arg)
+splat_taskq_test4_common(struct file *file, void *arg, int minalloc,
+                         int maxalloc, int nr_tasks)
 {
 	taskq_t *tq;
 	splat_taskq_arg_t tq_arg;
 	int i, j, rc = 0;
 
-	splat_vprint(file, SPLAT_TASKQ_TEST4_NAME, "Taskq '%s' creating\n",
-	             SPLAT_TASKQ_TEST4_NAME);
+	splat_vprint(file, SPLAT_TASKQ_TEST4_NAME, "Taskq '%s' creating "
+	             "(%d/%d/%d)\n", SPLAT_TASKQ_TEST4_NAME, minalloc, maxalloc,
+	             nr_tasks);
 	if ((tq = taskq_create(SPLAT_TASKQ_TEST4_NAME, 1, maxclsyspri,
-		               50, INT_MAX, TASKQ_PREPOPULATE)) == NULL) {
+		               minalloc, maxalloc, TASKQ_PREPOPULATE)) == NULL) {
 		splat_vprint(file, SPLAT_TASKQ_TEST4_NAME,
 		             "Taskq '%s' create failed\n",
 		             SPLAT_TASKQ_TEST4_NAME);
@@ -326,7 +332,7 @@ splat_taskq_test4(struct file *file, void *arg)
 	tq_arg.file = file;
 	tq_arg.name = SPLAT_TASKQ_TEST4_NAME;
 
-	for (i = 1; i <= 1024; i *= 2) {
+	for (i = 1; i <= nr_tasks; i *= 2) {
 		atomic_set(&tq_arg.count, 0);
 		splat_vprint(file, SPLAT_TASKQ_TEST4_NAME,
 		             "Taskq '%s' function '%s' dispatched %d times\n",
@@ -364,6 +370,19 @@ out:
 	return rc;
 }
 
+static int splat_taskq_test4(struct file *file, void *arg)
+{
+	int rc;
+
+	rc = splat_taskq_test4_common(file, arg, 50, INT_MAX, 1024);
+	if (rc)
+		return rc;
+
+	rc = splat_taskq_test4_common(file, arg, 1, 1, 32);
+
+	return rc;
+}
+
 /*
  * Create a taskq and dispatch a specific sequence of tasks carefully
  * crafted to validate the order in which tasks are processed.  When
-- 
2.40.0