From a10287e00d13c4c4dbbff14f42b00b03da363fcb Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Mon, 10 Dec 2012 10:53:46 -0800
Subject: [PATCH] kmem-cache: Use taskqs for ageing

Shift the cache and magazine ageing functionality over to the new
delayed taskq interfaces.  This allows us to abandon the kernels
delayed work queue interface and all the compatibility code it
requires.

However, the delayed taskq interface does not allow us to schedule
a task for a specfic cpu so the ageing code was slightly reworked.
The magazine ageing delay has been directly linked to the cache
ageing function.  The spl_cache_age() function invokes on_each_cpu()
in order to run spl_magazine_age() on each cpu.  It then blocks
waiting for them to complete and promptly reclaims any free slabs.

When restructing the code wasn't the primary goal I think the
new code is far more understable and maintainable.  It also should
help minimize magazine thrashing because free slabs are immediately
released after the magazine is aged.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 include/sys/kmem.h    |  4 +-
 module/spl/spl-kmem.c | 91 ++++++++++++++++++++++++-------------------
 2 files changed, 52 insertions(+), 43 deletions(-)

diff --git a/include/sys/kmem.h b/include/sys/kmem.h
index 83adc8d2a..e189922ef 100644
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@@ -37,6 +37,7 @@
 #include <sys/types.h>
 #include <sys/vmsystm.h>
 #include <sys/kstat.h>
+#include <sys/taskq.h>
 
 /*
  * Memory allocation interfaces
@@ -406,7 +407,6 @@ typedef struct spl_kmem_magazine {
 	uint32_t		skm_size;	/* Magazine size */
 	uint32_t		skm_refill;	/* Batch refill size */
 	struct spl_kmem_cache	*skm_cache;	/* Owned by cache */
-	struct delayed_work	skm_work;	/* Magazine reclaim work */
 	unsigned long		skm_age;	/* Last cache access */
 	unsigned int		skm_cpu;	/* Owned by cpu */
 	void			*skm_objs[0];	/* Object pointers */
@@ -460,7 +460,7 @@ typedef struct spl_kmem_cache {
 	uint32_t		skc_delay;	/* Slab reclaim interval */
 	uint32_t		skc_reap;	/* Slab reclaim count */
 	atomic_t		skc_ref;	/* Ref count callers */
-	struct delayed_work	skc_work;	/* Slab reclaim work */
+	taskqid_t		skc_taskqid;	/* Slab reclaim task */
 	struct list_head	skc_list;	/* List of caches linkage */
 	struct list_head	skc_complete_list;/* Completely alloc'ed */
 	struct list_head	skc_partial_list; /* Partially alloc'ed */
diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
index f78f820aa..3900c9cf0 100644
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -825,6 +825,7 @@ EXPORT_SYMBOL(vmem_free_debug);
 
 struct list_head spl_kmem_cache_list;   /* List of caches */
 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
+taskq_t *spl_kmem_cache_taskq;          /* Task queue for ageing / reclaim */
 
 static int spl_cache_flush(spl_kmem_cache_t *skc,
                            spl_kmem_magazine_t *skm, int flush);
@@ -1243,50 +1244,59 @@ spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
 	SRETURN(0);
 }
 
-/*
- * Called regularly on all caches to age objects out of the magazines
- * which have not been access in skc->skc_delay seconds.  This prevents
- * idle magazines from holding memory which might be better used by
- * other caches or parts of the system.  The delay is present to
- * prevent thrashing the magazine.
- */
 static void
 spl_magazine_age(void *data)
 {
-	spl_kmem_magazine_t *skm =
-		spl_get_work_data(data, spl_kmem_magazine_t, skm_work.work);
-	spl_kmem_cache_t *skc = skm->skm_cache;
+	spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
+	spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
 
 	ASSERT(skm->skm_magic == SKM_MAGIC);
-	ASSERT(skc->skc_magic == SKC_MAGIC);
-	ASSERT(skc->skc_mag[skm->skm_cpu] == skm);
+	ASSERT(skm->skm_cpu == smp_processor_id());
 
-	if (skm->skm_avail > 0 &&
-	    time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
-		(void)spl_cache_flush(skc, skm, skm->skm_refill);
-
-	if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
-		schedule_delayed_work_on(skm->skm_cpu, &skm->skm_work,
-					 skc->skc_delay / 3 * HZ);
+	if (skm->skm_avail > 0)
+		if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
+			(void) spl_cache_flush(skc, skm, skm->skm_refill);
 }
 
 /*
- * Called regularly to keep a downward pressure on the size of idle
- * magazines and to release free slabs from the cache.  This function
- * never calls the registered reclaim function, that only occurs
- * under memory pressure or with a direct call to spl_kmem_reap().
+ * Called regularly to keep a downward pressure on the cache.
+ *
+ * Objects older than skc->skc_delay seconds in the per-cpu magazines will
+ * be returned to the caches.  This is done to prevent idle magazines from
+ * holding memory which could be better used elsewhere.  The delay is
+ * present to prevent thrashing the magazine.
+ *
+ * The newly released objects may result in empty partial slabs.  Those
+ * slabs should be released to the system.  Otherwise moving the objects
+ * out of the magazines is just wasted work.
  */
 static void
 spl_cache_age(void *data)
 {
-	spl_kmem_cache_t *skc =
-		spl_get_work_data(data, spl_kmem_cache_t, skc_work.work);
+	spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
+	taskqid_t id = 0;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
+
+	atomic_inc(&skc->skc_ref);
+	spl_on_each_cpu(spl_magazine_age, skc, 1);
 	spl_slab_reclaim(skc, skc->skc_reap, 0);
 
-	if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
-		schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
+	while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
+		id = taskq_dispatch_delay(
+		    spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
+		    ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
+
+		/* Destroy issued after dispatch immediately cancel it */
+		if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
+			taskq_cancel_id(spl_kmem_cache_taskq, id);
+	}
+
+	spin_lock(&skc->skc_lock);
+	skc->skc_taskqid = id;
+	spin_unlock(&skc->skc_lock);
+
+	atomic_dec(&skc->skc_ref);
 }
 
 /*
@@ -1380,7 +1390,6 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
 		skm->skm_size = skc->skc_mag_size;
 		skm->skm_refill = skc->skc_mag_refill;
 		skm->skm_cache = skc;
-		spl_init_delayed_work(&skm->skm_work, spl_magazine_age, skm);
 		skm->skm_age = jiffies;
 		skm->skm_cpu = cpu;
 	}
@@ -1427,11 +1436,6 @@ spl_magazine_create(spl_kmem_cache_t *skc)
 		}
 	}
 
-	/* Only after everything is allocated schedule magazine work */
-	for_each_online_cpu(i)
-		schedule_delayed_work_on(i, &skc->skc_mag[i]->skm_work,
-				         skc->skc_delay / 3 * HZ);
-
 	SRETURN(0);
 }
 
@@ -1566,8 +1570,9 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
 	if (rc)
 		SGOTO(out, rc);
 
-	spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc);
-	schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
+	skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
+	    spl_cache_age, skc, TQ_SLEEP,
+	    ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
 
 	down_write(&spl_kmem_cache_sem);
 	list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
@@ -1600,7 +1605,7 @@ void
 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 {
 	DECLARE_WAIT_QUEUE_HEAD(wq);
-	int i;
+	taskqid_t id;
 	SENTRY;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
@@ -1609,13 +1614,14 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 	list_del_init(&skc->skc_list);
 	up_write(&spl_kmem_cache_sem);
 
-	/* Cancel any and wait for any pending delayed work */
+	/* Cancel any and wait for any pending delayed tasks */
 	VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
-	cancel_delayed_work_sync(&skc->skc_work);
-	for_each_online_cpu(i)
-		cancel_delayed_work_sync(&skc->skc_mag[i]->skm_work);
 
-	flush_scheduled_work();
+	spin_lock(&skc->skc_lock);
+	id = skc->skc_taskqid;
+	spin_unlock(&skc->skc_lock);
+
+	taskq_cancel_id(spl_kmem_cache_taskq, id);
 
 	/* Wait until all current callers complete, this is mainly
 	 * to catch the case where a low memory situation triggers a
@@ -2394,6 +2400,8 @@ spl_kmem_init(void)
 
 	init_rwsem(&spl_kmem_cache_sem);
 	INIT_LIST_HEAD(&spl_kmem_cache_list);
+	spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
+	    1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
 
 	spl_register_shrinker(&spl_kmem_cache_shrinker);
 
@@ -2432,6 +2440,7 @@ spl_kmem_fini(void)
 	SENTRY;
 
 	spl_unregister_shrinker(&spl_kmem_cache_shrinker);
+	taskq_destroy(spl_kmem_cache_taskq);
 
 	SEXIT;
 }
-- 
2.40.0