]> granicus.if.org Git - zfs/commitdiff
kmem-cache: Use taskqs for ageing
authorBrian Behlendorf <behlendorf1@llnl.gov>
Mon, 10 Dec 2012 18:53:46 +0000 (10:53 -0800)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Wed, 12 Dec 2012 17:56:54 +0000 (09:56 -0800)
Shift the cache and magazine ageing functionality over to the new
delayed taskq interfaces.  This allows us to abandon the kernels
delayed work queue interface and all the compatibility code it
requires.

However, the delayed taskq interface does not allow us to schedule
a task for a specfic cpu so the ageing code was slightly reworked.
The magazine ageing delay has been directly linked to the cache
ageing function.  The spl_cache_age() function invokes on_each_cpu()
in order to run spl_magazine_age() on each cpu.  It then blocks
waiting for them to complete and promptly reclaims any free slabs.

When restructing the code wasn't the primary goal I think the
new code is far more understable and maintainable.  It also should
help minimize magazine thrashing because free slabs are immediately
released after the magazine is aged.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
include/sys/kmem.h
module/spl/spl-kmem.c

index 83adc8d2a36e875ce66fab709a6c462bd1eb4561..e189922efe7ae58d1c2fed783a8c423c0a5932c2 100644 (file)
@@ -37,6 +37,7 @@
 #include <sys/types.h>
 #include <sys/vmsystm.h>
 #include <sys/kstat.h>
+#include <sys/taskq.h>
 
 /*
  * Memory allocation interfaces
@@ -406,7 +407,6 @@ typedef struct spl_kmem_magazine {
        uint32_t                skm_size;       /* Magazine size */
        uint32_t                skm_refill;     /* Batch refill size */
        struct spl_kmem_cache   *skm_cache;     /* Owned by cache */
-       struct delayed_work     skm_work;       /* Magazine reclaim work */
        unsigned long           skm_age;        /* Last cache access */
        unsigned int            skm_cpu;        /* Owned by cpu */
        void                    *skm_objs[0];   /* Object pointers */
@@ -460,7 +460,7 @@ typedef struct spl_kmem_cache {
        uint32_t                skc_delay;      /* Slab reclaim interval */
        uint32_t                skc_reap;       /* Slab reclaim count */
        atomic_t                skc_ref;        /* Ref count callers */
-       struct delayed_work     skc_work;       /* Slab reclaim work */
+       taskqid_t               skc_taskqid;    /* Slab reclaim task */
        struct list_head        skc_list;       /* List of caches linkage */
        struct list_head        skc_complete_list;/* Completely alloc'ed */
        struct list_head        skc_partial_list; /* Partially alloc'ed */
index f78f820aa8713d5663741b6a26268c6a29c675fa..3900c9cf0568fe45f20b01cd3020efea28de0149 100644 (file)
@@ -825,6 +825,7 @@ EXPORT_SYMBOL(vmem_free_debug);
 
 struct list_head spl_kmem_cache_list;   /* List of caches */
 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
+taskq_t *spl_kmem_cache_taskq;          /* Task queue for ageing / reclaim */
 
 static int spl_cache_flush(spl_kmem_cache_t *skc,
                            spl_kmem_magazine_t *skm, int flush);
@@ -1243,50 +1244,59 @@ spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
        SRETURN(0);
 }
 
-/*
- * Called regularly on all caches to age objects out of the magazines
- * which have not been access in skc->skc_delay seconds.  This prevents
- * idle magazines from holding memory which might be better used by
- * other caches or parts of the system.  The delay is present to
- * prevent thrashing the magazine.
- */
 static void
 spl_magazine_age(void *data)
 {
-       spl_kmem_magazine_t *skm =
-               spl_get_work_data(data, spl_kmem_magazine_t, skm_work.work);
-       spl_kmem_cache_t *skc = skm->skm_cache;
+       spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
+       spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
 
        ASSERT(skm->skm_magic == SKM_MAGIC);
-       ASSERT(skc->skc_magic == SKC_MAGIC);
-       ASSERT(skc->skc_mag[skm->skm_cpu] == skm);
+       ASSERT(skm->skm_cpu == smp_processor_id());
 
-       if (skm->skm_avail > 0 &&
-           time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
-               (void)spl_cache_flush(skc, skm, skm->skm_refill);
-
-       if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
-               schedule_delayed_work_on(skm->skm_cpu, &skm->skm_work,
-                                        skc->skc_delay / 3 * HZ);
+       if (skm->skm_avail > 0)
+               if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
+                       (void) spl_cache_flush(skc, skm, skm->skm_refill);
 }
 
 /*
- * Called regularly to keep a downward pressure on the size of idle
- * magazines and to release free slabs from the cache.  This function
- * never calls the registered reclaim function, that only occurs
- * under memory pressure or with a direct call to spl_kmem_reap().
+ * Called regularly to keep a downward pressure on the cache.
+ *
+ * Objects older than skc->skc_delay seconds in the per-cpu magazines will
+ * be returned to the caches.  This is done to prevent idle magazines from
+ * holding memory which could be better used elsewhere.  The delay is
+ * present to prevent thrashing the magazine.
+ *
+ * The newly released objects may result in empty partial slabs.  Those
+ * slabs should be released to the system.  Otherwise moving the objects
+ * out of the magazines is just wasted work.
  */
 static void
 spl_cache_age(void *data)
 {
-       spl_kmem_cache_t *skc =
-               spl_get_work_data(data, spl_kmem_cache_t, skc_work.work);
+       spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
+       taskqid_t id = 0;
 
        ASSERT(skc->skc_magic == SKC_MAGIC);
+
+       atomic_inc(&skc->skc_ref);
+       spl_on_each_cpu(spl_magazine_age, skc, 1);
        spl_slab_reclaim(skc, skc->skc_reap, 0);
 
-       if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
-               schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
+       while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
+               id = taskq_dispatch_delay(
+                   spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
+                   ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
+
+               /* Destroy issued after dispatch immediately cancel it */
+               if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
+                       taskq_cancel_id(spl_kmem_cache_taskq, id);
+       }
+
+       spin_lock(&skc->skc_lock);
+       skc->skc_taskqid = id;
+       spin_unlock(&skc->skc_lock);
+
+       atomic_dec(&skc->skc_ref);
 }
 
 /*
@@ -1380,7 +1390,6 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
                skm->skm_size = skc->skc_mag_size;
                skm->skm_refill = skc->skc_mag_refill;
                skm->skm_cache = skc;
-               spl_init_delayed_work(&skm->skm_work, spl_magazine_age, skm);
                skm->skm_age = jiffies;
                skm->skm_cpu = cpu;
        }
@@ -1427,11 +1436,6 @@ spl_magazine_create(spl_kmem_cache_t *skc)
                }
        }
 
-       /* Only after everything is allocated schedule magazine work */
-       for_each_online_cpu(i)
-               schedule_delayed_work_on(i, &skc->skc_mag[i]->skm_work,
-                                        skc->skc_delay / 3 * HZ);
-
        SRETURN(0);
 }
 
@@ -1566,8 +1570,9 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
        if (rc)
                SGOTO(out, rc);
 
-       spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc);
-       schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
+       skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
+           spl_cache_age, skc, TQ_SLEEP,
+           ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
 
        down_write(&spl_kmem_cache_sem);
        list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
@@ -1600,7 +1605,7 @@ void
 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 {
        DECLARE_WAIT_QUEUE_HEAD(wq);
-       int i;
+       taskqid_t id;
        SENTRY;
 
        ASSERT(skc->skc_magic == SKC_MAGIC);
@@ -1609,13 +1614,14 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
        list_del_init(&skc->skc_list);
        up_write(&spl_kmem_cache_sem);
 
-       /* Cancel any and wait for any pending delayed work */
+       /* Cancel any and wait for any pending delayed tasks */
        VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
-       cancel_delayed_work_sync(&skc->skc_work);
-       for_each_online_cpu(i)
-               cancel_delayed_work_sync(&skc->skc_mag[i]->skm_work);
 
-       flush_scheduled_work();
+       spin_lock(&skc->skc_lock);
+       id = skc->skc_taskqid;
+       spin_unlock(&skc->skc_lock);
+
+       taskq_cancel_id(spl_kmem_cache_taskq, id);
 
        /* Wait until all current callers complete, this is mainly
         * to catch the case where a low memory situation triggers a
@@ -2394,6 +2400,8 @@ spl_kmem_init(void)
 
        init_rwsem(&spl_kmem_cache_sem);
        INIT_LIST_HEAD(&spl_kmem_cache_list);
+       spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
+           1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
 
        spl_register_shrinker(&spl_kmem_cache_shrinker);
 
@@ -2432,6 +2440,7 @@ spl_kmem_fini(void)
        SENTRY;
 
        spl_unregister_shrinker(&spl_kmem_cache_shrinker);
+       taskq_destroy(spl_kmem_cache_taskq);
 
        SEXIT;
 }