]> granicus.if.org Git - spl/commitdiff
kmem slab fixes
authorBrian Behlendorf <behlendorf1@llnl.gov>
Thu, 12 Feb 2009 21:32:10 +0000 (13:32 -0800)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Thu, 12 Feb 2009 21:32:10 +0000 (13:32 -0800)
- Default SPL_KMEM_CACHE_DELAY changed to 15 to match Solaris.
- Aged out slab checking occurs every SPL_KMEM_CACHE_DELAY / 3.
- skc->skc_reap tunable added whichs allows callers of
  spl_slab_reclaim() to cap the number of slabs reclaimed.
  On Solaris all eligible slabs are always reclaimed, and this
  is still the default behavior.  However, I suspect that is
  not always wise for reasons such as in the next comment.
- spl_slab_reclaim() added cond_resched() while walking the
  slab/object free lists.  Soft lockups were observed when
  freeing large numbers of vmalloc'd slabs/objets.
- spl_slab_reclaim() 'sks->sks_ref > 0' check changes from
  incorrect 'break' to 'continue' to ensure all slabs are
  checked.
- spl_cache_age() reworked to avoid a deadlock with
  do_flush_tlb_all() which occured because we slept waiting
  for completion in spl_cache_age().  To waiting for magazine
  reclamation to finish is not required so we no longer wait.
- spl_magazine_create() and spl_magazine_destroy() shifted
  back to using for_each_online_cpu() instead of the
  spl_on_each_cpu() approach which was of course a bad idea
  due to memory allocations which Ricardo pointed out.

include/sys/kmem.h
module/spl/spl-kmem.c

index 7281f1063df54bff3d9bef2bdda8795212a704e0..dc66a91536a413a52186e8287ddb3bb8ea698e3b 100644 (file)
@@ -239,7 +239,8 @@ extern struct rw_semaphore spl_kmem_cache_sem;
 #define SKS_MAGIC                      0x22222222
 #define SKC_MAGIC                      0x2c2c2c2c
 
-#define SPL_KMEM_CACHE_DELAY           5       /* Minimum slab release age */
+#define SPL_KMEM_CACHE_DELAY           15      /* Minimum slab release age */
+#define SPL_KMEM_CACHE_REAP            0       /* Default reap everything */
 #define SPL_KMEM_CACHE_OBJ_PER_SLAB    32      /* Target objects per slab */
 #define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN        8       /* Minimum objects per slab */
 #define SPL_KMEM_CACHE_ALIGN           8       /* Default object alignment */
@@ -292,6 +293,7 @@ typedef struct spl_kmem_cache {
        uint32_t                skc_slab_objs;  /* Objects per slab */
        uint32_t                skc_slab_size;  /* Slab size */
        uint32_t                skc_delay;      /* Slab reclaim interval */
+       uint32_t                skc_reap;       /* Slab reclaim count */
        atomic_t                skc_ref;        /* Ref count callers */
        struct delayed_work     skc_work;       /* Slab reclaim work */
         struct work_struct work;
index b5cd9fb1216a53faf98681a9473d887160fff772..d82d7b49fe785817e4ecc3f47a6edac18b99e1f7 100644 (file)
@@ -856,16 +856,19 @@ spl_slab_free(spl_kmem_slab_t *sks,
 /*
  * Traverses all the partial slabs attached to a cache and free those
  * which which are currently empty, and have not been touched for
- * skc_delay seconds.  This is to avoid thrashing.
+ * skc_delay seconds to  avoid thrashing.  The count argument is
+ * passed to optionally cap the number of slabs reclaimed, a count
+ * of zero means try and reclaim everything.  When flag is set we
+ * always free an available slab regardless of age.
  */
 static void
-spl_slab_reclaim(spl_kmem_cache_t *skc, int flag)
+spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
 {
        spl_kmem_slab_t *sks, *m;
        spl_kmem_obj_t *sko, *n;
        LIST_HEAD(sks_list);
        LIST_HEAD(sko_list);
-       int size;
+       int size, i = 0;
        ENTRY;
 
        /*
@@ -878,11 +881,18 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int flag)
        spin_lock(&skc->skc_lock);
         list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
                                         sks_list) {
+               /* Release at most count slabs */
+               if (count && i > count)
+                       break;
+
+               /* Skip active slabs */
                if (sks->sks_ref > 0)
-                      break;
+                       continue;
 
-               if (flag || time_after(jiffies,sks->sks_age+skc->skc_delay*HZ))
+               if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
                        spl_slab_free(sks, &sks_list, &sko_list);
+                       i++;
+               }
        }
        spin_unlock(&skc->skc_lock);
 
@@ -896,12 +906,18 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int flag)
                size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) +
                       P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align);
 
-               list_for_each_entry_safe(sko, n, &sko_list, sko_list)
+               /* To avoid soft lockups conditionally reschedule */
+               list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
                        kv_free(skc, sko->sko_addr, size);
+                       cond_resched();
+               }
        }
 
-       list_for_each_entry_safe(sks, m, &sks_list, sks_list)
+       /* To avoid soft lockups conditionally reschedule */
+       list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
                kv_free(skc, sks, skc->skc_slab_size);
+               cond_resched();
+       }
 
        EXIT;
 }
@@ -937,11 +953,11 @@ spl_cache_age(void *data)
                spl_get_work_data(data, spl_kmem_cache_t, skc_work.work);
 
        ASSERT(skc->skc_magic == SKC_MAGIC);
-       spl_on_each_cpu(spl_magazine_age, skc, 1);
-       spl_slab_reclaim(skc, 0);
+       spl_slab_reclaim(skc, skc->skc_reap, 0);
+       spl_on_each_cpu(spl_magazine_age, skc, 0);
 
        if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
-               schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ);
+               schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
 }
 
 /*
@@ -1057,39 +1073,29 @@ spl_magazine_free(spl_kmem_magazine_t *skm)
        EXIT;
 }
 
-static void
-__spl_magazine_create(void *data)
-{
-        spl_kmem_cache_t *skc = data;
-       int id = smp_processor_id();
-
-       skc->skc_mag[id] = spl_magazine_alloc(skc, cpu_to_node(id));
-       ASSERT(skc->skc_mag[id]);
-}
-
 /*
  * Create all pre-cpu magazines of reasonable sizes.
  */
 static int
 spl_magazine_create(spl_kmem_cache_t *skc)
 {
+       int i;
        ENTRY;
 
        skc->skc_mag_size = spl_magazine_size(skc);
        skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
-       spl_on_each_cpu(__spl_magazine_create, skc, 1);
 
-       RETURN(0);
-}
+       for_each_online_cpu(i) {
+               skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
+               if (!skc->skc_mag[i]) {
+                       for (i--; i >= 0; i--)
+                               spl_magazine_free(skc->skc_mag[i]);
 
-static void
-__spl_magazine_destroy(void *data)
-{
-        spl_kmem_cache_t *skc = data;
-       spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
+                       RETURN(-ENOMEM);
+               }
+       }
 
-       (void)spl_cache_flush(skc, skm, skm->skm_avail);
-       spl_magazine_free(skm);
+       RETURN(0);
 }
 
 /*
@@ -1098,8 +1104,16 @@ __spl_magazine_destroy(void *data)
 static void
 spl_magazine_destroy(spl_kmem_cache_t *skc)
 {
+       spl_kmem_magazine_t *skm;
+       int i;
        ENTRY;
-       spl_on_each_cpu(__spl_magazine_destroy, skc, 1);
+
+        for_each_online_cpu(i) {
+               skm = skc->skc_mag[i];
+               (void)spl_cache_flush(skc, skm, skm->skm_avail);
+               spl_magazine_free(skm);
+        }
+
        EXIT;
 }
 
@@ -1168,6 +1182,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
        skc->skc_obj_size = size;
        skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
        skc->skc_delay = SPL_KMEM_CACHE_DELAY;
+       skc->skc_reap = SPL_KMEM_CACHE_REAP;
        atomic_set(&skc->skc_ref, 0);
 
        INIT_LIST_HEAD(&skc->skc_list);
@@ -1209,7 +1224,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
                GOTO(out, rc);
 
        spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc);
-       schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ);
+       schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
 
        down_write(&spl_kmem_cache_sem);
        list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
@@ -1249,7 +1264,7 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
        wait_event(wq, atomic_read(&skc->skc_ref) == 0);
 
        spl_magazine_destroy(skc);
-       spl_slab_reclaim(skc, 1);
+       spl_slab_reclaim(skc, 0, 1);
        spin_lock(&skc->skc_lock);
 
        /* Validate there are no objects in use and free all the
@@ -1654,7 +1669,7 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
        if (skc->skc_reclaim)
                skc->skc_reclaim(skc->skc_private);
 
-       spl_slab_reclaim(skc, 0);
+       spl_slab_reclaim(skc, skc->skc_reap, 0);
        clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
        atomic_dec(&skc->skc_ref);