]> granicus.if.org Git - spl/commitdiff
Victory! I've reworked caches with large objects which are
authorbehlendo <behlendo@7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c>
Sat, 28 Jun 2008 05:04:46 +0000 (05:04 +0000)
committerbehlendo <behlendo@7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c>
Sat, 28 Jun 2008 05:04:46 +0000 (05:04 +0000)
based by vmalloc()'ed memory.  I now alloc a slab which is
roughly 32*spl_obj_size and in this block of memory I place
the slab descriptor, slab object descriptors, and objects
themselves.  This greatly reduces vmalloc lock contention.

Still some minor cleanup remains and fine tuning but
it's working pretty well.

git-svn-id: https://outreach.scidac.gov/svn/spl/trunk@139 7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c

include/sys/kmem.h
modules/spl/spl-kmem.c
modules/splat/splat-kmem.c

index 47ac72e7727144a5bb20662627344783b179343d..3c17700526f84633981753e2922c7acb77c8293f 100644 (file)
@@ -485,7 +485,6 @@ typedef struct spl_kmem_magazine {
 
 typedef struct spl_kmem_obj {
         uint32_t               sko_magic;      /* Sanity magic */
-       uint32_t                sko_flags;      /* Per object flags */
        void                    *sko_addr;      /* Buffer address */
        struct spl_kmem_slab    *sko_slab;      /* Owned by slab */
        struct list_head        sko_list;       /* Free object list linkage */
index 0ee04a287267cc621a2f283ce28d990e2b83112a..be20c5b44f11dc2e2c4bbfb84551db57222d4534 100644 (file)
@@ -167,17 +167,9 @@ static struct shrinker spl_kmem_cache_shrinker = {
 };
 #endif
 
-static spl_kmem_slab_t *
-spl_slab_alloc(spl_kmem_cache_t *skc, int flags) {
-       spl_kmem_slab_t *sks;
-       spl_kmem_obj_t *sko, *n;
-       int i;
-       ENTRY;
-
-       sks = kmem_cache_alloc(spl_slab_cache, flags);
-       if (sks == NULL)
-               RETURN(sks);
-
+static void
+spl_slab_init(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
+{
        sks->sks_magic = SKS_MAGIC;
        sks->sks_objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
        sks->sks_age = jiffies;
@@ -185,91 +177,201 @@ spl_slab_alloc(spl_kmem_cache_t *skc, int flags) {
        INIT_LIST_HEAD(&sks->sks_list);
        INIT_LIST_HEAD(&sks->sks_free_list);
        sks->sks_ref = 0;
+}
+
+static int
+spl_slab_alloc_kmem(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks, int flags)
+{
+       spl_kmem_obj_t *sko, *n;
+       int i, rc = 0;
 
+       /* This is based on the linux slab cache for now simply because
+        * it means I get slab coloring, hardware cache alignment, etc
+        * for free.  There's no reason we can't do this ourselves.  And
+        * we probably should at in the future.  For now I'll just
+        * leverage the existing linux slab here. */
        for (i = 0; i < sks->sks_objs; i++) {
                sko = kmem_cache_alloc(spl_obj_cache, flags);
                if (sko == NULL) {
-out_alloc:
-                       /* Unable to fully construct slab, objects,
-                        * and object data buffers unwind everything.
-                        */
-                       list_for_each_entry_safe(sko, n, &sks->sks_free_list,
-                                                sko_list) {
-                               ASSERT(sko->sko_magic == SKO_MAGIC);
-                               vmem_free(sko->sko_addr, skc->skc_obj_size);
-                               list_del(&sko->sko_list);
-                               kmem_cache_free(spl_obj_cache, sko);
-                       }
-
-                       kmem_cache_free(spl_slab_cache, sks);
-                       GOTO(out, sks = NULL);
+                       rc = -ENOMEM;
+                       break;
                }
 
-               /* Objects less than a page can use kmem_alloc() and avoid
-                * the locking overhead in __get_vm_area_node() when locking
-                * for a free address.  For objects over a page we use
-                * vmem_alloc() because it is usually worth paying this
-                * overhead to avoid the need to find contigeous pages.
-                * This should give us the best of both worlds. */
-               if (skc->skc_obj_size <= PAGE_SIZE)
-                       sko->sko_addr = kmem_alloc(skc->skc_obj_size, flags);
-               else
-                       sko->sko_addr = vmem_alloc(skc->skc_obj_size, flags);
-
+               sko->sko_addr = kmem_alloc(skc->skc_obj_size, flags);
                if (sko->sko_addr == NULL) {
                        kmem_cache_free(spl_obj_cache, sko);
-                       GOTO(out_alloc, sks = NULL);
+                       rc = -ENOMEM;
+                       break;
                }
 
                sko->sko_magic = SKO_MAGIC;
-               sko->sko_flags = 0;
                sko->sko_slab = sks;
                INIT_LIST_HEAD(&sko->sko_list);
                INIT_HLIST_NODE(&sko->sko_hlist);
                list_add(&sko->sko_list, &sks->sks_free_list);
        }
+
+       /* Unable to fully construct slab, unwind everything */
+       if (rc) {
+               list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
+                       ASSERT(sko->sko_magic == SKO_MAGIC);
+                       kmem_free(sko->sko_addr, skc->skc_obj_size);
+                       list_del(&sko->sko_list);
+                       kmem_cache_free(spl_obj_cache, sko);
+               }
+       }
+
+       RETURN(rc);
+}
+
+static spl_kmem_slab_t *
+spl_slab_alloc_vmem(spl_kmem_cache_t *skc, int flags)
+{
+       spl_kmem_slab_t *sks;
+       spl_kmem_obj_t *sko, *sko_base;
+       void *slab, *obj, *obj_base;
+       int i, size;
+
+       /* For large vmem_alloc'ed buffers it's important that we pack the
+        * spl_kmem_obj_t structure and the actual objects in to one large
+        * virtual address zone to minimize the number of calls to
+        * vmalloc().  Mapping the virtual address in done under a single
+        * global lock which walks a list of all virtual zones.  So doing
+        * lots of allocations simply results in lock contention and a
+        * longer list of mapped addresses.  It is far better to do a
+        * few large allocations and then subdivide it ourselves.  The
+        * large vmem_alloc'ed space is divied as follows:
+        *
+        * 1 slab struct: sizeof(spl_kmem_slab_t)
+        * N obj structs: sizeof(spl_kmem_obj_t) * skc->skc_objs
+        * N objects:     skc->skc_obj_size * skc->skc_objs
+        *
+        * XXX: It would probably be a good idea to more carefully
+        *      align the starts of these objects in memory.
+        */
+       size = sizeof(spl_kmem_slab_t) + SPL_KMEM_CACHE_OBJ_PER_SLAB *
+              (skc->skc_obj_size + sizeof(spl_kmem_obj_t));
+
+       slab = vmem_alloc(size, flags);
+       if (slab == NULL)
+               RETURN(NULL);
+
+       sks = (spl_kmem_slab_t *)slab;
+       spl_slab_init(skc, sks);
+
+       sko_base = (spl_kmem_obj_t *)(slab + sizeof(spl_kmem_slab_t));
+       obj_base = (void *)sko_base + sizeof(spl_kmem_obj_t) * sks->sks_objs;
+
+       for (i = 0; i < sks->sks_objs; i++) {
+               sko = &sko_base[i];
+               obj = obj_base + skc->skc_obj_size * i;
+               sko->sko_addr = obj;
+               sko->sko_magic = SKO_MAGIC;
+               sko->sko_slab = sks;
+               INIT_LIST_HEAD(&sko->sko_list);
+               INIT_HLIST_NODE(&sko->sko_hlist);
+               list_add_tail(&sko->sko_list, &sks->sks_free_list);
+       }
+
+       RETURN(sks);
+}
+
+static spl_kmem_slab_t *
+spl_slab_alloc(spl_kmem_cache_t *skc, int flags) {
+       spl_kmem_slab_t *sks;
+       spl_kmem_obj_t *sko;
+       int rc;
+       ENTRY;
+
+       /* Objects less than a page can use kmem_alloc() and avoid
+        * the locking overhead in __get_vm_area_node() when locking
+        * for a free address.  For objects over a page we use
+        * vmem_alloc() because it is usually worth paying this
+        * overhead to avoid the need to find contigeous pages.
+        * This should give us the best of both worlds. */
+       if (skc->skc_obj_size <= PAGE_SIZE) {
+               sks = kmem_cache_alloc(spl_slab_cache, flags);
+               if (sks == NULL)
+                       GOTO(out, sks = NULL);
+
+               spl_slab_init(skc, sks);
+
+               rc = spl_slab_alloc_kmem(skc, sks, flags);
+               if (rc) {
+                       kmem_cache_free(spl_slab_cache, sks);
+                       GOTO(out, sks = NULL);
+               }
+       } else {
+               sks = spl_slab_alloc_vmem(skc, flags);
+               if (sks == NULL)
+                       GOTO(out, sks = NULL);
+       }
+
+       ASSERT(sks);
+       list_for_each_entry(sko, &sks->sks_free_list, sko_list)
+               if (skc->skc_ctor)
+                       skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
 out:
        RETURN(sks);
 }
 
+static void
+spl_slab_free_kmem(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
+{
+       spl_kmem_obj_t *sko, *n;
+
+       ASSERT(skc->skc_magic == SKC_MAGIC);
+       ASSERT(sks->sks_magic == SKS_MAGIC);
+
+       list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
+               ASSERT(sko->sko_magic == SKO_MAGIC);
+               kmem_free(sko->sko_addr, skc->skc_obj_size);
+               list_del(&sko->sko_list);
+               kmem_cache_free(spl_obj_cache, sko);
+       }
+
+       kmem_cache_free(spl_slab_cache, sks);
+}
+
+static void
+spl_slab_free_vmem(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
+{
+       ASSERT(skc->skc_magic == SKC_MAGIC);
+       ASSERT(sks->sks_magic == SKS_MAGIC);
+
+       vmem_free(sks, SPL_KMEM_CACHE_OBJ_PER_SLAB *
+                 (skc->skc_obj_size + sizeof(spl_kmem_obj_t)));
+}
+
 /* Removes slab from complete or partial list, so it must
  * be called with the 'skc->skc_lock' held.
- *                         */
+ */
 static void
 spl_slab_free(spl_kmem_slab_t *sks) {
        spl_kmem_cache_t *skc;
        spl_kmem_obj_t *sko, *n;
-       int i = 0;
        ENTRY;
 
        ASSERT(sks->sks_magic == SKS_MAGIC);
        ASSERT(sks->sks_ref == 0);
-       skc = sks->sks_cache;
-       skc->skc_obj_total -= sks->sks_objs;
-       skc->skc_slab_total--;
 
+       skc = sks->sks_cache;
+       ASSERT(skc->skc_magic == SKC_MAGIC);
        ASSERT(spin_is_locked(&skc->skc_lock));
 
-       list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
-               ASSERT(sko->sko_magic == SKO_MAGIC);
+       skc->skc_obj_total -= sks->sks_objs;
+       skc->skc_slab_total--;
+       list_del(&sks->sks_list);
 
-               /* Run destructors for being freed */
+       /* Run destructors slab is being released */
+       list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list)
                if (skc->skc_dtor)
                        skc->skc_dtor(sko->sko_addr, skc->skc_private);
 
-               if (skc->skc_obj_size <= PAGE_SIZE)
-                       kmem_free(sko->sko_addr, skc->skc_obj_size);
-               else
-                       vmem_free(sko->sko_addr, skc->skc_obj_size);
-
-               list_del(&sko->sko_list);
-               kmem_cache_free(spl_obj_cache, sko);
-               i++;
-       }
-
-       ASSERT(sks->sks_objs == i);
-       list_del(&sks->sks_list);
-       kmem_cache_free(spl_slab_cache, sks);
+       if (skc->skc_obj_size <= PAGE_SIZE)
+               spl_slab_free_kmem(skc, sks);
+       else
+               spl_slab_free_vmem(skc, sks);
 
        EXIT;
 }
@@ -629,14 +731,13 @@ static spl_kmem_slab_t *
 spl_cache_grow(spl_kmem_cache_t *skc, int flags)
 {
        spl_kmem_slab_t *sks;
-       spl_kmem_obj_t *sko;
        cycles_t start;
        ENTRY;
 
        ASSERT(skc->skc_magic == SKC_MAGIC);
 
        if (flags & __GFP_WAIT) {
-//             flags |= __GFP_NOFAIL; /* XXX: Solaris assumes this */
+               flags |= __GFP_NOFAIL;
                might_sleep();
                local_irq_enable();
        }
@@ -649,14 +750,6 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags)
                RETURN(NULL);
        }
 
-       /* Run all the constructors now that the slab is fully allocated */
-       list_for_each_entry(sko, &sks->sks_free_list, sko_list) {
-               ASSERT(sko->sko_magic == SKO_MAGIC);
-
-               if (skc->skc_ctor)
-                       skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
-       }
-
        if (flags & __GFP_WAIT)
                local_irq_disable();
 
@@ -697,7 +790,7 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
                if (list_empty(&skc->skc_partial_list)) {
                        spin_unlock(&skc->skc_lock);
 
-                       if (unlikely((get_cycles() - start) > skc->skc_lock_refill))
+                       if (unlikely((get_cycles()-start)>skc->skc_lock_refill))
                                skc->skc_lock_refill = get_cycles() - start;
 
                        sks = spl_cache_grow(skc, flags);
@@ -861,6 +954,7 @@ restart:
        }
 
        local_irq_restore(irq_flags);
+       ASSERT(obj);
 
        /* Pre-emptively migrate object to CPU L1 cache */
        prefetchw(obj);
index de9b36841ac98dc93617bff4c16bbde2538e69f0..49715152d8a4f11d93338ab8c1770f21bf01376e 100644 (file)
@@ -559,36 +559,36 @@ splat_kmem_test8_count(kmem_cache_priv_t *kcp, int threads)
  * eyeball the slab cache locking overhead to ensure it is reasonable.
  */
 static int
-splat_kmem_test8(struct file *file, void *arg)
+splat_kmem_test8_sc(struct file *file, void *arg, int size, int count)
 {
        kmem_cache_priv_t kcp;
        kthread_t *thr;
        struct timespec start, stop, delta;
-       char cache_name[16];
-       int alloc, i;
+       char cache_name[32];
+       int i, j, threads = 32;
 
        kcp.kcp_magic = SPLAT_KMEM_TEST_MAGIC;
        kcp.kcp_file = file;
 
-        splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%s",
+        splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%-22s  %s", "name",
                     "time (sec)\tslabs       \tobjs        \thash\n");
-        splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%s",
+        splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%-22s  %s", "",
                     "          \ttot/max/calc\ttot/max/calc\tsize/depth\n");
 
-       for (alloc = 1; alloc <= 4096; alloc *= 2) {
-               kcp.kcp_size = 256;
+       for (i = 1; i <= count; i *= 2) {
+               kcp.kcp_size = size;
                kcp.kcp_count = 0;
                kcp.kcp_threads = 0;
-               kcp.kcp_alloc = alloc;
+               kcp.kcp_alloc = i;
                kcp.kcp_rc = 0;
                spin_lock_init(&kcp.kcp_lock);
                init_waitqueue_head(&kcp.kcp_waitq);
 
-               sprintf(cache_name, "%s-%d", SPLAT_KMEM_CACHE_NAME, alloc);
+               sprintf(cache_name, "%s-%d-%d", SPLAT_KMEM_CACHE_NAME, size, i);
                kcp.kcp_cache = kmem_cache_create(cache_name, kcp.kcp_size, 0,
-                                                 splat_kmem_cache_test_constructor,
-                                                 splat_kmem_cache_test_destructor,
-                                                 NULL, &kcp, NULL, 0);
+                                         splat_kmem_cache_test_constructor,
+                                         splat_kmem_cache_test_destructor,
+                                         NULL, &kcp, NULL, 0);
                if (!kcp.kcp_cache) {
                        splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
                                     "Unable to create '%s' cache\n",
@@ -598,7 +598,7 @@ splat_kmem_test8(struct file *file, void *arg)
 
                start = current_kernel_time();
 
-               for (i = 0; i < 32; i++) {
+               for (j = 0; j < threads; j++) {
                        thr = thread_create(NULL, 0, splat_kmem_test8_thread,
                                            &kcp, 0, &p0, TS_RUN, minclsyspri);
                        ASSERT(thr != NULL);
@@ -610,15 +610,17 @@ splat_kmem_test8(struct file *file, void *arg)
                stop = current_kernel_time();
                delta = timespec_sub(stop, start);
 
-               splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%2ld.%09ld\t"
+               splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%-22s %2ld.%09ld\t"
                             "%lu/%lu/%lu\t%lu/%lu/%lu\t%lu/%lu\n",
+                            kcp.kcp_cache->skc_name,
                             delta.tv_sec, delta.tv_nsec,
                             (unsigned long)kcp.kcp_cache->skc_slab_total,
                             (unsigned long)kcp.kcp_cache->skc_slab_max,
-                            (unsigned long)(kcp.kcp_alloc * 32 / SPL_KMEM_CACHE_OBJ_PER_SLAB),
+                            (unsigned long)(kcp.kcp_alloc * threads /
+                                           SPL_KMEM_CACHE_OBJ_PER_SLAB),
                             (unsigned long)kcp.kcp_cache->skc_obj_total,
                             (unsigned long)kcp.kcp_cache->skc_obj_max,
-                            (unsigned long)(kcp.kcp_alloc * 32),
+                            (unsigned long)(kcp.kcp_alloc * threads),
                             (unsigned long)kcp.kcp_cache->skc_hash_size,
                             (unsigned long)kcp.kcp_cache->skc_hash_depth);
 
@@ -631,6 +633,22 @@ splat_kmem_test8(struct file *file, void *arg)
        return kcp.kcp_rc;
 }
 
+static int
+splat_kmem_test8(struct file *file, void *arg)
+{
+       int i, rc = 0;
+
+       /* Run through slab cache with objects size from
+        * 16-1Mb in 4x multiples with 1024 objects each */
+       for (i = 16; i <= 1024*1024; i *= 4) {
+               rc = splat_kmem_test8_sc(file, arg, i, 1024);
+               if (rc)
+                       break;
+       }
+
+       return rc;
+}
+
 splat_subsystem_t *
 splat_kmem_init(void)
 {