]> granicus.if.org Git - zfs/commitdiff
Track emergency object in rbtree
authorBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 30 Oct 2012 17:45:50 +0000 (10:45 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 6 Nov 2012 22:54:19 +0000 (14:54 -0800)
In the initial implementation emergency objects were tracked on a
per-cache list.  The assumption was that under normal operation we
would never allocate more than a handful of these objects.  So the
cost of walking the list during free was expected to be negligible.

However real world usage has shown that emergency objects tend to
be allocated in batches.  A deadlock will be detected and several
thousand emergency objects will be allocated before the original
blocked slab allocation can complete.

Therefore the original list has been replaced by a red black tree
which is sorted by the memory address of each allocated object.
This bounds the worst case insertion and removal time to O(log n)
which minimize contention on the assoicated spin lock.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
include/sys/kmem.h
module/spl/spl-kmem.c

index b0f4208bdceb0f386368b1e4effebca51fe825cf..83adc8d2a36e875ce66fab709a6c462bd1eb4561 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/spinlock.h>
 #include <linux/rwsem.h>
 #include <linux/hash.h>
+#include <linux/rbtree.h>
 #include <linux/ctype.h>
 #include <asm/atomic.h>
 #include <sys/types.h>
@@ -435,8 +436,8 @@ typedef struct spl_kmem_alloc {
 } spl_kmem_alloc_t;
 
 typedef struct spl_kmem_emergency {
+       struct rb_node          ske_node;       /* Emergency tree linkage */
        void                    *ske_obj;       /* Buffer address */
-       struct list_head        ske_list;       /* Emergency list linkage */
 } spl_kmem_emergency_t;
 
 typedef struct spl_kmem_cache {
@@ -463,7 +464,7 @@ typedef struct spl_kmem_cache {
        struct list_head        skc_list;       /* List of caches linkage */
        struct list_head        skc_complete_list;/* Completely alloc'ed */
        struct list_head        skc_partial_list; /* Partially alloc'ed */
-       struct list_head        skc_emergency_list; /* Min sized objects */
+       struct rb_root          skc_emergency_tree; /* Min sized objects */
        spinlock_t              skc_lock;       /* Cache lock */
        wait_queue_head_t       skc_waitq;      /* Allocation waiters */
        uint64_t                skc_slab_fail;  /* Slab alloc failures */
index 045075cc033b533a9f6ae2cc844275305b623ab8..7e68522ad3ea87370a9ac2146162cbcb77b4b2ee 100644 (file)
@@ -1116,8 +1116,54 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
        SEXIT;
 }
 
+static spl_kmem_emergency_t *
+spl_emergency_search(struct rb_root *root, void *obj)
+{
+       struct rb_node *node = root->rb_node;
+       spl_kmem_emergency_t *ske;
+       unsigned long address = (unsigned long)obj;
+
+       while (node) {
+               ske = container_of(node, spl_kmem_emergency_t, ske_node);
+
+               if (address < (unsigned long)ske->ske_obj)
+                       node = node->rb_left;
+               else if (address > (unsigned long)ske->ske_obj)
+                       node = node->rb_right;
+               else
+                       return ske;
+       }
+
+       return NULL;
+}
+
+static int
+spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
+{
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+       spl_kmem_emergency_t *ske_tmp;
+       unsigned long address = (unsigned long)ske->ske_obj;
+
+       while (*new) {
+               ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
+
+               parent = *new;
+               if (address < (unsigned long)ske_tmp->ske_obj)
+                       new = &((*new)->rb_left);
+               else if (address > (unsigned long)ske_tmp->ske_obj)
+                       new = &((*new)->rb_right);
+               else
+                       return 0;
+       }
+
+       rb_link_node(&ske->ske_node, parent, new);
+       rb_insert_color(&ske->ske_node, root);
+
+       return 1;
+}
+
 /*
- * Allocate a single emergency object for use by the caller.
+ * Allocate a single emergency object and track it in a red black tree.
  */
 static int
 spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
@@ -1143,48 +1189,49 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
                SRETURN(-ENOMEM);
        }
 
-       if (skc->skc_ctor)
-               skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
-
        spin_lock(&skc->skc_lock);
-       skc->skc_obj_total++;
-       skc->skc_obj_emergency++;
-       if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
-               skc->skc_obj_emergency_max = skc->skc_obj_emergency;
-
-       list_add(&ske->ske_list, &skc->skc_emergency_list);
+       empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
+       if (likely(empty)) {
+               skc->skc_obj_total++;
+               skc->skc_obj_emergency++;
+               if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
+                       skc->skc_obj_emergency_max = skc->skc_obj_emergency;
+       }
        spin_unlock(&skc->skc_lock);
 
+       if (unlikely(!empty)) {
+               kfree(ske->ske_obj);
+               kfree(ske);
+               SRETURN(-EINVAL);
+       }
+
+       if (skc->skc_ctor)
+               skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
+
        *obj = ske->ske_obj;
 
        SRETURN(0);
 }
 
 /*
- * Free the passed object if it is an emergency object or a normal slab
- * object.  Currently this is done by walking what should be a short list of
- * emergency objects.  If this proves to be too inefficient we can replace
- * the simple list with a hash.
+ * Locate the passed object in the red black tree and free it.
  */
 static int
 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
 {
-       spl_kmem_emergency_t *m, *n, *ske = NULL;
+       spl_kmem_emergency_t *ske;
        SENTRY;
 
        spin_lock(&skc->skc_lock);
-       list_for_each_entry_safe(m, n, &skc->skc_emergency_list, ske_list) {
-               if (m->ske_obj == obj) {
-                       list_del(&m->ske_list);
-                       skc->skc_obj_emergency--;
-                       skc->skc_obj_total--;
-                       ske = m;
-                       break;
-               }
+       ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
+       if (likely(ske)) {
+               rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
+               skc->skc_obj_emergency--;
+               skc->skc_obj_total--;
        }
        spin_unlock(&skc->skc_lock);
 
-       if (ske == NULL)
+       if (unlikely(ske == NULL))
                SRETURN(-ENOENT);
 
        if (skc->skc_dtor)
@@ -1483,7 +1530,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
        INIT_LIST_HEAD(&skc->skc_list);
        INIT_LIST_HEAD(&skc->skc_complete_list);
        INIT_LIST_HEAD(&skc->skc_partial_list);
-       INIT_LIST_HEAD(&skc->skc_emergency_list);
+       skc->skc_emergency_tree = RB_ROOT;
        spin_lock_init(&skc->skc_lock);
        init_waitqueue_head(&skc->skc_waitq);
        skc->skc_slab_fail = 0;
@@ -1590,7 +1637,6 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
        ASSERT3U(skc->skc_obj_total, ==, 0);
        ASSERT3U(skc->skc_obj_emergency, ==, 0);
        ASSERT(list_empty(&skc->skc_complete_list));
-       ASSERT(list_empty(&skc->skc_emergency_list));
 
        kmem_free(skc->skc_name, skc->skc_name_size);
        spin_unlock(&skc->skc_lock);