]> granicus.if.org Git - zfs/commitdiff
Illumos 4873 - zvol unmap calls can take a very long time for larger datasets
authorAlex Reece <alex@delphix.com>
Fri, 3 Apr 2015 03:14:28 +0000 (14:14 +1100)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 28 Apr 2015 23:24:03 +0000 (16:24 -0700)
4873 zvol unmap calls can take a very long time for larger datasets
Author: Alex Reece <alex@delphix.com>
Reviewed by: George Wilson <george@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <paul.dagnelie@delphix.com>
Reviewed by: Basil Crow <basil.crow@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Robert Mustacchi <rm@joyent.com>

References:
  https://www.illumos.org/issues/4873
  https://github.com/illumos/illumos-gate/commit/0f6d88a

Porting Notes:

dbuf_free_range():
  - reduce stack usage using kmem_alloc()
  - the sorted AVL tree will handle the spill block case correctly
    without all the special handling in the for() loop

Ported-by: Chris Dunlop <chris@onthe.net.au>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
include/sys/avl.h
include/sys/dbuf.h
include/sys/dnode.h
module/avl/avl.c
module/zfs/dbuf.c
module/zfs/dnode.c
module/zfs/dnode_sync.c

index ba305c9082392ee7b970b2032704c8ea4a74df9c..9bc2b4a32e0e21d5c62a9161281b9168d2df1288 100644 (file)
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
 #ifndef        _AVL_H
 #define        _AVL_H
 
@@ -259,6 +263,11 @@ extern boolean_t avl_update(avl_tree_t *, void *);
 extern boolean_t avl_update_lt(avl_tree_t *, void *);
 extern boolean_t avl_update_gt(avl_tree_t *, void *);
 
+/*
+ * Swaps the contents of the two trees.
+ */
+extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2);
+
 /*
  * Return the number of nodes in the tree
  */
index 1eabfd7daccfbd37df7524691a7e21f885e05204..2f593bb4d666823986d227212586804461d00c5f 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
@@ -213,11 +213,14 @@ typedef struct dmu_buf_impl {
        /* pointer to most recent dirty record for this buffer */
        dbuf_dirty_record_t *db_last_dirty;
 
+       /* Creation time of dbuf (see comment in dbuf_compare). */
+       hrtime_t db_creation;
+
        /*
         * Our link on the owner dnodes's dn_dbufs list.
         * Protected by its dn_dbufs_mtx.
         */
-       list_node_t db_link;
+       avl_node_t db_link;
 
        /* Data which is unique to data (leaf) blocks: */
 
index b63549b4675320199e3644f87d2e6f2c081e49eb..8a72722eb2798cc756c7976bdb04aa25b51df56b 100644 (file)
@@ -233,7 +233,7 @@ typedef struct dnode {
        refcount_t dn_holds;
 
        kmutex_t dn_dbufs_mtx;
-       list_t dn_dbufs;                /* descendent dbufs */
+       avl_tree_t dn_dbufs;            /* descendent dbufs */
 
        /* protected by dn_struct_rwlock */
        struct dmu_buf_impl *dn_bonus;  /* bonus buffer dbuf */
index f9971da20a0fb50e1fdf98c513aa5ed91c187fd0..46bf49d8c279811301a156c51bcc0ead3b80690b 100644 (file)
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
 /*
  * AVL - generic AVL tree implementation for kernel use
  *
  *       is a modified "avl_node_t *".  The bottom bit (normally 0 for a
  *       pointer) is set to indicate if that the new node has a value greater
  *       than the value of the indicated "avl_node_t *".
+ *
+ * Note - in addition to userland (e.g. libavl and libutil) and the kernel
+ * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module,
+ * which each have their own compilation environments and subsequent
+ * requirements. Each of these environments must be considered when adding
+ * dependencies from avl.c.
  */
 
 #include <sys/types.h>
@@ -863,6 +873,24 @@ avl_update(avl_tree_t *t, void *obj)
        return (B_FALSE);
 }
 
+void
+avl_swap(avl_tree_t *tree1, avl_tree_t *tree2)
+{
+       avl_node_t *temp_node;
+       ulong_t temp_numnodes;
+
+       ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar);
+       ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset);
+       ASSERT3U(tree1->avl_size, ==, tree2->avl_size);
+
+       temp_node = tree1->avl_root;
+       temp_numnodes = tree1->avl_numnodes;
+       tree1->avl_root = tree2->avl_root;
+       tree1->avl_numnodes = tree2->avl_numnodes;
+       tree2->avl_root = temp_node;
+       tree2->avl_numnodes = temp_numnodes;
+}
+
 /*
  * initialize a new AVL tree
  */
@@ -1058,6 +1086,8 @@ EXPORT_SYMBOL(avl_first);
 EXPORT_SYMBOL(avl_last);
 EXPORT_SYMBOL(avl_nearest);
 EXPORT_SYMBOL(avl_add);
+EXPORT_SYMBOL(avl_swap);
+EXPORT_SYMBOL(avl_is_empty);
 EXPORT_SYMBOL(avl_remove);
 EXPORT_SYMBOL(avl_numnodes);
 EXPORT_SYMBOL(avl_destroy_nodes);
index f5327a34aa4432e0f3ac8ac0ac9f28b782e2aacc..e9c8580fc33a8c9e83a26481a9b57518837dd8e5 100644 (file)
@@ -93,7 +93,9 @@ dbuf_cons(void *vdb, void *unused, int kmflag)
        mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
        refcount_create(&db->db_holds);
-       list_link_init(&db->db_link);
+
+       db->db_creation = gethrtime();
+
        return (0);
 }
 
@@ -386,7 +388,7 @@ dbuf_verify(dmu_buf_impl_t *db)
                ASSERT3U(db->db_level, <, dn->dn_nlevels);
                ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
                    db->db_blkid == DMU_SPILL_BLKID ||
-                   !list_is_empty(&dn->dn_dbufs));
+                   !avl_is_empty(&dn->dn_dbufs));
        }
        if (db->db_blkid == DMU_BONUS_BLKID) {
                ASSERT(dn != NULL);
@@ -866,23 +868,34 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
  * receive; see comment below for details.
  */
 void
-dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
+dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+    dmu_tx_t *tx)
 {
-       dmu_buf_impl_t *db, *db_next;
+       dmu_buf_impl_t *db, *db_next, *db_search;
        uint64_t txg = tx->tx_txg;
+       avl_index_t where;
        boolean_t freespill =
-           (start == DMU_SPILL_BLKID || end == DMU_SPILL_BLKID);
+           (start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID);
+
+       if (end_blkid > dn->dn_maxblkid && !freespill)
+               end_blkid = dn->dn_maxblkid;
+       dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
 
-       if (end > dn->dn_maxblkid && !freespill)
-               end = dn->dn_maxblkid;
-       dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
+       db_seach = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
+       db_search->db_level = 0;
+       db_search->db_blkid = start_blkid;
+       db_search->db_creation = 0;
 
        mutex_enter(&dn->dn_dbufs_mtx);
-       if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz &&
-           !freespill) {
+       if (start_blkid >= dn->dn_unlisted_l0_blkid && !freespill) {
                /* There can't be any dbufs in this range; no need to search. */
-               mutex_exit(&dn->dn_dbufs_mtx);
-               return;
+#ifdef DEBUG
+               db = avl_find(&dn->dn_dbufs, db_search, &where);
+               ASSERT3P(db, ==, NULL);
+               db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+               ASSERT(db == NULL || db->db_level > 0);
+#endif
+               goto out;
        } else if (dmu_objset_is_receiving(dn->dn_objset)) {
                /*
                 * If we are receiving, we expect there to be no dbufs in
@@ -894,19 +907,18 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
                atomic_inc_64(&zfs_free_range_recv_miss);
        }
 
-       for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
-               db_next = list_next(&dn->dn_dbufs, db);
+       db = avl_find(&dn->dn_dbufs, db_search, &where);
+       ASSERT3P(db, ==, NULL);
+       db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+       for (; db != NULL; db = db_next) {
+               db_next = AVL_NEXT(&dn->dn_dbufs, db);
                ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
-               /* Skip indirect blocks. */
-               if (db->db_level != 0)
-                       continue;
-               /* Skip direct blocks outside the range. */
-               if (!freespill && (db->db_blkid < start || db->db_blkid > end))
-                       continue;
-               /* Skip all direct blocks, only free spill blocks. */
-               if (freespill && (db->db_blkid != DMU_SPILL_BLKID))
-                       continue;
+               if (db->db_level != 0 || db->db_blkid > end_blkid) {
+                       break;
+               }
+               ASSERT3U(db->db_blkid, >=, start_blkid);
 
                /* found a level 0 buffer in the range */
                mutex_enter(&db->db_mtx);
@@ -968,6 +980,9 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
 
                mutex_exit(&db->db_mtx);
        }
+
+out:
+       kmem_free(db_search, sizeof (dmu_buf_impl_t));
        mutex_exit(&dn->dn_dbufs_mtx);
 }
 
@@ -1657,7 +1672,7 @@ dbuf_clear(dmu_buf_impl_t *db)
        dn = DB_DNODE(db);
        dndb = dn->dn_dbuf;
        if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
-               list_remove(&dn->dn_dbufs, db);
+               avl_remove(&dn->dn_dbufs, db);
                atomic_dec_32(&dn->dn_dbufs_count);
                membar_producer();
                DB_DNODE_EXIT(db);
@@ -1829,7 +1844,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
                mutex_exit(&dn->dn_dbufs_mtx);
                return (odb);
        }
-       list_insert_head(&dn->dn_dbufs, db);
+       avl_add(&dn->dn_dbufs, db);
        if (db->db_level == 0 && db->db_blkid >=
            dn->dn_unlisted_l0_blkid)
                dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
@@ -1888,7 +1903,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
                        DB_DNODE_ENTER(db);
                        dn = DB_DNODE(db);
                        mutex_enter(&dn->dn_dbufs_mtx);
-                       list_remove(&dn->dn_dbufs, db);
+                       avl_remove(&dn->dn_dbufs, db);
                        atomic_dec_32(&dn->dn_dbufs_count);
                        mutex_exit(&dn->dn_dbufs_mtx);
                        DB_DNODE_EXIT(db);
@@ -1906,7 +1921,6 @@ dbuf_destroy(dmu_buf_impl_t *db)
        db->db_parent = NULL;
        db->db_buf = NULL;
 
-       ASSERT(!list_link_active(&db->db_link));
        ASSERT(db->db.db_data == NULL);
        ASSERT(db->db_hash_next == NULL);
        ASSERT(db->db_blkptr == NULL);
index 815696f702180ffb4080c5617fee923cc9633150..41722f25c0d8d273fd057daede61b3ac1d5abf1d 100644 (file)
@@ -62,6 +62,43 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
 #endif /* _KERNEL */
 
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+       const dmu_buf_impl_t *d1 = x1;
+       const dmu_buf_impl_t *d2 = x2;
+
+       if (d1->db_level < d2->db_level) {
+               return (-1);
+       } else if (d1->db_level > d2->db_level) {
+               return (1);
+       }
+
+       if (d1->db_blkid < d2->db_blkid) {
+               return (-1);
+       } else if (d1->db_blkid > d2->db_blkid) {
+               return (1);
+       }
+
+       /*
+        * If a dbuf is being evicted while dn_dbufs_mutex is not held, we set
+        * the db_state to DB_EVICTING but do not remove it from dn_dbufs. If
+        * another thread creates a dbuf of the same blkid before the dbuf is
+        * removed from dn_dbufs, we can reach a state where there are two
+        * dbufs of the same blkid and level in db_dbufs. To maintain the avl
+        * invariant that there cannot be duplicate items, we distinguish
+        * between these two dbufs based on the time they were created.
+        */
+       if (d1->db_creation < d2->db_creation) {
+               return (-1);
+       } else if (d1->db_creation > d2->db_creation) {
+               return (1);
+       } else {
+               ASSERT3P(d1, ==, d2);
+               return (0);
+       }
+}
+
 /* ARGSUSED */
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
@@ -116,7 +153,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
 
        dn->dn_dbufs_count = 0;
        dn->dn_unlisted_l0_blkid = 0;
-       list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+       avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
 
        dn->dn_moved = 0;
@@ -169,7 +206,7 @@ dnode_dest(void *arg, void *unused)
 
        ASSERT0(dn->dn_dbufs_count);
        ASSERT0(dn->dn_unlisted_l0_blkid);
-       list_destroy(&dn->dn_dbufs);
+       avl_destroy(&dn->dn_dbufs);
 }
 
 void
@@ -503,7 +540,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
        ASSERT0(dn->dn_assigned_txg);
        ASSERT(refcount_is_zero(&dn->dn_tx_holds));
        ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
-       ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+       ASSERT(avl_is_empty(&dn->dn_dbufs));
 
        for (i = 0; i < TXG_SIZE; i++) {
                ASSERT0(dn->dn_next_nblkptr[i]);
@@ -689,8 +726,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
        ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
        refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
-       ASSERT(list_is_empty(&ndn->dn_dbufs));
-       list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+       ASSERT(avl_is_empty(&ndn->dn_dbufs));
+       avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
        ndn->dn_dbufs_count = odn->dn_dbufs_count;
        ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
        ndn->dn_bonus = odn->dn_bonus;
@@ -724,7 +761,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
         */
        odn->dn_dbuf = NULL;
        odn->dn_handle = NULL;
-       list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+       avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
        odn->dn_dbufs_count = 0;
        odn->dn_unlisted_l0_blkid = 0;
@@ -1238,7 +1275,8 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
                return;
        }
 
-       ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+       ASSERT(!refcount_is_zero(&dn->dn_holds) ||
+           !avl_is_empty(&dn->dn_dbufs));
        ASSERT(dn->dn_datablksz != 0);
        ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
        ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
@@ -1311,7 +1349,7 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx)
 int
 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 {
-       dmu_buf_impl_t *db, *db_next;
+       dmu_buf_impl_t *db;
        int err;
 
        if (size == 0)
@@ -1334,9 +1372,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
                goto fail;
 
        mutex_enter(&dn->dn_dbufs_mtx);
-       for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
-               db_next = list_next(&dn->dn_dbufs, db);
-
+       for (db = avl_first(&dn->dn_dbufs); db != NULL;
+           db = AVL_NEXT(&dn->dn_dbufs, db)) {
                if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
                    db->db_blkid != DMU_SPILL_BLKID) {
                        mutex_exit(&dn->dn_dbufs_mtx);
index 9779c51ef324d5e6daf165462369d89aa5c08f07..4e613dd76e223b2783937248acce1fdd98127577 100644 (file)
@@ -406,16 +406,13 @@ dnode_evict_dbufs(dnode_t *dn)
        int pass = 0;
 
        do {
-               dmu_buf_impl_t *db, marker;
+               dmu_buf_impl_t *db, *db_next;
                int evicting = FALSE;
 
                progress = FALSE;
                mutex_enter(&dn->dn_dbufs_mtx);
-               list_insert_tail(&dn->dn_dbufs, &marker);
-               db = list_head(&dn->dn_dbufs);
-               for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
-                       list_remove(&dn->dn_dbufs, db);
-                       list_insert_tail(&dn->dn_dbufs, db);
+               for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
+                       db_next = AVL_NEXT(&dn->dn_dbufs, db);
 #ifdef DEBUG
                        DB_DNODE_ENTER(db);
                        ASSERT3P(DB_DNODE(db), ==, dn);
@@ -435,7 +432,6 @@ dnode_evict_dbufs(dnode_t *dn)
                        }
 
                }
-               list_remove(&dn->dn_dbufs, &marker);
                /*
                 * NB: we need to drop dn_dbufs_mtx between passes so
                 * that any DB_EVICTING dbufs can make progress.
@@ -516,7 +512,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 
        dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
        dnode_evict_dbufs(dn);
-       ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+       ASSERT(avl_is_empty(&dn->dn_dbufs));
        ASSERT3P(dn->dn_bonus, ==, NULL);
 
        /*