]> granicus.if.org Git - zfs/commitdiff
Refine use of zv_state_lock.
authorBoris Protopopov <boris.protopopov@actifio.com>
Tue, 13 Jun 2017 16:03:44 +0000 (12:03 -0400)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 27 Jun 2017 16:51:44 +0000 (12:51 -0400)
Use zv_state_lock to protect all members of zvol_state structure, add
relevant ASSERT()s. Take zv_suspend_lock before zv_state_lock, do not
hold zv_state_lock across suspend/resume.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Boris Protopopov <boris.protopopov@actifio.com>
Closes #6226

module/zfs/zvol.c

index 8890aaaf78a3aaa42aedd5d2f137dcc73e408e39..75ed06e03300c23499b4f59abd6518bd99200b67 100644 (file)
  * time comes to remove the structure from the list, it is not in use, and
  * therefore, it can be taken off zvol_state_list and freed.
  *
- * The minor operations are issued to the spa->spa_zvol_taskq quues, that are
+ * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
+ * e.g. for the duration of receive and rollback operations. This lock can be
+ * held for significant periods of time. Given that it is undesirable to hold
+ * mutexes for long periods of time, the following lock ordering applies:
+ * - take zvol_state_lock if necessary, to protect zvol_state_list
+ * - take zv_suspend_lock if necessary, by the code path in question
+ * - take zv_state_lock to protect zvol_state_t
+ *
+ * The minor operations are issued to spa->spa_zvol_taskq queues, that are
  * single-threaded (to preserve order of minor operations), and are executed
  * through the zvol_task_cb that dispatches the specific operations. Therefore,
  * these operations are serialized per pool. Consequently, we can be certain
@@ -156,49 +164,87 @@ zvol_name_hash(const char *name)
 }
 
 /*
- * Find a zvol_state_t given the full major+minor dev_t.
+ * Find a zvol_state_t given the full major+minor dev_t. If found,
+ * return with zv_state_lock taken, otherwise, return (NULL) without
+ * taking zv_state_lock.
  */
 static zvol_state_t *
 zvol_find_by_dev(dev_t dev)
 {
        zvol_state_t *zv;
 
-       ASSERT(MUTEX_HELD(&zvol_state_lock));
+       mutex_enter(&zvol_state_lock);
        for (zv = list_head(&zvol_state_list); zv != NULL;
            zv = list_next(&zvol_state_list, zv)) {
-               if (zv->zv_dev == dev)
+               mutex_enter(&zv->zv_state_lock);
+               if (zv->zv_dev == dev) {
+                       mutex_exit(&zvol_state_lock);
                        return (zv);
+               }
+               mutex_exit(&zv->zv_state_lock);
        }
+       mutex_exit(&zvol_state_lock);
 
        return (NULL);
 }
 
 /*
  * Find a zvol_state_t given the name and hash generated by zvol_name_hash.
+ * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
+ * return (NULL) without the taking locks. The zv_suspend_lock is always taken
+ * before zv_state_lock. The mode argument indicates the mode (including none)
+ * for zv_suspend_lock to be taken.
  */
 static zvol_state_t *
-zvol_find_by_name_hash(const char *name, uint64_t hash)
+zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
 {
        zvol_state_t *zv;
        struct hlist_node *p;
 
-       ASSERT(MUTEX_HELD(&zvol_state_lock));
+       mutex_enter(&zvol_state_lock);
        hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
                zv = hlist_entry(p, zvol_state_t, zv_hlink);
+               mutex_enter(&zv->zv_state_lock);
                if (zv->zv_hash == hash &&
-                   strncmp(zv->zv_name, name, MAXNAMELEN) == 0)
+                   strncmp(zv->zv_name, name, MAXNAMELEN) == 0) {
+                       /*
+                        * this is the right zvol, take the locks in the
+                        * right order
+                        */
+                       if (mode != RW_NONE &&
+                           !rw_tryenter(&zv->zv_suspend_lock, mode)) {
+                               mutex_exit(&zv->zv_state_lock);
+                               rw_enter(&zv->zv_suspend_lock, mode);
+                               mutex_enter(&zv->zv_state_lock);
+                               /*
+                                * zvol cannot be renamed as we continue
+                                * to hold zvol_state_lock
+                                */
+                               ASSERT(zv->zv_hash == hash &&
+                                   strncmp(zv->zv_name, name, MAXNAMELEN)
+                                   == 0);
+                       }
+                       mutex_exit(&zvol_state_lock);
                        return (zv);
+               }
+               mutex_exit(&zv->zv_state_lock);
        }
+       mutex_exit(&zvol_state_lock);
+
        return (NULL);
 }
 
 /*
- * Find a zvol_state_t given the name provided at zvol_alloc() time.
+ * Find a zvol_state_t given the name.
+ * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
+ * return (NULL) without the taking locks. The zv_suspend_lock is always taken
+ * before zv_state_lock. The mode argument indicates the mode (including none)
+ * for zv_suspend_lock to be taken.
  */
 static zvol_state_t *
-zvol_find_by_name(const char *name)
+zvol_find_by_name(const char *name, int mode)
 {
-       return (zvol_find_by_name_hash(name, zvol_name_hash(name)));
+       return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode));
 }
 
 
@@ -295,9 +341,12 @@ zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
 {
        struct block_device *bdev;
 
+       ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
        bdev = bdget_disk(zv->zv_disk, 0);
        if (bdev == NULL)
                return;
+
        set_capacity(zv->zv_disk, volsize >> 9);
        zv->zv_volsize = volsize;
        check_disk_size_change(zv->zv_disk, bdev);
@@ -391,13 +440,14 @@ zvol_set_volsize(const char *name, uint64_t volsize)
        if (readonly)
                return (SET_ERROR(EROFS));
 
-       mutex_enter(&zvol_state_lock);
-       zv = zvol_find_by_name(name);
-       if (zv != NULL)
-               mutex_enter(&zv->zv_state_lock);
-       mutex_exit(&zvol_state_lock);
+       zv = zvol_find_by_name(name, RW_READER);
+
+       ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
+           RW_READ_HELD(&zv->zv_suspend_lock)));
 
        if (zv == NULL || zv->zv_objset == NULL) {
+               if (zv != NULL)
+                       rw_exit(&zv->zv_suspend_lock);
                if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
                    FTAG, &os)) != 0) {
                        if (zv != NULL)
@@ -408,7 +458,6 @@ zvol_set_volsize(const char *name, uint64_t volsize)
                if (zv != NULL)
                        zv->zv_objset = os;
        } else {
-               rw_enter(&zv->zv_suspend_lock, RW_READER);
                os = zv->zv_objset;
        }
 
@@ -435,6 +484,7 @@ out:
 
        if (zv != NULL)
                mutex_exit(&zv->zv_state_lock);
+
        return (SET_ERROR(error));
 }
 
@@ -485,23 +535,20 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize)
        dmu_tx_t *tx;
        int error;
 
-       mutex_enter(&zvol_state_lock);
+       zv = zvol_find_by_name(name, RW_READER);
 
-       zv = zvol_find_by_name(name);
-       if (zv == NULL) {
-               mutex_exit(&zvol_state_lock);
+       if (zv == NULL)
                return (SET_ERROR(ENXIO));
-       }
-       mutex_enter(&zv->zv_state_lock);
-       mutex_exit(&zvol_state_lock);
+
+       ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
+           RW_READ_HELD(&zv->zv_suspend_lock));
 
        if (zv->zv_flags & ZVOL_RDONLY) {
                mutex_exit(&zv->zv_state_lock);
+               rw_exit(&zv->zv_suspend_lock);
                return (SET_ERROR(EROFS));
        }
 
-       rw_enter(&zv->zv_suspend_lock, RW_READER);
-
        tx = dmu_tx_create(zv->zv_objset);
        dmu_tx_hold_bonus(tx, ZVOL_OBJ);
        error = dmu_tx_assign(tx, TXG_WAIT);
@@ -516,9 +563,9 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize)
                if (error == 0)
                        zv->zv_volblocksize = volblocksize;
        }
-       rw_exit(&zv->zv_suspend_lock);
 
        mutex_exit(&zv->zv_state_lock);
+       rw_exit(&zv->zv_suspend_lock);
 
        return (SET_ERROR(error));
 }
@@ -1060,6 +1107,9 @@ zvol_setup_zv(zvol_state_t *zv)
        uint64_t ro;
        objset_t *os = zv->zv_objset;
 
+       ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
+           RW_LOCK_HELD(&zv->zv_suspend_lock));
+
        error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
        if (error)
                return (SET_ERROR(error));
@@ -1094,6 +1144,9 @@ zvol_setup_zv(zvol_state_t *zv)
 static void
 zvol_shutdown_zv(zvol_state_t *zv)
 {
+       ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
+           RW_LOCK_HELD(&zv->zv_suspend_lock));
+
        zil_close(zv->zv_zilog);
        zv->zv_zilog = NULL;
 
@@ -1127,24 +1180,27 @@ zvol_suspend(const char *name)
 {
        zvol_state_t *zv;
 
-       mutex_enter(&zvol_state_lock);
-       zv = zvol_find_by_name(name);
-       if (zv == NULL) {
-               mutex_exit(&zvol_state_lock);
+       zv = zvol_find_by_name(name, RW_WRITER);
+
+       if (zv == NULL)
                return (NULL);
-       }
-       mutex_enter(&zv->zv_state_lock);
-       mutex_exit(&zvol_state_lock);
 
        /* block all I/O, release in zvol_resume. */
-       rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+       ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
+           RW_WRITE_HELD(&zv->zv_suspend_lock));
 
        atomic_inc(&zv->zv_suspend_ref);
 
        if (zv->zv_open_count > 0)
                zvol_shutdown_zv(zv);
 
+       /*
+        * do not hold zv_state_lock across suspend/resume to
+        * avoid locking up zvol lookups
+        */
        mutex_exit(&zv->zv_state_lock);
+
+       /* zv_suspend_lock is released in zvol_resume() */
        return (zv);
 }
 
@@ -1155,11 +1211,8 @@ zvol_resume(zvol_state_t *zv)
 
        ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 
-       /*
-        * Cannot take zv_state_lock here with zv_suspend_lock
-        * held; however, the latter is held in exclusive mode,
-        * so it is not necessary to do so
-        */
+       mutex_enter(&zv->zv_state_lock);
+
        if (zv->zv_open_count > 0) {
                VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset));
                VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv);
@@ -1168,6 +1221,9 @@ zvol_resume(zvol_state_t *zv)
 
                error = zvol_setup_zv(zv);
        }
+
+       mutex_exit(&zv->zv_state_lock);
+
        rw_exit(&zv->zv_suspend_lock);
        /*
         * We need this because we don't hold zvol_state_lock while releasing
@@ -1186,6 +1242,9 @@ zvol_first_open(zvol_state_t *zv)
        objset_t *os;
        int error, locked = 0;
 
+       ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+       ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
        /*
         * In all other cases the spa_namespace_lock is taken before the
         * bdev->bd_mutex lock.  But in this case the Linux __blkdev_get()
@@ -1233,6 +1292,9 @@ out_mutex:
 static void
 zvol_last_close(zvol_state_t *zv)
 {
+       ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+       ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
        zvol_shutdown_zv(zv);
 
        dmu_objset_disown(zv->zv_objset, zv);
@@ -1243,14 +1305,15 @@ static int
 zvol_open(struct block_device *bdev, fmode_t flag)
 {
        zvol_state_t *zv;
-       int error = 0, drop_suspend = 0;
+       int error = 0;
+       boolean_t drop_suspend = B_FALSE;
 
        ASSERT(!mutex_owned(&zvol_state_lock));
 
        mutex_enter(&zvol_state_lock);
        /*
-        * Obtain a copy of private_data under the lock to make sure
-        * that either the result of zvol free code path setting
+        * Obtain a copy of private_data under the zvol_state_lock to make
+        * sure that either the result of zvol free code path setting
         * bdev->bd_disk->private_data to NULL is observed, or zvol_free()
         * is not called on this zv because of the positive zv_open_count.
         */
@@ -1259,14 +1322,25 @@ zvol_open(struct block_device *bdev, fmode_t flag)
                mutex_exit(&zvol_state_lock);
                return (SET_ERROR(-ENXIO));
        }
+
+       /* take zv_suspend_lock before zv_state_lock */
+       rw_enter(&zv->zv_suspend_lock, RW_READER);
+
        mutex_enter(&zv->zv_state_lock);
-       mutex_exit(&zvol_state_lock);
 
+       /*
+        * make sure zvol is not suspended during first open
+        * (hold zv_suspend_lock), otherwise, drop the lock
+        */
        if (zv->zv_open_count == 0) {
-               /* make sure zvol is not suspended when first open */
-               rw_enter(&zv->zv_suspend_lock, RW_READER);
-               drop_suspend = 1;
+               drop_suspend = B_TRUE;
+       } else {
+               rw_exit(&zv->zv_suspend_lock);
+       }
+
+       mutex_exit(&zvol_state_lock);
 
+       if (zv->zv_open_count == 0) {
                error = zvol_first_open(zv);
                if (error)
                        goto out_mutex;
@@ -1285,9 +1359,9 @@ out_open_count:
        if (zv->zv_open_count == 0)
                zvol_last_close(zv);
 out_mutex:
+       mutex_exit(&zv->zv_state_lock);
        if (drop_suspend)
                rw_exit(&zv->zv_suspend_lock);
-       mutex_exit(&zv->zv_state_lock);
 
        return (SET_ERROR(error));
 }
@@ -1300,27 +1374,38 @@ static int
 zvol_release(struct gendisk *disk, fmode_t mode)
 {
        zvol_state_t *zv;
+       boolean_t drop_suspend = B_FALSE;
 
        ASSERT(!mutex_owned(&zvol_state_lock));
 
        mutex_enter(&zvol_state_lock);
        zv = disk->private_data;
        ASSERT(zv && zv->zv_open_count > 0);
+
+       /* take zv_suspend_lock before zv_state_lock */
+       rw_enter(&zv->zv_suspend_lock, RW_READER);
+
        mutex_enter(&zv->zv_state_lock);
        mutex_exit(&zvol_state_lock);
 
-       /* make sure zvol is not suspended when last close */
+       /*
+        * make sure zvol is not suspended during last close
+        * (hold zv_suspend_lock), otherwise, drop the lock
+        */
        if (zv->zv_open_count == 1)
-               rw_enter(&zv->zv_suspend_lock, RW_READER);
+               drop_suspend = B_TRUE;
+       else
+               rw_exit(&zv->zv_suspend_lock);
 
        zv->zv_open_count--;
-       if (zv->zv_open_count == 0) {
+       if (zv->zv_open_count == 0)
                zvol_last_close(zv);
-               rw_exit(&zv->zv_suspend_lock);
-       }
 
        mutex_exit(&zv->zv_state_lock);
 
+       if (drop_suspend)
+               rw_exit(&zv->zv_suspend_lock);
+
 #ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
        return (0);
 #endif
@@ -1430,10 +1515,11 @@ zvol_probe(dev_t dev, int *part, void *arg)
        zvol_state_t *zv;
        struct kobject *kobj;
 
-       mutex_enter(&zvol_state_lock);
        zv = zvol_find_by_dev(dev);
        kobj = zv ? get_disk(zv->zv_disk) : NULL;
-       mutex_exit(&zvol_state_lock);
+       ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock));
+       if (zv)
+               mutex_exit(&zv->zv_state_lock);
 
        return (kobj);
 }
@@ -1571,6 +1657,8 @@ zvol_free(void *arg)
        zvol_state_t *zv = arg;
 
        ASSERT(!MUTEX_HELD(&zvol_state_lock));
+       ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+       ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
        ASSERT(zv->zv_open_count == 0);
        ASSERT(zv->zv_disk->private_data == NULL);
 
@@ -1611,17 +1699,14 @@ zvol_create_minor_impl(const char *name)
                return (SET_ERROR(-idx));
        minor = idx << ZVOL_MINOR_BITS;
 
-       mutex_enter(&zvol_state_lock);
-
-       zv = zvol_find_by_name_hash(name, hash);
+       zv = zvol_find_by_name_hash(name, hash, RW_NONE);
        if (zv) {
-               mutex_exit(&zvol_state_lock);
+               ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+               mutex_exit(&zv->zv_state_lock);
                ida_simple_remove(&zvol_ida, idx);
                return (SET_ERROR(EEXIST));
        }
 
-       mutex_exit(&zvol_state_lock);
-
        doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
 
        error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
@@ -1715,10 +1800,9 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname)
        int readonly = get_disk_ro(zv->zv_disk);
 
        ASSERT(MUTEX_HELD(&zvol_state_lock));
+       ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
-       rw_enter(&zv->zv_suspend_lock, RW_READER);
        strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
-       rw_exit(&zv->zv_suspend_lock);
 
        /* move to new hashtable entry  */
        zv->zv_hash = zvol_name_hash(zv->zv_name);
@@ -1960,15 +2044,15 @@ zvol_remove_minors_impl(const char *name)
        for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
                zv_next = list_next(&zvol_state_list, zv);
 
+               mutex_enter(&zv->zv_state_lock);
                if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
                    (strncmp(zv->zv_name, name, namelen) == 0 &&
                    (zv->zv_name[namelen] == '/' ||
                    zv->zv_name[namelen] == '@'))) {
                        /*
-                        * By taking zv_state_lock here, we guarantee that no
+                        * By holding zv_state_lock here, we guarantee that no
                         * one is currently using this zv
                         */
-                       mutex_enter(&zv->zv_state_lock);
 
                        /* If in use, leave alone */
                        if (zv->zv_open_count > 0 ||
@@ -1979,7 +2063,10 @@ zvol_remove_minors_impl(const char *name)
 
                        zvol_remove(zv);
 
-                       /* clear this so zvol_open won't open it */
+                       /*
+                        * clear this while holding zvol_state_lock so
+                        * zvol_open won't open it
+                        */
                        zv->zv_disk->private_data = NULL;
 
                        /* Drop zv_state_lock before zvol_free() */
@@ -1992,6 +2079,8 @@ zvol_remove_minors_impl(const char *name)
                                list_insert_head(&free_list, zv);
                        else
                                tid = t;
+               } else {
+                       mutex_exit(&zv->zv_state_lock);
                }
        }
        mutex_exit(&zvol_state_lock);
@@ -2025,12 +2114,12 @@ zvol_remove_minor_impl(const char *name)
        for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
                zv_next = list_next(&zvol_state_list, zv);
 
+               mutex_enter(&zv->zv_state_lock);
                if (strcmp(zv->zv_name, name) == 0) {
                        /*
-                        * By taking zv_state_lock here, we guarantee that no
+                        * By holding zv_state_lock here, we guarantee that no
                         * one is currently using this zv
                         */
-                       mutex_enter(&zv->zv_state_lock);
 
                        /* If in use, leave alone */
                        if (zv->zv_open_count > 0 ||
@@ -2045,6 +2134,8 @@ zvol_remove_minor_impl(const char *name)
 
                        mutex_exit(&zv->zv_state_lock);
                        break;
+               } else {
+                       mutex_exit(&zv->zv_state_lock);
                }
        }
 
@@ -2077,9 +2168,13 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
        for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
                zv_next = list_next(&zvol_state_list, zv);
 
+               mutex_enter(&zv->zv_state_lock);
+
                /* If in use, leave alone */
-               if (zv->zv_open_count > 0)
+               if (zv->zv_open_count > 0) {
+                       mutex_exit(&zv->zv_state_lock);
                        continue;
+               }
 
                if (strcmp(zv->zv_name, oldname) == 0) {
                        zvol_rename_minor(zv, newname);
@@ -2091,6 +2186,8 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
                            zv->zv_name + oldnamelen + 1);
                        zvol_rename_minor(zv, name);
                }
+
+               mutex_exit(&zv->zv_state_lock);
        }
 
        mutex_exit(&zvol_state_lock);