]> granicus.if.org Git - zfs/commitdiff
Improve rate at which new zvols are processed
authorJohn Gallagher <john.gallagher@delphix.com>
Sat, 4 May 2019 23:39:10 +0000 (16:39 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Sat, 4 May 2019 23:39:10 +0000 (16:39 -0700)
The kernel function which adds new zvols as disks to the system,
add_disk(), briefly opens and closes the zvol as part of its work.
Closing a zvol involves waiting for two txgs to sync. This, combined
with the fact that the taskq processing new zvols is single threaded,
makes this processing new zvols slow.

Waiting for these txgs to sync is only necessary if the zvol has been
written to, which is not the case during add_disk(). This change adds
tracking of whether a zvol has been written to so that we can skip the
txg_wait_synced() calls when they are unnecessary.

This change also fixes the flags passed to blkdev_get_by_path() by
vdev_disk_open() to be FMODE_READ | FMODE_WRITE | FMODE_EXCL instead of
just FMODE_EXCL. The flags were being incorrectly calculated because
we were using the wrong version of vdev_bdev_mode().

Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #8526
Closes #8615

module/zfs/vdev_disk.c
module/zfs/zvol.c

index b329ef3c240fe7f0dee3d51f9419e1bf2033ef8e..1419ae6ad54aaab3918d23f8774c364e56e1f8d5 100644 (file)
@@ -23,7 +23,7 @@
  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  * LLNL-CODE-403049.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -56,7 +56,7 @@ typedef struct dio_request {
 } dio_request_t;
 
 
-#ifdef HAVE_OPEN_BDEV_EXCLUSIVE
+#if defined(HAVE_OPEN_BDEV_EXCLUSIVE) || defined(HAVE_BLKDEV_GET_BY_PATH)
 static fmode_t
 vdev_bdev_mode(int smode)
 {
index 9fd689fbd01baa493f3a48352e8f6b90d00ad49c..a77339d7f4c4865442376b237bc2221af77562f7 100644 (file)
@@ -36,7 +36,7 @@
  *
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  */
 
 /*
@@ -155,6 +155,11 @@ typedef struct {
 } zvol_task_t;
 
 #define        ZVOL_RDONLY     0x1
+/*
+ * Whether the zvol has been written to (as opposed to ZVOL_RDONLY, which
+ * specifies whether or not the zvol _can_ be written to)
+ */
+#define        ZVOL_WRITTEN_TO 0x2
 
 static uint64_t
 zvol_name_hash(const char *name)
@@ -742,6 +747,7 @@ zvol_write(void *arg)
 
        zvol_state_t *zv = zvr->zv;
        ASSERT(zv && zv->zv_open_count > 0);
+       ASSERT(zv->zv_zilog != NULL);
 
        ssize_t start_resid = uio.uio_resid;
        unsigned long start_jif = jiffies;
@@ -832,6 +838,7 @@ zvol_discard(void *arg)
        unsigned long start_jif;
 
        ASSERT(zv && zv->zv_open_count > 0);
+       ASSERT(zv->zv_zilog != NULL);
 
        start_jif = jiffies;
        blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio),
@@ -930,6 +937,86 @@ zvol_read(void *arg)
        kmem_free(zvr, sizeof (zv_request_t));
 }
 
+/* ARGSUSED */
+static void
+zvol_get_done(zgd_t *zgd, int error)
+{
+       if (zgd->zgd_db)
+               dmu_buf_rele(zgd->zgd_db, zgd);
+
+       rangelock_exit(zgd->zgd_lr);
+
+       kmem_free(zgd, sizeof (zgd_t));
+}
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+static int
+zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+{
+       zvol_state_t *zv = arg;
+       uint64_t offset = lr->lr_offset;
+       uint64_t size = lr->lr_length;
+       dmu_buf_t *db;
+       zgd_t *zgd;
+       int error;
+
+       ASSERT3P(lwb, !=, NULL);
+       ASSERT3P(zio, !=, NULL);
+       ASSERT3U(size, !=, 0);
+
+       zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+       zgd->zgd_lwb = lwb;
+
+       /*
+        * Write records come in two flavors: immediate and indirect.
+        * For small writes it's cheaper to store the data with the
+        * log record (immediate); for large writes it's cheaper to
+        * sync the data and get a pointer to it (indirect) so that
+        * we don't have to write the data twice.
+        */
+       if (buf != NULL) { /* immediate write */
+               zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
+                   RL_READER);
+               error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
+                   DMU_READ_NO_PREFETCH);
+       } else { /* indirect write */
+               /*
+                * Have to lock the whole block to ensure when it's written out
+                * and its checksum is being calculated that no one can change
+                * the data. Contrarily to zfs_get_data we need not re-check
+                * blocksize after we get the lock because it cannot be changed.
+                */
+               size = zv->zv_volblocksize;
+               offset = P2ALIGN_TYPED(offset, size, uint64_t);
+               zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
+                   RL_READER);
+               error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
+                   DMU_READ_NO_PREFETCH);
+               if (error == 0) {
+                       blkptr_t *bp = &lr->lr_blkptr;
+
+                       zgd->zgd_db = db;
+                       zgd->zgd_bp = bp;
+
+                       ASSERT(db != NULL);
+                       ASSERT(db->db_offset == offset);
+                       ASSERT(db->db_size == size);
+
+                       error = dmu_sync(zio, lr->lr_common.lrc_txg,
+                           zvol_get_done, zgd);
+
+                       if (error == 0)
+                               return (0);
+               }
+       }
+
+       zvol_get_done(zgd, error);
+
+       return (SET_ERROR(error));
+}
+
 static MAKE_REQUEST_FN_RET
 zvol_request(struct request_queue *q, struct bio *bio)
 {
@@ -965,6 +1052,23 @@ zvol_request(struct request_queue *q, struct bio *bio)
                 */
                rw_enter(&zv->zv_suspend_lock, RW_READER);
 
+               /*
+                * Open a ZIL if this is the first time we have written to this
+                * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
+                * than zv_state_lock so that we don't need to acquire an
+                * additional lock in this path.
+                */
+               if (zv->zv_zilog == NULL) {
+                       rw_exit(&zv->zv_suspend_lock);
+                       rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+                       if (zv->zv_zilog == NULL) {
+                               zv->zv_zilog = zil_open(zv->zv_objset,
+                                   zvol_get_data);
+                               zv->zv_flags |= ZVOL_WRITTEN_TO;
+                       }
+                       rw_downgrade(&zv->zv_suspend_lock);
+               }
+
                /* bio marked as FLUSH need to flush before write */
                if (bio_is_flush(bio))
                        zil_commit(zv->zv_zilog, ZVOL_OBJ);
@@ -1040,86 +1144,6 @@ out:
 #endif
 }
 
-/* ARGSUSED */
-static void
-zvol_get_done(zgd_t *zgd, int error)
-{
-       if (zgd->zgd_db)
-               dmu_buf_rele(zgd->zgd_db, zgd);
-
-       rangelock_exit(zgd->zgd_lr);
-
-       kmem_free(zgd, sizeof (zgd_t));
-}
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-static int
-zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
-{
-       zvol_state_t *zv = arg;
-       uint64_t offset = lr->lr_offset;
-       uint64_t size = lr->lr_length;
-       dmu_buf_t *db;
-       zgd_t *zgd;
-       int error;
-
-       ASSERT3P(lwb, !=, NULL);
-       ASSERT3P(zio, !=, NULL);
-       ASSERT3U(size, !=, 0);
-
-       zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
-       zgd->zgd_lwb = lwb;
-
-       /*
-        * Write records come in two flavors: immediate and indirect.
-        * For small writes it's cheaper to store the data with the
-        * log record (immediate); for large writes it's cheaper to
-        * sync the data and get a pointer to it (indirect) so that
-        * we don't have to write the data twice.
-        */
-       if (buf != NULL) { /* immediate write */
-               zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
-                   RL_READER);
-               error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
-                   DMU_READ_NO_PREFETCH);
-       } else { /* indirect write */
-               /*
-                * Have to lock the whole block to ensure when it's written out
-                * and its checksum is being calculated that no one can change
-                * the data. Contrarily to zfs_get_data we need not re-check
-                * blocksize after we get the lock because it cannot be changed.
-                */
-               size = zv->zv_volblocksize;
-               offset = P2ALIGN_TYPED(offset, size, uint64_t);
-               zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
-                   RL_READER);
-               error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
-                   DMU_READ_NO_PREFETCH);
-               if (error == 0) {
-                       blkptr_t *bp = &lr->lr_blkptr;
-
-                       zgd->zgd_db = db;
-                       zgd->zgd_bp = bp;
-
-                       ASSERT(db != NULL);
-                       ASSERT(db->db_offset == offset);
-                       ASSERT(db->db_size == size);
-
-                       error = dmu_sync(zio, lr->lr_common.lrc_txg,
-                           zvol_get_done, zgd);
-
-                       if (error == 0)
-                               return (0);
-               }
-       }
-
-       zvol_get_done(zgd, error);
-
-       return (SET_ERROR(error));
-}
-
 /*
  * The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
  */
@@ -1157,6 +1181,9 @@ zvol_setup_zv(zvol_state_t *zv)
        ASSERT(MUTEX_HELD(&zv->zv_state_lock));
        ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
 
+       zv->zv_zilog = NULL;
+       zv->zv_flags &= ~ZVOL_WRITTEN_TO;
+
        error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
        if (error)
                return (SET_ERROR(error));
@@ -1171,7 +1198,6 @@ zvol_setup_zv(zvol_state_t *zv)
 
        set_capacity(zv->zv_disk, volsize >> 9);
        zv->zv_volsize = volsize;
-       zv->zv_zilog = zil_open(os, zvol_get_data);
 
        if (ro || dmu_objset_is_snapshot(os) ||
            !spa_writeable(dmu_objset_spa(os))) {
@@ -1194,7 +1220,11 @@ zvol_shutdown_zv(zvol_state_t *zv)
        ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
            RW_LOCK_HELD(&zv->zv_suspend_lock));
 
-       zil_close(zv->zv_zilog);
+       if (zv->zv_flags & ZVOL_WRITTEN_TO) {
+               ASSERT(zv->zv_zilog != NULL);
+               zil_close(zv->zv_zilog);
+       }
+
        zv->zv_zilog = NULL;
 
        dnode_rele(zv->zv_dn, FTAG);
@@ -1204,7 +1234,7 @@ zvol_shutdown_zv(zvol_state_t *zv)
         * Evict cached data. We must write out any dirty data before
         * disowning the dataset.
         */
-       if (!(zv->zv_flags & ZVOL_RDONLY))
+       if (zv->zv_flags & ZVOL_WRITTEN_TO)
                txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
        (void) dmu_objset_evict_dbufs(zv->zv_objset);
 }