]> granicus.if.org Git - zfs/commitdiff
Fix range locking in ZIL commit codepath
authorLOLi <loli10K@users.noreply.github.com>
Mon, 21 Aug 2017 15:59:48 +0000 (17:59 +0200)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Mon, 21 Aug 2017 15:59:48 +0000 (08:59 -0700)
Since OpenZFS 7578 (1b7c1e5) if we have a ZVOL with logbias=throughput
we will force WR_INDIRECT itxs in zvol_log_write() setting itx->itx_lr
offset and length to the offset and length of the BIO from
zvol_write()->zvol_log_write(): these offset and length are later used
to take a range lock in zillog->zl_get_data function: zvol_get_data().

Now suppose we have a ZVOL with blocksize=8K and push 4K writes to
offset 0: we will only be range-locking 0-4096. This means the
ASSERTion we make in dbuf_unoverride() is no longer valid because now
dmu_sync() is called from zilog's get_data functions holding a partial
lock on the dbuf.

Fix this by taking a range lock on the whole block in zvol_get_data().

Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
Closes #6238
Closes #6315
Closes #6356
Closes #6477

cmd/ztest/ztest.c
module/zfs/dmu.c
module/zfs/zfs_vnops.c
module/zfs/zvol.c
tests/runfiles/linux.run
tests/zfs-tests/tests/functional/zvol/zvol_misc/Makefile.am
tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_zil.ksh [new file with mode: 0755]

index 3acfaecafff4a56998f290b1a38b5cf6cef92c21..8ea263dea9c45e5647495f7bb2df8403d0fbf647 100644 (file)
@@ -2145,6 +2145,7 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
        if (buf != NULL) {      /* immediate write */
                zgd_private->z_rl = ztest_range_lock(zd, object, offset, size,
                    RL_READER);
+               zgd->zgd_rl = zgd_private->z_rl->z_rl;
 
                error = dmu_read(os, object, offset, size, buf,
                    DMU_READ_NO_PREFETCH);
@@ -2160,6 +2161,7 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 
                zgd_private->z_rl = ztest_range_lock(zd, object, offset, size,
                    RL_READER);
+               zgd->zgd_rl = zgd_private->z_rl->z_rl;
 
                error = dmu_buf_hold(os, object, offset, zgd, &db,
                    DMU_READ_NO_PREFETCH);
index e098a4966b7dbe18ca5a360f47d1f722db123ed6..5b90818f4ee7930d2960ce74f106ed35cea7e59d 100644 (file)
@@ -49,6 +49,7 @@
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/trace_dmu.h>
+#include <sys/zfs_rlock.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <sys/zfs_znode.h>
@@ -1815,6 +1816,11 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
        ASSERT(pio != NULL);
        ASSERT(txg != 0);
 
+       /* dbuf is within the locked range */
+       ASSERT3U(db->db.db_offset, >=, zgd->zgd_rl->r_off);
+       ASSERT3U(db->db.db_offset + db->db.db_size, <=,
+           zgd->zgd_rl->r_off + zgd->zgd_rl->r_len);
+
        SET_BOOKMARK(&zb, ds->ds_object,
            db->db.db_object, db->db_level, db->db_blkid);
 
index 53c5e4f2384f175185d3f59ef6178b12e1ddf707..ffeac7730e97ba911adb824afe8b9828869510b2 100644 (file)
@@ -1050,7 +1050,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
        } else { /* indirect write */
                /*
                 * Have to lock the whole block to ensure when it's
-                * written out and it's checksum is being calculated
+                * written out and its checksum is being calculated
                 * that no one can change the data. We need to re-check
                 * blocksize after we get the lock in case it's changed!
                 */
index 4d11b52ab37abb38b4c1fa9529684594229934bf..60fab5cc68bf31ec8bc40a72280ae3c3eefe4f82 100644 (file)
@@ -821,6 +821,7 @@ zvol_discard(void *arg)
        uint64_t start = BIO_BI_SECTOR(bio) << 9;
        uint64_t size = BIO_BI_SIZE(bio);
        uint64_t end = start + size;
+       boolean_t sync;
        int error = 0;
        dmu_tx_t *tx;
        unsigned long start_jif;
@@ -830,9 +831,11 @@ zvol_discard(void *arg)
        start_jif = jiffies;
        generic_start_io_acct(WRITE, bio_sectors(bio), &zv->zv_disk->part0);
 
+       sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+
        if (end > zv->zv_volsize) {
                error = SET_ERROR(EIO);
-               goto out;
+               goto unlock;
        }
 
        /*
@@ -848,7 +851,7 @@ zvol_discard(void *arg)
        }
 
        if (start >= end)
-               goto out;
+               goto unlock;
 
        tx = dmu_tx_create(zv->zv_objset);
        dmu_tx_mark_netfree(tx);
@@ -861,9 +864,11 @@ zvol_discard(void *arg)
                error = dmu_free_long_range(zv->zv_objset,
                    ZVOL_OBJ, start, size);
        }
-
-out:
+unlock:
        zfs_range_unlock(zvr->rl);
+       if (error == 0 && sync)
+               zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
        rw_exit(&zv->zv_suspend_lock);
        generic_end_io_acct(WRITE, &zv->zv_disk->part0, start_jif);
        BIO_END_IO(bio, -error);
@@ -933,6 +938,8 @@ zvol_request(struct request_queue *q, struct bio *bio)
        }
 
        if (rw == WRITE) {
+               boolean_t need_sync = B_FALSE;
+
                if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
                        BIO_END_IO(bio, -SET_ERROR(EROFS));
                        goto out;
@@ -966,13 +973,24 @@ zvol_request(struct request_queue *q, struct bio *bio)
                 */
                zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
                    RL_WRITER);
+               /*
+                * Sync writes and discards execute zil_commit() which may need
+                * to take a RL_READER lock on the whole block being modified
+                * via its zillog->zl_get_data(): to avoid circular dependency
+                * issues with taskq threads execute these requests
+                * synchronously here in zvol_request().
+                */
+               need_sync = bio_is_fua(bio) ||
+                   zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
                if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
-                       if (zvol_request_sync || taskq_dispatch(zvol_taskq,
-                           zvol_discard, zvr, TQ_SLEEP) == TASKQID_INVALID)
+                       if (zvol_request_sync || need_sync ||
+                           taskq_dispatch(zvol_taskq, zvol_discard, zvr,
+                           TQ_SLEEP) == TASKQID_INVALID)
                                zvol_discard(zvr);
                } else {
-                       if (zvol_request_sync || taskq_dispatch(zvol_taskq,
-                           zvol_write, zvr, TQ_SLEEP) == TASKQID_INVALID)
+                       if (zvol_request_sync || need_sync ||
+                           taskq_dispatch(zvol_taskq, zvol_write, zvr,
+                           TQ_SLEEP) == TASKQID_INVALID)
                                zvol_write(zvr);
                }
        } else {
@@ -1030,8 +1048,6 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 
        zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
        zgd->zgd_zilog = zv->zv_zilog;
-       zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
-           RL_READER);
 
        /*
         * Write records come in two flavors: immediate and indirect.
@@ -1041,11 +1057,21 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
         * we don't have to write the data twice.
         */
        if (buf != NULL) { /* immediate write */
+               zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
+                   RL_READER);
                error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
                    DMU_READ_NO_PREFETCH);
-       } else {
+       } else { /* indirect write */
+               /*
+                * Have to lock the whole block to ensure when it's written out
+                * and its checksum is being calculated that no one can change
+                * the data. Contrarily to zfs_get_data we need not re-check
+                * blocksize after we get the lock because it cannot be changed.
+                */
                size = zv->zv_volblocksize;
                offset = P2ALIGN_TYPED(offset, size, uint64_t);
+               zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
+                   RL_READER);
                error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
                    DMU_READ_NO_PREFETCH);
                if (error == 0) {
index cb8e2841f6780202463574f0063054299e36672d..0cbbfa03e520a0216f0aae3bf21b93e1cc74743a 100644 (file)
@@ -594,7 +594,7 @@ tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg']
 [tests/functional/zvol/zvol_misc]
 tests = ['zvol_misc_001_neg', 'zvol_misc_002_pos', 'zvol_misc_003_neg',
     'zvol_misc_004_pos', 'zvol_misc_005_neg', 'zvol_misc_006_pos',
-    'zvol_misc_snapdev', 'zvol_misc_volmode']
+    'zvol_misc_snapdev', 'zvol_misc_volmode', 'zvol_misc_zil']
 
 [tests/functional/zvol/zvol_swap]
 tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_003_pos',
index 30ef9acbd0024e864271b0f1027013e3eca012ca..8859ff3598dba69035c0950f1aac05e5b95b3777 100644 (file)
@@ -10,4 +10,5 @@ dist_pkgdata_SCRIPTS = \
        zvol_misc_005_neg.ksh \
        zvol_misc_006_pos.ksh \
        zvol_misc_snapdev.ksh \
-       zvol_misc_volmode.ksh
+       zvol_misc_volmode.ksh \
+       zvol_misc_zil.ksh
diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_zil.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_zil.ksh
new file mode 100755 (executable)
index 0000000..b8989f4
--- /dev/null
@@ -0,0 +1,74 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/zvol/zvol_common.shlib
+. $STF_SUITE/tests/functional/zvol/zvol_misc/zvol_misc_common.kshlib
+
+#
+# DESCRIPTION:
+# Verify ZIL functionality on ZVOLs
+#
+# STRATEGY:
+# 1. Create a ZVOLs with various combination of "logbias" and "sync" values
+# 2. Write data to ZVOL device node
+# 3. Verify we don't trigger any issue like the one reported in #6238
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+       datasetexists $ZVOL && log_must_busy zfs destroy $ZVOL
+       udev_wait
+}
+
+log_assert "Verify ZIL functionality on ZVOLs"
+log_onexit cleanup
+
+ZVOL="$TESTPOOL/vol"
+ZDEV="$ZVOL_DEVDIR/$ZVOL"
+typeset -a logbias_prop_vals=('latency' 'throughput')
+typeset -a sync_prop_vals=('standard' 'always' 'disabled')
+
+for logbias in ${logbias_prop_vals[@]}; do
+       for sync in ${sync_prop_vals[@]}; do
+               # 1. Create a ZVOL with logbias=throughput and sync=always
+               log_must zfs create -V $VOLSIZE -b 128K -o sync=$sync \
+                   -o logbias=$logbias $ZVOL
+
+               # 2. Write data to its device node
+               for i in {1..50}; do
+                       dd if=/dev/zero of=$ZDEV bs=8k count=1 &
+               done
+
+               # 3. Verify we don't trigger any issue
+               log_must wait
+               log_must_busy zfs destroy $ZVOL
+       done
+done
+
+log_pass "ZIL functionality works on ZVOLs"