]> granicus.if.org Git - zfs/commitdiff
OpenZFS 8378 - crash due to bp in-memory modification of nopwrite block
authorMatthew Ahrens <mahrens@delphix.com>
Fri, 14 Apr 2017 19:59:18 +0000 (12:59 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 4 Jul 2017 22:41:24 +0000 (15:41 -0700)
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
The problem is that zfs_get_data() supplies a stale zgd_bp to
dmu_sync(), which we then nopwrite against.
zfs_get_data() doesn't hold any DMU-related locks, so after it
copies db_blkptr to zgd_bp, dbuf_write_ready() could change
db_blkptr, and dbuf_write_done() could remove the dirty record.
dmu_sync() then sees the stale BP and that the dbuf it not dirty,
so it is eligible for nop-writing.
The fix is for dmu_sync() to copy db_blkptr to zgd_bp after
acquiring the db_mtx. We could still see a stale db_blkptr,
but if it is stale then the dirty record will still exist and
thus we won't attempt to nopwrite.

OpenZFS-issue: https://www.illumos.org/issues/8378
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3127742
Closes #6293

cmd/ztest/ztest.c
module/zfs/dmu.c
module/zfs/zfs_vnops.c
module/zfs/zvol.c

index b9eef07e1cbe504bed30f6dccc557f5acbc30623..d698628ded436979c806f5197c01c300852f4c48 100644 (file)
@@ -2099,7 +2099,6 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
        uint64_t object = lr->lr_foid;
        uint64_t offset = lr->lr_offset;
        uint64_t size = lr->lr_length;
-       blkptr_t *bp = &lr->lr_blkptr;
        uint64_t txg = lr->lr_common.lrc_txg;
        uint64_t crtxg;
        dmu_object_info_t doi;
@@ -2157,11 +2156,7 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
                    DMU_READ_NO_PREFETCH);
 
                if (error == 0) {
-                       blkptr_t *obp = dmu_buf_get_blkptr(db);
-                       if (obp) {
-                               ASSERT(BP_IS_HOLE(bp));
-                               *bp = *obp;
-                       }
+                       blkptr_t *bp = &lr->lr_blkptr;
 
                        zgd->zgd_db = db;
                        zgd->zgd_bp = bp;
index 48e89eef4af31d8a22d5bdbea29910e9bae451d7..717bd121fc811af71150998f1e0ca9b57309df42 100644 (file)
@@ -1561,13 +1561,14 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
        if (zio->io_error == 0) {
                dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
                if (dr->dt.dl.dr_nopwrite) {
-                       ASSERTV(blkptr_t *bp = zio->io_bp);
-                       ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
-                       ASSERTV(uint8_t chksum = BP_GET_CHECKSUM(bp_orig));
+                       blkptr_t *bp = zio->io_bp;
+                       blkptr_t *bp_orig = &zio->io_bp_orig;
+                       uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
 
                        ASSERT(BP_EQUAL(bp, bp_orig));
+                       VERIFY(BP_EQUAL(bp, db->db_blkptr));
                        ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
-                       ASSERT(zio_checksum_table[chksum].ci_flags &
+                       VERIFY(zio_checksum_table[chksum].ci_flags &
                            ZCHECKSUM_FLAG_NOPWRITE);
                }
                dr->dt.dl.dr_overridden_by = *zio->io_bp;
@@ -1606,19 +1607,11 @@ dmu_sync_late_arrival_done(zio_t *zio)
        ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
 
        if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
-               /*
-                * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
-                * then there is nothing to do here. Otherwise, free the
-                * newly allocated block in this txg.
-                */
-               if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
-                       ASSERT(BP_EQUAL(bp, bp_orig));
-               } else {
-                       ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
-                       ASSERT(zio->io_bp->blk_birth == zio->io_txg);
-                       ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
-                       zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
-               }
+               ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
+               ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
+               ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+               ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+               zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
        }
 
        dmu_tx_commit(dsa->dsa_tx);
@@ -1650,6 +1643,29 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
        dsa->dsa_zgd = zgd;
        dsa->dsa_tx = tx;
 
+       /*
+        * Since we are currently syncing this txg, it's nontrivial to
+        * determine what BP to nopwrite against, so we disable nopwrite.
+        *
+        * When syncing, the db_blkptr is initially the BP of the previous
+        * txg.  We can not nopwrite against it because it will be changed
+        * (this is similar to the non-late-arrival case where the dbuf is
+        * dirty in a future txg).
+        *
+        * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
+        * We can not nopwrite against it because although the BP will not
+        * (typically) be changed, the data has not yet been persisted to this
+        * location.
+        *
+        * Finally, when dbuf_write_done() is called, it is theoretically
+        * possible to always nopwrite, because the data that was written in
+        * this txg is the same data that we are trying to write.  However we
+        * would need to check that this dbuf is not dirty in any future
+        * txg's (as we do in the normal dmu_sync() path). For simplicity, we
+        * don't nopwrite in this case.
+        */
+       zp->zp_nopwrite = B_FALSE;
+
        zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
            abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
            zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
@@ -1687,7 +1703,6 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
 int
 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 {
-       blkptr_t *bp = zgd->zgd_bp;
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
        objset_t *os = db->db_objset;
        dsl_dataset_t *ds = os->os_dsl_dataset;
@@ -1754,6 +1769,21 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 
        ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
 
+       if (db->db_blkptr != NULL) {
+               /*
+                * We need to fill in zgd_bp with the current blkptr so that
+                * the nopwrite code can check if we're writing the same
+                * data that's already on disk.  We can only nopwrite if we
+                * are sure that after making the copy, db_blkptr will not
+                * change until our i/o completes.  We ensure this by
+                * holding the db_mtx, and only allowing nopwrite if the
+                * block is not already dirty (see below).  This is verified
+                * by dmu_sync_done(), which VERIFYs that the db_blkptr has
+                * not changed.
+                */
+               *zgd->zgd_bp = *db->db_blkptr;
+       }
+
        /*
         * Assume the on-disk data is X, the current syncing data (in
         * txg - 1) is Y, and the current in-memory data is Z (currently
@@ -1805,7 +1835,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
        dsa->dsa_tx = NULL;
 
        zio_nowait(arc_write(pio, os->os_spa, txg,
-           bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
+           zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
            &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
            ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 
index c065c0c59b18023863ee869750c6647b769116cb..d415e80247be26357ba8c12d50207a82b6212fb1 100644 (file)
@@ -1002,7 +1002,6 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
        uint64_t object = lr->lr_foid;
        uint64_t offset = lr->lr_offset;
        uint64_t size = lr->lr_length;
-       blkptr_t *bp = &lr->lr_blkptr;
        dmu_buf_t *db;
        zgd_t *zgd;
        int error = 0;
@@ -1079,11 +1078,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
                            DMU_READ_NO_PREFETCH);
 
                if (error == 0) {
-                       blkptr_t *obp = dmu_buf_get_blkptr(db);
-                       if (obp) {
-                               ASSERT(BP_IS_HOLE(bp));
-                               *bp = *obp;
-                       }
+                       blkptr_t *bp = &lr->lr_blkptr;
 
                        zgd->zgd_db = db;
                        zgd->zgd_bp = bp;
index 528acbc227f334220ec1a34bad32cfa9d6b10c91..3e9f004ef3d78cd21a5f8b1debfa6ea26c60f329 100644 (file)
@@ -36,6 +36,7 @@
  *
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 /*
@@ -1018,7 +1019,6 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
        zvol_state_t *zv = arg;
        uint64_t offset = lr->lr_offset;
        uint64_t size = lr->lr_length;
-       blkptr_t *bp = &lr->lr_blkptr;
        dmu_buf_t *db;
        zgd_t *zgd;
        int error;
@@ -1047,14 +1047,10 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
                error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
                    DMU_READ_NO_PREFETCH);
                if (error == 0) {
-                       blkptr_t *obp = dmu_buf_get_blkptr(db);
-                       if (obp) {
-                               ASSERT(BP_IS_HOLE(bp));
-                               *bp = *obp;
-                       }
+                       blkptr_t *bp = &lr->lr_blkptr;
 
                        zgd->zgd_db = db;
-                       zgd->zgd_bp = &lr->lr_blkptr;
+                       zgd->zgd_bp = bp;
 
                        ASSERT(db != NULL);
                        ASSERT(db->db_offset == offset);