]> granicus.if.org Git - zfs/commitdiff
Skip spurious resilver IO on raidz vdev
authorIsaac Huang <he.huang@intel.com>
Sat, 13 May 2017 00:28:03 +0000 (18:28 -0600)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Sat, 13 May 2017 00:28:03 +0000 (17:28 -0700)
On a raidz vdev, a block that does not span all child vdevs, excluding
its skip sectors if any, may not be affected by a child vdev outage or
failure. In such cases, the block does not need to be resilvered.
However, current resilver algorithm simply resilvers all blocks on a
degraded raidz vdev. Such spurious IO is not only wasteful, but also
adds the risk of overwriting good data.

This patch eliminates such spurious IOs.

Reviewed-by: Gvozden Neskovic <neskovic@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Isaac Huang <he.huang@intel.com>
Closes #5316

include/sys/vdev.h
include/sys/vdev_impl.h
module/zfs/dsl_scan.c
module/zfs/vdev.c
module/zfs/vdev_disk.c
module/zfs/vdev_file.c
module/zfs/vdev_mirror.c
module/zfs/vdev_missing.c
module/zfs/vdev_raidz.c
module/zfs/vdev_root.c

index 4f54b1707c54383f6e6f34e16aa527a54861afa2..63b4904c5e1a8c0b4768eab16fa575417fb0bed1 100644 (file)
@@ -65,6 +65,7 @@ extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
 extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
     uint64_t txg, uint64_t size);
 extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
+extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size);
 extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     int scrub_done);
 extern boolean_t vdev_dtl_required(vdev_t *vd);
index c9e9bede972811b5242fd8f8bfd697ccc80079eb..835d2dbbf24e4d7ef3cfc76b4230b8ea6f092189 100644 (file)
@@ -68,6 +68,7 @@ typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
 typedef void   vdev_io_start_func_t(zio_t *zio);
 typedef void   vdev_io_done_func_t(zio_t *zio);
 typedef void   vdev_state_change_func_t(vdev_t *vd, int, int);
+typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t);
 typedef void   vdev_hold_func_t(vdev_t *vd);
 typedef void   vdev_rele_func_t(vdev_t *vd);
 
@@ -78,6 +79,7 @@ typedef const struct vdev_ops {
        vdev_io_start_func_t            *vdev_op_io_start;
        vdev_io_done_func_t             *vdev_op_io_done;
        vdev_state_change_func_t        *vdev_op_state_change;
+       vdev_need_resilver_func_t       *vdev_op_need_resilver;
        vdev_hold_func_t                *vdev_op_hold;
        vdev_rele_func_t                *vdev_op_rele;
        char                            vdev_op_type[16];
index f5ef2268d2fda109879d3d7f54fb55ff0832d926..5b52681d8d8e13d59c0f3c625b406b8793e562a2 100644 (file)
@@ -1836,12 +1836,51 @@ dsl_scan_scrub_done(zio_t *zio)
        mutex_exit(&spa->spa_scrub_lock);
 }
 
+static boolean_t
+dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
+{
+       vdev_t *vd;
+
+       if (DVA_GET_GANG(dva)) {
+               /*
+                * Gang members may be spread across multiple
+                * vdevs, so the best estimate we have is the
+                * scrub range, which has already been checked.
+                * XXX -- it would be better to change our
+                * allocation policy to ensure that all
+                * gang members reside on the same vdev.
+                */
+               return (B_TRUE);
+       }
+
+       vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+
+       /*
+        * Check if the txg falls within the range which must be
+        * resilvered.  DVAs outside this range can always be skipped.
+        */
+       if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
+               return (B_FALSE);
+
+       /*
+        * Check if the top-level vdev must resilver this offset.
+        * When the offset does not intersect with a dirty leaf DTL
+        * then it may be possible to skip the resilver IO.  The psize
+        * is provided instead of asize to simplify the check for RAIDZ.
+        */
+       if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
+               return (B_FALSE);
+
+       return (B_TRUE);
+}
+
 static int
 dsl_scan_scrub_cb(dsl_pool_t *dp,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
        dsl_scan_t *scn = dp->dp_scan;
-       size_t size = BP_GET_PSIZE(bp);
+       size_t psize = BP_GET_PSIZE(bp);
        spa_t *spa = dp->dp_spa;
        uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
        boolean_t needs_io = B_FALSE;
@@ -1875,33 +1914,19 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
                zio_flags |= ZIO_FLAG_SPECULATIVE;
 
        for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-               vdev_t *vd = vdev_lookup_top(spa,
-                   DVA_GET_VDEV(&bp->blk_dva[d]));
+               const dva_t *dva = &bp->blk_dva[d];
 
                /*
                 * Keep track of how much data we've examined so that
                 * zpool(1M) status can make useful progress reports.
                 */
-               scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
-               spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
+               scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
+               spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
 
                /* if it's a resilver, this may not be in the target range */
-               if (!needs_io) {
-                       if (DVA_GET_GANG(&bp->blk_dva[d])) {
-                               /*
-                                * Gang members may be spread across multiple
-                                * vdevs, so the best estimate we have is the
-                                * scrub range, which has already been checked.
-                                * XXX -- it would be better to change our
-                                * allocation policy to ensure that all
-                                * gang members reside on the same vdev.
-                                */
-                               needs_io = B_TRUE;
-                       } else {
-                               needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
-                                   phys_birth, 1);
-                       }
-               }
+               if (!needs_io)
+                       needs_io = dsl_scan_need_resilver(spa, dva, psize,
+                           phys_birth);
        }
 
        if (needs_io && !zfs_no_scrub_io) {
@@ -1922,8 +1947,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
                        delay(scan_delay);
 
                zio_nowait(zio_read(NULL, spa, bp,
-                   abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done,
-                   NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb));
+                   abd_alloc_for_io(psize, B_FALSE),
+                   psize, dsl_scan_scrub_done, NULL,
+                   ZIO_PRIORITY_SCRUB, zio_flags, zb));
        }
 
        /* do not relocate this block */
index a71e678bbea9e61777807b2db4c342fce53cd96e..f44d338ef87cc440ecca64981790e50e0fb1bf02 100644 (file)
@@ -1819,6 +1819,21 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
        return (empty);
 }
 
+/*
+ * Returns B_TRUE if vdev determines offset needs to be resilvered.
+ */
+boolean_t
+vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
+{
+       ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+
+       if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
+           vd->vdev_ops->vdev_op_leaf)
+               return (B_TRUE);
+
+       return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
+}
+
 /*
  * Returns the lowest txg in the DTL range.
  */
index 33b7f5d1550a77e9ba04e31e32169d8bfc95a638..3fdf5642b104ce54d20070c8fd6e37d365ec1e8d 100644 (file)
@@ -796,6 +796,7 @@ vdev_ops_t vdev_disk_ops = {
        vdev_disk_io_start,
        vdev_disk_io_done,
        NULL,
+       NULL,
        vdev_disk_hold,
        vdev_disk_rele,
        VDEV_TYPE_DISK,         /* name of this vdev type */
index c5e64520d3c181a0d9263510d52467cc66756f3b..13c32e0836f5b94ff7ff11802bf6bf1cd0203993 100644 (file)
@@ -250,6 +250,7 @@ vdev_ops_t vdev_file_ops = {
        vdev_file_io_start,
        vdev_file_io_done,
        NULL,
+       NULL,
        vdev_file_hold,
        vdev_file_rele,
        VDEV_TYPE_FILE,         /* name of this vdev type */
@@ -283,6 +284,7 @@ vdev_ops_t vdev_disk_ops = {
        vdev_file_io_start,
        vdev_file_io_done,
        NULL,
+       NULL,
        vdev_file_hold,
        vdev_file_rele,
        VDEV_TYPE_DISK,         /* name of this vdev type */
index 256431e6b334974924edec9a86057bf8c4278671..15d1f204ffed73d47a3536d10dffc53603720b41 100644 (file)
@@ -615,6 +615,7 @@ vdev_ops_t vdev_mirror_ops = {
        vdev_mirror_state_change,
        NULL,
        NULL,
+       NULL,
        VDEV_TYPE_MIRROR,       /* name of this vdev type */
        B_FALSE                 /* not a leaf vdev */
 };
@@ -628,6 +629,7 @@ vdev_ops_t vdev_replacing_ops = {
        vdev_mirror_state_change,
        NULL,
        NULL,
+       NULL,
        VDEV_TYPE_REPLACING,    /* name of this vdev type */
        B_FALSE                 /* not a leaf vdev */
 };
@@ -641,6 +643,7 @@ vdev_ops_t vdev_spare_ops = {
        vdev_mirror_state_change,
        NULL,
        NULL,
+       NULL,
        VDEV_TYPE_SPARE,        /* name of this vdev type */
        B_FALSE                 /* not a leaf vdev */
 };
index 228757334234d241f980058397438d3a80716dcf..d7d017fb8fbe34e9a8f7e78e547215d016c0cf05 100644 (file)
@@ -88,6 +88,7 @@ vdev_ops_t vdev_missing_ops = {
        NULL,
        NULL,
        NULL,
+       NULL,
        VDEV_TYPE_MISSING,      /* name of this vdev type */
        B_TRUE                  /* leaf vdev */
 };
@@ -101,6 +102,7 @@ vdev_ops_t vdev_hole_ops = {
        NULL,
        NULL,
        NULL,
+       NULL,
        VDEV_TYPE_HOLE,         /* name of this vdev type */
        B_TRUE                  /* leaf vdev */
 };
index c073f1374fa591b49a84399627c9ae164c6f077b..ba850b4f83d814648f5834f40dad3768236aef02 100644 (file)
@@ -330,18 +330,18 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
  * is this functions only caller, as small as possible on the stack.
  */
 noinline raidz_map_t *
-vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
+vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
     uint64_t nparity)
 {
        raidz_map_t *rm;
        /* The starting RAIDZ (parent) vdev sector of the block. */
-       uint64_t b = zio->io_offset >> unit_shift;
+       uint64_t b = zio->io_offset >> ashift;
        /* The zio's size in units of the vdev's minimum sector size. */
-       uint64_t s = zio->io_size >> unit_shift;
+       uint64_t s = zio->io_size >> ashift;
        /* The first column for this stripe. */
        uint64_t f = b % dcols;
        /* The starting byte offset on each child vdev. */
-       uint64_t o = (b / dcols) << unit_shift;
+       uint64_t o = (b / dcols) << ashift;
        uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
        uint64_t off = 0;
 
@@ -400,7 +400,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
                coff = o;
                if (col >= dcols) {
                        col -= dcols;
-                       coff += 1ULL << unit_shift;
+                       coff += 1ULL << ashift;
                }
                rm->rm_col[c].rc_devidx = col;
                rm->rm_col[c].rc_offset = coff;
@@ -413,17 +413,17 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
                if (c >= acols)
                        rm->rm_col[c].rc_size = 0;
                else if (c < bc)
-                       rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+                       rm->rm_col[c].rc_size = (q + 1) << ashift;
                else
-                       rm->rm_col[c].rc_size = q << unit_shift;
+                       rm->rm_col[c].rc_size = q << ashift;
 
                asize += rm->rm_col[c].rc_size;
        }
 
-       ASSERT3U(asize, ==, tot << unit_shift);
-       rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
+       ASSERT3U(asize, ==, tot << ashift);
+       rm->rm_asize = roundup(asize, (nparity + 1) << ashift);
        rm->rm_nskip = roundup(tot, nparity + 1) - tot;
-       ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
+       ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift);
        ASSERT3U(rm->rm_nskip, <=, nparity);
 
        for (c = 0; c < rm->rm_firstdatacol; c++)
@@ -2299,6 +2299,44 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
                vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
+/*
+ * Determine if any portion of the provided block resides on a child vdev
+ * with a dirty DTL and therefore needs to be resilvered.  The function
+ * assumes that at least one DTL is dirty which imples that full stripe
+ * width blocks must be resilvered.
+ */
+static boolean_t
+vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
+{
+       uint64_t dcols = vd->vdev_children;
+       uint64_t nparity = vd->vdev_nparity;
+       uint64_t ashift = vd->vdev_top->vdev_ashift;
+       /* The starting RAIDZ (parent) vdev sector of the block. */
+       uint64_t b = offset >> ashift;
+       /* The zio's size in units of the vdev's minimum sector size. */
+       uint64_t s = ((psize - 1) >> ashift) + 1;
+       /* The first column for this stripe. */
+       uint64_t f = b % dcols;
+
+       if (s + nparity >= dcols)
+               return (B_TRUE);
+
+       for (uint64_t c = 0; c < s + nparity; c++) {
+               uint64_t devidx = (f + c) % dcols;
+               vdev_t *cvd = vd->vdev_child[devidx];
+
+               /*
+                * dsl_scan_need_resilver() already checked vd with
+                * vdev_dtl_contains(). So here just check cvd with
+                * vdev_dtl_empty(), cheaper and a good approximation.
+                */
+               if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
+                       return (B_TRUE);
+       }
+
+       return (B_FALSE);
+}
+
 vdev_ops_t vdev_raidz_ops = {
        vdev_raidz_open,
        vdev_raidz_close,
@@ -2306,6 +2344,7 @@ vdev_ops_t vdev_raidz_ops = {
        vdev_raidz_io_start,
        vdev_raidz_io_done,
        vdev_raidz_state_change,
+       vdev_raidz_need_resilver,
        NULL,
        NULL,
        VDEV_TYPE_RAIDZ,        /* name of this vdev type */
index 90250b0fb99c7c2b2a5ff7f695411ced4a6f8aee..6b456dd2bde087c07c12dd24e5315671d08e36fc 100644 (file)
@@ -120,6 +120,7 @@ vdev_ops_t vdev_root_ops = {
        vdev_root_state_change,
        NULL,
        NULL,
+       NULL,
        VDEV_TYPE_ROOT,         /* name of this vdev type */
        B_FALSE                 /* not a leaf vdev */
 };