OpenZFS 9290 - device removal reduces redundancy of mirrors

author Matthew Ahrens <mahrens@delphix.com>

Tue, 13 Feb 2018 19:37:56 +0000 (11:37 -0800)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Sat, 14 Apr 2018 19:21:39 +0000 (12:21 -0700)
author Matthew Ahrens <mahrens@delphix.com>
Tue, 13 Feb 2018 19:37:56 +0000 (11:37 -0800)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Sat, 14 Apr 2018 19:21:39 +0000 (12:21 -0700)
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c

index 02fb55944e0575cb32dccd860b97e2069766d8ac..852f853559c7c37a575de590c092aa77b75b0429 100644 (file)
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -3406,7 +3406,7 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
  
         spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-       vdev_t *vd = svr->svr_vdev;
+       vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
         vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
  
         for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
@@ -3422,13 +3422,17 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
                             svr->svr_allocd_segs, SM_ALLOC));
  
                         /*
-                        * Clear everything past what has been synced,
-                        * because we have not allocated mappings for it yet.
+                        * Clear everything past what has been synced unless
+                        * it's past the spacemap, because we have not allocated
+                        * mappings for it yet.
                          */
-                       range_tree_clear(svr->svr_allocd_segs,
-                           vdev_indirect_mapping_max_offset(vim),
-                           msp->ms_sm->sm_start + msp->ms_sm->sm_size -
-                           vdev_indirect_mapping_max_offset(vim));
+                       uint64_t vim_max_offset =
+                           vdev_indirect_mapping_max_offset(vim);
+                       uint64_t sm_end = msp->ms_sm->sm_start +
+                           msp->ms_sm->sm_size;
+                       if (sm_end > vim_max_offset)
+                               range_tree_clear(svr->svr_allocd_segs,
+                                   vim_max_offset, sm_end - vim_max_offset);
                 }
  
                 zcb->zcb_removing_size +=
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c

index 0e3459d324b4af88a00e97a316f3f2479ac4a063..c81d446a542ffc9aad3985c005ff0010dc943c2d 100644 (file)
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -445,6 +445,7 @@ static spa_t *ztest_spa = NULL;
  static ztest_ds_t *ztest_ds;
  
  static kmutex_t ztest_vdev_lock;
+static boolean_t ztest_device_removal_active = B_FALSE;
  
  /*
   * The ztest_name_lock protects the pool and dataset namespace used by
@@ -3203,7 +3204,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
          * value.  Don't bother trying to attach while we are in the middle
          * of removal.
          */
-       if (spa->spa_vdev_removal != NULL) {
+       if (ztest_device_removal_active) {
                 spa_config_exit(spa, SCL_ALL, FTAG);
                 mutex_exit(&ztest_vdev_lock);
                 return;
@@ -3375,16 +3376,49 @@ ztest_device_removal(ztest_ds_t *zd, uint64_t id)
         spa_t *spa = ztest_spa;
         vdev_t *vd;
         uint64_t guid;
+       int error;
  
         mutex_enter(&ztest_vdev_lock);
  
+       if (ztest_device_removal_active) {
+               mutex_exit(&ztest_vdev_lock);
+               return;
+       }
+
+       /*
+        * Remove a random top-level vdev and wait for removal to finish.
+        */
         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
         vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE));
         guid = vd->vdev_guid;
         spa_config_exit(spa, SCL_VDEV, FTAG);
  
-       (void) spa_vdev_remove(spa, guid, B_FALSE);
+       error = spa_vdev_remove(spa, guid, B_FALSE);
+       if (error == 0) {
+               ztest_device_removal_active = B_TRUE;
+               mutex_exit(&ztest_vdev_lock);
+
+               while (spa->spa_vdev_removal != NULL)
+                       txg_wait_synced(spa_get_dsl(spa), 0);
+       } else {
+               mutex_exit(&ztest_vdev_lock);
+               return;
+       }
  
+       /*
+        * The pool needs to be scrubbed after completing device removal.
+        * Failure to do so may result in checksum errors due to the
+        * strategy employed by ztest_fault_inject() when selecting which
+        * offset are redundant and can be damaged.
+        */
+       error = spa_scan(spa, POOL_SCAN_SCRUB);
+       if (error == 0) {
+               while (dsl_scan_scrubbing(spa_get_dsl(spa)))
+                       txg_wait_synced(spa_get_dsl(spa), 0);
+       }
+
+       mutex_enter(&ztest_vdev_lock);
+       ztest_device_removal_active = B_FALSE;
         mutex_exit(&ztest_vdev_lock);
  }
  
@@ -3524,7 +3558,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
          * that the metaslab_class space increased (because it decreases
          * when the device removal completes).
          */
-       if (spa->spa_vdev_removal != NULL) {
+       if (ztest_device_removal_active) {
                 spa_config_exit(spa, SCL_STATE, FTAG);
                 mutex_exit(&ztest_vdev_lock);
                 return;
@@ -5520,6 +5554,18 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
         pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
  
         mutex_enter(&ztest_vdev_lock);
+
+       /*
+        * Device removal is in progress, fault injection must be disabled
+        * until it completes and the pool is scrubbed.  The fault injection
+        * strategy for damaging blocks does not take in to account evacuated
+        * blocks which may have already been damaged.
+        */
+       if (ztest_device_removal_active) {
+               mutex_exit(&ztest_vdev_lock);
+               goto out;
+       }
+
         maxfaults = MAXFAULTS(zs);
         leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
         mirror_save = zs->zs_mirrors;
@@ -5875,6 +5921,12 @@ ztest_scrub(ztest_ds_t *zd, uint64_t id)
  {
         spa_t *spa = ztest_spa;
  
+       /*
+        * Scrub in progress by device removal.
+        */
+       if (ztest_device_removal_active)
+               return;
+
         (void) spa_scan(spa, POOL_SCAN_SCRUB);
         (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
         (void) spa_scan(spa, POOL_SCAN_SCRUB);
diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h

index 5b1e3056be141fa2fd2f60a5ffcf906e1b63681a..80ee09456a27a69c84ec67740f93899f15a84ef6 100644 (file)
--- a/include/sys/vdev_removal.h
+++ b/include/sys/vdev_removal.h
@@ -30,7 +30,7 @@ extern "C" {
  #endif
  
  typedef struct spa_vdev_removal {
-       vdev_t          *svr_vdev;
+       uint64_t        svr_vdev_id;
         uint64_t        svr_max_offset_to_sync[TXG_SIZE];
         /* Thread performing a vdev removal. */
         kthread_t       *svr_thread;
diff --git a/include/sys/zio.h b/include/sys/zio.h

index a275b16de9cc5d939ab41111f35146673bcc3497..dc6841d839e62e4ad3f5a01ee9ac3cc07166d23c 100644 (file)
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -600,7 +600,7 @@ extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
      zio_done_func_t *done, void *private);
  
  extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
-    struct abd *data, uint64_t size, int type, zio_priority_t priority,
+    struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
      enum zio_flag flags, zio_done_func_t *done, void *private);
  
  extern void zio_vdev_io_bypass(zio_t *zio);
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c

index 99b4ea29af86a8c2862b605f0c627ebb2d1c99de..2d94cd3204973ada5df84e7acbc3fc4be32ec2da 100644 (file)
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -2879,7 +2879,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
  
         case EBUSY:
                 zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, "
-                   "or pool has removing/removed vdevs"),
+                   "or device removal is in progress"),
                     new_disk);
                 (void) zfs_error(hdl, EZFS_BADDEV, msg);
                 break;
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5

index 818f26449ce9348c4a2f0ec3ad7cfd2d8edb32de..6e670facb5ce8e62b57b199169a189978fea4466 100644 (file)
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -1739,6 +1739,23 @@ Include cache hits in read history
  Use \fB1\fR for yes and \fB0\fR for no (default).
  .RE
  
+.sp
+.ne 2
+.na
+\fBzfs_reconstruct_indirect_segments_max\fR (int)
+.ad
+.RS 12n
+When a split block which is part of a indirect vdev contains more than this
+many segments, consider it too computationally expensive to check all possible
+combinations.  Instead, operate under the assumption that only a few segment
+copies are damaged and the majority of segment copies are good, in which case
+it is reasonable to randomly select sample combinations.  This allows all the
+segment copies to participate fairly in the reconstruction and prevents the
+repeated use of one bad copy.
+.sp
+Default value: \fB10\fR.
+.RE
+
  .sp
  .ne 2
  .na
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c

index 53953a6c5851a19dc88bcf9158ef2eda3864cdaa..b87b4d5558f5cbc2c80b73a47c12c83318b72eba 100644 (file)
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2959,6 +2959,19 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
  {
         vdev_t *vd;
  
+       vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+
+       if (vd->vdev_ops == &vdev_indirect_ops) {
+               /*
+                * The indirect vdev can point to multiple
+                * vdevs.  For simplicity, always create
+                * the resilver zio_t. zio_vdev_io_start()
+                * will bypass the child resilver i/o's if
+                * they are on vdevs that don't have DTL's.
+                */
+               return (B_TRUE);
+       }
+
         if (DVA_GET_GANG(dva)) {
                 /*
                  * Gang members may be spread across multiple
@@ -2971,8 +2984,6 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
                 return (B_TRUE);
         }
  
-       vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
-
         /*
          * Check if the txg falls within the range which must be
          * resilvered.  DVAs outside this range can always be skipped.
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c

index 25090f089be587b05f63563f25b0c3d1f5526c4d..52a60cc5eb08b56ee24117b9f9345ed80a6cfd11 100644 (file)
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -3234,7 +3234,7 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
                 return;
  
         if (spa->spa_vdev_removal != NULL &&
-           spa->spa_vdev_removal->svr_vdev == vd &&
+           spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
             vdev_is_concrete(vd)) {
                 /*
                  * Note: we check if the vdev is concrete because when
diff --git a/module/zfs/spa.c b/module/zfs/spa.c

index 08fc7bbda71d97204c6b0f72bdddf158adce186f..9a5346b42aa96ffd104d27c1b93eb27bcc6596e1 100644 (file)
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -4860,8 +4860,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
                 for (int c = 0; c < vd->vdev_children; c++) {
                         tvd = vd->vdev_child[c];
                         if (spa->spa_vdev_removal != NULL &&
-                           tvd->vdev_ashift !=
-                           spa->spa_vdev_removal->svr_vdev->vdev_ashift) {
+                           tvd->vdev_ashift != spa->spa_max_ashift) {
                                 return (spa_vdev_exit(spa, vd, txg, EINVAL));
                         }
                         /* Fail if top level vdev is raidz */
@@ -4970,10 +4969,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
  
         oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
  
-       if (spa->spa_vdev_removal != NULL ||
-           spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
+       if (spa->spa_vdev_removal != NULL)
                 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-       }
  
         if (oldvd == NULL)
                 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c

index 5c6e825671b012d8860bb13182903331c44e7586..54299a7e451663891daafd289972dc9ecdfb88fe 100644 (file)
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1708,9 +1708,12 @@ spa_update_dspace(spa_t *spa)
                  * allocated twice (on the old device and the new
                  * device).
                  */
-               vdev_t *vd = spa->spa_vdev_removal->svr_vdev;
+               spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+               vdev_t *vd =
+                   vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
                 spa->spa_dspace -= spa_deflate(spa) ?
                     vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+               spa_config_exit(spa, SCL_VDEV, FTAG);
         }
  }
  
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c

index f3e3f90fa6b1bcfc948b3ffde8e457f38de6bef0..0dea0b4e456bf5742543a77d6384542df7df1475 100644 (file)
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -857,6 +857,32 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
         svd->vdev_stat.vs_space = 0;
         svd->vdev_stat.vs_dspace = 0;
  
+       /*
+        * State which may be set on a top-level vdev that's in the
+        * process of being removed.
+        */
+       ASSERT0(tvd->vdev_indirect_config.vic_births_object);
+       ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
+       ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
+       ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
+       ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
+       ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
+       ASSERT0(tvd->vdev_removing);
+       tvd->vdev_removing = svd->vdev_removing;
+       tvd->vdev_indirect_config = svd->vdev_indirect_config;
+       tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
+       tvd->vdev_indirect_births = svd->vdev_indirect_births;
+       range_tree_swap(&svd->vdev_obsolete_segments,
+           &tvd->vdev_obsolete_segments);
+       tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
+       svd->vdev_indirect_config.vic_mapping_object = 0;
+       svd->vdev_indirect_config.vic_births_object = 0;
+       svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
+       svd->vdev_indirect_mapping = NULL;
+       svd->vdev_indirect_births = NULL;
+       svd->vdev_obsolete_sm = NULL;
+       svd->vdev_removing = 0;
+
         for (t = 0; t < TXG_SIZE; t++) {
                 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
                         (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c

index 86a05daa87a3cecc4f219c0f4503aa9459deb3d0..3ccdfee3bd88a8818f88ed1f12b8b639dd3ce0cf 100644 (file)
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -23,6 +23,7 @@
  #include <sys/vdev_impl.h>
  #include <sys/fs/zfs.h>
  #include <sys/zio.h>
+#include <sys/zio_checksum.h>
  #include <sys/metaslab.h>
  #include <sys/refcount.h>
  #include <sys/dmu.h>
@@ -44,10 +45,11 @@
   * "vdev_remap" operation that executes a callback on each contiguous
   * segment of the new location.  This function is used in multiple ways:
   *
- *  - reads and repair writes to this device use the callback to create
- *    a child io for each mapped segment.
+ *  - i/os to this vdev use the callback to determine where the
+ *    data is now located, and issue child i/os for each segment's new
+ *    location.
   *
- *  - frees and claims to this device use the callback to free or claim
+ *  - frees and claims to this vdev use the callback to free or claim
   *    each mapped segment.  (Note that we don't actually need to claim
   *    log blocks on indirect vdevs, because we don't allocate to
   *    removing vdevs.  However, zdb uses zio_claim() for its leak
@@ -201,6 +203,95 @@ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024;
   */
  int zfs_condense_indirect_commit_entry_delay_ms = 0;
  
+/*
+ * If a split block contains more than this many segments, consider it too
+ * computationally expensive to check all (2^num_segments) possible
+ * combinations. Instead, try at most 2^_segments_max randomly-selected
+ * combinations.
+ *
+ * This is reasonable if only a few segment copies are damaged and the
+ * majority of segment copies are good. It allows all segment copies to
+ * participate fairly in the reconstruction and prevents repeated use of
+ * one bad copy.
+ */
+int zfs_reconstruct_indirect_segments_max = 10;
+
+/*
+ * The indirect_child_t represents the vdev that we will read from, when we
+ * need to read all copies of the data (e.g. for scrub or reconstruction).
+ * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
+ * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
+ * ic_vdev is a child of the mirror.
+ */
+typedef struct indirect_child {
+       abd_t *ic_data;
+       vdev_t *ic_vdev;
+} indirect_child_t;
+
+/*
+ * The indirect_split_t represents one mapped segment of an i/o to the
+ * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
+ * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
+ * For split blocks, there will be several of these.
+ */
+typedef struct indirect_split {
+       list_node_t is_node; /* link on iv_splits */
+
+       /*
+        * is_split_offset is the offset into the i/o.
+        * This is the sum of the previous splits' is_size's.
+        */
+       uint64_t is_split_offset;
+
+       vdev_t *is_vdev; /* top-level vdev */
+       uint64_t is_target_offset; /* offset on is_vdev */
+       uint64_t is_size;
+       int is_children; /* number of entries in is_child[] */
+
+       /*
+        * is_good_child is the child that we are currently using to
+        * attempt reconstruction.
+        */
+       int is_good_child;
+
+       indirect_child_t is_child[1]; /* variable-length */
+} indirect_split_t;
+
+/*
+ * The indirect_vsd_t is associated with each i/o to the indirect vdev.
+ * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
+ */
+typedef struct indirect_vsd {
+       boolean_t iv_split_block;
+       boolean_t iv_reconstruct;
+
+       list_t iv_splits; /* list of indirect_split_t's */
+} indirect_vsd_t;
+
+static void
+vdev_indirect_map_free(zio_t *zio)
+{
+       indirect_vsd_t *iv = zio->io_vsd;
+
+       indirect_split_t *is;
+       while ((is = list_head(&iv->iv_splits)) != NULL) {
+               for (int c = 0; c < is->is_children; c++) {
+                       indirect_child_t *ic = &is->is_child[c];
+                       if (ic->ic_data != NULL)
+                               abd_free(ic->ic_data);
+               }
+               list_remove(&iv->iv_splits, is);
+               kmem_free(is,
+                   offsetof(indirect_split_t, is_child[is->is_children]));
+       }
+       kmem_free(iv, sizeof (*iv));
+}
+
+static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
+       vdev_indirect_map_free,
+       zio_vsd_default_cksum_report
+};
+
  /*
   * Mark the given offset and size as being obsolete in the given txg.
   */
@@ -813,12 +904,6 @@ vdev_indirect_close(vdev_t *vd)
  {
  }
  
-/* ARGSUSED */
-static void
-vdev_indirect_io_done(zio_t *zio)
-{
-}
-
  /* ARGSUSED */
  static int
  vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
@@ -990,41 +1075,471 @@ vdev_indirect_child_io_done(zio_t *zio)
         abd_put(zio->io_abd);
  }
  
+/*
+ * This is a callback for vdev_indirect_remap() which allocates an
+ * indirect_split_t for each split segment and adds it to iv_splits.
+ */
  static void
-vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset,
+vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
      uint64_t size, void *arg)
  {
         zio_t *zio = arg;
+       indirect_vsd_t *iv = zio->io_vsd;
  
         ASSERT3P(vd, !=, NULL);
  
         if (vd->vdev_ops == &vdev_indirect_ops)
                 return;
  
-       zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
-           abd_get_offset(zio->io_abd, split_offset),
-           size, zio->io_type, zio->io_priority,
-           0, vdev_indirect_child_io_done, zio));
+       int n = 1;
+       if (vd->vdev_ops == &vdev_mirror_ops)
+               n = vd->vdev_children;
+
+       indirect_split_t *is =
+           kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
+
+       is->is_children = n;
+       is->is_size = size;
+       is->is_split_offset = split_offset;
+       is->is_target_offset = offset;
+       is->is_vdev = vd;
+
+       /*
+        * Note that we only consider multiple copies of the data for
+        * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
+        * though they use the same ops as mirror, because there's only one
+        * "good" copy under the replacing/spare.
+        */
+       if (vd->vdev_ops == &vdev_mirror_ops) {
+               for (int i = 0; i < n; i++) {
+                       is->is_child[i].ic_vdev = vd->vdev_child[i];
+               }
+       } else {
+               is->is_child[0].ic_vdev = vd;
+       }
+
+       list_insert_tail(&iv->iv_splits, is);
+}
+
+static void
+vdev_indirect_read_split_done(zio_t *zio)
+{
+       indirect_child_t *ic = zio->io_private;
+
+       if (zio->io_error != 0) {
+               /*
+                * Clear ic_data to indicate that we do not have data for this
+                * child.
+                */
+               abd_free(ic->ic_data);
+               ic->ic_data = NULL;
+       }
+}
+
+/*
+ * Issue reads for all copies (mirror children) of all splits.
+ */
+static void
+vdev_indirect_read_all(zio_t *zio)
+{
+       indirect_vsd_t *iv = zio->io_vsd;
+
+       for (indirect_split_t *is = list_head(&iv->iv_splits);
+           is != NULL; is = list_next(&iv->iv_splits, is)) {
+               for (int i = 0; i < is->is_children; i++) {
+                       indirect_child_t *ic = &is->is_child[i];
+
+                       if (!vdev_readable(ic->ic_vdev))
+                               continue;
+
+                       /*
+                        * Note, we may read from a child whose DTL
+                        * indicates that the data may not be present here.
+                        * While this might result in a few i/os that will
+                        * likely return incorrect data, it simplifies the
+                        * code since we can treat scrub and resilver
+                        * identically.  (The incorrect data will be
+                        * detected and ignored when we verify the
+                        * checksum.)
+                        */
+
+                       ic->ic_data = abd_alloc_sametype(zio->io_abd,
+                           is->is_size);
+
+                       zio_nowait(zio_vdev_child_io(zio, NULL,
+                           ic->ic_vdev, is->is_target_offset, ic->ic_data,
+                           is->is_size, zio->io_type, zio->io_priority, 0,
+                           vdev_indirect_read_split_done, ic));
+               }
+       }
+       iv->iv_reconstruct = B_TRUE;
  }
  
  static void
  vdev_indirect_io_start(zio_t *zio)
  {
         ASSERTV(spa_t *spa = zio->io_spa);
+       indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
+       list_create(&iv->iv_splits,
+           sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
+
+       zio->io_vsd = iv;
+       zio->io_vsd_ops = &vdev_indirect_vsd_ops;
  
         ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
         if (zio->io_type != ZIO_TYPE_READ) {
                 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
-               ASSERT((zio->io_flags &
-                   (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
+               /*
+                * Note: this code can handle other kinds of writes,
+                * but we don't expect them.
+                */
+               ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
+                   ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
         }
  
         vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
-           vdev_indirect_io_start_cb, zio);
+           vdev_indirect_gather_splits, zio);
+
+       indirect_split_t *first = list_head(&iv->iv_splits);
+       if (first->is_size == zio->io_size) {
+               /*
+                * This is not a split block; we are pointing to the entire
+                * data, which will checksum the same as the original data.
+                * Pass the BP down so that the child i/o can verify the
+                * checksum, and try a different location if available
+                * (e.g. on a mirror).
+                *
+                * While this special case could be handled the same as the
+                * general (split block) case, doing it this way ensures
+                * that the vast majority of blocks on indirect vdevs
+                * (which are not split) are handled identically to blocks
+                * on non-indirect vdevs.  This allows us to be less strict
+                * about performance in the general (but rare) case.
+                */
+               ASSERT0(first->is_split_offset);
+               ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
+               zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+                   first->is_vdev, first->is_target_offset,
+                   abd_get_offset(zio->io_abd, 0),
+                   zio->io_size, zio->io_type, zio->io_priority, 0,
+                   vdev_indirect_child_io_done, zio));
+       } else {
+               iv->iv_split_block = B_TRUE;
+               if (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+                       /*
+                        * Read all copies.  Note that for simplicity,
+                        * we don't bother consulting the DTL in the
+                        * resilver case.
+                        */
+                       vdev_indirect_read_all(zio);
+               } else {
+                       /*
+                        * Read one copy of each split segment, from the
+                        * top-level vdev.  Since we don't know the
+                        * checksum of each split individually, the child
+                        * zio can't ensure that we get the right data.
+                        * E.g. if it's a mirror, it will just read from a
+                        * random (healthy) leaf vdev.  We have to verify
+                        * the checksum in vdev_indirect_io_done().
+                        */
+                       for (indirect_split_t *is = list_head(&iv->iv_splits);
+                           is != NULL; is = list_next(&iv->iv_splits, is)) {
+                               zio_nowait(zio_vdev_child_io(zio, NULL,
+                                   is->is_vdev, is->is_target_offset,
+                                   abd_get_offset(zio->io_abd,
+                                   is->is_split_offset), is->is_size,
+                                   zio->io_type, zio->io_priority, 0,
+                                   vdev_indirect_child_io_done, zio));
+                       }
+
+               }
+       }
  
         zio_execute(zio);
  }
  
+/*
+ * Report a checksum error for a child.
+ */
+static void
+vdev_indirect_checksum_error(zio_t *zio,
+    indirect_split_t *is, indirect_child_t *ic)
+{
+       vdev_t *vd = ic->ic_vdev;
+
+       if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+               return;
+
+       mutex_enter(&vd->vdev_stat_lock);
+       vd->vdev_stat.vs_checksum_errors++;
+       mutex_exit(&vd->vdev_stat_lock);
+
+       zio_bad_cksum_t zbc = {{{ 0 }}};
+       abd_t *bad_abd = ic->ic_data;
+       abd_t *good_abd = is->is_child[is->is_good_child].ic_data;
+       zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
+           is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc);
+}
+
+/*
+ * Issue repair i/os for any incorrect copies.  We do this by comparing
+ * each split segment's correct data (is_good_child's ic_data) with each
+ * other copy of the data.  If they differ, then we overwrite the bad data
+ * with the good copy.  Note that we do this without regard for the DTL's,
+ * which simplifies this code and also issues the optimal number of writes
+ * (based on which copies actually read bad data, as opposed to which we
+ * think might be wrong).  For the same reason, we always use
+ * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
+ */
+static void
+vdev_indirect_repair(zio_t *zio)
+{
+       indirect_vsd_t *iv = zio->io_vsd;
+
+       enum zio_flag flags = ZIO_FLAG_IO_REPAIR;
+
+       if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))
+               flags |= ZIO_FLAG_SELF_HEAL;
+
+       if (!spa_writeable(zio->io_spa))
+               return;
+
+       for (indirect_split_t *is = list_head(&iv->iv_splits);
+           is != NULL; is = list_next(&iv->iv_splits, is)) {
+               indirect_child_t *good_child = &is->is_child[is->is_good_child];
+
+               for (int c = 0; c < is->is_children; c++) {
+                       indirect_child_t *ic = &is->is_child[c];
+                       if (ic == good_child)
+                               continue;
+                       if (ic->ic_data == NULL)
+                               continue;
+                       if (abd_cmp(good_child->ic_data, ic->ic_data) == 0)
+                               continue;
+
+                       zio_nowait(zio_vdev_child_io(zio, NULL,
+                           ic->ic_vdev, is->is_target_offset,
+                           good_child->ic_data, is->is_size,
+                           ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+                           ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
+                           NULL, NULL));
+
+                       vdev_indirect_checksum_error(zio, is, ic);
+               }
+       }
+}
+
+/*
+ * Report checksum errors on all children that we read from.
+ */
+static void
+vdev_indirect_all_checksum_errors(zio_t *zio)
+{
+       indirect_vsd_t *iv = zio->io_vsd;
+
+       if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+               return;
+
+       for (indirect_split_t *is = list_head(&iv->iv_splits);
+           is != NULL; is = list_next(&iv->iv_splits, is)) {
+               for (int c = 0; c < is->is_children; c++) {
+                       indirect_child_t *ic = &is->is_child[c];
+
+                       if (ic->ic_data == NULL)
+                               continue;
+
+                       vdev_t *vd = ic->ic_vdev;
+
+                       mutex_enter(&vd->vdev_stat_lock);
+                       vd->vdev_stat.vs_checksum_errors++;
+                       mutex_exit(&vd->vdev_stat_lock);
+
+                       zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
+                           is->is_target_offset, is->is_size,
+                           NULL, NULL, NULL);
+               }
+       }
+}
+
+/*
+ * This function is called when we have read all copies of the data and need
+ * to try to find a combination of copies that gives us the right checksum.
+ *
+ * If we pointed to any mirror vdevs, this effectively does the job of the
+ * mirror.  The mirror vdev code can't do its own job because we don't know
+ * the checksum of each split segment individually.  We have to try every
+ * combination of copies of split segments, until we find one that checksums
+ * correctly.  (Or until we have tried all combinations, or have tried
+ * 2^zfs_reconstruct_indirect_segments_max combinations.  In these cases we
+ * set io_error to ECKSUM to propagate the error up to the user.)
+ *
+ * For example, if we have 3 segments in the split,
+ * and each points to a 2-way mirror, we will have the following pieces of
+ * data:
+ *
+ *       |     mirror child
+ * split |     [0]        [1]
+ * ======|=====================
+ *   A   |  data_A_0   data_A_1
+ *   B   |  data_B_0   data_B_1
+ *   C   |  data_C_0   data_C_1
+ *
+ * We will try the following (mirror children)^(number of splits) (2^3=8)
+ * combinations, which is similar to bitwise-little-endian counting in
+ * binary.  In general each "digit" corresponds to a split segment, and the
+ * base of each digit is is_children, which can be different for each
+ * digit.
+ *
+ * "low bit"        "high bit"
+ *        v                 v
+ * data_A_0 data_B_0 data_C_0
+ * data_A_1 data_B_0 data_C_0
+ * data_A_0 data_B_1 data_C_0
+ * data_A_1 data_B_1 data_C_0
+ * data_A_0 data_B_0 data_C_1
+ * data_A_1 data_B_0 data_C_1
+ * data_A_0 data_B_1 data_C_1
+ * data_A_1 data_B_1 data_C_1
+ *
+ * Note that the split segments may be on the same or different top-level
+ * vdevs. In either case, we try lots of combinations (see
+ * zfs_reconstruct_indirect_segments_max).  This ensures that if a mirror has
+ * small silent errors on all of its children, we can still reconstruct the
+ * correct data, as long as those errors are at sufficiently-separated
+ * offsets (specifically, separated by the largest block size - default of
+ * 128KB, but up to 16MB).
+ */
+static void
+vdev_indirect_reconstruct_io_done(zio_t *zio)
+{
+       indirect_vsd_t *iv = zio->io_vsd;
+       uint64_t attempts = 0;
+       uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max;
+       int segments = 0;
+
+       for (indirect_split_t *is = list_head(&iv->iv_splits);
+           is != NULL; is = list_next(&iv->iv_splits, is))
+               segments++;
+
+       for (;;) {
+               /* copy data from splits to main zio */
+               int ret;
+               for (indirect_split_t *is = list_head(&iv->iv_splits);
+                   is != NULL; is = list_next(&iv->iv_splits, is)) {
+
+                       /*
+                        * If this child failed, its ic_data will be NULL.
+                        * Skip this combination.
+                        */
+                       if (is->is_child[is->is_good_child].ic_data == NULL) {
+                               ret = EIO;
+                               goto next;
+                       }
+
+                       abd_copy_off(zio->io_abd,
+                           is->is_child[is->is_good_child].ic_data,
+                           is->is_split_offset, 0, is->is_size);
+               }
+
+               /* See if this checksum matches. */
+               zio_bad_cksum_t zbc;
+               ret = zio_checksum_error(zio, &zbc);
+               if (ret == 0) {
+                       /* Found a matching checksum.  Issue repair i/os. */
+                       vdev_indirect_repair(zio);
+                       zio_checksum_verified(zio);
+                       return;
+               }
+
+               /*
+                * Checksum failed; try a different combination of split
+                * children.
+                */
+               boolean_t more;
+next:
+               more = B_FALSE;
+               if (segments <= zfs_reconstruct_indirect_segments_max) {
+                       /*
+                        * There are relatively few segments, so
+                        * deterministically check all combinations.  We do
+                        * this by by adding one to the first split's
+                        * good_child.  If it overflows, then "carry over" to
+                        * the next split (like counting in base is_children,
+                        * but each digit can have a different base).
+                        */
+                       for (indirect_split_t *is = list_head(&iv->iv_splits);
+                           is != NULL; is = list_next(&iv->iv_splits, is)) {
+                               is->is_good_child++;
+                               if (is->is_good_child < is->is_children) {
+                                       more = B_TRUE;
+                                       break;
+                               }
+                               is->is_good_child = 0;
+                       }
+               } else if (++attempts < attempts_max) {
+                       /*
+                        * There are too many combinations to try all of them
+                        * in a reasonable amount of time, so try a fixed
+                        * number of random combinations, after which we'll
+                        * consider the block unrecoverable.
+                        */
+                       for (indirect_split_t *is = list_head(&iv->iv_splits);
+                           is != NULL; is = list_next(&iv->iv_splits, is)) {
+                               is->is_good_child =
+                                   spa_get_random(is->is_children);
+                       }
+                       more = B_TRUE;
+               }
+               if (!more) {
+                       /* All combinations failed. */
+                       zio->io_error = ret;
+                       vdev_indirect_all_checksum_errors(zio);
+                       zio_checksum_verified(zio);
+                       return;
+               }
+       }
+}
+
+static void
+vdev_indirect_io_done(zio_t *zio)
+{
+       indirect_vsd_t *iv = zio->io_vsd;
+
+       if (iv->iv_reconstruct) {
+               /*
+                * We have read all copies of the data (e.g. from mirrors),
+                * either because this was a scrub/resilver, or because the
+                * one-copy read didn't checksum correctly.
+                */
+               vdev_indirect_reconstruct_io_done(zio);
+               return;
+       }
+
+       if (!iv->iv_split_block) {
+               /*
+                * This was not a split block, so we passed the BP down,
+                * and the checksum was handled by the (one) child zio.
+                */
+               return;
+       }
+
+       zio_bad_cksum_t zbc;
+       int ret = zio_checksum_error(zio, &zbc);
+       if (ret == 0) {
+               zio_checksum_verified(zio);
+               return;
+       }
+
+       /*
+        * The checksum didn't match.  Read all copies of all splits, and
+        * then we will try to reconstruct.  The next time
+        * vdev_indirect_io_done() is called, iv_reconstruct will be set.
+        */
+       vdev_indirect_read_all(zio);
+
+       zio_vdev_io_redone(zio);
+}
+
  vdev_ops_t vdev_indirect_ops = {
         vdev_indirect_open,
         vdev_indirect_close,
@@ -1061,4 +1576,8 @@ MODULE_PARM_DESC(zfs_condense_min_mapping_bytes,
  module_param(zfs_condense_indirect_commit_entry_delay_ms, int, 0644);
  MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms,
         "Delay while condensing vdev mapping");
+
+module_param(zfs_reconstruct_indirect_segments_max, int, 0644);
+MODULE_PARM_DESC(zfs_reconstruct_indirect_segments_max,
+       "Maximum number of split segments check all combinations");
  #endif
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c

index b56f955eb01b5fb56c06dcd4c3c0a496f838f47e..4b01f317b4cc3a2daa60a73b0c07f7cd30dbac78 100644 (file)
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -486,12 +486,15 @@ vdev_mirror_io_start(zio_t *zio)
         mm = vdev_mirror_map_init(zio);
  
         if (zio->io_type == ZIO_TYPE_READ) {
-               if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
+               if (zio->io_bp != NULL &&
+                   (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
                         /*
-                        * For scrubbing reads we need to allocate a read
-                        * buffer for each child and issue reads to all
-                        * children.  If any child succeeds, it will copy its
-                        * data into zio->io_data in vdev_mirror_scrub_done.
+                        * For scrubbing reads (if we can verify the
+                        * checksum here, as indicated by io_bp being
+                        * non-NULL) we need to allocate a read buffer for
+                        * each child and issue reads to all children.  If
+                        * any child succeeds, it will copy its data into
+                        * zio->io_data in vdev_mirror_scrub_done.
                          */
                         for (c = 0; c < mm->mm_children; c++) {
                                 mc = &mm->mm_child[c];
@@ -640,7 +643,21 @@ vdev_mirror_io_done(zio_t *zio)
                         if (mc->mc_error == 0) {
                                 if (mc->mc_tried)
                                         continue;
+                               /*
+                                * We didn't try this child.  We need to
+                                * repair it if:
+                                * 1. it's a scrub (in which case we have
+                                * tried everything that was healthy)
+                                *  - or -
+                                * 2. it's an indirect vdev (in which case
+                                * it could point to any other vdev, which
+                                * might have a bad DTL)
+                                *  - or -
+                                * 3. the DTL indicates that this data is
+                                * missing from this vdev
+                                */
                                 if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
+                                   mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
                                     !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
                                     zio->io_txg, 1))
                                         continue;
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c

index 6e81bf014894d5a04f456e6eb1017d95dc17149f..0fca8fb03f2da00d7f1cbe80f99bcfc2b3fb70da 100644 (file)
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -84,18 +84,12 @@ typedef struct vdev_copy_arg {
         kmutex_t        vca_lock;
  } vdev_copy_arg_t;
  
-typedef struct vdev_copy_seg_arg {
-       vdev_copy_arg_t *vcsa_copy_arg;
-       uint64_t        vcsa_txg;
-       dva_t           *vcsa_dest_dva;
-       blkptr_t        *vcsa_dest_bp;
-} vdev_copy_seg_arg_t;
-
  /*
- * The maximum amount of allowed data we're allowed to copy from a device
- * at a time when removing it.
+ * The maximum amount of memory we can use for outstanding i/o while
+ * doing a device removal.  This determines how much i/o we can have
+ * in flight concurrently.
   */
-int zfs_remove_max_copy_bytes = 8 * 1024 * 1024;
+int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
  
  /*
   * The largest contiguous segment that we will attempt to allocate when
@@ -165,7 +159,7 @@ spa_vdev_removal_create(vdev_t *vd)
         mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
         cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
         svr->svr_allocd_segs = range_tree_create(NULL, NULL);
-       svr->svr_vdev = vd;
+       svr->svr_vdev_id = vd->vdev_id;
  
         for (int i = 0; i < TXG_SIZE; i++) {
                 svr->svr_frees[i] = range_tree_create(NULL, NULL);
@@ -207,9 +201,10 @@ spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
  static void
  vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
  {
-       vdev_t *vd = arg;
+       int vdev_id = (uintptr_t)arg;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       vdev_t *vd = vdev_lookup_top(spa, vdev_id);
         vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
-       spa_t *spa = vd->vdev_spa;
         objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
         spa_vdev_removal_t *svr = NULL;
         ASSERTV(uint64_t txg = dmu_tx_get_txg(tx));
@@ -331,7 +326,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
         ASSERT3P(spa->spa_vdev_removal, ==, NULL);
         spa->spa_vdev_removal = svr;
         svr->svr_thread = thread_create(NULL, 0,
-           spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri);
+           spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
  }
  
  /*
@@ -372,21 +367,24 @@ spa_remove_init(spa_t *spa)
                 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
                 vdev_t *vd = vdev_lookup_top(spa,
                     spa->spa_removing_phys.sr_removing_vdev);
-               spa_config_exit(spa, SCL_STATE, FTAG);
  
-               if (vd == NULL)
+               if (vd == NULL) {
+                       spa_config_exit(spa, SCL_STATE, FTAG);
                         return (EINVAL);
+               }
  
                 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
  
                 ASSERT(vdev_is_concrete(vd));
                 spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
-               ASSERT(svr->svr_vdev->vdev_removing);
+               ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
+               ASSERT(vd->vdev_removing);
  
                 vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
                     spa->spa_meta_objset, vic->vic_mapping_object);
                 vd->vdev_indirect_births = vdev_indirect_births_open(
                     spa->spa_meta_objset, vic->vic_births_object);
+               spa_config_exit(spa, SCL_STATE, FTAG);
  
                 spa->spa_vdev_removal = svr;
         }
@@ -439,15 +437,8 @@ spa_restart_removal(spa_t *spa)
         if (!spa_writeable(spa))
                 return;
  
-       vdev_t *vd = svr->svr_vdev;
-       vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-
-       ASSERT3P(vd, !=, NULL);
-       ASSERT(vd->vdev_removing);
-
-       zfs_dbgmsg("restarting removal of %llu at count=%llu",
-           vd->vdev_id, vdev_indirect_mapping_num_entries(vim));
-       svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd,
+       zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id);
+       svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
             0, &p0, TS_RUN, minclsyspri);
  }
  
@@ -468,7 +459,7 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
         ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
         ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
             vdev_indirect_mapping_object(vim));
-       ASSERT3P(vd, ==, svr->svr_vdev);
+       ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
         ASSERT3U(spa_syncing_txg(spa), ==, txg);
  
         mutex_enter(&svr->svr_lock);
@@ -646,7 +637,7 @@ spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
  
         if (state == DSS_FINISHED) {
                 spa_removing_phys_t *srp = &spa->spa_removing_phys;
-               vdev_t *vd = svr->svr_vdev;
+               vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
                 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
  
                 if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
@@ -690,7 +681,7 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
  {
         spa_vdev_removal_t *svr = arg;
         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-       vdev_t *vd = svr->svr_vdev;
+       vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
         ASSERTV(vdev_indirect_config_t *vic = &vd->vdev_indirect_config);
         uint64_t txg = dmu_tx_get_txg(tx);
         vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
@@ -718,64 +709,128 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
         spa_sync_removing_state(spa, tx);
  }
  
+/*
+ * All reads and writes associated with a call to spa_vdev_copy_segment()
+ * are done.
+ */
+static void
+spa_vdev_copy_nullzio_done(zio_t *zio)
+{
+       spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
+}
+
+/*
+ * The write of the new location is done.
+ */
  static void
  spa_vdev_copy_segment_write_done(zio_t *zio)
  {
-       vdev_copy_seg_arg_t *vcsa = zio->io_private;
-       vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg;
-       spa_config_exit(zio->io_spa, SCL_STATE, FTAG);
+       vdev_copy_arg_t *vca = zio->io_private;
+
         abd_free(zio->io_abd);
  
         mutex_enter(&vca->vca_lock);
         vca->vca_outstanding_bytes -= zio->io_size;
         cv_signal(&vca->vca_cv);
         mutex_exit(&vca->vca_lock);
-
-       ASSERT0(zio->io_error);
-       kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t));
-       kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t));
  }
  
+/*
+ * The read of the old location is done.  The parent zio is the write to
+ * the new location.  Allow it to start.
+ */
  static void
  spa_vdev_copy_segment_read_done(zio_t *zio)
  {
-       vdev_copy_seg_arg_t *vcsa = zio->io_private;
-       dva_t *dest_dva = vcsa->vcsa_dest_dva;
-       uint64_t txg = vcsa->vcsa_txg;
-       spa_t *spa = zio->io_spa;
-       ASSERTV(vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva)));
-       blkptr_t *bp = NULL;
-       dva_t *dva = NULL;
-       uint64_t size = zio->io_size;
-
-       ASSERT3P(dest_vd, !=, NULL);
-       ASSERT0(zio->io_error);
-
-       vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
-       bp = vcsa->vcsa_dest_bp;
-       dva = bp->blk_dva;
-
-       BP_ZERO(bp);
-
-       /* initialize with dest_dva */
-       bcopy(dest_dva, dva, sizeof (dva_t));
-       BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
-       BP_SET_LSIZE(bp, size);
-       BP_SET_PSIZE(bp, size);
-       BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-       BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
-       BP_SET_TYPE(bp, DMU_OT_NONE);
-       BP_SET_LEVEL(bp, 0);
-       BP_SET_DEDUP(bp, 0);
-       BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
-       zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa,
-           txg, bp, zio->io_abd, size,
-           spa_vdev_copy_segment_write_done, vcsa,
-           ZIO_PRIORITY_REMOVAL, 0, NULL));
+       zio_nowait(zio_unique_parent(zio));
+}
+
+/*
+ * If the old and new vdevs are mirrors, we will read both sides of the old
+ * mirror, and write each copy to the corresponding side of the new mirror.
+ * If the old and new vdevs have a different number of children, we will do
+ * this as best as possible.  Since we aren't verifying checksums, this
+ * ensures that as long as there's a good copy of the data, we'll have a
+ * good copy after the removal, even if there's silent damage to one side
+ * of the mirror. If we're removing a mirror that has some silent damage,
+ * we'll have exactly the same damage in the new location (assuming that
+ * the new location is also a mirror).
+ *
+ * We accomplish this by creating a tree of zio_t's, with as many writes as
+ * there are "children" of the new vdev (a non-redundant vdev counts as one
+ * child, a 2-way mirror has 2 children, etc). Each write has an associated
+ * read from a child of the old vdev. Typically there will be the same
+ * number of children of the old and new vdevs.  However, if there are more
+ * children of the new vdev, some child(ren) of the old vdev will be issued
+ * multiple reads.  If there are more children of the old vdev, some copies
+ * will be dropped.
+ *
+ * For example, the tree of zio_t's for a 2-way mirror is:
+ *
+ *                            null
+ *                           /    \
+ *    write(new vdev, child 0)      write(new vdev, child 1)
+ *      |                             |
+ *    read(old vdev, child 0)       read(old vdev, child 1)
+ *
+ * Child zio's complete before their parents complete.  However, zio's
+ * created with zio_vdev_child_io() may be issued before their children
+ * complete.  In this case we need to make sure that the children (reads)
+ * complete before the parents (writes) are *issued*.  We do this by not
+ * calling zio_nowait() on each write until its corresponding read has
+ * completed.
+ *
+ * The spa_config_lock must be held while zio's created by
+ * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
+ * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
+ * zio is needed to release the spa_config_lock after all the reads and
+ * writes complete. (Note that we can't grab the config lock for each read,
+ * because it is not reentrant - we could deadlock with a thread waiting
+ * for a write lock.)
+ */
+static void
+spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
+    vdev_t *source_vd, uint64_t source_offset,
+    vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
+{
+       ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
+
+       mutex_enter(&vca->vca_lock);
+       vca->vca_outstanding_bytes += size;
+       mutex_exit(&vca->vca_lock);
+
+       abd_t *abd = abd_alloc_for_io(size, B_FALSE);
+
+       vdev_t *source_child_vd;
+       if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
+               /*
+                * Source and dest are both mirrors.  Copy from the same
+                * child id as we are copying to (wrapping around if there
+                * are more dest children than source children).
+                */
+               source_child_vd =
+                   source_vd->vdev_child[dest_id % source_vd->vdev_children];
+       } else {
+               source_child_vd = source_vd;
+       }
+
+       zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
+           dest_child_vd, dest_offset, abd, size,
+           ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
+           ZIO_FLAG_CANFAIL,
+           spa_vdev_copy_segment_write_done, vca);
+
+       zio_nowait(zio_vdev_child_io(write_zio, NULL,
+           source_child_vd, source_offset, abd, size,
+           ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
+           ZIO_FLAG_CANFAIL,
+           spa_vdev_copy_segment_read_done, vca));
  }
  
+/*
+ * Allocate a new location for this segment, and create the zio_t's to
+ * read from the old location and write to the new location.
+ */
  static int
  spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
      vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
@@ -784,10 +839,7 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
         spa_t *spa = vd->vdev_spa;
         spa_vdev_removal_t *svr = spa->spa_vdev_removal;
         vdev_indirect_mapping_entry_t *entry;
-       vdev_copy_seg_arg_t *private;
         dva_t dst = {{ 0 }};
-       blkptr_t blk, *bp = &blk;
-       dva_t *dva = bp->blk_dva;
  
         ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
  
@@ -804,51 +856,28 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
          */
         ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
  
-       mutex_enter(&vca->vca_lock);
-       vca->vca_outstanding_bytes += size;
-       mutex_exit(&vca->vca_lock);
-
         entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
         DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
         entry->vime_mapping.vimep_dst = dst;
  
-       private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP);
-       private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
-       private->vcsa_txg = txg;
-       private->vcsa_copy_arg = vca;
-
         /*
-        * This lock is eventually released by the donefunc for the
-        * zio_write_phys that finishes copying the data.
+        * See comment before spa_vdev_copy_one_child().
          */
-       spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-
-       /*
-        * Do logical I/O, letting the redundancy vdevs (like mirror)
-        * handle their own I/O instead of duplicating that code here.
-        */
-       BP_ZERO(bp);
-
-       DVA_SET_VDEV(&dva[0], vd->vdev_id);
-       DVA_SET_OFFSET(&dva[0], start);
-       DVA_SET_GANG(&dva[0], 0);
-       DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size));
-
-       BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
-       BP_SET_LSIZE(bp, size);
-       BP_SET_PSIZE(bp, size);
-       BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-       BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
-       BP_SET_TYPE(bp, DMU_OT_NONE);
-       BP_SET_LEVEL(bp, 0);
-       BP_SET_DEDUP(bp, 0);
-       BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
-       zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa,
-           bp, abd_alloc_for_io(size, B_FALSE), size,
-           spa_vdev_copy_segment_read_done, private,
-           ZIO_PRIORITY_REMOVAL, 0, NULL));
+       spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+       zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
+           spa_vdev_copy_nullzio_done, NULL, 0);
+       vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
+       if (dest_vd->vdev_ops == &vdev_mirror_ops) {
+               for (int i = 0; i < dest_vd->vdev_children; i++) {
+                       vdev_t *child = dest_vd->vdev_child[i];
+                       spa_vdev_copy_one_child(vca, nzio, vd, start,
+                           child, DVA_GET_OFFSET(&dst), i, size);
+               }
+       } else {
+               spa_vdev_copy_one_child(vca, nzio, vd, start,
+                   dest_vd, DVA_GET_OFFSET(&dst), -1, size);
+       }
+       zio_nowait(nzio);
  
         list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
         ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
@@ -866,8 +895,8 @@ static void
  vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
  {
         spa_vdev_removal_t *svr = arg;
-       vdev_t *vd = svr->svr_vdev;
-       spa_t *spa = vd->vdev_spa;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
  
         ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
  
@@ -895,37 +924,6 @@ vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
             "%s vdev %llu", spa_name(spa), vd->vdev_id);
  }
  
-static void
-vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd)
-{
-       ivd->vdev_indirect_config = vd->vdev_indirect_config;
-
-       ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL);
-       ASSERT(vd->vdev_indirect_mapping != NULL);
-       ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping;
-       vd->vdev_indirect_mapping = NULL;
-
-       ASSERT3P(ivd->vdev_indirect_births, ==, NULL);
-       ASSERT(vd->vdev_indirect_births != NULL);
-       ivd->vdev_indirect_births = vd->vdev_indirect_births;
-       vd->vdev_indirect_births = NULL;
-
-       ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
-       ASSERT0(range_tree_space(ivd->vdev_obsolete_segments));
-
-       if (vd->vdev_obsolete_sm != NULL) {
-               ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize);
-
-               /*
-                * We cannot use space_map_{open,close} because we hold all
-                * the config locks as writer.
-                */
-               ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL);
-               ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm;
-               vd->vdev_obsolete_sm = NULL;
-       }
-}
-
  static void
  vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
  {
@@ -961,17 +959,13 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
         vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
  
         ivd = vdev_add_parent(vd, &vdev_indirect_ops);
+       ivd->vdev_removing = 0;
  
         vd->vdev_leaf_zap = 0;
  
         vdev_remove_child(ivd, vd);
         vdev_compact_children(ivd);
  
-       vdev_indirect_state_transfer(ivd, vd);
-
-       svr->svr_vdev = ivd;
-
-       ASSERT(!ivd->vdev_removing);
         ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
  
         tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -994,9 +988,8 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
   * context by the removal thread after we have copied all vdev's data.
   */
  static void
-vdev_remove_complete(vdev_t *vd)
+vdev_remove_complete(spa_t *spa)
  {
-       spa_t *spa = vd->vdev_spa;
         uint64_t txg;
  
         /*
@@ -1004,8 +997,12 @@ vdev_remove_complete(vdev_t *vd)
          * vdev_metaslab_fini()
          */
         txg_wait_synced(spa->spa_dsl_pool, 0);
-
         txg = spa_vdev_enter(spa);
+       vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+
+       sysevent_t *ev = spa_event_create(spa, vd, NULL,
+           ESC_ZFS_VDEV_REMOVE_DEV);
+
         zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
             vd->vdev_id, txg);
  
@@ -1025,6 +1022,10 @@ vdev_remove_complete(vdev_t *vd)
         /*
          * We now release the locks, allowing spa_sync to run and finish the
          * removal via vdev_remove_complete_sync in syncing context.
+        *
+        * Note that we hold on to the vdev_t that has been replaced.  Since
+        * it isn't part of the vdev tree any longer, it can't be concurrently
+        * manipulated, even while we don't have the config lock.
          */
         (void) spa_vdev_exit(spa, NULL, txg, 0);
  
@@ -1046,6 +1047,9 @@ vdev_remove_complete(vdev_t *vd)
          */
         vdev_config_dirty(spa->spa_root_vdev);
         (void) spa_vdev_exit(spa, vd, txg, 0);
+
+       if (ev != NULL)
+               spa_event_post(ev);
  }
  
  /*
@@ -1056,7 +1060,7 @@ vdev_remove_complete(vdev_t *vd)
   * this size again this txg.
   */
  static void
-spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
+spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
      uint64_t *max_alloc, dmu_tx_t *tx)
  {
         uint64_t txg = dmu_tx_get_txg(tx);
@@ -1095,7 +1099,7 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
         while (length > 0) {
                 uint64_t mylen = MIN(length, thismax);
  
-               int error = spa_vdev_copy_segment(svr->svr_vdev,
+               int error = spa_vdev_copy_segment(vd,
                     offset, mylen, txg, vca, &zal);
  
                 if (error == ENOSPC) {
@@ -1153,12 +1157,14 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
  static void
  spa_vdev_remove_thread(void *arg)
  {
-       vdev_t *vd = arg;
-       spa_t *spa = vd->vdev_spa;
+       spa_t *spa = arg;
         spa_vdev_removal_t *svr = spa->spa_vdev_removal;
         vdev_copy_arg_t vca;
         uint64_t max_alloc = zfs_remove_max_segment;
         uint64_t last_txg = 0;
+
+       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+       vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
         vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
         uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
  
@@ -1166,7 +1172,6 @@ spa_vdev_remove_thread(void *arg)
         ASSERT(vdev_is_concrete(vd));
         ASSERT(vd->vdev_removing);
         ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
-       ASSERT3P(svr->svr_vdev, ==, vd);
         ASSERT(vim != NULL);
  
         mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -1247,6 +1252,17 @@ spa_vdev_remove_thread(void *arg)
  
                         mutex_exit(&svr->svr_lock);
  
+                       /*
+                        * We need to periodically drop the config lock so that
+                        * writers can get in.  Additionally, we can't wait
+                        * for a txg to sync while holding a config lock
+                        * (since a waiting writer could cause a 3-way deadlock
+                        * with the sync thread, which also gets a config
+                        * lock for reader).  So we can't hold the config lock
+                        * while calling dmu_tx_assign().
+                        */
+                       spa_config_exit(spa, SCL_CONFIG, FTAG);
+
                         mutex_enter(&vca.vca_lock);
                         while (vca.vca_outstanding_bytes >
                             zfs_remove_max_copy_bytes) {
@@ -1260,11 +1276,19 @@ spa_vdev_remove_thread(void *arg)
                         VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
                         uint64_t txg = dmu_tx_get_txg(tx);
  
+                       /*
+                        * Reacquire the vdev_config lock.  The vdev_t
+                        * that we're removing may have changed, e.g. due
+                        * to a vdev_attach or vdev_detach.
+                        */
+                       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+                       vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+
                         if (txg != last_txg)
                                 max_alloc = zfs_remove_max_segment;
                         last_txg = txg;
  
-                       spa_vdev_copy_impl(svr, &vca, &max_alloc, tx);
+                       spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
  
                         dmu_tx_commit(tx);
                         mutex_enter(&svr->svr_lock);
@@ -1272,6 +1296,9 @@ spa_vdev_remove_thread(void *arg)
         }
  
         mutex_exit(&svr->svr_lock);
+
+       spa_config_exit(spa, SCL_CONFIG, FTAG);
+
         /*
          * Wait for all copies to finish before cleaning up the vca.
          */
@@ -1289,7 +1316,7 @@ spa_vdev_remove_thread(void *arg)
                 mutex_exit(&svr->svr_lock);
         } else {
                 ASSERT0(range_tree_space(svr->svr_allocd_segs));
-               vdev_remove_complete(vd);
+               vdev_remove_complete(spa);
         }
  }
  
@@ -1330,7 +1357,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
  {
         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
         spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-       vdev_t *vd = svr->svr_vdev;
+       vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
         vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
         vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
         objset_t *mos = spa->spa_meta_objset;
@@ -1403,8 +1430,11 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
                          * because we have not allocated mappings for it yet.
                          */
                         uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
-                       range_tree_clear(svr->svr_allocd_segs, syncd,
-                           msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd);
+                       uint64_t sm_end = msp->ms_sm->sm_start +
+                           msp->ms_sm->sm_size;
+                       if (sm_end > syncd)
+                               range_tree_clear(svr->svr_allocd_segs,
+                                   syncd, sm_end - syncd);
  
                         mutex_exit(&svr->svr_lock);
                 }
@@ -1465,7 +1495,7 @@ spa_vdev_remove_cancel(spa_t *spa)
         if (spa->spa_vdev_removal == NULL)
                 return (ENOTACTIVE);
  
-       uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id;
+       uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
  
         int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
             spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE);
@@ -1774,7 +1804,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
         dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
         dsl_sync_task_nowait(spa->spa_dsl_pool,
             vdev_remove_initiate_sync,
-           vd, 0, ZFS_SPACE_CHECK_NONE, tx);
+           (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
         dmu_tx_commit(tx);
  
         return (0);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c

index 5259a0c6f01df9b5bf5d698730d3f6835e3f6cf7..9bb2459bb3d130c897fc25062c10a62a116f0908 100644 (file)
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1212,17 +1212,6 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
         ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
             done != NULL);
  
-       /*
-        * In the common case, where the parent zio was to a normal vdev,
-        * the child zio must be to a child vdev of that vdev.  Otherwise,
-        * the child zio must be to a top-level vdev.
-        */
-       if (pio->io_vd != NULL && pio->io_vd->vdev_ops != &vdev_indirect_ops) {
-               ASSERT3P(vd->vdev_parent, ==, pio->io_vd);
-       } else {
-               ASSERT3P(vd, ==, vd->vdev_top);
-       }
-
         if (type == ZIO_TYPE_READ && bp != NULL) {
                 /*
                  * If we have the bp, then the child should perform the
@@ -1283,7 +1272,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
  
  zio_t *
  zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
-    int type, zio_priority_t priority, enum zio_flag flags,
+    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
      zio_done_func_t *done, void *private)
  {
         zio_t *zio;
@@ -3481,7 +3470,7 @@ zio_vdev_io_start(zio_t *zio)
                  */
                 ASSERT(zio->io_flags &
                     (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
-                   ZIO_FLAG_INDUCE_DAMAGE));
+                   ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
         }
  
         align = 1ULL << vd->vdev_top->vdev_ashift;
@@ -3521,18 +3510,37 @@ zio_vdev_io_start(zio_t *zio)
          * If this is a repair I/O, and there's no self-healing involved --
          * that is, we're just resilvering what we expect to resilver --
          * then don't do the I/O unless zio's txg is actually in vd's DTL.
-        * This prevents spurious resilvering with nested replication.
-        * For example, given a mirror of mirrors, (A+B)+(C+D), if only
-        * A is out of date, we'll read from C+D, then use the data to
-        * resilver A+B -- but we don't actually want to resilver B, just A.
-        * The top-level mirror has no way to know this, so instead we just
-        * discard unnecessary repairs as we work our way down the vdev tree.
-        * The same logic applies to any form of nested replication:
-        * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
+        * This prevents spurious resilvering.
+        *
+        * There are a few ways that we can end up creating these spurious
+        * resilver i/os:
+        *
+        * 1. A resilver i/o will be issued if any DVA in the BP has a
+        * dirty DTL.  The mirror code will issue resilver writes to
+        * each DVA, including the one(s) that are not on vdevs with dirty
+        * DTLs.
+        *
+        * 2. With nested replication, which happens when we have a
+        * "replacing" or "spare" vdev that's a child of a mirror or raidz.
+        * For example, given mirror(replacing(A+B), C), it's likely that
+        * only A is out of date (it's the new device). In this case, we'll
+        * read from C, then use the data to resilver A+B -- but we don't
+        * actually want to resilver B, just A. The top-level mirror has no
+        * way to know this, so instead we just discard unnecessary repairs
+        * as we work our way down the vdev tree.
+        *
+        * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
+        * The same logic applies to any form of nested replication: ditto
+        * + mirror, RAID-Z + replacing, etc.
+        *
+        * However, indirect vdevs point off to other vdevs which may have
+        * DTL's, so we never bypass them.  The child i/os on concrete vdevs
+        * will be properly bypassed instead.
          */
         if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
             !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
             zio->io_txg != 0 && /* not a delegated i/o */
+           vd->vdev_ops != &vdev_indirect_ops &&
             !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
                 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
                 zio_vdev_io_bypass(zio);
author	Matthew Ahrens <mahrens@delphix.com>
	Tue, 13 Feb 2018 19:37:56 +0000 (11:37 -0800)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Sat, 14 Apr 2018 19:21:39 +0000 (12:21 -0700)
cmd/zdb/zdb.c		patch \| blob \| history
cmd/ztest/ztest.c		patch \| blob \| history
include/sys/vdev_removal.h		patch \| blob \| history
include/sys/zio.h		patch \| blob \| history
lib/libzfs/libzfs_pool.c		patch \| blob \| history
man/man5/zfs-module-parameters.5		patch \| blob \| history
module/zfs/dsl_scan.c		patch \| blob \| history
module/zfs/metaslab.c		patch \| blob \| history
module/zfs/spa.c		patch \| blob \| history
module/zfs/spa_misc.c		patch \| blob \| history
module/zfs/vdev.c		patch \| blob \| history
module/zfs/vdev_indirect.c		patch \| blob \| history
module/zfs/vdev_mirror.c		patch \| blob \| history
module/zfs/vdev_removal.c		patch \| blob \| history
module/zfs/zio.c		patch \| blob \| history