]> granicus.if.org Git - zfs/commitdiff
Optimize possible split block search space
authorBrian Behlendorf <behlendorf1@llnl.gov>
Thu, 29 Mar 2018 21:50:40 +0000 (14:50 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Sat, 14 Apr 2018 19:22:43 +0000 (12:22 -0700)
Remove duplicate segment copies to minimize the possible search
space for reconstruction.  Once reduced an accurate assessment can
be made regarding the difficulty in reconstructing the block.

Also, ztest will now run zdb with
zfs_reconstruct_indirect_combinations_max set to 1000000 in an attempt
to avoid checksum errors.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #6900

cmd/ztest/ztest.c
man/man5/zfs-module-parameters.5
module/zfs/vdev_indirect.c

index c81d446a542ffc9aad3985c005ff0010dc943c2d..315169571db6bb44e2ca8371fba40bed921f77a5 100644 (file)
@@ -6221,7 +6221,8 @@ ztest_run_zdb(char *pool)
        ztest_get_zdb_bin(bin, len);
 
        (void) sprintf(zdb,
-           "%s -bcc%s%s -G -d -U %s %s",
+           "%s -bcc%s%s -G -d -U %s "
+           "-o zfs_reconstruct_indirect_combinations_max=1000000 %s",
            bin,
            ztest_opts.zo_verbose >= 3 ? "s" : "",
            ztest_opts.zo_verbose >= 4 ? "v" : "",
index 6e670facb5ce8e62b57b199169a189978fea4466..8b007ea17e8b1390e3ae868c117c8dfdd48d63d3 100644 (file)
@@ -1742,18 +1742,18 @@ Use \fB1\fR for yes and \fB0\fR for no (default).
 .sp
 .ne 2
 .na
-\fBzfs_reconstruct_indirect_segments_max\fR (int)
-.ad
-.RS 12n
-When a split block which is part of a indirect vdev contains more than this
-many segments, consider it too computationally expensive to check all possible
-combinations.  Instead, operate under the assumption that only a few segment
-copies are damaged and the majority of segment copies are good, in which case
-it is reasonable to randomly select sample combinations.  This allows all the
-segment copies to participate fairly in the reconstruction and prevents the
-repeated use of one bad copy.
-.sp
-Default value: \fB10\fR.
+\fBzfs_reconstruct_indirect_combinations_max\fR (int)
+.ad
+.RS 12na
+If an indirect split block contains more than this many possible unique
+combinations when being reconstructed, consider it too computationally
+expensive to check them all. Instead, try at most
+\fBzfs_reconstruct_indirect_combinations_max\fR randomly-selected
+combinations each time the block is accessed.  This allows all segment
+copies to participate fairly in the reconstruction when all combinations
+cannot be checked and prevents repeated use of one bad copy.
+.sp
+Default value: \fB100\fR.
 .RE
 
 .sp
index 3ccdfee3bd88a8818f88ed1f12b8b639dd3ce0cf..b30ddaf266bfca75461edfb2e1e34355d61c50e5 100644 (file)
@@ -204,17 +204,14 @@ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024;
 int zfs_condense_indirect_commit_entry_delay_ms = 0;
 
 /*
- * If a split block contains more than this many segments, consider it too
- * computationally expensive to check all (2^num_segments) possible
- * combinations. Instead, try at most 2^_segments_max randomly-selected
- * combinations.
- *
- * This is reasonable if only a few segment copies are damaged and the
- * majority of segment copies are good. It allows all segment copies to
- * participate fairly in the reconstruction and prevents repeated use of
- * one bad copy.
+ * If an indirect split block contains more than this many possible unique
+ * combinations when being reconstructed, consider it too computationally
+ * expensive to check them all. Instead, try at most 100 randomly-selected
+ * combinations each time the block is accessed.  This allows all segment
+ * copies to participate fairly in the reconstruction when all combinations
+ * cannot be checked and prevents repeated use of one bad copy.
  */
-int zfs_reconstruct_indirect_segments_max = 10;
+int zfs_reconstruct_indirect_combinations_max = 100;
 
 /*
  * The indirect_child_t represents the vdev that we will read from, when we
@@ -226,6 +223,12 @@ int zfs_reconstruct_indirect_segments_max = 10;
 typedef struct indirect_child {
        abd_t *ic_data;
        vdev_t *ic_vdev;
+
+       /*
+        * ic_duplicate is -1 when the ic_data contents are unique, when it
+        * is determined to be a duplicate it refers to the primary child.
+        */
+       int ic_duplicate;
 } indirect_child_t;
 
 /*
@@ -1165,6 +1168,7 @@ vdev_indirect_read_all(zio_t *zio)
 
                        ic->ic_data = abd_alloc_sametype(zio->io_abd,
                            is->is_size);
+                       ic->ic_duplicate = -1;
 
                        zio_nowait(zio_vdev_child_io(zio, NULL,
                            ic->ic_vdev, is->is_target_offset, ic->ic_data,
@@ -1314,7 +1318,7 @@ vdev_indirect_repair(zio_t *zio)
                                continue;
                        if (ic->ic_data == NULL)
                                continue;
-                       if (abd_cmp(good_child->ic_data, ic->ic_data) == 0)
+                       if (ic->ic_duplicate == is->is_good_child)
                                continue;
 
                        zio_nowait(zio_vdev_child_io(zio, NULL,
@@ -1367,15 +1371,16 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
  *
  * If we pointed to any mirror vdevs, this effectively does the job of the
  * mirror.  The mirror vdev code can't do its own job because we don't know
- * the checksum of each split segment individually.  We have to try every
- * combination of copies of split segments, until we find one that checksums
- * correctly.  (Or until we have tried all combinations, or have tried
- * 2^zfs_reconstruct_indirect_segments_max combinations.  In these cases we
- * set io_error to ECKSUM to propagate the error up to the user.)
+ * the checksum of each split segment individually.
  *
- * For example, if we have 3 segments in the split,
- * and each points to a 2-way mirror, we will have the following pieces of
- * data:
+ * We have to try every unique combination of copies of split segments, until
+ * we find one that checksums correctly.  Duplicate segment copies are first
+ * discarded as an optimization to reduce the search space.  After pruning
+ * there will exist at most one valid combination.
+ *
+ * When the total number of combinations is small they can all be checked.
+ * For example, if we have 3 segments in the split, and each points to a
+ * 2-way mirror with unique copies, we will have the following pieces of data:
  *
  *       |     mirror child
  * split |     [0]        [1]
@@ -1414,12 +1419,48 @@ vdev_indirect_reconstruct_io_done(zio_t *zio)
 {
        indirect_vsd_t *iv = zio->io_vsd;
        uint64_t attempts = 0;
-       uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max;
-       int segments = 0;
+       uint64_t attempts_max = UINT64_MAX;
+       uint64_t combinations = 1;
+
+       if (zfs_reconstruct_indirect_combinations_max > 0)
+               attempts_max = zfs_reconstruct_indirect_combinations_max;
 
+       /*
+        * Discard duplicate copies of split segments to minimize the
+        * number of unique combinations when attempting reconstruction.
+        */
        for (indirect_split_t *is = list_head(&iv->iv_splits);
-           is != NULL; is = list_next(&iv->iv_splits, is))
-               segments++;
+           is != NULL; is = list_next(&iv->iv_splits, is)) {
+               uint64_t is_copies = 0;
+
+               for (int i = 0; i < is->is_children; i++) {
+                       if (is->is_child[i].ic_data == NULL)
+                               continue;
+
+                       for (int j = i + 1; j < is->is_children; j++) {
+                               if (is->is_child[j].ic_data == NULL)
+                                       continue;
+
+                               if (is->is_child[j].ic_duplicate == -1 &&
+                                   abd_cmp(is->is_child[i].ic_data,
+                                   is->is_child[j].ic_data) == 0) {
+                                       is->is_child[j].ic_duplicate = i;
+                               }
+                       }
+
+                       is_copies++;
+               }
+
+               /* Reconstruction is impossible, no valid is->is_child[] */
+               if (is_copies == 0) {
+                       zio->io_error = EIO;
+                       vdev_indirect_all_checksum_errors(zio);
+                       zio_checksum_verified(zio);
+                       return;
+               }
+
+               combinations *= is_copies;
+       }
 
        for (;;) {
                /* copy data from splits to main zio */
@@ -1436,6 +1477,15 @@ vdev_indirect_reconstruct_io_done(zio_t *zio)
                                goto next;
                        }
 
+                       /*
+                        * If this child is a duplicate, its is_duplicate will
+                        * refer to the primary copy.  Skip this combination.
+                        */
+                       if (is->is_child[is->is_good_child].ic_duplicate >= 0) {
+                               ret = ECKSUM;
+                               goto next;
+                       }
+
                        abd_copy_off(zio->io_abd,
                            is->is_child[is->is_good_child].ic_data,
                            is->is_split_offset, 0, is->is_size);
@@ -1458,14 +1508,14 @@ vdev_indirect_reconstruct_io_done(zio_t *zio)
                boolean_t more;
 next:
                more = B_FALSE;
-               if (segments <= zfs_reconstruct_indirect_segments_max) {
+               if (combinations <= attempts_max) {
                        /*
-                        * There are relatively few segments, so
-                        * deterministically check all combinations.  We do
-                        * this by by adding one to the first split's
-                        * good_child.  If it overflows, then "carry over" to
-                        * the next split (like counting in base is_children,
-                        * but each digit can have a different base).
+                        * There are relatively few possible combinations, so
+                        * deterministically check them all.  We do this by
+                        * adding one to the first split's good_child.  If it
+                        * overflows, then "carry over" to the next split
+                        * (like counting in base is_children, but each
+                        * digit can have a different base).
                         */
                        for (indirect_split_t *is = list_head(&iv->iv_splits);
                            is != NULL; is = list_next(&iv->iv_splits, is)) {
@@ -1485,8 +1535,12 @@ next:
                         */
                        for (indirect_split_t *is = list_head(&iv->iv_splits);
                            is != NULL; is = list_next(&iv->iv_splits, is)) {
-                               is->is_good_child =
-                                   spa_get_random(is->is_children);
+                               int c = spa_get_random(is->is_children);
+
+                               while (is->is_child[c].ic_duplicate >= 0)
+                                       c = (c + 1) % is->is_children;
+
+                               is->is_good_child = c;
                        }
                        more = B_TRUE;
                }
@@ -1577,7 +1631,7 @@ module_param(zfs_condense_indirect_commit_entry_delay_ms, int, 0644);
 MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms,
        "Delay while condensing vdev mapping");
 
-module_param(zfs_reconstruct_indirect_segments_max, int, 0644);
-MODULE_PARM_DESC(zfs_reconstruct_indirect_segments_max,
-       "Maximum number of split segments check all combinations");
+module_param(zfs_reconstruct_indirect_combinations_max, int, 0644);
+MODULE_PARM_DESC(zfs_reconstruct_indirect_combinations_max,
+       "Maximum number of combinations when reconstructing split segments");
 #endif