]> granicus.if.org Git - zfs/commitdiff
Merge commit 'refs/top-bases/gcc-c90' into gcc-c90
authorBrian Behlendorf <behlendorf1@llnl.gov>
Fri, 28 May 2010 21:19:22 +0000 (14:19 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Fri, 28 May 2010 21:19:22 +0000 (14:19 -0700)
Conflicts:
cmd/zdb/zdb.c
cmd/ztest/ztest.c
module/zfs/dbuf.c
module/zfs/dsl_dataset.c
module/zfs/dsl_scrub.c
module/zfs/spa.c
module/zfs/vdev.c
module/zfs/zio.c

25 files changed:
1  2 
cmd/zdb/zdb.c
cmd/zfs/zfs_main.c
cmd/ztest/ztest.c
lib/libzfs/libzfs_import.c
module/zcommon/zprop_common.c
module/zfs/arc.c
module/zfs/dbuf.c
module/zfs/ddt.c
module/zfs/dmu.c
module/zfs/dmu_objset.c
module/zfs/dmu_tx.c
module/zfs/dsl_dataset.c
module/zfs/dsl_dir.c
module/zfs/dsl_scan.c
module/zfs/include/sys/spa.h
module/zfs/metaslab.c
module/zfs/spa.c
module/zfs/spa_misc.c
module/zfs/vdev.c
module/zfs/vdev_label.c
module/zfs/vdev_mirror.c
module/zfs/vdev_queue.c
module/zfs/vdev_raidz.c
module/zfs/vdev_root.c
module/zfs/zio.c

diff --cc cmd/zdb/zdb.c
index 4787818825d2b7ef629b630f557ff332c143b02d,ff73072f8a64f5622c23cb5b2b3c625af3b9e9b4..202b5a6197268a2eb383308d84b429402bfd922d
@@@ -546,6 -637,133 +637,134 @@@ dump_metaslabs(spa_t *spa
        }
  }
  
 -      for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ static void
+ dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+ {
+       const ddt_phys_t *ddp = dde->dde_phys;
+       const ddt_key_t *ddk = &dde->dde_key;
+       char *types[4] = { "ditto", "single", "double", "triple" };
+       char blkbuf[BP_SPRINTF_LEN];
+       blkptr_t blk;
++      int p;
++      for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+               if (ddp->ddp_phys_birth == 0)
+                       continue;
+               ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+               sprintf_blkptr(blkbuf, &blk);
+               (void) printf("index %llx refcnt %llu %s %s\n",
+                   (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
+                   types[p], blkbuf);
+       }
+ }
+ static void
+ dump_dedup_ratio(const ddt_stat_t *dds)
+ {
+       double rL, rP, rD, D, dedup, compress, copies;
+       if (dds->dds_blocks == 0)
+               return;
+       rL = (double)dds->dds_ref_lsize;
+       rP = (double)dds->dds_ref_psize;
+       rD = (double)dds->dds_ref_dsize;
+       D = (double)dds->dds_dsize;
+       dedup = rD / D;
+       compress = rL / rP;
+       copies = rD / rP;
+       (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
+           "dedup * compress / copies = %.2f\n\n",
+           dedup, compress, copies, dedup * compress / copies);
+ }
+ static void
+ dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+ {
+       char name[DDT_NAMELEN];
+       ddt_entry_t dde;
+       uint64_t walk = 0;
+       dmu_object_info_t doi;
+       uint64_t count, dspace, mspace;
+       int error;
+       error = ddt_object_info(ddt, type, class, &doi);
+       if (error == ENOENT)
+               return;
+       ASSERT(error == 0);
+       count = ddt_object_count(ddt, type, class);
+       dspace = doi.doi_physical_blocks_512 << 9;
+       mspace = doi.doi_fill_count * doi.doi_data_block_size;
+       ASSERT(count != 0);     /* we should have destroyed it */
+       ddt_object_name(ddt, type, class, name);
+       (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
+           name,
+           (u_longlong_t)count,
+           (u_longlong_t)(dspace / count),
+           (u_longlong_t)(mspace / count));
+       if (dump_opt['D'] < 3)
+               return;
+       zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
+       if (dump_opt['D'] < 4)
+               return;
+       if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
+               return;
+       (void) printf("%s contents:\n\n", name);
+       while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
+               dump_dde(ddt, &dde, walk);
+       ASSERT(error == ENOENT);
+       (void) printf("\n");
+ }
+ static void
+ dump_all_ddts(spa_t *spa)
+ {
+       ddt_histogram_t ddh_total = { 0 };
+       ddt_stat_t dds_total = { 0 };
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+               ddt_t *ddt = spa->spa_ddt[c];
+               for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+                       for (enum ddt_class class = 0; class < DDT_CLASSES;
+                           class++) {
+                               dump_ddt(ddt, type, class);
+                       }
+               }
+       }
+       ddt_get_dedup_stats(spa, &dds_total);
+       if (dds_total.dds_blocks == 0) {
+               (void) printf("All DDTs are empty\n");
+               return;
+       }
+       (void) printf("\n");
+       if (dump_opt['D'] > 1) {
+               (void) printf("DDT histogram (aggregated over all DDTs):\n");
+               ddt_get_dedup_histogram(spa, &ddh_total);
+               zpool_dump_ddt(&dds_total, &ddh_total);
+       }
+       dump_dedup_ratio(&dds_total);
+ }
  static void
  dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size)
  {
@@@ -565,9 -783,8 +784,9 @@@ dump_dtl(vdev_t *vd, int indent
        boolean_t required;
        char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
        char prefix[256];
 +      int c, t;
  
-       spa_vdev_state_enter(spa);
+       spa_vdev_state_enter(spa, SCL_NONE);
        required = vdev_dtl_required(vd);
        (void) spa_vdev_state_exit(spa, NULL, 0);
  
                dump_dtl(vd->vdev_child[c], indent + 4);
  }
  
 -      for (int i = 0; i < num; i++) {
+ static void
+ dump_history(spa_t *spa)
+ {
+       nvlist_t **events = NULL;
+       char buf[SPA_MAXBLOCKSIZE];
+       uint64_t resid, len, off = 0;
+       uint_t num = 0;
+       int error;
+       time_t tsec;
+       struct tm t;
+       char tbuf[30];
+       char internalstr[MAXPATHLEN];
++      int i;
+       do {
+               len = sizeof (buf);
+               if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
+                       (void) fprintf(stderr, "Unable to read history: "
+                           "error %d\n", error);
+                       return;
+               }
+               if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
+                       break;
+               off -= resid;
+       } while (len != 0);
+       (void) printf("\nHistory:\n");
++      for (i = 0; i < num; i++) {
+               uint64_t time, txg, ievent;
+               char *cmd, *intstr;
+               if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
+                   &time) != 0)
+                       continue;
+               if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
+                   &cmd) != 0) {
+                       if (nvlist_lookup_uint64(events[i],
+                           ZPOOL_HIST_INT_EVENT, &ievent) != 0)
+                               continue;
+                       verify(nvlist_lookup_uint64(events[i],
+                           ZPOOL_HIST_TXG, &txg) == 0);
+                       verify(nvlist_lookup_string(events[i],
+                           ZPOOL_HIST_INT_STR, &intstr) == 0);
+                       if (ievent >= LOG_END)
+                               continue;
+                       (void) snprintf(internalstr,
+                           sizeof (internalstr),
+                           "[internal %s txg:%lld] %s",
+                           zfs_history_event_names[ievent], txg,
+                           intstr);
+                       cmd = internalstr;
+               }
+               tsec = time;
+               (void) localtime_r(&tsec, &t);
+               (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+               (void) printf("%s %s\n", tbuf, cmd);
+       }
+ }
  /*ARGSUSED*/
  static void
  dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
@@@ -614,15 -900,19 +903,20 @@@ blkid2offset(const dnode_phys_t *dnp, c
  }
  
  static void
- sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas)
+ sprintf_blkptr_compact(char *blkbuf, const blkptr_t *bp)
  {
-       dva_t *dva = bp->blk_dva;
-       int ndvas = alldvas ? BP_GET_NDVAS(bp) : 1;
+       const dva_t *dva = bp->blk_dva;
+       int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
 +      int i;
  
+       if (dump_opt['b'] >= 5) {
+               sprintf_blkptr(blkbuf, bp);
+               return;
+       }
        blkbuf[0] = '\0';
  
 -      for (int i = 0; i < ndvas; i++)
 +      for (i = 0; i < ndvas; i++)
                (void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ",
                    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
                    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
@@@ -1344,19 -1739,52 +1743,54 @@@ dump_cachefile(const char *cachefile
        nvlist_free(config);
  }
  
 -      for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
+ #define       ZDB_MAX_UB_HEADER_SIZE 32
+ static void
+ dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
+ {
+       vdev_t vd;
+       vdev_t *vdp = &vd;
+       char header[ZDB_MAX_UB_HEADER_SIZE];
++      int i;
+       vd.vdev_ashift = ashift;
+       vdp->vdev_top = vdp;
++      for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
+               uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
+               uberblock_t *ub = (void *)((char *)lbl + uoff);
+               if (uberblock_verify(ub))
+                       continue;
+               (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
+                   "Uberblock[%d]\n", i);
+               dump_uberblock(ub, header, "");
+       }
+ }
  static void
  dump_label(const char *dev)
  {
        int fd;
        vdev_label_t label;
-       char *buf = label.vl_vdev_phys.vp_nvlist;
+       char *path, *buf = label.vl_vdev_phys.vp_nvlist;
        size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
        struct stat64 statbuf;
-       uint64_t psize;
+       uint64_t psize, ashift;
+       int len = strlen(dev) + 1;
 +      int l;
  
-       if ((fd = open64(dev, O_RDONLY)) < 0) {
-               (void) printf("cannot open '%s': %s\n", dev, strerror(errno));
+       if (strncmp(dev, "/dev/dsk/", 9) == 0) {
+               len++;
+               path = malloc(len);
+               (void) snprintf(path, len, "%s%s", "/dev/rdsk/", dev + 9);
+       } else {
+               path = strdup(dev);
+       }
+       if ((fd = open64(path, O_RDONLY)) < 0) {
+               (void) printf("cannot open '%s': %s\n", path, strerror(errno));
+               free(path);
                exit(1);
        }
  
        psize = statbuf.st_size;
        psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
  
 -      for (int l = 0; l < VDEV_LABELS; l++) {
 +      for (l = 0; l < VDEV_LABELS; l++) {
                nvlist_t *config = NULL;
  
                (void) printf("--------------------------------------------\n");
@@@ -1507,13 -1897,19 +1903,20 @@@ typedef struct zdb_cb 
  } zdb_cb_t;
  
  static void
- zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type)
+ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
+     dmu_object_type_t type)
  {
+       uint64_t refcnt = 0;
 +      int i;
  
 -      for (int i = 0; i < 4; i++) {
+       ASSERT(type < ZDB_OT_TOTAL);
+       if (zilog && zil_bp_tree_add(zilog, bp) != 0)
+               return;
 +      for (i = 0; i < 4; i++) {
                int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
-               int t = (i & 1) ? type : DMU_OT_TOTAL;
+               int t = (i & 1) ? type : ZDB_OT_TOTAL;
                zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
  
                zb->zb_asize += BP_GET_ASIZE(bp);
@@@ -1625,24 -2017,159 +2024,164 @@@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilo
        return (0);
  }
  
 -              for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ static void
+ zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
+ {
+       vdev_t *vd = sm->sm_ppd;
+       (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
+           (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
+ }
+ /* ARGSUSED */
+ static void
+ zdb_space_map_load(space_map_t *sm)
+ {
+ }
+ static void
+ zdb_space_map_unload(space_map_t *sm)
+ {
+       space_map_vacate(sm, zdb_leak, sm);
+ }
+ /* ARGSUSED */
+ static void
+ zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
+ {
+ }
+ static space_map_ops_t zdb_space_map_ops = {
+       zdb_space_map_load,
+       zdb_space_map_unload,
+       NULL,   /* alloc */
+       zdb_space_map_claim,
+       NULL,   /* free */
+       NULL    /* maxsize */
+ };
+ static void
+ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
+ {
+       ddt_bookmark_t ddb = { 0 };
+       ddt_entry_t dde;
+       int error;
++      int p;
+       while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
+               blkptr_t blk;
+               ddt_phys_t *ddp = dde.dde_phys;
+               if (ddb.ddb_class == DDT_CLASS_UNIQUE)
+                       return;
+               ASSERT(ddt_phys_total_refcnt(&dde) > 1);
 -              for (int c = 0; c < rvd->vdev_children; c++) {
++              for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+                       if (ddp->ddp_phys_birth == 0)
+                               continue;
+                       ddt_bp_create(ddb.ddb_checksum,
+                           &dde.dde_key, ddp, &blk);
+                       if (p == DDT_PHYS_DITTO) {
+                               zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
+                       } else {
+                               zcb->zcb_dedup_asize +=
+                                   BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
+                               zcb->zcb_dedup_blocks++;
+                       }
+               }
+               if (!dump_opt['L']) {
+                       ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+                       ddt_enter(ddt);
+                       VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+                       ddt_exit(ddt);
+               }
+       }
+       ASSERT(error == ENOENT);
+ }
+ static void
+ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
+ {
+       zcb->zcb_spa = spa;
++      int c, m;
+       if (!dump_opt['L']) {
+               vdev_t *rvd = spa->spa_root_vdev;
 -                      for (int m = 0; m < vd->vdev_ms_count; m++) {
++              for (c = 0; c < rvd->vdev_children; c++) {
+                       vdev_t *vd = rvd->vdev_child[c];
 -              for (int c = 0; c < rvd->vdev_children; c++) {
++                      for (m = 0; m < vd->vdev_ms_count; m++) {
+                               metaslab_t *msp = vd->vdev_ms[m];
+                               mutex_enter(&msp->ms_lock);
+                               space_map_unload(&msp->ms_map);
+                               VERIFY(space_map_load(&msp->ms_map,
+                                   &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo,
+                                   spa->spa_meta_objset) == 0);
+                               msp->ms_map.sm_ppd = vd;
+                               mutex_exit(&msp->ms_lock);
+                       }
+               }
+       }
+       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+       zdb_ddt_leak_init(spa, zcb);
+       spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+ static void
+ zdb_leak_fini(spa_t *spa)
+ {
++      int c, m;
++
+       if (!dump_opt['L']) {
+               vdev_t *rvd = spa->spa_root_vdev;
 -                      for (int m = 0; m < vd->vdev_ms_count; m++) {
++              for (c = 0; c < rvd->vdev_children; c++) {
+                       vdev_t *vd = rvd->vdev_child[c];
++                      for (m = 0; m < vd->vdev_ms_count; m++) {
+                               metaslab_t *msp = vd->vdev_ms[m];
+                               mutex_enter(&msp->ms_lock);
+                               space_map_unload(&msp->ms_map);
+                               mutex_exit(&msp->ms_lock);
+                       }
+               }
+       }
+ }
+ /* ARGSUSED */
+ static int
+ count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+ {
+       zdb_cb_t *zcb = arg;
+       if (dump_opt['b'] >= 4) {
+               char blkbuf[BP_SPRINTF_LEN];
+               sprintf_blkptr(blkbuf, bp);
+               (void) printf("[%s] %s\n",
+                   "deferred free", blkbuf);
+       }
+       zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
+       return (0);
+ }
  static int
  dump_block_stats(spa_t *spa)
  {
        zdb_cb_t zcb = { 0 };
        zdb_blkstats_t *zb, *tzb;
-       uint64_t alloc, space, logalloc;
-       vdev_t *rvd = spa->spa_root_vdev;
+       uint64_t norm_alloc, norm_space, total_alloc, total_found;
+       int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
        int leaks = 0;
-       int c, e;
++      int e;
  
-       if (!dump_opt['S']) {
-               (void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
-                   (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
-                   (dump_opt['c'] == 1) ? "metadata " : "",
-                   dump_opt['c'] ? "checksums " : "",
-                   (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
-                   !dump_opt['L'] ? "nothing leaked " : "");
-       }
+       (void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
+           (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
+           (dump_opt['c'] == 1) ? "metadata " : "",
+           dump_opt['c'] ? "checksums " : "",
+           (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
+           !dump_opt['L'] ? "nothing leaked " : "");
  
        /*
         * Load all space maps as SM_ALLOC maps, then traverse the pool
        /*
         * If there's a deferred-free bplist, process that first.
         */
-       if (spa->spa_sync_bplist_obj != 0) {
-               bplist_t *bpl = &spa->spa_sync_bplist;
-               blkptr_t blk;
-               uint64_t itor = 0;
+       (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
+           count_block_cb, &zcb, NULL);
+       (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
+           count_block_cb, &zcb, NULL);
  
-               VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset,
-                   spa->spa_sync_bplist_obj));
+       if (dump_opt['c'] > 1)
+               flags |= TRAVERSE_PREFETCH_DATA;
  
-               while (bplist_iterate(bpl, &itor, &blk) == 0) {
-                       if (dump_opt['b'] >= 4) {
-                               char blkbuf[BP_SPRINTF_LEN];
-                               sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk);
-                               (void) printf("[%s] %s\n",
-                                   "deferred free", blkbuf);
-                       }
-                       zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
-               }
-               bplist_close(bpl);
-       }
-       zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb);
+       zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
  
-       if (zcb.zcb_haderrors && !dump_opt['S']) {
+       if (zcb.zcb_haderrors) {
                (void) printf("\nError counts:\n\n");
                (void) printf("\t%5s  %s\n", "errno", "count");
 -              for (int e = 0; e < 256; e++) {
 +              for (e = 0; e < 256; e++) {
                        if (zcb.zcb_errors[e] != 0) {
                                (void) printf("\t%5d  %llu\n",
                                    e, (u_longlong_t)zcb.zcb_errors[e]);
Simple merge
index 5ce76541863347b7df5a3e86ae2c8e01f6dd3613,eed92ec72ebb5546bf75d1f3d2a21c036067b68b..bdfde21bbbabca9ded853fd682d55f9f499e6ea0
@@@ -763,152 -871,1365 +871,1372 @@@ ztest_spa_prop_set_uint64(ztest_shared_
        return (error);
  }
  
- zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
-       NULL,                   /* 0 no such transaction type */
-       ztest_replay_create,    /* TX_CREATE */
-       NULL,                   /* TX_MKDIR */
-       NULL,                   /* TX_MKXATTR */
-       NULL,                   /* TX_SYMLINK */
-       ztest_replay_remove,    /* TX_REMOVE */
-       NULL,                   /* TX_RMDIR */
-       NULL,                   /* TX_LINK */
-       NULL,                   /* TX_RENAME */
-       NULL,                   /* TX_WRITE */
-       NULL,                   /* TX_TRUNCATE */
-       NULL,                   /* TX_SETATTR */
-       NULL,                   /* TX_ACL */
- };
+ static void
+ ztest_rll_init(rll_t *rll)
+ {
+       rll->rll_writer = NULL;
+       rll->rll_readers = 0;
+       VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0);
+       VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0);
+ }
  
- /*
-  * Verify that we can't destroy an active pool, create an existing pool,
-  * or create a pool with a bad vdev spec.
-  */
- void
- ztest_spa_create_destroy(ztest_args_t *za)
+ static void
+ ztest_rll_destroy(rll_t *rll)
  {
-       int error;
-       spa_t *spa;
-       nvlist_t *nvroot;
+       ASSERT(rll->rll_writer == NULL);
+       ASSERT(rll->rll_readers == 0);
+       VERIFY(_mutex_destroy(&rll->rll_lock) == 0);
+       VERIFY(cond_destroy(&rll->rll_cv) == 0);
+ }
  
-       /*
-        * Attempt to create using a bad file.
-        */
-       nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
-       error = spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL);
-       nvlist_free(nvroot);
-       if (error != ENOENT)
-               fatal(0, "spa_create(bad_file) = %d", error);
+ static void
+ ztest_rll_lock(rll_t *rll, rl_type_t type)
+ {
+       VERIFY(mutex_lock(&rll->rll_lock) == 0);
  
-       /*
-        * Attempt to create using a bad mirror.
-        */
-       nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
-       error = spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL);
-       nvlist_free(nvroot);
-       if (error != ENOENT)
-               fatal(0, "spa_create(bad_mirror) = %d", error);
+       if (type == RL_READER) {
+               while (rll->rll_writer != NULL)
+                       (void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+               rll->rll_readers++;
+       } else {
+               while (rll->rll_writer != NULL || rll->rll_readers)
+                       (void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+               rll->rll_writer = curthread;
+       }
  
-       /*
-        * Attempt to create an existing pool.  It shouldn't matter
-        * what's in the nvroot; we should fail with EEXIST.
-        */
-       (void) rw_rdlock(&ztest_shared->zs_name_lock);
-       nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
-       error = spa_create(za->za_pool, nvroot, NULL, NULL, NULL);
-       nvlist_free(nvroot);
-       if (error != EEXIST)
-               fatal(0, "spa_create(whatever) = %d", error);
+       VERIFY(mutex_unlock(&rll->rll_lock) == 0);
+ }
  
-       error = spa_open(za->za_pool, &spa, FTAG);
-       if (error)
-               fatal(0, "spa_open() = %d", error);
+ static void
+ ztest_rll_unlock(rll_t *rll)
+ {
+       VERIFY(mutex_lock(&rll->rll_lock) == 0);
  
-       error = spa_destroy(za->za_pool);
-       if (error != EBUSY)
-               fatal(0, "spa_destroy() = %d", error);
+       if (rll->rll_writer) {
+               ASSERT(rll->rll_readers == 0);
+               rll->rll_writer = NULL;
+       } else {
+               ASSERT(rll->rll_readers != 0);
+               ASSERT(rll->rll_writer == NULL);
+               rll->rll_readers--;
+       }
  
-       spa_close(spa, FTAG);
-       (void) rw_unlock(&ztest_shared->zs_name_lock);
+       if (rll->rll_writer == NULL && rll->rll_readers == 0)
+               VERIFY(cond_broadcast(&rll->rll_cv) == 0);
+       VERIFY(mutex_unlock(&rll->rll_lock) == 0);
  }
  
- static vdev_t *
vdev_lookup_by_path(vdev_t *vd, const char *path)
+ static void
ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
  {
-       vdev_t *mvd;
-       int c;
+       rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
  
-       if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
-               return (vd);
+       ztest_rll_lock(rll, type);
+ }
  
-       for (c = 0; c < vd->vdev_children; c++)
-               if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
-                   NULL)
-                       return (mvd);
+ static void
+ ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
+ {
+       rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
  
-       return (NULL);
+       ztest_rll_unlock(rll);
  }
  
- /*
-  * Verify that vdev_add() works as expected.
-  */
- void
- ztest_vdev_add_remove(ztest_args_t *za)
+ static rl_t *
+ ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
+     uint64_t size, rl_type_t type)
  {
-       spa_t *spa = za->za_spa;
-       uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
-       nvlist_t *nvroot;
-       int error;
+       uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
+       rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
+       rl_t *rl;
  
-       (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+       rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
+       rl->rl_object = object;
+       rl->rl_offset = offset;
+       rl->rl_size = size;
+       rl->rl_lock = rll;
  
-       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+       ztest_rll_lock(rll, type);
  
-       ztest_shared->zs_vdev_primaries =
-           spa->spa_root_vdev->vdev_children * leaves;
+       return (rl);
+ }
  
-       spa_config_exit(spa, SCL_VDEV, FTAG);
+ static void
+ ztest_range_unlock(rl_t *rl)
+ {
+       rll_t *rll = rl->rl_lock;
  
-       /*
-        * Make 1/4 of the devices be log devices.
-        */
-       nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
-           ztest_random(4) == 0, zopt_raidz, zopt_mirrors, 1);
+       ztest_rll_unlock(rll);
  
-       error = spa_vdev_add(spa, nvroot);
-       nvlist_free(nvroot);
+       umem_free(rl, sizeof (*rl));
+ }
+ static void
+ ztest_zd_init(ztest_ds_t *zd, objset_t *os)
+ {
+       zd->zd_os = os;
+       zd->zd_zilog = dmu_objset_zil(os);
+       zd->zd_seq = 0;
+       dmu_objset_name(os, zd->zd_name);
++      int l;
  
-       (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+       VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0);
  
-       if (error == ENOSPC)
-               ztest_record_enospc("spa_vdev_add");
-       else if (error != 0)
-               fatal(0, "spa_vdev_add() = %d", error);
 -      for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
++      for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+               ztest_rll_init(&zd->zd_object_lock[l]);
 -      for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
++      for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
+               ztest_rll_init(&zd->zd_range_lock[l]);
  }
  
- /*
-  * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
-  */
- void
- ztest_vdev_aux_add_remove(ztest_args_t *za)
+ static void
+ ztest_zd_fini(ztest_ds_t *zd)
  {
-       spa_t *spa = za->za_spa;
-       vdev_t *rvd = spa->spa_root_vdev;
-       spa_aux_vdev_t *sav;
-       char *aux;
-       uint64_t guid = 0;
-       int error;
++      int l;
 +
-       if (ztest_random(2) == 0) {
-               sav = &spa->spa_spares;
-               aux = ZPOOL_CONFIG_SPARES;
-       } else {
-               sav = &spa->spa_l2cache;
-               aux = ZPOOL_CONFIG_L2CACHE;
-       }
+       VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0);
  
-       (void) mutex_lock(&ztest_shared->zs_vdev_lock);
 -      for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
++      for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+               ztest_rll_destroy(&zd->zd_object_lock[l]);
  
-       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 -      for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
++      for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
+               ztest_rll_destroy(&zd->zd_range_lock[l]);
+ }
+ #define       TXG_MIGHTWAIT   (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
+ static uint64_t
+ ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
+ {
+       uint64_t txg;
+       int error;
+       /*
+        * Attempt to assign tx to some transaction group.
+        */
+       error = dmu_tx_assign(tx, txg_how);
+       if (error) {
+               if (error == ERESTART) {
+                       ASSERT(txg_how == TXG_NOWAIT);
+                       dmu_tx_wait(tx);
+               } else {
+                       ASSERT3U(error, ==, ENOSPC);
+                       ztest_record_enospc(tag);
+               }
+               dmu_tx_abort(tx);
+               return (0);
+       }
+       txg = dmu_tx_get_txg(tx);
+       ASSERT(txg != 0);
+       return (txg);
+ }
+ static void
+ ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
+ {
+       uint64_t *ip = buf;
+       uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+       while (ip < ip_end)
+               *ip++ = value;
+ }
+ static boolean_t
+ ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
+ {
+       uint64_t *ip = buf;
+       uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+       uint64_t diff = 0;
+       while (ip < ip_end)
+               diff |= (value - *ip++);
+       return (diff == 0);
+ }
+ static void
+ ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+     uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+ {
+       bt->bt_magic = BT_MAGIC;
+       bt->bt_objset = dmu_objset_id(os);
+       bt->bt_object = object;
+       bt->bt_offset = offset;
+       bt->bt_gen = gen;
+       bt->bt_txg = txg;
+       bt->bt_crtxg = crtxg;
+ }
+ static void
+ ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+     uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+ {
+       ASSERT(bt->bt_magic == BT_MAGIC);
+       ASSERT(bt->bt_objset == dmu_objset_id(os));
+       ASSERT(bt->bt_object == object);
+       ASSERT(bt->bt_offset == offset);
+       ASSERT(bt->bt_gen <= gen);
+       ASSERT(bt->bt_txg <= txg);
+       ASSERT(bt->bt_crtxg == crtxg);
+ }
+ static ztest_block_tag_t *
+ ztest_bt_bonus(dmu_buf_t *db)
+ {
+       dmu_object_info_t doi;
+       ztest_block_tag_t *bt;
+       dmu_object_info_from_db(db, &doi);
+       ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
+       ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
+       bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
+       return (bt);
+ }
+ /*
+  * ZIL logging ops
+  */
+ #define       lrz_type        lr_mode
+ #define       lrz_blocksize   lr_uid
+ #define       lrz_ibshift     lr_gid
+ #define       lrz_bonustype   lr_rdev
+ #define       lrz_bonuslen    lr_crtime[1]
+ static uint64_t
+ ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
+ {
+       char *name = (void *)(lr + 1);          /* name follows lr */
+       size_t namesize = strlen(name) + 1;
+       itx_t *itx;
+       if (zil_replaying(zd->zd_zilog, tx))
+               return (0);
+       itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
+       bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+           sizeof (*lr) + namesize - sizeof (lr_t));
+       return (zil_itx_assign(zd->zd_zilog, itx, tx));
+ }
+ static uint64_t
+ ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr)
+ {
+       char *name = (void *)(lr + 1);          /* name follows lr */
+       size_t namesize = strlen(name) + 1;
+       itx_t *itx;
+       if (zil_replaying(zd->zd_zilog, tx))
+               return (0);
+       itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
+       bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+           sizeof (*lr) + namesize - sizeof (lr_t));
+       return (zil_itx_assign(zd->zd_zilog, itx, tx));
+ }
+ static uint64_t
+ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
+ {
+       itx_t *itx;
+       itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
+       if (zil_replaying(zd->zd_zilog, tx))
+               return (0);
+       if (lr->lr_length > ZIL_MAX_LOG_DATA)
+               write_state = WR_INDIRECT;
+       itx = zil_itx_create(TX_WRITE,
+           sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
+       if (write_state == WR_COPIED &&
+           dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
+           ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
+               zil_itx_destroy(itx);
+               itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+               write_state = WR_NEED_COPY;
+       }
+       itx->itx_private = zd;
+       itx->itx_wr_state = write_state;
+       itx->itx_sync = (ztest_random(8) == 0);
+       itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
+       bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+           sizeof (*lr) - sizeof (lr_t));
+       return (zil_itx_assign(zd->zd_zilog, itx, tx));
+ }
+ static uint64_t
+ ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
+ {
+       itx_t *itx;
+       if (zil_replaying(zd->zd_zilog, tx))
+               return (0);
+       itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+       bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+           sizeof (*lr) - sizeof (lr_t));
+       return (zil_itx_assign(zd->zd_zilog, itx, tx));
+ }
+ static uint64_t
+ ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
+ {
+       itx_t *itx;
+       if (zil_replaying(zd->zd_zilog, tx))
+               return (0);
+       itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
+       bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+           sizeof (*lr) - sizeof (lr_t));
+       return (zil_itx_assign(zd->zd_zilog, itx, tx));
+ }
+ /*
+  * ZIL replay ops
+  */
+ static int
+ ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap)
+ {
+       char *name = (void *)(lr + 1);          /* name follows lr */
+       objset_t *os = zd->zd_os;
+       ztest_block_tag_t *bbt;
+       dmu_buf_t *db;
+       dmu_tx_t *tx;
+       uint64_t txg;
+       int error = 0;
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+       ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+       ASSERT(name[0] != '\0');
+       tx = dmu_tx_create(os);
+       dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
+       if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+               dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+       } else {
+               dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+       }
+       txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+       if (txg == 0)
+               return (ENOSPC);
+       ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
+       if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+               if (lr->lr_foid == 0) {
+                       lr->lr_foid = zap_create(os,
+                           lr->lrz_type, lr->lrz_bonustype,
+                           lr->lrz_bonuslen, tx);
+               } else {
+                       error = zap_create_claim(os, lr->lr_foid,
+                           lr->lrz_type, lr->lrz_bonustype,
+                           lr->lrz_bonuslen, tx);
+               }
+       } else {
+               if (lr->lr_foid == 0) {
+                       lr->lr_foid = dmu_object_alloc(os,
+                           lr->lrz_type, 0, lr->lrz_bonustype,
+                           lr->lrz_bonuslen, tx);
+               } else {
+                       error = dmu_object_claim(os, lr->lr_foid,
+                           lr->lrz_type, 0, lr->lrz_bonustype,
+                           lr->lrz_bonuslen, tx);
+               }
+       }
+       if (error) {
+               ASSERT3U(error, ==, EEXIST);
+               ASSERT(zd->zd_zilog->zl_replay);
+               dmu_tx_commit(tx);
+               return (error);
+       }
+       ASSERT(lr->lr_foid != 0);
+       if (lr->lrz_type != DMU_OT_ZAP_OTHER)
+               VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
+                   lr->lrz_blocksize, lr->lrz_ibshift, tx));
+       VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+       bbt = ztest_bt_bonus(db);
+       dmu_buf_will_dirty(db, tx);
+       ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg);
+       dmu_buf_rele(db, FTAG);
+       VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
+           &lr->lr_foid, tx));
+       (void) ztest_log_create(zd, tx, lr);
+       dmu_tx_commit(tx);
+       return (0);
+ }
+ static int
+ ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap)
+ {
+       char *name = (void *)(lr + 1);          /* name follows lr */
+       objset_t *os = zd->zd_os;
+       dmu_object_info_t doi;
+       dmu_tx_t *tx;
+       uint64_t object, txg;
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+       ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+       ASSERT(name[0] != '\0');
+       VERIFY3U(0, ==,
+           zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
+       ASSERT(object != 0);
+       ztest_object_lock(zd, object, RL_WRITER);
+       VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
+       tx = dmu_tx_create(os);
+       dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
+       dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+       txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+       if (txg == 0) {
+               ztest_object_unlock(zd, object);
+               return (ENOSPC);
+       }
+       if (doi.doi_type == DMU_OT_ZAP_OTHER) {
+               VERIFY3U(0, ==, zap_destroy(os, object, tx));
+       } else {
+               VERIFY3U(0, ==, dmu_object_free(os, object, tx));
+       }
+       VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
+       (void) ztest_log_remove(zd, tx, lr);
+       dmu_tx_commit(tx);
+       ztest_object_unlock(zd, object);
+       return (0);
+ }
+ static int
+ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
+ {
+       objset_t *os = zd->zd_os;
+       void *data = lr + 1;                    /* data follows lr */
+       uint64_t offset, length;
+       ztest_block_tag_t *bt = data;
+       ztest_block_tag_t *bbt;
+       uint64_t gen, txg, lrtxg, crtxg;
+       dmu_object_info_t doi;
+       dmu_tx_t *tx;
+       dmu_buf_t *db;
+       arc_buf_t *abuf = NULL;
+       rl_t *rl;
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+       offset = lr->lr_offset;
+       length = lr->lr_length;
+       /* If it's a dmu_sync() block, write the whole block */
+       if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+               uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+               if (length < blocksize) {
+                       offset -= offset % blocksize;
+                       length = blocksize;
+               }
+       }
+       if (bt->bt_magic == BSWAP_64(BT_MAGIC))
+               byteswap_uint64_array(bt, sizeof (*bt));
+       if (bt->bt_magic != BT_MAGIC)
+               bt = NULL;
+       ztest_object_lock(zd, lr->lr_foid, RL_READER);
+       rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
+       VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+       dmu_object_info_from_db(db, &doi);
+       bbt = ztest_bt_bonus(db);
+       ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+       gen = bbt->bt_gen;
+       crtxg = bbt->bt_crtxg;
+       lrtxg = lr->lr_common.lrc_txg;
+       tx = dmu_tx_create(os);
+       dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
+       if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
+           P2PHASE(offset, length) == 0)
+               abuf = dmu_request_arcbuf(db, length);
+       txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+       if (txg == 0) {
+               if (abuf != NULL)
+                       dmu_return_arcbuf(abuf);
+               dmu_buf_rele(db, FTAG);
+               ztest_range_unlock(rl);
+               ztest_object_unlock(zd, lr->lr_foid);
+               return (ENOSPC);
+       }
+       if (bt != NULL) {
+               /*
+                * Usually, verify the old data before writing new data --
+                * but not always, because we also want to verify correct
+                * behavior when the data was not recently read into cache.
+                */
+               ASSERT(offset % doi.doi_data_block_size == 0);
+               if (ztest_random(4) != 0) {
+                       int prefetch = ztest_random(2) ?
+                           DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
+                       ztest_block_tag_t rbt;
+                       VERIFY(dmu_read(os, lr->lr_foid, offset,
+                           sizeof (rbt), &rbt, prefetch) == 0);
+                       if (rbt.bt_magic == BT_MAGIC) {
+                               ztest_bt_verify(&rbt, os, lr->lr_foid,
+                                   offset, gen, txg, crtxg);
+                       }
+               }
+               /*
+                * Writes can appear to be newer than the bonus buffer because
+                * the ztest_get_data() callback does a dmu_read() of the
+                * open-context data, which may be different than the data
+                * as it was when the write was generated.
+                */
+               if (zd->zd_zilog->zl_replay) {
+                       ztest_bt_verify(bt, os, lr->lr_foid, offset,
+                           MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
+                           bt->bt_crtxg);
+               }
+               /*
+                * Set the bt's gen/txg to the bonus buffer's gen/txg
+                * so that all of the usual ASSERTs will work.
+                */
+               ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg);
+       }
+       if (abuf == NULL) {
+               dmu_write(os, lr->lr_foid, offset, length, data, tx);
+       } else {
+               bcopy(data, abuf->b_data, length);
+               dmu_assign_arcbuf(db, offset, abuf, tx);
+       }
+       (void) ztest_log_write(zd, tx, lr);
+       dmu_buf_rele(db, FTAG);
+       dmu_tx_commit(tx);
+       ztest_range_unlock(rl);
+       ztest_object_unlock(zd, lr->lr_foid);
+       return (0);
+ }
+ static int
+ ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap)
+ {
+       objset_t *os = zd->zd_os;
+       dmu_tx_t *tx;
+       uint64_t txg;
+       rl_t *rl;
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+       ztest_object_lock(zd, lr->lr_foid, RL_READER);
+       rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
+           RL_WRITER);
+       tx = dmu_tx_create(os);
+       dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
+       txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+       if (txg == 0) {
+               ztest_range_unlock(rl);
+               ztest_object_unlock(zd, lr->lr_foid);
+               return (ENOSPC);
+       }
+       VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
+           lr->lr_length, tx) == 0);
+       (void) ztest_log_truncate(zd, tx, lr);
+       dmu_tx_commit(tx);
+       ztest_range_unlock(rl);
+       ztest_object_unlock(zd, lr->lr_foid);
+       return (0);
+ }
+ static int
+ ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
+ {
+       objset_t *os = zd->zd_os;
+       dmu_tx_t *tx;
+       dmu_buf_t *db;
+       ztest_block_tag_t *bbt;
+       uint64_t txg, lrtxg, crtxg;
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+       ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
+       VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+       tx = dmu_tx_create(os);
+       dmu_tx_hold_bonus(tx, lr->lr_foid);
+       txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+       if (txg == 0) {
+               dmu_buf_rele(db, FTAG);
+               ztest_object_unlock(zd, lr->lr_foid);
+               return (ENOSPC);
+       }
+       bbt = ztest_bt_bonus(db);
+       ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+       crtxg = bbt->bt_crtxg;
+       lrtxg = lr->lr_common.lrc_txg;
+       if (zd->zd_zilog->zl_replay) {
+               ASSERT(lr->lr_size != 0);
+               ASSERT(lr->lr_mode != 0);
+               ASSERT(lrtxg != 0);
+       } else {
+               /*
+                * Randomly change the size and increment the generation.
+                */
+               lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
+                   sizeof (*bbt);
+               lr->lr_mode = bbt->bt_gen + 1;
+               ASSERT(lrtxg == 0);
+       }
+       /*
+        * Verify that the current bonus buffer is not newer than our txg.
+        */
+       ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode,
+           MAX(txg, lrtxg), crtxg);
+       dmu_buf_will_dirty(db, tx);
+       ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
+       ASSERT3U(lr->lr_size, <=, db->db_size);
+       VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0);
+       bbt = ztest_bt_bonus(db);
+       ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);
+       dmu_buf_rele(db, FTAG);
+       (void) ztest_log_setattr(zd, tx, lr);
+       dmu_tx_commit(tx);
+       ztest_object_unlock(zd, lr->lr_foid);
+       return (0);
+ }
+ zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
+       NULL,                   /* 0 no such transaction type */
+       ztest_replay_create,    /* TX_CREATE */
+       NULL,                   /* TX_MKDIR */
+       NULL,                   /* TX_MKXATTR */
+       NULL,                   /* TX_SYMLINK */
+       ztest_replay_remove,    /* TX_REMOVE */
+       NULL,                   /* TX_RMDIR */
+       NULL,                   /* TX_LINK */
+       NULL,                   /* TX_RENAME */
+       ztest_replay_write,     /* TX_WRITE */
+       ztest_replay_truncate,  /* TX_TRUNCATE */
+       ztest_replay_setattr,   /* TX_SETATTR */
+       NULL,                   /* TX_ACL */
+       NULL,                   /* TX_CREATE_ACL */
+       NULL,                   /* TX_CREATE_ATTR */
+       NULL,                   /* TX_CREATE_ACL_ATTR */
+       NULL,                   /* TX_MKDIR_ACL */
+       NULL,                   /* TX_MKDIR_ATTR */
+       NULL,                   /* TX_MKDIR_ACL_ATTR */
+       NULL,                   /* TX_WRITE2 */
+ };
+ /*
+  * ZIL get_data callbacks
+  */
+ static void
+ ztest_get_done(zgd_t *zgd, int error)
+ {
+       ztest_ds_t *zd = zgd->zgd_private;
+       uint64_t object = zgd->zgd_rl->rl_object;
+       if (zgd->zgd_db)
+               dmu_buf_rele(zgd->zgd_db, zgd);
+       ztest_range_unlock(zgd->zgd_rl);
+       ztest_object_unlock(zd, object);
+       if (error == 0 && zgd->zgd_bp)
+               zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+       umem_free(zgd, sizeof (*zgd));
+ }
+ static int
+ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+ {
+       ztest_ds_t *zd = arg;
+       objset_t *os = zd->zd_os;
+       uint64_t object = lr->lr_foid;
+       uint64_t offset = lr->lr_offset;
+       uint64_t size = lr->lr_length;
+       blkptr_t *bp = &lr->lr_blkptr;
+       uint64_t txg = lr->lr_common.lrc_txg;
+       uint64_t crtxg;
+       dmu_object_info_t doi;
+       dmu_buf_t *db;
+       zgd_t *zgd;
+       int error;
+       ztest_object_lock(zd, object, RL_READER);
+       error = dmu_bonus_hold(os, object, FTAG, &db);
+       if (error) {
+               ztest_object_unlock(zd, object);
+               return (error);
+       }
+       crtxg = ztest_bt_bonus(db)->bt_crtxg;
+       if (crtxg == 0 || crtxg > txg) {
+               dmu_buf_rele(db, FTAG);
+               ztest_object_unlock(zd, object);
+               return (ENOENT);
+       }
+       dmu_object_info_from_db(db, &doi);
+       dmu_buf_rele(db, FTAG);
+       db = NULL;
+       zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
+       zgd->zgd_zilog = zd->zd_zilog;
+       zgd->zgd_private = zd;
+       if (buf != NULL) {      /* immediate write */
+               zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+                   RL_READER);
+               error = dmu_read(os, object, offset, size, buf,
+                   DMU_READ_NO_PREFETCH);
+               ASSERT(error == 0);
+       } else {
+               size = doi.doi_data_block_size;
+               if (ISP2(size)) {
+                       offset = P2ALIGN(offset, size);
+               } else {
+                       ASSERT(offset < size);
+                       offset = 0;
+               }
+               zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+                   RL_READER);
+               error = dmu_buf_hold(os, object, offset, zgd, &db,
+                   DMU_READ_NO_PREFETCH);
+               if (error == 0) {
+                       zgd->zgd_db = db;
+                       zgd->zgd_bp = bp;
+                       ASSERT(db->db_offset == offset);
+                       ASSERT(db->db_size == size);
+                       error = dmu_sync(zio, lr->lr_common.lrc_txg,
+                           ztest_get_done, zgd);
+                       if (error == 0)
+                               return (0);
+               }
+       }
+       ztest_get_done(zgd, error);
+       return (error);
+ }
+ static void *
+ ztest_lr_alloc(size_t lrsize, char *name)
+ {
+       char *lr;
+       size_t namesize = name ? strlen(name) + 1 : 0;
+       lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
+       if (name)
+               bcopy(name, lr + lrsize, namesize);
+       return (lr);
+ }
+ void
+ ztest_lr_free(void *lr, size_t lrsize, char *name)
+ {
+       size_t namesize = name ? strlen(name) + 1 : 0;
+       umem_free(lr, lrsize + namesize);
+ }
+ /*
+  * Lookup a bunch of objects.  Returns the number of objects not found.
+  */
+ static int
+ ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
+ {
+       int missing = 0;
+       int error;
++      int i;
+       ASSERT(_mutex_held(&zd->zd_dirobj_lock));
 -      for (int i = 0; i < count; i++, od++) {
++      for (i = 0; i < count; i++, od++) {
+               od->od_object = 0;
+               error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
+                   sizeof (uint64_t), 1, &od->od_object);
+               if (error) {
+                       ASSERT(error == ENOENT);
+                       ASSERT(od->od_object == 0);
+                       missing++;
+               } else {
+                       dmu_buf_t *db;
+                       ztest_block_tag_t *bbt;
+                       dmu_object_info_t doi;
+                       ASSERT(od->od_object != 0);
+                       ASSERT(missing == 0);   /* there should be no gaps */
+                       ztest_object_lock(zd, od->od_object, RL_READER);
+                       VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
+                           od->od_object, FTAG, &db));
+                       dmu_object_info_from_db(db, &doi);
+                       bbt = ztest_bt_bonus(db);
+                       ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+                       od->od_type = doi.doi_type;
+                       od->od_blocksize = doi.doi_data_block_size;
+                       od->od_gen = bbt->bt_gen;
+                       dmu_buf_rele(db, FTAG);
+                       ztest_object_unlock(zd, od->od_object);
+               }
+       }
+       return (missing);
+ }
+ static int
+ ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
+ {
+       int missing = 0;
++      int i;
+       ASSERT(_mutex_held(&zd->zd_dirobj_lock));
 -      for (int i = 0; i < count; i++, od++) {
++      for (i = 0; i < count; i++, od++) {
+               if (missing) {
+                       od->od_object = 0;
+                       missing++;
+                       continue;
+               }
+               lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+               lr->lr_doid = od->od_dir;
+               lr->lr_foid = 0;        /* 0 to allocate, > 0 to claim */
+               lr->lrz_type = od->od_crtype;
+               lr->lrz_blocksize = od->od_crblocksize;
+               lr->lrz_ibshift = ztest_random_ibshift();
+               lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
+               lr->lrz_bonuslen = dmu_bonus_max();
+               lr->lr_gen = od->od_crgen;
+               lr->lr_crtime[0] = time(NULL);
+               if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
+                       ASSERT(missing == 0);
+                       od->od_object = 0;
+                       missing++;
+               } else {
+                       od->od_object = lr->lr_foid;
+                       od->od_type = od->od_crtype;
+                       od->od_blocksize = od->od_crblocksize;
+                       od->od_gen = od->od_crgen;
+                       ASSERT(od->od_object != 0);
+               }
+               ztest_lr_free(lr, sizeof (*lr), od->od_name);
+       }
+       return (missing);
+ }
+ static int
+ ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
+ {
+       int missing = 0;
+       int error;
++      int i;
+       ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+       od += count - 1;
 -      for (int i = count - 1; i >= 0; i--, od--) {
++      for (i = count - 1; i >= 0; i--, od--) {
+               if (missing) {
+                       missing++;
+                       continue;
+               }
+               if (od->od_object == 0)
+                       continue;
+               lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+               lr->lr_doid = od->od_dir;
+               if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
+                       ASSERT3U(error, ==, ENOSPC);
+                       missing++;
+               } else {
+                       od->od_object = 0;
+               }
+               ztest_lr_free(lr, sizeof (*lr), od->od_name);
+       }
+       return (missing);
+ }
+ static int
+ ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
+     void *data)
+ {
+       lr_write_t *lr;
+       int error;
+       lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
+       lr->lr_foid = object;
+       lr->lr_offset = offset;
+       lr->lr_length = size;
+       lr->lr_blkoff = 0;
+       BP_ZERO(&lr->lr_blkptr);
+       bcopy(data, lr + 1, size);
+       error = ztest_replay_write(zd, lr, B_FALSE);
+       ztest_lr_free(lr, sizeof (*lr) + size, NULL);
+       return (error);
+ }
+ static int
+ ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+ {
+       lr_truncate_t *lr;
+       int error;
+       lr = ztest_lr_alloc(sizeof (*lr), NULL);
+       lr->lr_foid = object;
+       lr->lr_offset = offset;
+       lr->lr_length = size;
+       error = ztest_replay_truncate(zd, lr, B_FALSE);
+       ztest_lr_free(lr, sizeof (*lr), NULL);
+       return (error);
+ }
+ static int
+ ztest_setattr(ztest_ds_t *zd, uint64_t object)
+ {
+       lr_setattr_t *lr;
+       int error;
+       lr = ztest_lr_alloc(sizeof (*lr), NULL);
+       lr->lr_foid = object;
+       lr->lr_size = 0;
+       lr->lr_mode = 0;
+       error = ztest_replay_setattr(zd, lr, B_FALSE);
+       ztest_lr_free(lr, sizeof (*lr), NULL);
+       return (error);
+ }
+ static void
+ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+ {
+       objset_t *os = zd->zd_os;
+       dmu_tx_t *tx;
+       uint64_t txg;
+       rl_t *rl;
+       txg_wait_synced(dmu_objset_pool(os), 0);
+       ztest_object_lock(zd, object, RL_READER);
+       rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
+       tx = dmu_tx_create(os);
+       dmu_tx_hold_write(tx, object, offset, size);
+       txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+       if (txg != 0) {
+               dmu_prealloc(os, object, offset, size, tx);
+               dmu_tx_commit(tx);
+               txg_wait_synced(dmu_objset_pool(os), txg);
+       } else {
+               (void) dmu_free_long_range(os, object, offset, size);
+       }
+       ztest_range_unlock(rl);
+       ztest_object_unlock(zd, object);
+ }
+ static void
+ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
+ {
+       ztest_block_tag_t wbt;
+       dmu_object_info_t doi;
+       enum ztest_io_type io_type;
+       uint64_t blocksize;
+       void *data;
+       VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
+       blocksize = doi.doi_data_block_size;
+       data = umem_alloc(blocksize, UMEM_NOFAIL);
+       /*
+        * Pick an i/o type at random, biased toward writing block tags.
+        */
+       io_type = ztest_random(ZTEST_IO_TYPES);
+       if (ztest_random(2) == 0)
+               io_type = ZTEST_IO_WRITE_TAG;
+       switch (io_type) {
+       case ZTEST_IO_WRITE_TAG:
+               ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
+               (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
+               break;
+       case ZTEST_IO_WRITE_PATTERN:
+               (void) memset(data, 'a' + (object + offset) % 5, blocksize);
+               if (ztest_random(2) == 0) {
+                       /*
+                        * Induce fletcher2 collisions to ensure that
+                        * zio_ddt_collision() detects and resolves them
+                        * when using fletcher2-verify for deduplication.
+                        */
+                       ((uint64_t *)data)[0] ^= 1ULL << 63;
+                       ((uint64_t *)data)[4] ^= 1ULL << 63;
+               }
+               (void) ztest_write(zd, object, offset, blocksize, data);
+               break;
+       case ZTEST_IO_WRITE_ZEROES:
+               bzero(data, blocksize);
+               (void) ztest_write(zd, object, offset, blocksize, data);
+               break;
+       case ZTEST_IO_TRUNCATE:
+               (void) ztest_truncate(zd, object, offset, blocksize);
+               break;
+       case ZTEST_IO_SETATTR:
+               (void) ztest_setattr(zd, object);
+               break;
+       }
+       umem_free(data, blocksize);
+ }
+ /*
+  * Initialize an object description template.
+  */
+ static void
+ ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
+     dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
+ {
+       od->od_dir = ZTEST_DIROBJ;
+       od->od_object = 0;
+       od->od_crtype = type;
+       od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
+       od->od_crgen = gen;
+       od->od_type = DMU_OT_NONE;
+       od->od_blocksize = 0;
+       od->od_gen = 0;
+       (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
+           tag, (int64_t)id, index);
+ }
+ /*
+  * Lookup or create the objects for a test using the od template.
+  * If the objects do not all exist, or if 'remove' is specified,
+  * remove any existing objects and create new ones.  Otherwise,
+  * use the existing objects.
+  */
+ static int
+ ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
+ {
+       int count = size / sizeof (*od);
+       int rv = 0;
+       VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0);
+       if ((ztest_lookup(zd, od, count) != 0 || remove) &&
+           (ztest_remove(zd, od, count) != 0 ||
+           ztest_create(zd, od, count) != 0))
+               rv = -1;
+       zd->zd_od = od;
+       VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
+       return (rv);
+ }
+ /* ARGSUSED */
+ void
+ ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
+ {
+       zilog_t *zilog = zd->zd_zilog;
+       zil_commit(zilog, UINT64_MAX, ztest_random(ZTEST_OBJECTS));
+       /*
+        * Remember the committed values in zd, which is in parent/child
+        * shared memory.  If we die, the next iteration of ztest_run()
+        * will verify that the log really does contain this record.
+        */
+       mutex_enter(&zilog->zl_lock);
+       ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq);
+       zd->zd_seq = zilog->zl_commit_lr_seq;
+       mutex_exit(&zilog->zl_lock);
+ }
+ /*
+  * Verify that we can't destroy an active pool, create an existing pool,
+  * or create a pool with a bad vdev spec.
+  */
+ /* ARGSUSED */
+ void
+ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
+ {
+       ztest_shared_t *zs = ztest_shared;
+       spa_t *spa;
+       nvlist_t *nvroot;
+       /*
+        * Attempt to create using a bad file.
+        */
+       nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
+       VERIFY3U(ENOENT, ==,
+           spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL));
+       nvlist_free(nvroot);
+       /*
+        * Attempt to create using a bad mirror.
+        */
+       nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
+       VERIFY3U(ENOENT, ==,
+           spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL));
+       nvlist_free(nvroot);
+       /*
+        * Attempt to create an existing pool.  It shouldn't matter
+        * what's in the nvroot; we should fail with EEXIST.
+        */
+       (void) rw_rdlock(&zs->zs_name_lock);
+       nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
+       VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL));
+       nvlist_free(nvroot);
+       VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+       VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool));
+       spa_close(spa, FTAG);
+       (void) rw_unlock(&zs->zs_name_lock);
+ }
+ static vdev_t *
+ vdev_lookup_by_path(vdev_t *vd, const char *path)
+ {
+       vdev_t *mvd;
++      int c;
+       if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
+               return (vd);
 -      for (int c = 0; c < vd->vdev_children; c++)
++      for (c = 0; c < vd->vdev_children; c++)
+               if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
+                   NULL)
+                       return (mvd);
+       return (NULL);
+ }
+ /*
+  * Find the first available hole which can be used as a top-level.
+  */
+ int
+ find_vdev_hole(spa_t *spa)
+ {
+       vdev_t *rvd = spa->spa_root_vdev;
+       int c;
+       ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);
+       for (c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *cvd = rvd->vdev_child[c];
+               if (cvd->vdev_ishole)
+                       break;
+       }
+       return (c);
+ }
+ /*
+  * Verify that vdev_add() works as expected.
+  */
+ /* ARGSUSED */
+ void
+ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
+ {
+       ztest_shared_t *zs = ztest_shared;
+       spa_t *spa = zs->zs_spa;
+       uint64_t leaves;
+       uint64_t guid;
+       nvlist_t *nvroot;
+       int error;
+       VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+       leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * zopt_raidz;
+       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+       ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
+       /*
+        * If we have slogs then remove them 1/4 of the time.
+        */
+       if (spa_has_slogs(spa) && ztest_random(4) == 0) {
+               /*
+                * Grab the guid from the head of the log class rotor.
+                */
+               guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
+               spa_config_exit(spa, SCL_VDEV, FTAG);
+               /*
+                * We have to grab the zs_name_lock as writer to
+                * prevent a race between removing a slog (dmu_objset_find)
+                * and destroying a dataset. Removing the slog will
+                * grab a reference on the dataset which may cause
+                * dmu_objset_destroy() to fail with EBUSY thus
+                * leaving the dataset in an inconsistent state.
+                */
+               VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0);
+               error = spa_vdev_remove(spa, guid, B_FALSE);
+               VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0);
+               if (error && error != EEXIST)
+                       fatal(0, "spa_vdev_remove() = %d", error);
+       } else {
+               spa_config_exit(spa, SCL_VDEV, FTAG);
+               /*
+                * Make 1/4 of the devices be log devices.
+                */
+               nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
+                   ztest_random(4) == 0, zopt_raidz, zs->zs_mirrors, 1);
+               error = spa_vdev_add(spa, nvroot);
+               nvlist_free(nvroot);
+               if (error == ENOSPC)
+                       ztest_record_enospc("spa_vdev_add");
+               else if (error != 0)
+                       fatal(0, "spa_vdev_add() = %d", error);
+       }
+       VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0);
+ }
+ /*
+  * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
+  */
+ /* ARGSUSED */
+ void
+ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
+ {
+       ztest_shared_t *zs = ztest_shared;
+       spa_t *spa = zs->zs_spa;
+       vdev_t *rvd = spa->spa_root_vdev;
+       spa_aux_vdev_t *sav;
+       char *aux;
+       uint64_t guid = 0;
+       int error;
+       if (ztest_random(2) == 0) {
+               sav = &spa->spa_spares;
+               aux = ZPOOL_CONFIG_SPARES;
+       } else {
+               sav = &spa->spa_l2cache;
+               aux = ZPOOL_CONFIG_L2CACHE;
+       }
+       VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
  
        if (sav->sav_count != 0 && ztest_random(4) == 0) {
                /*
@@@ -1399,56 -2865,57 +2874,58 @@@ ztest_objset_destroy_cb(const char *nam
        return (0);
  }
  
- /*
-  * Verify that dmu_objset_{create,destroy,open,close} work as expected.
-  */
- static uint64_t
- ztest_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t object, int mode)
+ static boolean_t
+ ztest_snapshot_create(char *osname, uint64_t id)
+ {
+       char snapname[MAXNAMELEN];
+       int error;
+       (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+           (u_longlong_t)id);
+       error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1,
+           NULL, B_FALSE);
+       if (error == ENOSPC) {
+               ztest_record_enospc(FTAG);
+               return (B_FALSE);
+       }
+       if (error != 0 && error != EEXIST)
+               fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error);
+       return (B_TRUE);
+ }
+ static boolean_t
+ ztest_snapshot_destroy(char *osname, uint64_t id)
  {
-       itx_t *itx;
-       lr_create_t *lr;
-       size_t namesize;
-       char name[24];
-       (void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object);
-       namesize = strlen(name) + 1;
-       itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize +
-           ztest_random(ZIL_MAX_BLKSZ));
-       lr = (lr_create_t *)&itx->itx_lr;
-       bzero(lr + 1, lr->lr_common.lrc_reclen - sizeof (*lr));
-       lr->lr_doid = object;
-       lr->lr_foid = 0;
-       lr->lr_mode = mode;
-       lr->lr_uid = 0;
-       lr->lr_gid = 0;
-       lr->lr_gen = dmu_tx_get_txg(tx);
-       lr->lr_crtime[0] = time(NULL);
-       lr->lr_crtime[1] = 0;
-       lr->lr_rdev = 0;
-       bcopy(name, (char *)(lr + 1), namesize);
-       return (zil_itx_assign(zilog, itx, tx));
+       char snapname[MAXNAMELEN];
+       int error;
+       (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+           (u_longlong_t)id);
+       error = dmu_objset_destroy(snapname, B_FALSE);
+       if (error != 0 && error != ENOENT)
+               fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
+       return (B_TRUE);
  }
  
+ /* ARGSUSED */
  void
- ztest_dmu_objset_create_destroy(ztest_args_t *za)
+ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
  {
+       ztest_shared_t *zs = ztest_shared;
+       ztest_ds_t zdtmp;
+       int iters;
        int error;
        objset_t *os, *os2;
-       char name[100];
-       int basemode, expected_error;
+       char name[MAXNAMELEN];
        zilog_t *zilog;
-       uint64_t seq;
-       uint64_t objects;
++      int i;
  
-       (void) rw_rdlock(&ztest_shared->zs_name_lock);
-       (void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool,
-           (u_longlong_t)za->za_instance);
+       (void) rw_rdlock(&zs->zs_name_lock);
  
-       basemode = DS_MODE_TYPE(za->za_instance);
-       if (basemode != DS_MODE_USER && basemode != DS_MODE_OWNER)
-               basemode = DS_MODE_USER;
+       (void) snprintf(name, MAXNAMELEN, "%s/temp_%llu",
+           zs->zs_pool, (u_longlong_t)id);
  
        /*
         * If this dataset exists from a previous run, process its replay log
        /*
         * Open the intent log for it.
         */
-       zilog = zil_open(os, NULL);
+       zilog = zil_open(os, ztest_get_data);
  
        /*
-        * Put a random number of objects in there.
+        * Put some objects in there, do a little I/O to them,
+        * and randomly take a couple of snapshots along the way.
         */
-       objects = ztest_random(20);
-       seq = 0;
-       while (objects-- != 0) {
-               uint64_t object;
-               dmu_tx_t *tx = dmu_tx_create(os);
-               dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, sizeof (name));
-               error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
-                       dmu_tx_abort(tx);
-               } else {
-                       object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-                           DMU_OT_NONE, 0, tx);
-                       ztest_set_random_blocksize(os, object, tx);
-                       seq = ztest_log_create(zilog, tx, object,
-                           DMU_OT_UINT64_OTHER);
-                       dmu_write(os, object, 0, sizeof (name), name, tx);
-                       dmu_tx_commit(tx);
-               }
-               if (ztest_random(5) == 0) {
-                       zil_commit(zilog, seq, object);
-               }
-               if (ztest_random(100) == 0) {
-                       error = zil_suspend(zilog);
-                       if (error == 0) {
-                               zil_resume(zilog);
-                       }
-               }
+       iters = ztest_random(5);
 -      for (int i = 0; i < iters; i++) {
++      for (i = 0; i < iters; i++) {
+               ztest_dmu_object_alloc_free(&zdtmp, id);
+               if (ztest_random(iters) == 0)
+                       (void) ztest_snapshot_create(name, i);
        }
  
        /*
   * Verify that dmu_object_{alloc,free} work as expected.
   */
  void
- ztest_dmu_object_alloc_free(ztest_args_t *za)
+ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
  {
-       objset_t *os = za->za_os;
-       dmu_buf_t *db;
-       dmu_tx_t *tx;
-       uint64_t batchobj, object, batchsize, endoff, temp;
-       int b, c, error, bonuslen;
-       dmu_object_info_t *doi = &za->za_doi;
-       char osname[MAXNAMELEN];
-       dmu_objset_name(os, osname);
-       endoff = -8ULL;
-       batchsize = 2;
-       /*
-        * Create a batch object if necessary, and record it in the directory.
-        */
-       VERIFY3U(0, ==, dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-           sizeof (uint64_t), &batchobj, DMU_READ_PREFETCH));
-       if (batchobj == 0) {
-               tx = dmu_tx_create(os);
-               dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
-                   sizeof (uint64_t));
-               dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-               error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
-                       ztest_record_enospc("create a batch object");
-                       dmu_tx_abort(tx);
-                       return;
-               }
-               batchobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-                   DMU_OT_NONE, 0, tx);
-               ztest_set_random_blocksize(os, batchobj, tx);
-               dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
-                   sizeof (uint64_t), &batchobj, tx);
-               dmu_tx_commit(tx);
-       }
-       /*
-        * Destroy the previous batch of objects.
-        */
-       for (b = 0; b < batchsize; b++) {
-               VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t),
-                   sizeof (uint64_t), &object, DMU_READ_PREFETCH));
-               if (object == 0)
-                       continue;
-               /*
-                * Read and validate contents.
-                * We expect the nth byte of the bonus buffer to be n.
-                */
-               VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
-               za->za_dbuf = db;
-               dmu_object_info_from_db(db, doi);
-               ASSERT(doi->doi_type == DMU_OT_UINT64_OTHER);
-               ASSERT(doi->doi_bonus_type == DMU_OT_PLAIN_OTHER);
-               ASSERT3S(doi->doi_physical_blks, >=, 0);
-               bonuslen = doi->doi_bonus_size;
-               for (c = 0; c < bonuslen; c++) {
-                       if (((uint8_t *)db->db_data)[c] !=
-                           (uint8_t)(c + bonuslen)) {
-                               fatal(0,
-                                   "bad bonus: %s, obj %llu, off %d: %u != %u",
-                                   osname, object, c,
-                                   ((uint8_t *)db->db_data)[c],
-                                   (uint8_t)(c + bonuslen));
-                       }
-               }
-               dmu_buf_rele(db, FTAG);
-               za->za_dbuf = NULL;
-               /*
-                * We expect the word at endoff to be our object number.
-                */
-               VERIFY(0 == dmu_read(os, object, endoff,
-                   sizeof (uint64_t), &temp, DMU_READ_PREFETCH));
-               if (temp != object) {
-                       fatal(0, "bad data in %s, got %llu, expected %llu",
-                           osname, temp, object);
-               }
-               /*
-                * Destroy old object and clear batch entry.
-                */
-               tx = dmu_tx_create(os);
-               dmu_tx_hold_write(tx, batchobj,
-                   b * sizeof (uint64_t), sizeof (uint64_t));
-               dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
-               error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
-                       ztest_record_enospc("free object");
-                       dmu_tx_abort(tx);
-                       return;
-               }
-               error = dmu_object_free(os, object, tx);
-               if (error) {
-                       fatal(0, "dmu_object_free('%s', %llu) = %d",
-                           osname, object, error);
-               }
-               object = 0;
-               dmu_object_set_checksum(os, batchobj,
-                   ztest_random_checksum(), tx);
-               dmu_object_set_compress(os, batchobj,
-                   ztest_random_compress(), tx);
-               dmu_write(os, batchobj, b * sizeof (uint64_t),
-                   sizeof (uint64_t), &object, tx);
-               dmu_tx_commit(tx);
-       }
+       ztest_od_t od[4];
+       int batchsize = sizeof (od) / sizeof (od[0]);
++      int b;
  
-       /*
-        * Before creating the new batch of objects, generate a bunch of churn.
-        */
-       for (b = ztest_random(100); b > 0; b--) {
-               tx = dmu_tx_create(os);
-               dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-               error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
-                       ztest_record_enospc("churn objects");
-                       dmu_tx_abort(tx);
-                       return;
-               }
-               object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-                   DMU_OT_NONE, 0, tx);
-               ztest_set_random_blocksize(os, object, tx);
-               error = dmu_object_free(os, object, tx);
-               if (error) {
-                       fatal(0, "dmu_object_free('%s', %llu) = %d",
-                           osname, object, error);
-               }
-               dmu_tx_commit(tx);
-       }
 -      for (int b = 0; b < batchsize; b++)
++      for (b = 0; b < batchsize; b++)
+               ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
  
        /*
-        * Create a new batch of objects with randomly chosen
-        * blocksizes and record them in the batch directory.
+        * Destroy the previous batch of objects, create a new batch,
+        * and do some I/O on the new objects.
         */
-       for (b = 0; b < batchsize; b++) {
-               uint32_t va_blksize;
-               u_longlong_t va_nblocks;
-               tx = dmu_tx_create(os);
-               dmu_tx_hold_write(tx, batchobj, b * sizeof (uint64_t),
-                   sizeof (uint64_t));
-               dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-               dmu_tx_hold_write(tx, DMU_NEW_OBJECT, endoff,
-                   sizeof (uint64_t));
-               error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
-                       ztest_record_enospc("create batchobj");
-                       dmu_tx_abort(tx);
-                       return;
-               }
-               bonuslen = (int)ztest_random(dmu_bonus_max()) + 1;
-               object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-                   DMU_OT_PLAIN_OTHER, bonuslen, tx);
-               ztest_set_random_blocksize(os, object, tx);
-               dmu_object_set_checksum(os, object,
-                   ztest_random_checksum(), tx);
-               dmu_object_set_compress(os, object,
-                   ztest_random_compress(), tx);
-               dmu_write(os, batchobj, b * sizeof (uint64_t),
-                   sizeof (uint64_t), &object, tx);
-               /*
-                * Write to both the bonus buffer and the regular data.
-                */
-               VERIFY(dmu_bonus_hold(os, object, FTAG, &db) == 0);
-               za->za_dbuf = db;
-               ASSERT3U(bonuslen, <=, db->db_size);
-               dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
-               ASSERT3S(va_nblocks, >=, 0);
-               dmu_buf_will_dirty(db, tx);
-               /*
-                * See comments above regarding the contents of
-                * the bonus buffer and the word at endoff.
-                */
-               for (c = 0; c < bonuslen; c++)
-                       ((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
-               dmu_buf_rele(db, FTAG);
-               za->za_dbuf = NULL;
-               /*
-                * Write to a large offset to increase indirection.
-                */
-               dmu_write(os, object, endoff, sizeof (uint64_t), &object, tx);
+       if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0)
+               return;
  
-               dmu_tx_commit(tx);
-       }
+       while (ztest_random(4 * batchsize) != 0)
+               ztest_io(zd, od[ztest_random(batchsize)].od_object,
+                   ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
  }
  
  /*
@@@ -2918,168 -3859,430 +3870,432 @@@ ztest_zap(ztest_ds_t *zd, uint64_t id
        ASSERT3U(error, ==, 0);
  
        tx = dmu_tx_create(os);
-       dmu_tx_hold_zap(tx, object, TRUE, NULL);
-       error = dmu_tx_assign(tx, TXG_WAIT);
-       if (error) {
-               ztest_record_enospc("remove zap entry");
-               dmu_tx_abort(tx);
+       dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+       txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+       if (txg == 0)
+               return;
+       VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
+       VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
+       dmu_tx_commit(tx);
+ }
+ /*
+  * Testcase to test the upgrading of a microzap to fatzap.
+  */
+ void
+ ztest_fzap(ztest_ds_t *zd, uint64_t id)
+ {
+       objset_t *os = zd->zd_os;
+       ztest_od_t od[1];
+       uint64_t object, txg;
++      int i;
+       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
+       if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
                return;
 -      for (int i = 0; i < 2050; i++) {
+       object = od[0].od_object;
+       /*
+        * Add entries to this ZAP and make sure it spills over
+        * and gets upgraded to a fatzap. Also, since we are adding
+        * 2050 entries we should see ptrtbl growth and leaf-block split.
+        */
++      for (i = 0; i < 2050; i++) {
+               char name[MAXNAMELEN];
+               uint64_t value = i;
+               dmu_tx_t *tx;
+               int error;
+               (void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
+                   id, value);
+               tx = dmu_tx_create(os);
+               dmu_tx_hold_zap(tx, object, B_TRUE, name);
+               txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+               if (txg == 0)
+                       return;
+               error = zap_add(os, object, name, sizeof (uint64_t), 1,
+                   &value, tx);
+               ASSERT(error == 0 || error == EEXIST);
+               dmu_tx_commit(tx);
        }
-       error = zap_remove(os, object, txgname, tx);
-       if (error)
-               fatal(0, "zap_remove('%s', %llu, '%s') = %d",
-                   osname, object, txgname, error);
+ }
  
-       error = zap_remove(os, object, propname, tx);
-       if (error)
-               fatal(0, "zap_remove('%s', %llu, '%s') = %d",
-                   osname, object, propname, error);
+ /* ARGSUSED */
+ void
+ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
+ {
+       objset_t *os = zd->zd_os;
+       ztest_od_t od[1];
+       uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
+       dmu_tx_t *tx;
+       int i, namelen, error;
+       int micro = ztest_random(2);
+       char name[20], string_value[20];
+       void *data;
  
-       dmu_tx_commit(tx);
+       ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0);
+       if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+               return;
+       object = od[0].od_object;
+       /*
+        * Generate a random name of the form 'xxx.....' where each
+        * x is a random printable character and the dots are dots.
+        * There are 94 such characters, and the name length goes from
+        * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
+        */
+       namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
+       for (i = 0; i < 3; i++)
+               name[i] = '!' + ztest_random('~' - '!' + 1);
+       for (; i < namelen - 1; i++)
+               name[i] = '.';
+       name[i] = '\0';
+       if ((namelen & 1) || micro) {
+               wsize = sizeof (txg);
+               wc = 1;
+               data = &txg;
+       } else {
+               wsize = 1;
+               wc = namelen;
+               data = string_value;
+       }
+       count = -1ULL;
+       VERIFY(zap_count(os, object, &count) == 0);
+       ASSERT(count != -1ULL);
  
        /*
-        * Once in a while, destroy the object.
+        * Select an operation: length, lookup, add, update, remove.
         */
-       if (ztest_random(1000) != 0)
+       i = ztest_random(5);
+       if (i >= 2) {
+               tx = dmu_tx_create(os);
+               dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+               txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+               if (txg == 0)
+                       return;
+               bcopy(name, string_value, namelen);
+       } else {
+               tx = NULL;
+               txg = 0;
+               bzero(string_value, namelen);
+       }
+       switch (i) {
+       case 0:
+               error = zap_length(os, object, name, &zl_wsize, &zl_wc);
+               if (error == 0) {
+                       ASSERT3U(wsize, ==, zl_wsize);
+                       ASSERT3U(wc, ==, zl_wc);
+               } else {
+                       ASSERT3U(error, ==, ENOENT);
+               }
+               break;
+       case 1:
+               error = zap_lookup(os, object, name, wsize, wc, data);
+               if (error == 0) {
+                       if (data == string_value &&
+                           bcmp(name, data, namelen) != 0)
+                               fatal(0, "name '%s' != val '%s' len %d",
+                                   name, data, namelen);
+               } else {
+                       ASSERT3U(error, ==, ENOENT);
+               }
+               break;
+       case 2:
+               error = zap_add(os, object, name, wsize, wc, data, tx);
+               ASSERT(error == 0 || error == EEXIST);
+               break;
+       case 3:
+               VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
+               break;
+       case 4:
+               error = zap_remove(os, object, name, tx);
+               ASSERT(error == 0 || error == ENOENT);
+               break;
+       }
+       if (tx != NULL)
+               dmu_tx_commit(tx);
+ }
+ /*
+  * Commit callback data.
+  */
+ typedef struct ztest_cb_data {
+       list_node_t             zcd_node;
+       uint64_t                zcd_txg;
+       int                     zcd_expected_err;
+       boolean_t               zcd_added;
+       boolean_t               zcd_called;
+       spa_t                   *zcd_spa;
+ } ztest_cb_data_t;
+ /* This is the actual commit callback function */
+ static void
+ ztest_commit_callback(void *arg, int error)
+ {
+       ztest_cb_data_t *data = arg;
+       uint64_t synced_txg;
+       VERIFY(data != NULL);
+       VERIFY3S(data->zcd_expected_err, ==, error);
+       VERIFY(!data->zcd_called);
+       synced_txg = spa_last_synced_txg(data->zcd_spa);
+       if (data->zcd_txg > synced_txg)
+               fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
+                   ", last synced txg = %" PRIu64 "\n", data->zcd_txg,
+                   synced_txg);
+       data->zcd_called = B_TRUE;
+       if (error == ECANCELED) {
+               ASSERT3U(data->zcd_txg, ==, 0);
+               ASSERT(!data->zcd_added);
+               /*
+                * The private callback data should be destroyed here, but
+                * since we are going to check the zcd_called field after
+                * dmu_tx_abort(), we will destroy it there.
+                */
+               return;
+       }
+       /* Was this callback added to the global callback list? */
+       if (!data->zcd_added)
+               goto out;
+       ASSERT3U(data->zcd_txg, !=, 0);
+       /* Remove our callback from the list */
+       (void) mutex_lock(&zcl.zcl_callbacks_lock);
+       list_remove(&zcl.zcl_callbacks, data);
+       (void) mutex_unlock(&zcl.zcl_callbacks_lock);
+ out:
+       umem_free(data, sizeof (ztest_cb_data_t));
+ }
+ /* Allocate and initialize callback data structure */
+ static ztest_cb_data_t *
+ ztest_create_cb_data(objset_t *os, uint64_t txg)
+ {
+       ztest_cb_data_t *cb_data;
+       cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
+       cb_data->zcd_txg = txg;
+       cb_data->zcd_spa = dmu_objset_spa(os);
+       return (cb_data);
+ }
+ /*
+  * If a number of txgs equal to this threshold have been created after a commit
+  * callback has been registered but not called, then we assume there is an
+  * implementation bug.
+  */
+ #define       ZTEST_COMMIT_CALLBACK_THRESH    (TXG_CONCURRENT_STATES + 2)
+ /*
+  * Commit callback test.
+  */
+ void
+ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
+ {
+       objset_t *os = zd->zd_os;
+       ztest_od_t od[1];
+       dmu_tx_t *tx;
+       ztest_cb_data_t *cb_data[3], *tmp_cb;
+       uint64_t old_txg, txg;
+       int i, error;
+       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+       if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
                return;
  
        tx = dmu_tx_create(os);
-       dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
-       dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
-       error = dmu_tx_assign(tx, TXG_WAIT);
+       cb_data[0] = ztest_create_cb_data(os, 0);
+       dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
+       dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t));
+       /* Every once in a while, abort the transaction on purpose */
+       if (ztest_random(100) == 0)
+               error = -1;
+       if (!error)
+               error = dmu_tx_assign(tx, TXG_NOWAIT);
+       txg = error ? 0 : dmu_tx_get_txg(tx);
+       cb_data[0]->zcd_txg = txg;
+       cb_data[1] = ztest_create_cb_data(os, txg);
+       dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);
        if (error) {
-               ztest_record_enospc("destroy zap object");
+               /*
+                * It's not a strict requirement to call the registered
+                * callbacks from inside dmu_tx_abort(), but that's what
+                * it's supposed to happen in the current implementation
+                * so we will check for that.
+                */
+               for (i = 0; i < 2; i++) {
+                       cb_data[i]->zcd_expected_err = ECANCELED;
+                       VERIFY(!cb_data[i]->zcd_called);
+               }
                dmu_tx_abort(tx);
+               for (i = 0; i < 2; i++) {
+                       VERIFY(cb_data[i]->zcd_called);
+                       umem_free(cb_data[i], sizeof (ztest_cb_data_t));
+               }
                return;
        }
-       error = zap_destroy(os, object, tx);
-       if (error)
-               fatal(0, "zap_destroy('%s', %llu) = %d",
-                   osname, object, error);
-       object = 0;
-       dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
-           &object, tx);
-       dmu_tx_commit(tx);
- }
  
- void
- ztest_zap_parallel(ztest_args_t *za)
- {
-       objset_t *os = za->za_os;
-       uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
-       dmu_tx_t *tx;
-       int i, namelen, error;
-       char name[20], string_value[20];
-       void *data;
+       cb_data[2] = ztest_create_cb_data(os, txg);
+       dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
  
        /*
-        * Generate a random name of the form 'xxx.....' where each
-        * x is a random printable character and the dots are dots.
-        * There are 94 such characters, and the name length goes from
-        * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
+        * Read existing data to make sure there isn't a future leak.
         */
-       namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
+       VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t),
+           &old_txg, DMU_READ_PREFETCH));
  
-       for (i = 0; i < 3; i++)
-               name[i] = '!' + ztest_random('~' - '!' + 1);
-       for (; i < namelen - 1; i++)
-               name[i] = '.';
-       name[i] = '\0';
+       if (old_txg > txg)
+               fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
+                   old_txg, txg);
  
-       if (ztest_random(2) == 0)
-               object = ZTEST_MICROZAP_OBJ;
-       else
-               object = ZTEST_FATZAP_OBJ;
+       dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx);
  
-       if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
-               wsize = sizeof (txg);
-               wc = 1;
-               data = &txg;
-       } else {
-               wsize = 1;
-               wc = namelen;
-               data = string_value;
-       }
+       (void) mutex_lock(&zcl.zcl_callbacks_lock);
  
-       count = -1ULL;
-       VERIFY(zap_count(os, object, &count) == 0);
-       ASSERT(count != -1ULL);
+       /*
+        * Since commit callbacks don't have any ordering requirement and since
+        * it is theoretically possible for a commit callback to be called
+        * after an arbitrary amount of time has elapsed since its txg has been
+        * synced, it is difficult to reliably determine whether a commit
+        * callback hasn't been called due to high load or due to a flawed
+        * implementation.
+        *
+        * In practice, we will assume that if after a certain number of txgs a
+        * commit callback hasn't been called, then most likely there's an
+        * implementation bug..
+        */
+       tmp_cb = list_head(&zcl.zcl_callbacks);
+       if (tmp_cb != NULL &&
+           tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) {
+               fatal(0, "Commit callback threshold exceeded, oldest txg: %"
+                   PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
+       }
  
        /*
-        * Select an operation: length, lookup, add, update, remove.
+        * Let's find the place to insert our callbacks.
+        *
+        * Even though the list is ordered by txg, it is possible for the
+        * insertion point to not be the end because our txg may already be
+        * quiescing at this point and other callbacks in the open txg
+        * (from other objsets) may have sneaked in.
         */
-       i = ztest_random(5);
+       tmp_cb = list_tail(&zcl.zcl_callbacks);
+       while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
+               tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
+       /* Add the 3 callbacks to the list */
+       for (i = 0; i < 3; i++) {
+               if (tmp_cb == NULL)
+                       list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
+               else
+                       list_insert_after(&zcl.zcl_callbacks, tmp_cb,
+                           cb_data[i]);
  
-       if (i >= 2) {
-               tx = dmu_tx_create(os);
-               dmu_tx_hold_zap(tx, object, TRUE, NULL);
-               error = dmu_tx_assign(tx, TXG_WAIT);
-               if (error) {
-                       ztest_record_enospc("zap parallel");
-                       dmu_tx_abort(tx);
-                       return;
-               }
-               txg = dmu_tx_get_txg(tx);
-               bcopy(name, string_value, namelen);
-       } else {
-               tx = NULL;
-               txg = 0;
-               bzero(string_value, namelen);
+               cb_data[i]->zcd_added = B_TRUE;
+               VERIFY(!cb_data[i]->zcd_called);
+               tmp_cb = cb_data[i];
        }
  
-       switch (i) {
+       (void) mutex_unlock(&zcl.zcl_callbacks_lock);
  
-       case 0:
-               error = zap_length(os, object, name, &zl_wsize, &zl_wc);
-               if (error == 0) {
-                       ASSERT3U(wsize, ==, zl_wsize);
-                       ASSERT3U(wc, ==, zl_wc);
-               } else {
-                       ASSERT3U(error, ==, ENOENT);
-               }
-               break;
+       dmu_tx_commit(tx);
+ }
  
-       case 1:
-               error = zap_lookup(os, object, name, wsize, wc, data);
-               if (error == 0) {
-                       if (data == string_value &&
-                           bcmp(name, data, namelen) != 0)
-                               fatal(0, "name '%s' != val '%s' len %d",
-                                   name, data, namelen);
-               } else {
-                       ASSERT3U(error, ==, ENOENT);
-               }
-               break;
+ /* ARGSUSED */
+ void
+ ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
+ {
+       zfs_prop_t proplist[] = {
+               ZFS_PROP_CHECKSUM,
+               ZFS_PROP_COMPRESSION,
+               ZFS_PROP_COPIES,
+               ZFS_PROP_DEDUP
+       };
+       ztest_shared_t *zs = ztest_shared;
++      int p;
  
-       case 2:
-               error = zap_add(os, object, name, wsize, wc, data, tx);
-               ASSERT(error == 0 || error == EEXIST);
-               break;
+       (void) rw_rdlock(&zs->zs_name_lock);
  
-       case 3:
-               VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
-               break;
 -      for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
++      for (p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
+               (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
+                   ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
  
-       case 4:
-               error = zap_remove(os, object, name, tx);
-               ASSERT(error == 0 || error == ENOENT);
-               break;
-       }
+       (void) rw_unlock(&zs->zs_name_lock);
+ }
  
-       if (tx != NULL)
-               dmu_tx_commit(tx);
+ /* ARGSUSED */
+ void
+ ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
+ {
+       ztest_shared_t *zs = ztest_shared;
+       nvlist_t *props = NULL;
+       (void) rw_rdlock(&zs->zs_name_lock);
+       (void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO,
+           ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
+       VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0);
+       if (zopt_verbose >= 6)
+               dump_nvlist(props, 4);
+       nvlist_free(props);
+       (void) rw_unlock(&zs->zs_name_lock);
  }
  
+ /*
+  * Test snapshot hold/release and deferred destroy.
+  */
  void
- ztest_dsl_prop_get_set(ztest_args_t *za)
+ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
  {
-       objset_t *os = za->za_os;
-       int i, inherit;
-       uint64_t value;
-       const char *prop, *valname;
-       char setpoint[MAXPATHLEN];
-       char osname[MAXNAMELEN];
        int error;
+       objset_t *os = zd->zd_os;
+       objset_t *origin;
+       char snapname[100];
+       char fullname[100];
+       char clonename[100];
+       char tag[100];
+       char osname[MAXNAMELEN];
  
        (void) rw_rdlock(&ztest_shared->zs_name_lock);
  
@@@ -3262,160 -4558,176 +4571,177 @@@ ztest_fault_inject(ztest_ds_t *zd, uint
  }
  
  /*
-  * Scrub the pool.
+  * Verify that DDT repair works as expected.
   */
  void
- ztest_scrub(ztest_args_t *za)
+ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
  {
-       spa_t *spa = za->za_spa;
-       (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
-       (void) poll(NULL, 0, 1000); /* wait a second, then force a restart */
-       (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
- }
+       ztest_shared_t *zs = ztest_shared;
+       spa_t *spa = zs->zs_spa;
+       objset_t *os = zd->zd_os;
+       ztest_od_t od[1];
+       uint64_t object, blocksize, txg, pattern, psize;
+       enum zio_checksum checksum = spa_dedup_checksum(spa);
+       dmu_buf_t *db;
+       dmu_tx_t *tx;
+       void *buf;
+       blkptr_t blk;
+       int copies = 2 * ZIO_DEDUPDITTO_MIN;
++      int i;
  
- /*
-  * Rename the pool to a different name and then rename it back.
-  */
- void
- ztest_spa_rename(ztest_args_t *za)
- {
-       char *oldname, *newname;
-       int error;
-       spa_t *spa;
+       blocksize = ztest_random_blocksize();
+       blocksize = MIN(blocksize, 2048);       /* because we write so many */
  
-       (void) rw_wrlock(&ztest_shared->zs_name_lock);
+       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
  
-       oldname = za->za_pool;
-       newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
-       (void) strcpy(newname, oldname);
-       (void) strcat(newname, "_tmp");
+       if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+               return;
  
        /*
-        * Do the rename
+        * Take the name lock as writer to prevent anyone else from changing
+        * the pool and dataset properies we need to maintain during this test.
         */
-       error = spa_rename(oldname, newname);
-       if (error)
-               fatal(0, "spa_rename('%s', '%s') = %d", oldname,
-                   newname, error);
+       (void) rw_wrlock(&zs->zs_name_lock);
  
-       /*
-        * Try to open it under the old name, which shouldn't exist
-        */
-       error = spa_open(oldname, &spa, FTAG);
-       if (error != ENOENT)
-               fatal(0, "spa_open('%s') = %d", oldname, error);
+       if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
+           B_FALSE) != 0 ||
+           ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
+           B_FALSE) != 0) {
+               (void) rw_unlock(&zs->zs_name_lock);
+               return;
+       }
+       object = od[0].od_object;
+       blocksize = od[0].od_blocksize;
+       pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os);
+       ASSERT(object != 0);
+       tx = dmu_tx_create(os);
+       dmu_tx_hold_write(tx, object, 0, copies * blocksize);
+       txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+       if (txg == 0) {
+               (void) rw_unlock(&zs->zs_name_lock);
+               return;
+       }
  
        /*
-        * Open it under the new name and make sure it's still the same spa_t.
+        * Write all the copies of our block.
         */
-       error = spa_open(newname, &spa, FTAG);
-       if (error != 0)
-               fatal(0, "spa_open('%s') = %d", newname, error);
 -      for (int i = 0; i < copies; i++) {
++      for (i = 0; i < copies; i++) {
+               uint64_t offset = i * blocksize;
+               VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db,
+                   DMU_READ_NO_PREFETCH) == 0);
+               ASSERT(db->db_offset == offset);
+               ASSERT(db->db_size == blocksize);
+               ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
+                   ztest_pattern_match(db->db_data, db->db_size, 0ULL));
+               dmu_buf_will_fill(db, tx);
+               ztest_pattern_set(db->db_data, db->db_size, pattern);
+               dmu_buf_rele(db, FTAG);
+       }
  
-       ASSERT(spa == za->za_spa);
-       spa_close(spa, FTAG);
+       dmu_tx_commit(tx);
+       txg_wait_synced(spa_get_dsl(spa), txg);
  
        /*
-        * Rename it back to the original
+        * Find out what block we got.
         */
-       error = spa_rename(newname, oldname);
-       if (error)
-               fatal(0, "spa_rename('%s', '%s') = %d", newname,
-                   oldname, error);
+       VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db,
+           DMU_READ_NO_PREFETCH) == 0);
+       blk = *((dmu_buf_impl_t *)db)->db_blkptr;
+       dmu_buf_rele(db, FTAG);
  
        /*
-        * Make sure it can still be opened
+        * Damage the block.  Dedup-ditto will save us when we read it later.
         */
-       error = spa_open(oldname, &spa, FTAG);
-       if (error != 0)
-               fatal(0, "spa_open('%s') = %d", oldname, error);
+       psize = BP_GET_PSIZE(&blk);
+       buf = zio_buf_alloc(psize);
+       ztest_pattern_set(buf, psize, ~pattern);
  
-       ASSERT(spa == za->za_spa);
-       spa_close(spa, FTAG);
+       (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
+           buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+           ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
  
-       umem_free(newname, strlen(newname) + 1);
+       zio_buf_free(buf, psize);
  
-       (void) rw_unlock(&ztest_shared->zs_name_lock);
+       (void) rw_unlock(&zs->zs_name_lock);
  }
  
  /*
-  * Completely obliterate one disk.
+  * Scrub the pool.
   */
- static void
- ztest_obliterate_one_disk(uint64_t vdev)
+ /* ARGSUSED */
+ void
+ ztest_scrub(ztest_ds_t *zd, uint64_t id)
  {
-       int fd;
-       char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN];
-       size_t fsize;
+       ztest_shared_t *zs = ztest_shared;
+       spa_t *spa = zs->zs_spa;
  
-       if (zopt_maxfaults < 2)
-               return;
+       (void) spa_scan(spa, POOL_SCAN_SCRUB);
+       (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
+       (void) spa_scan(spa, POOL_SCAN_SCRUB);
+ }
  
-       (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
-       (void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name);
+ /*
+  * Rename the pool to a different name and then rename it back.
+  */
+ /* ARGSUSED */
+ void
+ ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
+ {
+       ztest_shared_t *zs = ztest_shared;
+       char *oldname, *newname;
+       spa_t *spa;
  
-       fd = open(dev_name, O_RDWR);
+       (void) rw_wrlock(&zs->zs_name_lock);
  
-       if (fd == -1)
-               fatal(1, "can't open %s", dev_name);
+       oldname = zs->zs_pool;
+       newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
+       (void) strcpy(newname, oldname);
+       (void) strcat(newname, "_tmp");
  
        /*
-        * Determine the size.
+        * Do the rename
         */
-       fsize = lseek(fd, 0, SEEK_END);
-       (void) close(fd);
+       VERIFY3U(0, ==, spa_rename(oldname, newname));
  
        /*
-        * Rename the old device to dev_name.old (useful for debugging).
+        * Try to open it under the old name, which shouldn't exist
         */
-       VERIFY(rename(dev_name, copy_name) == 0);
+       VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
  
        /*
-        * Create a new one.
+        * Open it under the new name and make sure it's still the same spa_t.
         */
-       VERIFY((fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666)) >= 0);
-       VERIFY(ftruncate(fd, fsize) == 0);
-       (void) close(fd);
- }
+       VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
  
- static void
- ztest_replace_one_disk(spa_t *spa, uint64_t vdev)
- {
-       char dev_name[MAXPATHLEN];
-       nvlist_t *root;
-       int error;
-       uint64_t guid;
-       vdev_t *vd;
+       ASSERT(spa == zs->zs_spa);
+       spa_close(spa, FTAG);
  
-       (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+       /*
+        * Rename it back to the original
+        */
+       VERIFY3U(0, ==, spa_rename(newname, oldname));
  
        /*
-        * Build the nvlist describing dev_name.
+        * Make sure it can still be opened
         */
-       root = make_vdev_root(dev_name, NULL, 0, 0, 0, 0, 0, 1);
+       VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
  
-       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-       if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, dev_name)) == NULL)
-               guid = 0;
-       else
-               guid = vd->vdev_guid;
-       spa_config_exit(spa, SCL_VDEV, FTAG);
-       error = spa_vdev_attach(spa, guid, root, B_TRUE);
-       if (error != 0 &&
-           error != EBUSY &&
-           error != ENOTSUP &&
-           error != ENODEV &&
-           error != EDOM)
-               fatal(0, "spa_vdev_attach(in-place) = %d", error);
+       ASSERT(spa == zs->zs_spa);
+       spa_close(spa, FTAG);
  
-       nvlist_free(root);
+       umem_free(newname, strlen(newname) + 1);
+       (void) rw_unlock(&zs->zs_name_lock);
  }
  
+ /*
+  * Verify pool integrity by running zdb.
+  */
  static void
- ztest_verify_blocks(char *pool)
+ ztest_run_zdb(char *pool)
  {
        int status;
        char zdb[MAXPATHLEN + MAXNAMELEN + 20];
@@@ -3597,6 -4896,45 +4910,46 @@@ ztest_resume_thread(void *arg
        return (NULL);
  }
  
 -      for (int i = 0; i < zi->zi_iters; i++)
+ static void *
+ ztest_deadman_thread(void *arg)
+ {
+       ztest_shared_t *zs = arg;
+       int grace = 300;
+       hrtime_t delta;
+       delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace;
+       (void) poll(NULL, 0, (int)(1000 * delta));
+       fatal(0, "failed to complete within %d seconds of deadline", grace);
+       return (NULL);
+ }
+ static void
+ ztest_execute(ztest_info_t *zi, uint64_t id)
+ {
+       ztest_shared_t *zs = ztest_shared;
+       ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets];
+       hrtime_t functime = gethrtime();
++      int i;
++      for (i = 0; i < zi->zi_iters; i++)
+               zi->zi_func(zd, id);
+       functime = gethrtime() - functime;
+       atomic_add_64(&zi->zi_call_count, 1);
+       atomic_add_64(&zi->zi_call_time, functime);
+       if (zopt_verbose >= 4) {
+               Dl_info dli;
+               (void) dladdr((void *)zi->zi_func, &dli);
+               (void) printf("%6.2f sec in %s\n",
+                   (double)functime / NANOSEC, dli.dli_sname);
+       }
+ }
  static void *
  ztest_thread(void *arg)
  {
                /*
                 * See if it's time to force a crash.
                 */
-               if (now > za->za_kill) {
-                       zs->zs_alloc = spa_get_alloc(za->za_spa);
-                       zs->zs_space = spa_get_space(za->za_spa);
-                       (void) kill(getpid(), SIGKILL);
-               }
+               if (now > zs->zs_thread_kill)
+                       ztest_kill(zs);
  
                /*
-                * Pick a random function.
+                * If we're getting ENOSPC with some regularity, stop.
                 */
-               f = ztest_random(ZTEST_FUNCS);
-               zi = &zs->zs_info[f];
+               if (zs->zs_enospc_count > 10)
+                       break;
  
                /*
-                * Decide whether to call it, based on the requested frequency.
+                * Pick a random function to execute.
                 */
-               if (zi->zi_call_target == 0 ||
-                   (double)zi->zi_call_total / zi->zi_call_target >
-                   (double)(now - zs->zs_start_time) / (zopt_time * NANOSEC))
-                       continue;
+               zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)];
+               call_next = zi->zi_call_next;
+               if (now >= call_next &&
+                   atomic_cas_64(&zi->zi_call_next, call_next, call_next +
+                   ztest_random(2 * zi->zi_interval[0] + 1)) == call_next)
+                       ztest_execute(zi, id);
+       }
  
-               atomic_add_64(&zi->zi_calls, 1);
-               atomic_add_64(&zi->zi_call_total, 1);
+       return (NULL);
+ }
  
-               za->za_diroff = (za->za_instance * ZTEST_FUNCS + f) *
-                   ZTEST_DIRSIZE;
-               za->za_diroff_shared = (1ULL << 63);
+ static void
+ ztest_dataset_name(char *dsname, char *pool, int d)
+ {
+       (void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d);
+ }
  
-               for (i = 0; i < zi->zi_iters; i++)
-                       zi->zi_func(za);
+ static void
+ ztest_dataset_destroy(ztest_shared_t *zs, int d)
+ {
+       char name[MAXNAMELEN];
++      int t;
  
-               functime = gethrtime() - now;
+       ztest_dataset_name(name, zs->zs_pool, d);
  
-               atomic_add_64(&zi->zi_call_time, functime);
+       if (zopt_verbose >= 3)
+               (void) printf("Destroying %s to free up space\n", name);
  
-               if (zopt_verbose >= 4) {
-                       Dl_info dli;
-                       (void) dladdr((void *)zi->zi_func, &dli);
-                       (void) printf("%6.2f sec in %s\n",
-                           (double)functime / NANOSEC, dli.dli_sname);
-               }
+       /*
+        * Cleanup any non-standard clones and snapshots.  In general,
+        * ztest thread t operates on dataset (t % zopt_datasets),
+        * so there may be more than one thing to clean up.
+        */
 -      for (int t = d; t < zopt_threads; t += zopt_datasets)
++      for (t = d; t < zopt_threads; t += zopt_datasets)
+               ztest_dsl_dataset_cleanup(name, t);
  
-               /*
-                * If we're getting ENOSPC with some regularity, stop.
-                */
-               if (zs->zs_enospc_count > 10)
-                       break;
+       (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
+           DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+ }
+ static void
+ ztest_dataset_dirobj_verify(ztest_ds_t *zd)
+ {
+       uint64_t usedobjs, dirobjs, scratch;
+       /*
+        * ZTEST_DIROBJ is the object directory for the entire dataset.
+        * Therefore, the number of objects in use should equal the
+        * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
+        * If not, we have an object leak.
+        *
+        * Note that we can only check this in ztest_dataset_open(),
+        * when the open-context and syncing-context values agree.
+        * That's because zap_count() returns the open-context value,
+        * while dmu_objset_space() returns the rootbp fill count.
+        */
+       VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
+       dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
+       ASSERT3U(dirobjs + 1, ==, usedobjs);
+ }
+ static int
+ ztest_dataset_open(ztest_shared_t *zs, int d)
+ {
+       ztest_ds_t *zd = &zs->zs_zd[d];
+       uint64_t committed_seq = zd->zd_seq;
+       objset_t *os;
+       zilog_t *zilog;
+       char name[MAXNAMELEN];
+       int error;
+       ztest_dataset_name(name, zs->zs_pool, d);
+       (void) rw_rdlock(&zs->zs_name_lock);
+       error = ztest_dataset_create(name);
+       if (error == ENOSPC) {
+               (void) rw_unlock(&zs->zs_name_lock);
+               ztest_record_enospc(FTAG);
+               return (error);
        }
+       ASSERT(error == 0 || error == EEXIST);
  
-       return (NULL);
+       VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0);
+       (void) rw_unlock(&zs->zs_name_lock);
+       ztest_zd_init(zd, os);
+       zilog = zd->zd_zilog;
+       if (zilog->zl_header->zh_claim_lr_seq != 0 &&
+           zilog->zl_header->zh_claim_lr_seq < committed_seq)
+               fatal(0, "missing log records: claimed %llu < committed %llu",
+                   zilog->zl_header->zh_claim_lr_seq, committed_seq);
+       ztest_dataset_dirobj_verify(zd);
+       zil_replay(os, zd, ztest_replay_vector);
+       ztest_dataset_dirobj_verify(zd);
+       if (zopt_verbose >= 6)
+               (void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
+                   zd->zd_name,
+                   (u_longlong_t)zilog->zl_parse_blk_count,
+                   (u_longlong_t)zilog->zl_parse_lr_count,
+                   (u_longlong_t)zilog->zl_replaying_seq);
+       zilog = zil_open(os, ztest_get_data);
+       if (zilog->zl_replaying_seq != 0 &&
+           zilog->zl_replaying_seq < committed_seq)
+               fatal(0, "missing log records: replayed %llu < committed %llu",
+                   zilog->zl_replaying_seq, committed_seq);
+       return (0);
+ }
+ static void
+ ztest_dataset_close(ztest_shared_t *zs, int d)
+ {
+       ztest_ds_t *zd = &zs->zs_zd[d];
+       zil_close(zd->zd_zilog);
+       dmu_objset_rele(zd->zd_os, zd);
+       ztest_zd_fini(zd);
  }
  
  /*
   * Kick off threads to run tests on all datasets in parallel.
   */
  static void
- ztest_run(char *pool)
+ ztest_run(ztest_shared_t *zs)
  {
-       int t, d, error;
-       ztest_shared_t *zs = ztest_shared;
-       ztest_args_t *za;
+       thread_t *tid;
        spa_t *spa;
-       char name[100];
        thread_t resume_tid;
+       int error;
++      int t, d;
  
        ztest_exiting = B_FALSE;
  
        if (zopt_verbose >= 4)
                (void) printf("starting main threads...\n");
  
-       za[0].za_start = gethrtime();
-       za[0].za_stop = za[0].za_start + zopt_passtime * NANOSEC;
-       za[0].za_stop = MIN(za[0].za_stop, zs->zs_stop_time);
-       za[0].za_kill = za[0].za_stop;
-       if (ztest_random(100) < zopt_killrate)
-               za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC);
+       /*
+        * Kick off all the tests that run in parallel.
+        */
 -      for (int t = 0; t < zopt_threads; t++) {
 +      for (t = 0; t < zopt_threads; t++) {
-               d = t % zopt_datasets;
-               (void) strcpy(za[t].za_pool, pool);
-               za[t].za_os = za[d].za_os;
-               za[t].za_spa = spa;
-               za[t].za_zilog = za[d].za_zilog;
-               za[t].za_instance = t;
-               za[t].za_random = ztest_random(-1ULL);
-               za[t].za_start = za[0].za_start;
-               za[t].za_stop = za[0].za_stop;
-               za[t].za_kill = za[0].za_kill;
-               if (t < zopt_datasets) {
-                       int test_future = FALSE;
-                       (void) rw_rdlock(&ztest_shared->zs_name_lock);
-                       (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
-                       error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0,
-                           ztest_create_cb, NULL);
-                       if (error == EEXIST) {
-                               test_future = TRUE;
-                       } else if (error == ENOSPC) {
-                               zs->zs_enospc_count++;
-                               (void) rw_unlock(&ztest_shared->zs_name_lock);
-                               break;
-                       } else if (error != 0) {
-                               fatal(0, "dmu_objset_create(%s) = %d",
-                                   name, error);
-                       }
-                       error = dmu_objset_open(name, DMU_OST_OTHER,
-                           DS_MODE_USER, &za[d].za_os);
-                       if (error)
-                               fatal(0, "dmu_objset_open('%s') = %d",
-                                   name, error);
-                       (void) rw_unlock(&ztest_shared->zs_name_lock);
-                       if (test_future)
-                               ztest_dmu_check_future_leak(&za[t]);
-                       zil_replay(za[d].za_os, za[d].za_os,
-                           ztest_replay_vector);
-                       za[d].za_zilog = zil_open(za[d].za_os, NULL);
-               }
-               VERIFY(thr_create(0, 0, ztest_thread, &za[t], THR_BOUND,
-                   &za[t].za_thread) == 0);
+               if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0)
+                       return;
+               VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
+                   THR_BOUND, &tid[t]) == 0);
        }
  
-       while (--t >= 0) {
-               VERIFY(thr_join(za[t].za_thread, NULL, NULL) == 0);
-               if (t < zopt_datasets) {
-                       zil_close(za[t].za_zilog);
-                       dmu_objset_close(za[t].za_os);
-               }
+       /*
+        * Wait for all of the tests to complete.  We go in reverse order
+        * so we don't close datasets while threads are still using them.
+        */
 -      for (int t = zopt_threads - 1; t >= 0; t--) {
++      for (t = zopt_threads - 1; t >= 0; t--) {
+               VERIFY(thr_join(tid[t], NULL, NULL) == 0);
+               if (t < zopt_datasets)
+                       ztest_dataset_close(zs, t);
        }
  
-       if (zopt_verbose >= 3)
-               show_pool_stats(spa);
        txg_wait_synced(spa_get_dsl(spa), 0);
  
-       zs->zs_alloc = spa_get_alloc(spa);
-       zs->zs_space = spa_get_space(spa);
+       zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+       zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
+       umem_free(tid, zopt_threads * sizeof (thread_t));
+       /* Kill the resume thread */
+       ztest_exiting = B_TRUE;
+       VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
+       ztest_resume(spa);
+       /*
+        * Right before closing the pool, kick off a bunch of async I/O;
+        * spa_close() should wait for it to complete.
+        */
+       for (uint64_t object = 1; object < 50; object++)
+               dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
+       spa_close(spa, FTAG);
  
        /*
-        * If we had out-of-space errors, destroy a random objset.
+        * Verify that we can loop over all pools.
         */
-       if (zs->zs_enospc_count != 0) {
-               (void) rw_rdlock(&ztest_shared->zs_name_lock);
-               d = (int)ztest_random(zopt_datasets);
-               (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
-               if (zopt_verbose >= 3)
-                       (void) printf("Destroying %s to free up space\n", name);
+       mutex_enter(&spa_namespace_lock);
+       for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
+               if (zopt_verbose > 3)
+                       (void) printf("spa_next: found %s\n", spa_name(spa));
+       mutex_exit(&spa_namespace_lock);
+       /*
+        * Verify that we can export the pool and reimport it under a
+        * different name.
+        */
+       if (ztest_random(2) == 0) {
+               char name[MAXNAMELEN];
+               (void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool);
+               ztest_spa_import_export(zs->zs_pool, name);
+               ztest_spa_import_export(name, zs->zs_pool);
+       }
+       kernel_fini();
+ }
+ static void
+ ztest_freeze(ztest_shared_t *zs)
+ {
+       ztest_ds_t *zd = &zs->zs_zd[0];
+       spa_t *spa;
+       int numloops = 0;
+       if (zopt_verbose >= 3)
+               (void) printf("testing spa_freeze()...\n");
  
-               /* Cleanup any non-standard clones and snapshots */
-               ztest_dsl_dataset_cleanup(name, za[d].za_instance);
+       kernel_init(FREAD | FWRITE);
+       VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+       VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
  
-               (void) dmu_objset_find(name, ztest_destroy_cb, &za[d],
-                   DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
-               (void) rw_unlock(&ztest_shared->zs_name_lock);
+       /*
+        * Force the first log block to be transactionally allocated.
+        * We have to do this before we freeze the pool -- otherwise
+        * the log chain won't be anchored.
+        */
+       while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
+               ztest_dmu_object_alloc_free(zd, 0);
+               zil_commit(zd->zd_zilog, UINT64_MAX, 0);
        }
  
        txg_wait_synced(spa_get_dsl(spa), 0);
@@@ -3957,6 -5413,7 +5430,8 @@@ main(int argc, char **argv
        ztest_info_t *zi;
        char timebuf[100];
        char numbuf[6];
+       spa_t *spa;
++      int i, f;
  
        (void) setvbuf(stdout, NULL, _IOLBF, 0);
  
                bzero(zs, sizeof (ztest_shared_t));
                if (zopt_verbose >= 3 && zopt_init != 1)
                        (void) printf("ztest_init(), pass %d\n", i);
-               ztest_init(zopt_pool);
+               zs->zs_pool = zopt_pool;
+               ztest_init(zs);
        }
  
-       /*
-        * Initialize the call targets for each function.
-        */
+       zs->zs_pool = zopt_pool;
+       zs->zs_proc_start = gethrtime();
+       zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC;
 -      for (int f = 0; f < ZTEST_FUNCS; f++) {
 +      for (f = 0; f < ZTEST_FUNCS; f++) {
                zi = &zs->zs_info[f];
                *zi = ztest_info[f];
-               if (*zi->zi_interval == 0)
-                       zi->zi_call_target = UINT64_MAX;
+               if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
+                       zi->zi_call_next = UINT64_MAX;
                else
-                       zi->zi_call_target = zopt_time / *zi->zi_interval;
+                       zi->zi_call_next = zs->zs_proc_start +
+                           ztest_random(2 * zi->zi_interval[0] + 1);
        }
  
-       zs->zs_start_time = gethrtime();
-       zs->zs_stop_time = zs->zs_start_time + zopt_time * NANOSEC;
        /*
         * Run the tests in a loop.  These tests include fault injection
         * to verify that self-healing data works, and forced crashes
                /*
                 * Initialize the workload counters for each function.
                 */
 -              for (int f = 0; f < ZTEST_FUNCS; f++) {
 +              for (f = 0; f < ZTEST_FUNCS; f++) {
                        zi = &zs->zs_info[f];
-                       zi->zi_calls = 0;
+                       zi->zi_call_count = 0;
                        zi->zi_call_time = 0;
                }
  
index d67776889d3501faecf2ccddefb025cc21967f41,fd3044b1da333b261844210abca80c6c80702a09..95632d938f12bbf5d35ae1c1c95ef0438ec62788
@@@ -403,6 -405,21 +405,23 @@@ refresh_config(libzfs_handle_t *hdl, nv
        return (nvl);
  }
  
 -      for (int c = 0; c < holes; c++) {
+ /*
+  * Determine if the vdev id is a hole in the namespace.
+  */
+ boolean_t
+ vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
+ {
++      int c;
++
++      for (c = 0; c < holes; c++) {
+               /* Top-level is a hole */
+               if (hole_array[c] == id)
+                       return (B_TRUE);
+       }
+       return (B_FALSE);
+ }
  /*
   * Convert our list of pools into the definitive set of configurations.  We
   * start by picking the best config for each toplevel vdev.  Once that's done,
Simple merge
Simple merge
index e9a8aab49737d5d15c7ac80b098b8ad491209ba3,42ae439972e4ce25ebd2ef38692af468bca7e0ac..22e7188bcf0d38bf2a6ea9359780e1961f6567db
@@@ -109,14 -106,12 +106,16 @@@ dmu_buf_impl_t 
  dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
  {
        dbuf_hash_table_t *h = &dbuf_hash_table;
-       objset_impl_t *os = dn->dn_objset;
-       uint64_t obj, hv, idx;
+       objset_t *os = dn->dn_objset;
+       uint64_t obj = dn->dn_object;
+       uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+       uint64_t idx = hv & h->hash_table_mask;
        dmu_buf_impl_t *db;
  
 +      obj = dn->dn_object;
 +      hv = DBUF_HASH(os, obj, level, blkid);
 +      idx = hv & h->hash_table_mask;
 +
        mutex_enter(DBUF_HASH_MUTEX(h, idx));
        for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
                if (DBUF_EQUAL(db, os, obj, level, blkid)) {
@@@ -142,16 -137,14 +141,16 @@@ static dmu_buf_impl_t 
  dbuf_hash_insert(dmu_buf_impl_t *db)
  {
        dbuf_hash_table_t *h = &dbuf_hash_table;
-       objset_impl_t *os = db->db_objset;
+       objset_t *os = db->db_objset;
        uint64_t obj = db->db.db_object;
        int level = db->db_level;
 -      uint64_t blkid = db->db_blkid;
 -      uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 -      uint64_t idx = hv & h->hash_table_mask;
 +      uint64_t blkid, hv, idx;
        dmu_buf_impl_t *dbf;
  
 +      blkid = db->db_blkid;
 +      hv = DBUF_HASH(os, obj, level, blkid);
 +      idx = hv & h->hash_table_mask;
 +
        mutex_enter(DBUF_HASH_MUTEX(h, idx));
        for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
                if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
index 0000000000000000000000000000000000000000,926b4df9a5d909d263e2bcb35c612f92e5304df1..cd4e8476c2117935bf598004f7a03f70082a2264
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1140 +1,1155 @@@
 -      for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ /*
+  * CDDL HEADER START
+  *
+  * The contents of this file are subject to the terms of the
+  * Common Development and Distribution License (the "License").
+  * You may not use this file except in compliance with the License.
+  *
+  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+  * or http://www.opensolaris.org/os/licensing.
+  * See the License for the specific language governing permissions
+  * and limitations under the License.
+  *
+  * When distributing Covered Code, include this CDDL HEADER in each
+  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+  * If applicable, add the following below this CDDL HEADER, with the
+  * fields enclosed by brackets "[]" replaced with your own identifying
+  * information: Portions Copyright [yyyy] [name of copyright owner]
+  *
+  * CDDL HEADER END
+  */
+ /*
+  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+  */
+ #include <sys/zfs_context.h>
+ #include <sys/spa.h>
+ #include <sys/spa_impl.h>
+ #include <sys/zio.h>
+ #include <sys/ddt.h>
+ #include <sys/zap.h>
+ #include <sys/dmu_tx.h>
+ #include <sys/arc.h>
+ #include <sys/dsl_pool.h>
+ #include <sys/zio_checksum.h>
+ #include <sys/zio_compress.h>
+ #include <sys/dsl_scan.h>
+ static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
+       &ddt_zap_ops,
+ };
+ static const char *ddt_class_name[DDT_CLASSES] = {
+       "ditto",
+       "duplicate",
+       "unique",
+ };
+ static void
+ ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     dmu_tx_t *tx)
+ {
+       spa_t *spa = ddt->ddt_spa;
+       objset_t *os = ddt->ddt_os;
+       uint64_t *objectp = &ddt->ddt_object[type][class];
+       boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
+       char name[DDT_NAMELEN];
+       ddt_object_name(ddt, type, class, name);
+       ASSERT(*objectp == 0);
+       VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
+       ASSERT(*objectp != 0);
+       VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
+           sizeof (uint64_t), 1, objectp, tx) == 0);
+       VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
+           sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+           &ddt->ddt_histogram[type][class], tx) == 0);
+ }
+ static void
+ ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     dmu_tx_t *tx)
+ {
+       spa_t *spa = ddt->ddt_spa;
+       objset_t *os = ddt->ddt_os;
+       uint64_t *objectp = &ddt->ddt_object[type][class];
+       char name[DDT_NAMELEN];
+       ddt_object_name(ddt, type, class, name);
+       ASSERT(*objectp != 0);
+       ASSERT(ddt_object_count(ddt, type, class) == 0);
+       ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
+       VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
+       VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
+       VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
+       bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
+       *objectp = 0;
+ }
+ static int
+ ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+ {
+       ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+       dmu_object_info_t doi;
+       char name[DDT_NAMELEN];
+       int error;
+       ddt_object_name(ddt, type, class, name);
+       error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+           sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
+       if (error)
+               return (error);
+       error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+           sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+           &ddt->ddt_histogram[type][class]);
+       /*
+        * Seed the cached statistics.
+        */
+       VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+       ddo->ddo_count = ddt_object_count(ddt, type, class);
+       ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+       ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+       ASSERT(error == 0);
+       return (error);
+ }
+ static void
+ ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     dmu_tx_t *tx)
+ {
+       ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+       dmu_object_info_t doi;
+       char name[DDT_NAMELEN];
+       ddt_object_name(ddt, type, class, name);
+       VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+           sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+           &ddt->ddt_histogram[type][class], tx) == 0);
+       /*
+        * Cache DDT statistics; this is the only time they'll change.
+        */
+       VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+       ddo->ddo_count = ddt_object_count(ddt, type, class);
+       ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+       ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+ }
+ static int
+ ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     ddt_entry_t *dde)
+ {
+       if (!ddt_object_exists(ddt, type, class))
+               return (ENOENT);
+       return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
+           ddt->ddt_object[type][class], dde));
+ }
+ static void
+ ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     ddt_entry_t *dde)
+ {
+       if (!ddt_object_exists(ddt, type, class))
+               return;
+       ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
+           ddt->ddt_object[type][class], dde);
+ }
+ int
+ ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     ddt_entry_t *dde, dmu_tx_t *tx)
+ {
+       ASSERT(ddt_object_exists(ddt, type, class));
+       return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
+           ddt->ddt_object[type][class], dde, tx));
+ }
+ static int
+ ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     ddt_entry_t *dde, dmu_tx_t *tx)
+ {
+       ASSERT(ddt_object_exists(ddt, type, class));
+       return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
+           ddt->ddt_object[type][class], dde, tx));
+ }
+ int
+ ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     uint64_t *walk, ddt_entry_t *dde)
+ {
+       ASSERT(ddt_object_exists(ddt, type, class));
+       return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
+           ddt->ddt_object[type][class], dde, walk));
+ }
+ uint64_t
+ ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+ {
+       ASSERT(ddt_object_exists(ddt, type, class));
+       return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
+           ddt->ddt_object[type][class]));
+ }
+ int
+ ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     dmu_object_info_t *doi)
+ {
+       if (!ddt_object_exists(ddt, type, class))
+               return (ENOENT);
+       return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
+           doi));
+ }
+ boolean_t
+ ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+ {
+       return (!!ddt->ddt_object[type][class]);
+ }
+ void
+ ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     char *name)
+ {
+       (void) sprintf(name, DMU_POOL_DDT,
+           zio_checksum_table[ddt->ddt_checksum].ci_name,
+           ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
+ }
+ void
+ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+ {
++      int d;
+       ASSERT(txg != 0);
 -      for (int d = 0; d < SPA_DVAS_PER_BP; d++)
++      for (d = 0; d < SPA_DVAS_PER_BP; d++)
+               bp->blk_dva[d] = ddp->ddp_dva[d];
+       BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+ }
+ void
+ ddt_bp_create(enum zio_checksum checksum,
+     const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
+ {
+       BP_ZERO(bp);
+       if (ddp != NULL)
+               ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+       bp->blk_cksum = ddk->ddk_cksum;
+       bp->blk_fill = 1;
+       BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
+       BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
+       BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
+       BP_SET_CHECKSUM(bp, checksum);
+       BP_SET_TYPE(bp, DMU_OT_DEDUP);
+       BP_SET_LEVEL(bp, 0);
+       BP_SET_DEDUP(bp, 0);
+       BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+ }
+ void
+ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
+ {
+       ddk->ddk_cksum = bp->blk_cksum;
+       ddk->ddk_prop = 0;
+       DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
+       DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
+       DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
+ }
+ void
+ ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+ {
++      int d;
+       ASSERT(ddp->ddp_phys_birth == 0);
 -      for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
++      for (d = 0; d < SPA_DVAS_PER_BP; d++)
+               ddp->ddp_dva[d] = bp->blk_dva[d];
+       ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+ }
+ void
+ ddt_phys_clear(ddt_phys_t *ddp)
+ {
+       bzero(ddp, sizeof (*ddp));
+ }
+ void
+ ddt_phys_addref(ddt_phys_t *ddp)
+ {
+       ddp->ddp_refcnt++;
+ }
+ void
+ ddt_phys_decref(ddt_phys_t *ddp)
+ {
+       ASSERT((int64_t)ddp->ddp_refcnt > 0);
+       ddp->ddp_refcnt--;
+ }
+ void
+ ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+ {
+       blkptr_t blk;
+       ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+       ddt_phys_clear(ddp);
+       zio_free(ddt->ddt_spa, txg, &blk);
+ }
+ ddt_phys_t *
+ ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
+ {
+       ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
++      int p;
 -      for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
++      for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+               if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
+                   BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+                       return (ddp);
+       }
+       return (NULL);
+ }
+ uint64_t
+ ddt_phys_total_refcnt(const ddt_entry_t *dde)
+ {
+       uint64_t refcnt = 0;
++      int p;
 -      for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
++      for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+               refcnt += dde->dde_phys[p].ddp_refcnt;
+       return (refcnt);
+ }
+ static void
+ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+ {
+       spa_t *spa = ddt->ddt_spa;
+       ddt_phys_t *ddp = dde->dde_phys;
+       ddt_key_t *ddk = &dde->dde_key;
+       uint64_t lsize = DDK_GET_LSIZE(ddk);
+       uint64_t psize = DDK_GET_PSIZE(ddk);
++      int p, d;
+       bzero(dds, sizeof (*dds));
 -              for (int d = 0; d < SPA_DVAS_PER_BP; d++)
++      for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+               uint64_t dsize = 0;
+               uint64_t refcnt = ddp->ddp_refcnt;
+               if (ddp->ddp_phys_birth == 0)
+                       continue;
 -      for (int h = 0; h < 64; h++)
++              for (d = 0; d < SPA_DVAS_PER_BP; d++)
+                       dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+               dds->dds_blocks += 1;
+               dds->dds_lsize += lsize;
+               dds->dds_psize += psize;
+               dds->dds_dsize += dsize;
+               dds->dds_ref_blocks += refcnt;
+               dds->dds_ref_lsize += lsize * refcnt;
+               dds->dds_ref_psize += psize * refcnt;
+               dds->dds_ref_dsize += dsize * refcnt;
+       }
+ }
+ void
+ ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+ {
+       const uint64_t *s = (const uint64_t *)src;
+       uint64_t *d = (uint64_t *)dst;
+       uint64_t *d_end = (uint64_t *)(dst + 1);
+       ASSERT(neg == 0 || neg == -1ULL);       /* add or subtract */
+       while (d < d_end)
+               *d++ += (*s++ ^ neg) - neg;
+ }
+ static void
+ ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+ {
+       ddt_stat_t dds;
+       ddt_histogram_t *ddh;
+       int bucket;
+       ddt_stat_generate(ddt, dde, &dds);
+       bucket = highbit(dds.dds_ref_blocks) - 1;
+       ASSERT(bucket >= 0);
+       ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+       ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+ }
+ void
+ ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
+ {
 -      for (int h = 0; h < 64; h++)
++      int h;
++
++      for (h = 0; h < 64; h++)
+               ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+ }
+ void
+ ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+ {
++      int h;
++
+       bzero(dds, sizeof (*dds));
 -      for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
++      for (h = 0; h < 64; h++)
+               ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+ }
+ boolean_t
+ ddt_histogram_empty(const ddt_histogram_t *ddh)
+ {
+       const uint64_t *s = (const uint64_t *)ddh;
+       const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+       while (s < s_end)
+               if (*s++ != 0)
+                       return (B_FALSE);
+       return (B_TRUE);
+ }
+ void
+ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
+ {
+       /* Sum the statistics we cached in ddt_object_sync(). */
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+               ddt_t *ddt = spa->spa_ddt[c];
+               for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+                       for (enum ddt_class class = 0; class < DDT_CLASSES;
+                           class++) {
+                               ddt_object_t *ddo =
+                                   &ddt->ddt_object_stats[type][class];
+                               ddo_total->ddo_count += ddo->ddo_count;
+                               ddo_total->ddo_dspace += ddo->ddo_dspace;
+                               ddo_total->ddo_mspace += ddo->ddo_mspace;
+                       }
+               }
+       }
+       /* ... and compute the averages. */
+       if (ddo_total->ddo_count != 0) {
+               ddo_total->ddo_dspace /= ddo_total->ddo_count;
+               ddo_total->ddo_mspace /= ddo_total->ddo_count;
+       } else {
+               ASSERT(ddo_total->ddo_dspace == 0);
+               ASSERT(ddo_total->ddo_mspace == 0);
+       }
+ }
+ void
+ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
+ {
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+               ddt_t *ddt = spa->spa_ddt[c];
+               for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+                       for (enum ddt_class class = 0; class < DDT_CLASSES;
+                           class++) {
+                               ddt_histogram_add(ddh,
+                                   &ddt->ddt_histogram_cache[type][class]);
+                       }
+               }
+       }
+ }
+ void
+ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
+ {
+       ddt_histogram_t *ddh_total;
+       ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+       ddt_get_dedup_histogram(spa, ddh_total);
+       ddt_histogram_stat(dds_total, ddh_total);
+       kmem_free(ddh_total, sizeof (ddt_histogram_t));
+ }
+ uint64_t
+ ddt_get_dedup_dspace(spa_t *spa)
+ {
+       ddt_stat_t dds_total = { 0 };
+       ddt_get_dedup_stats(spa, &dds_total);
+       return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
+ }
+ uint64_t
+ ddt_get_pool_dedup_ratio(spa_t *spa)
+ {
+       ddt_stat_t dds_total = { 0 };
+       ddt_get_dedup_stats(spa, &dds_total);
+       if (dds_total.dds_dsize == 0)
+               return (100);
+       return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
+ }
+ int
+ ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
+ {
+       spa_t *spa = ddt->ddt_spa;
+       uint64_t total_refcnt = 0;
+       uint64_t ditto = spa->spa_dedup_ditto;
+       int total_copies = 0;
+       int desired_copies = 0;
++      int p;
 -      for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
++      for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+               ddt_phys_t *ddp = &dde->dde_phys[p];
+               zio_t *zio = dde->dde_lead_zio[p];
+               uint64_t refcnt = ddp->ddp_refcnt;      /* committed refs */
+               if (zio != NULL)
+                       refcnt += zio->io_parent_count; /* pending refs */
+               if (ddp == ddp_willref)
+                       refcnt++;                       /* caller's ref */
+               if (refcnt != 0) {
+                       total_refcnt += refcnt;
+                       total_copies += p;
+               }
+       }
+       if (ditto == 0 || ditto > UINT32_MAX)
+               ditto = UINT32_MAX;
+       if (total_refcnt >= 1)
+               desired_copies++;
+       if (total_refcnt >= ditto)
+               desired_copies++;
+       if (total_refcnt >= ditto * ditto)
+               desired_copies++;
+       return (MAX(desired_copies, total_copies) - total_copies);
+ }
+ int
+ ddt_ditto_copies_present(ddt_entry_t *dde)
+ {
+       ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
+       dva_t *dva = ddp->ddp_dva;
+       int copies = 0 - DVA_GET_GANG(dva);
++      int d;
 -      for (int p = 0; p < DDT_PHYS_TYPES; p++)
++      for (d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
+               if (DVA_IS_VALID(dva))
+                       copies++;
+       ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
+       return (copies);
+ }
+ size_t
+ ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
+ {
+       uchar_t *version = dst++;
+       int cpfunc = ZIO_COMPRESS_ZLE;
+       zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+       size_t c_len;
+       ASSERT(d_len >= s_len + 1);     /* no compression plus version byte */
+       c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
+       if (c_len == s_len) {
+               cpfunc = ZIO_COMPRESS_OFF;
+               bcopy(src, dst, s_len);
+       }
+       *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
+       return (c_len + 1);
+ }
+ void
+ ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
+ {
+       uchar_t version = *src++;
+       int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
+       zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+       if (ci->ci_decompress != NULL)
+               (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
+       else
+               bcopy(src, dst, d_len);
+       if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
+               byteswap_uint64_array(dst, d_len);
+ }
+ ddt_t *
+ ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
+ {
+       return (spa->spa_ddt[c]);
+ }
+ ddt_t *
+ ddt_select(spa_t *spa, const blkptr_t *bp)
+ {
+       return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
+ }
+ void
+ ddt_enter(ddt_t *ddt)
+ {
+       mutex_enter(&ddt->ddt_lock);
+ }
+ void
+ ddt_exit(ddt_t *ddt)
+ {
+       mutex_exit(&ddt->ddt_lock);
+ }
+ static ddt_entry_t *
+ ddt_alloc(const ddt_key_t *ddk)
+ {
+       ddt_entry_t *dde;
+       dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
+       cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
+       dde->dde_key = *ddk;
+       return (dde);
+ }
+ static void
+ ddt_free(ddt_entry_t *dde)
+ {
+       ASSERT(!dde->dde_loading);
++      int p;
 -      for (int i = 0; i < DDT_KEY_WORDS; i++) {
++      for (p = 0; p < DDT_PHYS_TYPES; p++)
+               ASSERT(dde->dde_lead_zio[p] == NULL);
+       if (dde->dde_repair_data != NULL)
+               zio_buf_free(dde->dde_repair_data,
+                   DDK_GET_PSIZE(&dde->dde_key));
+       cv_destroy(&dde->dde_cv);
+       kmem_free(dde, sizeof (*dde));
+ }
+ void
+ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
+ {
+       ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+       avl_remove(&ddt->ddt_tree, dde);
+       ddt_free(dde);
+ }
+ ddt_entry_t *
+ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
+ {
+       ddt_entry_t *dde, dde_search;
+       enum ddt_type type;
+       enum ddt_class class;
+       avl_index_t where;
+       int error;
+       ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+       ddt_key_fill(&dde_search.dde_key, bp);
+       dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
+       if (dde == NULL) {
+               if (!add)
+                       return (NULL);
+               dde = ddt_alloc(&dde_search.dde_key);
+               avl_insert(&ddt->ddt_tree, dde, where);
+       }
+       while (dde->dde_loading)
+               cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+       if (dde->dde_loaded)
+               return (dde);
+       dde->dde_loading = B_TRUE;
+       ddt_exit(ddt);
+       error = ENOENT;
+       for (type = 0; type < DDT_TYPES; type++) {
+               for (class = 0; class < DDT_CLASSES; class++) {
+                       error = ddt_object_lookup(ddt, type, class, dde);
+                       if (error != ENOENT)
+                               break;
+               }
+               if (error != ENOENT)
+                       break;
+       }
+       ASSERT(error == 0 || error == ENOENT);
+       ddt_enter(ddt);
+       ASSERT(dde->dde_loaded == B_FALSE);
+       ASSERT(dde->dde_loading == B_TRUE);
+       dde->dde_type = type;   /* will be DDT_TYPES if no entry found */
+       dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
+       dde->dde_loaded = B_TRUE;
+       dde->dde_loading = B_FALSE;
+       if (error == 0)
+               ddt_stat_update(ddt, dde, -1ULL);
+       cv_broadcast(&dde->dde_cv);
+       return (dde);
+ }
+ void
+ ddt_prefetch(spa_t *spa, const blkptr_t *bp)
+ {
+       ddt_t *ddt;
+       ddt_entry_t dde;
+       if (!BP_GET_DEDUP(bp))
+               return;
+       /*
+        * We remove the DDT once it's empty and only prefetch dedup blocks
+        * when there are entries in the DDT.  Thus no locking is required
+        * as the DDT can't disappear on us.
+        */
+       ddt = ddt_select(spa, bp);
+       ddt_key_fill(&dde.dde_key, bp);
+       for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+               for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+                       ddt_object_prefetch(ddt, type, class, &dde);
+               }
+       }
+ }
+ int
+ ddt_entry_compare(const void *x1, const void *x2)
+ {
+       const ddt_entry_t *dde1 = x1;
+       const ddt_entry_t *dde2 = x2;
+       const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
+       const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
++      int i;
 -      for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
++      for (i = 0; i < DDT_KEY_WORDS; i++) {
+               if (u1[i] < u2[i])
+                       return (-1);
+               if (u1[i] > u2[i])
+                       return (1);
+       }
+       return (0);
+ }
+ static ddt_t *
+ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
+ {
+       ddt_t *ddt;
+       ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
+       mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
+       avl_create(&ddt->ddt_tree, ddt_entry_compare,
+           sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+       avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
+           sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+       ddt->ddt_checksum = c;
+       ddt->ddt_spa = spa;
+       ddt->ddt_os = spa->spa_meta_objset;
+       return (ddt);
+ }
+ static void
+ ddt_table_free(ddt_t *ddt)
+ {
+       ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
+       ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
+       avl_destroy(&ddt->ddt_tree);
+       avl_destroy(&ddt->ddt_repair_tree);
+       mutex_destroy(&ddt->ddt_lock);
+       kmem_free(ddt, sizeof (*ddt));
+ }
+ void
+ ddt_create(spa_t *spa)
+ {
+       spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+               spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+ }
+ int
+ ddt_load(spa_t *spa)
+ {
+       int error;
+       ddt_create(spa);
+       error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+           &spa->spa_ddt_stat_object);
+       if (error)
+               return (error == ENOENT ? 0 : error);
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+               ddt_t *ddt = spa->spa_ddt[c];
+               for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+                       for (enum ddt_class class = 0; class < DDT_CLASSES;
+                           class++) {
+                               error = ddt_object_load(ddt, type, class);
+                               if (error != 0 && error != ENOENT)
+                                       return (error);
+                       }
+               }
+               /*
+                * Seed the cached histograms.
+                */
+               bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+                   sizeof (ddt->ddt_histogram));
+       }
+       return (0);
+ }
+ void
+ ddt_unload(spa_t *spa)
+ {
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+               if (spa->spa_ddt[c]) {
+                       ddt_table_free(spa->spa_ddt[c]);
+                       spa->spa_ddt[c] = NULL;
+               }
+       }
+ }
+ boolean_t
+ ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
+ {
+       ddt_t *ddt;
+       ddt_entry_t dde;
+       if (!BP_GET_DEDUP(bp))
+               return (B_FALSE);
+       if (max_class == DDT_CLASS_UNIQUE)
+               return (B_TRUE);
+       ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
+       ddt_key_fill(&dde.dde_key, bp);
+       for (enum ddt_type type = 0; type < DDT_TYPES; type++)
+               for (enum ddt_class class = 0; class <= max_class; class++)
+                       if (ddt_object_lookup(ddt, type, class, &dde) == 0)
+                               return (B_TRUE);
+       return (B_FALSE);
+ }
+ ddt_entry_t *
+ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
+ {
+       ddt_key_t ddk;
+       ddt_entry_t *dde;
+       ddt_key_fill(&ddk, bp);
+       dde = ddt_alloc(&ddk);
+       for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+               for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+                       /*
+                        * We can only do repair if there are multiple copies
+                        * of the block.  For anything in the UNIQUE class,
+                        * there's definitely only one copy, so don't even try.
+                        */
+                       if (class != DDT_CLASS_UNIQUE &&
+                           ddt_object_lookup(ddt, type, class, dde) == 0)
+                               return (dde);
+               }
+       }
+       bzero(dde->dde_phys, sizeof (dde->dde_phys));
+       return (dde);
+ }
+ void
+ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
+ {
+       avl_index_t where;
+       ddt_enter(ddt);
+       if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
+           avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
+               avl_insert(&ddt->ddt_repair_tree, dde, where);
+       else
+               ddt_free(dde);
+       ddt_exit(ddt);
+ }
+ static void
+ ddt_repair_entry_done(zio_t *zio)
+ {
+       ddt_entry_t *rdde = zio->io_private;
+       ddt_free(rdde);
+ }
+ static void
+ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
+ {
+       ddt_phys_t *ddp = dde->dde_phys;
+       ddt_phys_t *rddp = rdde->dde_phys;
+       ddt_key_t *ddk = &dde->dde_key;
+       ddt_key_t *rddk = &rdde->dde_key;
+       zio_t *zio;
+       blkptr_t blk;
++      int p;
+       zio = zio_null(rio, rio->io_spa, NULL,
+           ddt_repair_entry_done, rdde, rio->io_flags);
 -      for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
++      for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
+               if (ddp->ddp_phys_birth == 0 ||
+                   ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
+                   bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+                       continue;
+               ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+               zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
+                   rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
+                   ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
+       }
+       zio_nowait(zio);
+ }
+ static void
+ ddt_repair_table(ddt_t *ddt, zio_t *rio)
+ {
+       spa_t *spa = ddt->ddt_spa;
+       ddt_entry_t *dde, *rdde_next, *rdde;
+       avl_tree_t *t = &ddt->ddt_repair_tree;
+       blkptr_t blk;
+       if (spa_sync_pass(spa) > 1)
+               return;
+       ddt_enter(ddt);
+       for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
+               rdde_next = AVL_NEXT(t, rdde);
+               avl_remove(&ddt->ddt_repair_tree, rdde);
+               ddt_exit(ddt);
+               ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
+               dde = ddt_repair_start(ddt, &blk);
+               ddt_repair_entry(ddt, dde, rdde, rio);
+               ddt_repair_done(ddt, dde);
+               ddt_enter(ddt);
+       }
+       ddt_exit(ddt);
+ }
+ static void
+ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+ {
+       dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
+       ddt_phys_t *ddp = dde->dde_phys;
+       ddt_key_t *ddk = &dde->dde_key;
+       enum ddt_type otype = dde->dde_type;
+       enum ddt_type ntype = DDT_TYPE_CURRENT;
+       enum ddt_class oclass = dde->dde_class;
+       enum ddt_class nclass;
+       uint64_t total_refcnt = 0;
++      int p;
+       ASSERT(dde->dde_loaded);
+       ASSERT(!dde->dde_loading);
++      for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+               ASSERT(dde->dde_lead_zio[p] == NULL);
+               ASSERT((int64_t)ddp->ddp_refcnt >= 0);
+               if (ddp->ddp_phys_birth == 0) {
+                       ASSERT(ddp->ddp_refcnt == 0);
+                       continue;
+               }
+               if (p == DDT_PHYS_DITTO) {
+                       if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
+                               ddt_phys_free(ddt, ddk, ddp, txg);
+                       continue;
+               }
+               if (ddp->ddp_refcnt == 0)
+                       ddt_phys_free(ddt, ddk, ddp, txg);
+               total_refcnt += ddp->ddp_refcnt;
+       }
+       if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
+               nclass = DDT_CLASS_DITTO;
+       else if (total_refcnt > 1)
+               nclass = DDT_CLASS_DUPLICATE;
+       else
+               nclass = DDT_CLASS_UNIQUE;
+       if (otype != DDT_TYPES &&
+           (otype != ntype || oclass != nclass || total_refcnt == 0)) {
+               VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
+               ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
+       }
+       if (total_refcnt != 0) {
+               dde->dde_type = ntype;
+               dde->dde_class = nclass;
+               ddt_stat_update(ddt, dde, 0);
+               if (!ddt_object_exists(ddt, ntype, nclass))
+                       ddt_object_create(ddt, ntype, nclass, tx);
+               VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+               /*
+                * If the class changes, the order that we scan this bp
+                * changes.  If it decreases, we could miss it, so
+                * scan it right now.  (This covers both class changing
+                * while we are doing ddt_walk(), and when we are
+                * traversing.)
+                */
+               if (nclass < oclass) {
+                       dsl_scan_ddt_entry(dp->dp_scan,
+                           ddt->ddt_checksum, dde, tx);
+               }
+       }
+ }
+ static void
+ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
+ {
+       spa_t *spa = ddt->ddt_spa;
+       ddt_entry_t *dde;
+       void *cookie = NULL;
+       if (avl_numnodes(&ddt->ddt_tree) == 0)
+               return;
+       ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
+       if (spa->spa_ddt_stat_object == 0) {
+               spa->spa_ddt_stat_object = zap_create(ddt->ddt_os,
+                   DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx);
+               VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+                   &spa->spa_ddt_stat_object, tx) == 0);
+       }
+       while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+               ddt_sync_entry(ddt, dde, tx, txg);
+               ddt_free(dde);
+       }
+       for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+               for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+                       if (!ddt_object_exists(ddt, type, class))
+                               continue;
+                       ddt_object_sync(ddt, type, class, tx);
+                       if (ddt_object_count(ddt, type, class) == 0)
+                               ddt_object_destroy(ddt, type, class, tx);
+               }
+       }
+       bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+           sizeof (ddt->ddt_histogram));
+ }
+ void
+ ddt_sync(spa_t *spa, uint64_t txg)
+ {
+       dmu_tx_t *tx;
+       zio_t *rio = zio_root(spa, NULL, NULL,
+           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+       ASSERT(spa_syncing_txg(spa) == txg);
+       tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+       for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+               ddt_t *ddt = spa->spa_ddt[c];
+               if (ddt == NULL)
+                       continue;
+               ddt_sync_table(ddt, tx, txg);
+               ddt_repair_table(ddt, rio);
+       }
+       (void) zio_wait(rio);
+       dmu_tx_commit(tx);
+ }
+ int
+ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
+ {
+       do {
+               do {
+                       do {
+                               ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
+                               int error = ENOENT;
+                               if (ddt_object_exists(ddt, ddb->ddb_type,
+                                   ddb->ddb_class)) {
+                                       error = ddt_object_walk(ddt,
+                                           ddb->ddb_type, ddb->ddb_class,
+                                           &ddb->ddb_cursor, dde);
+                               }
+                               dde->dde_type = ddb->ddb_type;
+                               dde->dde_class = ddb->ddb_class;
+                               if (error == 0)
+                                       return (0);
+                               if (error != ENOENT)
+                                       return (error);
+                               ddb->ddb_cursor = 0;
+                       } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
+                       ddb->ddb_checksum = 0;
+               } while (++ddb->ddb_type < DDT_TYPES);
+               ddb->ddb_type = 0;
+       } while (++ddb->ddb_class < DDT_CLASSES);
+       return (ENOENT);
+ }
index d864682024026ca44deaf837949f6430daf2ad73,5b87c81c639af00528e1db63967985b678d36b75..ad7a8f74f33cfe300c62cc692fb205017080c523
@@@ -1148,6 -1519,8 +1519,9 @@@ dmu_offset_next(objset_t *os, uint64_t 
  void
  dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
  {
+       dnode_phys_t *dnp;
++      int i;
        rw_enter(&dn->dn_struct_rwlock, RW_READER);
        mutex_enter(&dn->dn_mtx);
  
        doi->doi_indirection = dn->dn_nlevels;
        doi->doi_checksum = dn->dn_checksum;
        doi->doi_compress = dn->dn_compress;
-       doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) +
-           SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT;
-       doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
-       doi->doi_type = dn->dn_type;
-       doi->doi_bonus_size = dn->dn_bonuslen;
-       doi->doi_bonus_type = dn->dn_bonustype;
+       doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
+       doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz;
+       doi->doi_fill_count = 0;
 -      for (int i = 0; i < dnp->dn_nblkptr; i++)
++      for (i = 0; i < dnp->dn_nblkptr; i++)
+               doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
  
        mutex_exit(&dn->dn_mtx);
        rw_exit(&dn->dn_struct_rwlock);
index 8bb6ce2e3d201d82cd35ced9fc665e51fa55df93,690e6ecdee6ab84e5611ee49ca8ca295fb25c97a..2ff085e4491fa459bf72543d5b4786e2fb84384f
@@@ -452,16 -502,12 +502,13 @@@ dmu_objset_evict_dbufs(objset_t *os
  }
  
  void
- dmu_objset_evict(dsl_dataset_t *ds, void *arg)
+ dmu_objset_evict(objset_t *os)
  {
-       objset_impl_t *osi = arg;
-       objset_t os;
-       int i;
+       dsl_dataset_t *ds = os->os_dsl_dataset;
++      int t;
  
-       for (i = 0; i < TXG_SIZE; i++) {
-               ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
-               ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
-       }
 -      for (int t = 0; t < TXG_SIZE; t++)
++      for (t = 0; t < TXG_SIZE; t++)
+               ASSERT(!dmu_objset_is_dirty(os, t));
  
        if (ds) {
                if (!dsl_dataset_is_snapshot(ds)) {
@@@ -888,13 -949,10 +950,12 @@@ dmu_objset_sync_dnodes(list_t *list, li
  
  /* ARGSUSED */
  static void
- ready(zio_t *zio, arc_buf_t *abuf, void *arg)
dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
  {
 +      int i;
 +
        blkptr_t *bp = zio->io_bp;
-       blkptr_t *bp_orig = &zio->io_bp_orig;
-       objset_impl_t *os = arg;
+       objset_t *os = arg;
        dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
  
        ASSERT(bp == os->os_rootbp);
         * dnode and user/group accounting objects).
         */
        bp->blk_fill = 0;
 -      for (int i = 0; i < dnp->dn_nblkptr; i++)
 +      for (i = 0; i < dnp->dn_nblkptr; i++)
                bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+ }
+ /* ARGSUSED */
+ static void
+ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
+ {
+       blkptr_t *bp = zio->io_bp;
+       blkptr_t *bp_orig = &zio->io_bp_orig;
+       objset_t *os = arg;
  
        if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
-               ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+               ASSERT(BP_EQUAL(bp, bp_orig));
        } else {
-               if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
-                       (void) dsl_dataset_block_kill(os->os_dsl_dataset,
-                           &zio->io_bp_orig, zio, os->os_synctx);
-               dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
+               dsl_dataset_t *ds = os->os_dsl_dataset;
+               dmu_tx_t *tx = os->os_synctx;
+               (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+               dsl_dataset_block_born(ds, bp, tx);
        }
  }
  
index c6fbeeef0105d019ef9b90575682960da786bab2,5fc062c16b4b4867e196357e02f1a1cb9bc06e06..32dbea622f8af4c2ebea5a540dd9a324a32778f1
@@@ -203,6 -216,6 +216,7 @@@ dmu_tx_count_write(dmu_tx_hold_t *txh, 
        uint64_t start, end, i;
        int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
        int err = 0;
++      int l;
  
        if (len == 0)
                return;
                 * If this write is not off the end of the file
                 * we need to account for overwrites/unref.
                 */
-               if (start <= dn->dn_maxblkid)
-                       bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS);
+               if (start <= dn->dn_maxblkid) {
 -                      for (int l = 0; l < DN_MAX_LEVELS; l++)
++                      for (l = 0; l < DN_MAX_LEVELS; l++)
+                               history[l] = -1ULL;
+               }
                while (start <= dn->dn_maxblkid) {
-                       spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
-                       dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
                        dmu_buf_impl_t *db;
  
                        rw_enter(&dn->dn_struct_rwlock, RW_READER);
index 58fc786846c258819d41af1a5021389a5c7b07a6,ddd83576c65e8fbc69cb6b6eba180bc89cc0f66f..2e1fff35a9a499127fc7cc4244f17bc0e3627323
@@@ -77,16 -87,14 +87,16 @@@ parent_delta(dsl_dataset_t *ds, int64_
  }
  
  void
- dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
  {
 -      int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 -      int compressed = BP_GET_PSIZE(bp);
 -      int uncompressed = BP_GET_UCSIZE(bp);
 +      int used, compressed, uncompressed;
        int64_t delta;
  
-       dprintf_bp(bp, "born, ds=%p\n", ds);
 +      used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
 +      compressed = BP_GET_PSIZE(bp);
 +      uncompressed = BP_GET_UCSIZE(bp);
 +
+       dprintf_bp(bp, "ds=%p", ds);
  
        ASSERT(dmu_tx_is_syncing(tx));
        /* It could have been compressed away to nothing */
Simple merge
index 0000000000000000000000000000000000000000,23c37c7ccfd23923c2fdbaed8e466eb740c6e631..e402dde7c3a2582466413e42d2cb2d2579d872a4
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1739 +1,1741 @@@
 -      for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ /*
+  * CDDL HEADER START
+  *
+  * The contents of this file are subject to the terms of the
+  * Common Development and Distribution License (the "License").
+  * You may not use this file except in compliance with the License.
+  *
+  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+  * or http://www.opensolaris.org/os/licensing.
+  * See the License for the specific language governing permissions
+  * and limitations under the License.
+  *
+  * When distributing Covered Code, include this CDDL HEADER in each
+  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+  * If applicable, add the following below this CDDL HEADER, with the
+  * fields enclosed by brackets "[]" replaced with your own identifying
+  * information: Portions Copyright [yyyy] [name of copyright owner]
+  *
+  * CDDL HEADER END
+  */
+ /*
+  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+  */
+ #include <sys/dsl_scan.h>
+ #include <sys/dsl_pool.h>
+ #include <sys/dsl_dataset.h>
+ #include <sys/dsl_prop.h>
+ #include <sys/dsl_dir.h>
+ #include <sys/dsl_synctask.h>
+ #include <sys/dnode.h>
+ #include <sys/dmu_tx.h>
+ #include <sys/dmu_objset.h>
+ #include <sys/arc.h>
+ #include <sys/zap.h>
+ #include <sys/zio.h>
+ #include <sys/zfs_context.h>
+ #include <sys/fs/zfs.h>
+ #include <sys/zfs_znode.h>
+ #include <sys/spa_impl.h>
+ #include <sys/vdev_impl.h>
+ #include <sys/zil_impl.h>
+ #include <sys/zio_checksum.h>
+ #include <sys/ddt.h>
+ #include <sys/sa.h>
+ #include <sys/sa_impl.h>
+ #ifdef _KERNEL
+ #include <sys/zfs_vfsops.h>
+ #endif
+ typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
+ static scan_cb_t dsl_scan_defrag_cb;
+ static scan_cb_t dsl_scan_scrub_cb;
+ static scan_cb_t dsl_scan_remove_cb;
+ static dsl_syncfunc_t dsl_scan_cancel_sync;
+ static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
+ int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
+ int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
+ int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
+ boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+ boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
+ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+ int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
+ #define       DSL_SCAN_IS_SCRUB_RESILVER(scn) \
+       ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
+       (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
+ extern int zfs_txg_timeout;
+ /* the order has to match pool_scan_type */
+ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
+       NULL,
+       dsl_scan_scrub_cb,      /* POOL_SCAN_SCRUB */
+       dsl_scan_scrub_cb,      /* POOL_SCAN_RESILVER */
+ };
+ int
+ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
+ {
+       int err;
+       dsl_scan_t *scn;
+       spa_t *spa = dp->dp_spa;
+       uint64_t f;
+       scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
+       scn->scn_dp = dp;
+       err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           "scrub_func", sizeof (uint64_t), 1, &f);
+       if (err == 0) {
+               /*
+                * There was an old-style scrub in progress.  Restart a
+                * new-style scrub from the beginning.
+                */
+               scn->scn_restart_txg = txg;
+               zfs_dbgmsg("old-style scrub was in progress; "
+                   "restarting new-style scrub in txg %llu",
+                   scn->scn_restart_txg);
+               /*
+                * Load the queue obj from the old location so that it
+                * can be freed by dsl_scan_done().
+                */
+               (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+                   "scrub_queue", sizeof (uint64_t), 1,
+                   &scn->scn_phys.scn_queue_obj);
+       } else {
+               err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+                   &scn->scn_phys);
+               if (err == ENOENT)
+                       return (0);
+               else if (err)
+                       return (err);
+               if (scn->scn_phys.scn_state == DSS_SCANNING &&
+                   spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
+                       /*
+                        * A new-type scrub was in progress on an old
+                        * pool, and the pool was accessed by old
+                        * software.  Restart from the beginning, since
+                        * the old software may have changed the pool in
+                        * the meantime.
+                        */
+                       scn->scn_restart_txg = txg;
+                       zfs_dbgmsg("new-style scrub was modified "
+                           "by old software; restarting in txg %llu",
+                           scn->scn_restart_txg);
+               }
+       }
+       spa_scan_stat_init(spa);
+       return (0);
+ }
+ void
+ dsl_scan_fini(dsl_pool_t *dp)
+ {
+       if (dp->dp_scan) {
+               kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
+               dp->dp_scan = NULL;
+       }
+ }
+ /* ARGSUSED */
+ static int
+ dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx)
+ {
+       dsl_scan_t *scn = arg1;
+       if (scn->scn_phys.scn_state == DSS_SCANNING)
+               return (EBUSY);
+       return (0);
+ }
+ /* ARGSUSED */
+ static void
+ dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+ {
+       dsl_scan_t *scn = arg1;
+       pool_scan_func_t *funcp = arg2;
+       dmu_object_type_t ot = 0;
+       dsl_pool_t *dp = scn->scn_dp;
+       spa_t *spa = dp->dp_spa;
+       ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
+       ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+       bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+       scn->scn_phys.scn_func = *funcp;
+       scn->scn_phys.scn_state = DSS_SCANNING;
+       scn->scn_phys.scn_min_txg = 0;
+       scn->scn_phys.scn_max_txg = tx->tx_txg;
+       scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
+       scn->scn_phys.scn_start_time = gethrestime_sec();
+       scn->scn_phys.scn_errors = 0;
+       scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+       scn->scn_restart_txg = 0;
+       spa_scan_stat_init(spa);
+       if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+               scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
+               /* rewrite all disk labels */
+               vdev_config_dirty(spa->spa_root_vdev);
+               if (vdev_resilver_needed(spa->spa_root_vdev,
+                   &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
+                       spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
+               } else {
+                       spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
+               }
+               spa->spa_scrub_started = B_TRUE;
+               /*
+                * If this is an incremental scrub, limit the DDT scrub phase
+                * to just the auto-ditto class (for correctness); the rest
+                * of the scrub should go faster using top-down pruning.
+                */
+               if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
+                       scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
+       }
+       /* back to the generic stuff */
+       if (dp->dp_blkstats == NULL) {
+               dp->dp_blkstats =
+                   kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+       }
+       bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+       if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
+               ot = DMU_OT_ZAP_OTHER;
+       scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
+           ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
+       dsl_scan_sync_state(scn, tx);
+       spa_history_log_internal(LOG_POOL_SCAN, spa, tx,
+           "func=%u mintxg=%llu maxtxg=%llu",
+           *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
+ }
+ /* ARGSUSED */
+ static void
+ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+ {
+       static const char *old_names[] = {
+               "scrub_bookmark",
+               "scrub_ddt_bookmark",
+               "scrub_ddt_class_max",
+               "scrub_queue",
+               "scrub_min_txg",
+               "scrub_max_txg",
+               "scrub_func",
+               "scrub_errors",
+               NULL
+       };
+       dsl_pool_t *dp = scn->scn_dp;
+       spa_t *spa = dp->dp_spa;
+       int i;
+       /* Remove any remnants of an old-style scrub. */
+       for (i = 0; old_names[i]; i++) {
+               (void) zap_remove(dp->dp_meta_objset,
+                   DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
+       }
+       if (scn->scn_phys.scn_queue_obj != 0) {
+               VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, tx));
+               scn->scn_phys.scn_queue_obj = 0;
+       }
+       /*
+        * If we were "restarted" from a stopped state, don't bother
+        * with anything else.
+        */
+       if (scn->scn_phys.scn_state != DSS_SCANNING)
+               return;
+       if (complete)
+               scn->scn_phys.scn_state = DSS_FINISHED;
+       else
+               scn->scn_phys.scn_state = DSS_CANCELED;
+       spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx,
+           "complete=%u", complete);
+       if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+               mutex_enter(&spa->spa_scrub_lock);
+               while (spa->spa_scrub_inflight > 0) {
+                       cv_wait(&spa->spa_scrub_io_cv,
+                           &spa->spa_scrub_lock);
+               }
+               mutex_exit(&spa->spa_scrub_lock);
+               spa->spa_scrub_started = B_FALSE;
+               spa->spa_scrub_active = B_FALSE;
+               /*
+                * If the scrub/resilver completed, update all DTLs to
+                * reflect this.  Whether it succeeded or not, vacate
+                * all temporary scrub DTLs.
+                */
+               vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+                   complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
+               if (complete) {
+                       spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
+                           ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+               }
+               spa_errlog_rotate(spa);
+               /*
+                * We may have finished replacing a device.
+                * Let the async thread assess this and handle the detach.
+                */
+               spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+       }
+       scn->scn_phys.scn_end_time = gethrestime_sec();
+ }
+ /* ARGSUSED */
+ static int
+ dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx)
+ {
+       dsl_scan_t *scn = arg1;
+       if (scn->scn_phys.scn_state != DSS_SCANNING)
+               return (ENOENT);
+       return (0);
+ }
+ /* ARGSUSED */
+ static void
+ dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+ {
+       dsl_scan_t *scn = arg1;
+       dsl_scan_done(scn, B_FALSE, tx);
+       dsl_scan_sync_state(scn, tx);
+ }
+ int
+ dsl_scan_cancel(dsl_pool_t *dp)
+ {
+       boolean_t complete = B_FALSE;
+       int err;
+       err = dsl_sync_task_do(dp, dsl_scan_cancel_check,
+           dsl_scan_cancel_sync, dp->dp_scan, &complete, 3);
+       return (err);
+ }
+ static void dsl_scan_visitbp(blkptr_t *bp,
+     const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf,
+     dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+     dmu_tx_t *tx);
+ static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
+     dmu_objset_type_t ostype,
+     dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx);
+ void
+ dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
+ {
+       zio_free(dp->dp_spa, txg, bp);
+ }
+ void
+ dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
+ {
+       ASSERT(dsl_pool_sync_context(dp));
+       zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
+ }
+ int
+ dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
+     arc_done_func_t *done, void *private, int priority, int zio_flags,
+     uint32_t *arc_flags, const zbookmark_t *zb)
+ {
+       return (arc_read(pio, spa, bpp, pbuf, done, private,
+           priority, zio_flags, arc_flags, zb));
+ }
+ int
+ dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
+     arc_done_func_t *done, void *private, int priority, int zio_flags,
+     uint32_t *arc_flags, const zbookmark_t *zb)
+ {
+       return (arc_read_nolock(pio, spa, bpp, done, private,
+           priority, zio_flags, arc_flags, zb));
+ }
+ static boolean_t
+ bookmark_is_zero(const zbookmark_t *zb)
+ {
+       return (zb->zb_objset == 0 && zb->zb_object == 0 &&
+           zb->zb_level == 0 && zb->zb_blkid == 0);
+ }
+ /* dnp is the dnode for zb1->zb_object */
+ static boolean_t
+ bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
+     const zbookmark_t *zb2)
+ {
+       uint64_t zb1nextL0, zb2thisobj;
+       ASSERT(zb1->zb_objset == zb2->zb_objset);
+       ASSERT(zb2->zb_level == 0);
+       /*
+        * A bookmark in the deadlist is considered to be after
+        * everything else.
+        */
+       if (zb2->zb_object == DMU_DEADLIST_OBJECT)
+               return (B_TRUE);
+       /* The objset_phys_t isn't before anything. */
+       if (dnp == NULL)
+               return (B_FALSE);
+       zb1nextL0 = (zb1->zb_blkid + 1) <<
+           ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+       zb2thisobj = zb2->zb_object ? zb2->zb_object :
+           zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+       if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
+               uint64_t nextobj = zb1nextL0 *
+                   (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
+               return (nextobj <= zb2thisobj);
+       }
+       if (zb1->zb_object < zb2thisobj)
+               return (B_TRUE);
+       if (zb1->zb_object > zb2thisobj)
+               return (B_FALSE);
+       if (zb2->zb_object == DMU_META_DNODE_OBJECT)
+               return (B_FALSE);
+       return (zb1nextL0 <= zb2->zb_blkid);
+ }
+ static uint64_t
+ dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+ {
+       uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+       if (dsl_dataset_is_snapshot(ds))
+               return (MIN(smt, ds->ds_phys->ds_creation_txg));
+       return (smt);
+ }
+ static void
+ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+ {
+       VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
+           DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+           &scn->scn_phys, tx));
+ }
+ static boolean_t
+ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
+ {
+       uint64_t elapsed_nanosecs;
+       int mintime;
+       /* we never skip user/group accounting objects */
+       if (zb && (int64_t)zb->zb_object < 0)
+               return (B_FALSE);
+       if (scn->scn_pausing)
+               return (B_TRUE); /* we're already pausing */
+       if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
+               return (B_FALSE); /* we're resuming */
+       /* We only know how to resume from level-0 blocks. */
+       if (zb && zb->zb_level != 0)
+               return (B_FALSE);
+       mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+           zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
+       elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+       if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+           (elapsed_nanosecs / MICROSEC > mintime &&
+           txg_sync_waiting(scn->scn_dp)) ||
+           spa_shutting_down(scn->scn_dp->dp_spa)) {
+               if (zb) {
+                       dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
+                           (longlong_t)zb->zb_objset,
+                           (longlong_t)zb->zb_object,
+                           (longlong_t)zb->zb_level,
+                           (longlong_t)zb->zb_blkid);
+                       scn->scn_phys.scn_bookmark = *zb;
+               }
+               dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+               scn->scn_pausing = B_TRUE;
+               return (B_TRUE);
+       }
+       return (B_FALSE);
+ }
+ typedef struct zil_scan_arg {
+       dsl_pool_t      *zsa_dp;
+       zil_header_t    *zsa_zh;
+ } zil_scan_arg_t;
+ /* ARGSUSED */
+ static int
+ dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+ {
+       zil_scan_arg_t *zsa = arg;
+       dsl_pool_t *dp = zsa->zsa_dp;
+       dsl_scan_t *scn = dp->dp_scan;
+       zil_header_t *zh = zsa->zsa_zh;
+       zbookmark_t zb;
+       if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+               return (0);
+       /*
+        * One block ("stubby") can be allocated a long time ago; we
+        * want to visit that one because it has been allocated
+        * (on-disk) even if it hasn't been claimed (even though for
+        * scrub there's nothing to do to it).
+        */
+       if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+               return (0);
+       SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+           ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+       VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+       return (0);
+ }
+ /* ARGSUSED */
+ static int
+ dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+ {
+       if (lrc->lrc_txtype == TX_WRITE) {
+               zil_scan_arg_t *zsa = arg;
+               dsl_pool_t *dp = zsa->zsa_dp;
+               dsl_scan_t *scn = dp->dp_scan;
+               zil_header_t *zh = zsa->zsa_zh;
+               lr_write_t *lr = (lr_write_t *)lrc;
+               blkptr_t *bp = &lr->lr_blkptr;
+               zbookmark_t zb;
+               if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+                       return (0);
+               /*
+                * birth can be < claim_txg if this record's txg is
+                * already txg sync'ed (but this log block contains
+                * other records that are not synced)
+                */
+               if (claim_txg == 0 || bp->blk_birth < claim_txg)
+                       return (0);
+               SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+                   lr->lr_foid, ZB_ZIL_LEVEL,
+                   lr->lr_offset / BP_GET_LSIZE(bp));
+               VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+       }
+       return (0);
+ }
+ static void
+ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
+ {
+       uint64_t claim_txg = zh->zh_claim_txg;
+       zil_scan_arg_t zsa = { dp, zh };
+       zilog_t *zilog;
+       /*
+        * We only want to visit blocks that have been claimed but not yet
+        * replayed (or, in read-only mode, blocks that *would* be claimed).
+        */
+       if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+               return;
+       zilog = zil_alloc(dp->dp_meta_objset, zh);
+       (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
+           claim_txg);
+       zil_free(zilog);
+ }
+ /* ARGSUSED */
+ static void
+ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
+     uint64_t objset, uint64_t object, uint64_t blkid)
+ {
+       zbookmark_t czb;
+       uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+       if (zfs_no_scrub_prefetch)
+               return;
+       if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
+           (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
+               return;
+       SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
+       /*
+        * XXX need to make sure all of these arc_read() prefetches are
+        * done before setting xlateall (similar to dsl_read())
+        */
+       (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
+           buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+           &flags, &czb);
+ }
+ static boolean_t
+ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
+     const zbookmark_t *zb)
+ {
+       /*
+        * We never skip over user/group accounting objects (obj<0)
+        */
+       if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
+           (int64_t)zb->zb_object >= 0) {
+               /*
+                * If we already visited this bp & everything below (in
+                * a prior txg sync), don't bother doing it again.
+                */
+               if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+                       return (B_TRUE);
+               /*
+                * If we found the block we're trying to resume from, or
+                * we went past it to a different object, zero it out to
+                * indicate that it's OK to start checking for pausing
+                * again.
+                */
+               if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
+                   zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
+                       dprintf("resuming at %llx/%llx/%llx/%llx\n",
+                           (longlong_t)zb->zb_objset,
+                           (longlong_t)zb->zb_object,
+                           (longlong_t)zb->zb_level,
+                           (longlong_t)zb->zb_blkid);
+                       bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
+               }
+       }
+       return (B_FALSE);
+ }
+ /*
+  * Return nonzero on i/o error.
+  * Return new buf to write out in *bufp.
+  */
+ static int
+ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+     dnode_phys_t *dnp, const blkptr_t *bp,
+     const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
+ {
+       dsl_pool_t *dp = scn->scn_dp;
+       int err;
+       if (BP_GET_LEVEL(bp) > 0) {
+               uint32_t flags = ARC_WAIT;
+               int i;
+               blkptr_t *cbp;
+               int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+               err = arc_read_nolock(NULL, dp->dp_spa, bp,
+                   arc_getbuf_func, bufp,
+                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+               if (err) {
+                       scn->scn_phys.scn_errors++;
+                       return (err);
+               }
+               for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+                       dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset,
+                           zb->zb_object, zb->zb_blkid * epb + i);
+               }
+               for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+                       zbookmark_t czb;
+                       SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+                           zb->zb_level - 1,
+                           zb->zb_blkid * epb + i);
+                       dsl_scan_visitbp(cbp, &czb, dnp,
+                           *bufp, ds, scn, ostype, tx);
+               }
+       } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
+               uint32_t flags = ARC_WAIT;
+               err = arc_read_nolock(NULL, dp->dp_spa, bp,
+                   arc_getbuf_func, bufp,
+                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+               if (err) {
+                       scn->scn_phys.scn_errors++;
+                       return (err);
+               }
+       } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+               uint32_t flags = ARC_WAIT;
+               dnode_phys_t *cdnp;
+               int i, j;
+               int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+               err = arc_read_nolock(NULL, dp->dp_spa, bp,
+                   arc_getbuf_func, bufp,
+                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+               if (err) {
+                       scn->scn_phys.scn_errors++;
+                       return (err);
+               }
+               for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+                       for (j = 0; j < cdnp->dn_nblkptr; j++) {
+                               blkptr_t *cbp = &cdnp->dn_blkptr[j];
+                               dsl_scan_prefetch(scn, *bufp, cbp,
+                                   zb->zb_objset, zb->zb_blkid * epb + i, j);
+                       }
+               }
+               for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+                       dsl_scan_visitdnode(scn, ds, ostype,
+                           cdnp, *bufp, zb->zb_blkid * epb + i, tx);
+               }
+       } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+               uint32_t flags = ARC_WAIT;
+               objset_phys_t *osp;
+               err = arc_read_nolock(NULL, dp->dp_spa, bp,
+                   arc_getbuf_func, bufp,
+                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+               if (err) {
+                       scn->scn_phys.scn_errors++;
+                       return (err);
+               }
+               osp = (*bufp)->b_data;
+               if (DSL_SCAN_IS_SCRUB_RESILVER(scn))
+                       dsl_scan_zil(dp, &osp->os_zil_header);
+               dsl_scan_visitdnode(scn, ds, osp->os_type,
+                   &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
+               if (OBJSET_BUF_HAS_USERUSED(*bufp)) {
+                       /*
+                        * We also always visit user/group accounting
+                        * objects, and never skip them, even if we are
+                        * pausing.  This is necessary so that the space
+                        * deltas from this txg get integrated.
+                        */
+                       dsl_scan_visitdnode(scn, ds, osp->os_type,
+                           &osp->os_groupused_dnode, *bufp,
+                           DMU_GROUPUSED_OBJECT, tx);
+                       dsl_scan_visitdnode(scn, ds, osp->os_type,
+                           &osp->os_userused_dnode, *bufp,
+                           DMU_USERUSED_OBJECT, tx);
+               }
+       }
+       return (0);
+ }
+ static void
+ dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
+     dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf,
+     uint64_t object, dmu_tx_t *tx)
+ {
+       int j;
+       for (j = 0; j < dnp->dn_nblkptr; j++) {
+               zbookmark_t czb;
+               SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+                   dnp->dn_nlevels - 1, j);
+               dsl_scan_visitbp(&dnp->dn_blkptr[j],
+                   &czb, dnp, buf, ds, scn, ostype, tx);
+       }
+       if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+               zbookmark_t czb;
+               SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+                   0, DMU_SPILL_BLKID);
+               dsl_scan_visitbp(&dnp->dn_spill,
+                   &czb, dnp, buf, ds, scn, ostype, tx);
+       }
+ }
+ /*
+  * The arguments are in this order because mdb can only print the
+  * first 5; we want them to be useful.
+  */
+ static void
+ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
+     dnode_phys_t *dnp, arc_buf_t *pbuf,
+     dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+     dmu_tx_t *tx)
+ {
+       dsl_pool_t *dp = scn->scn_dp;
+       arc_buf_t *buf = NULL;
+       blkptr_t bp_toread = *bp;
+       /* ASSERT(pbuf == NULL || arc_released(pbuf)); */
+       if (dsl_scan_check_pause(scn, zb))
+               return;
+       if (dsl_scan_check_resume(scn, dnp, zb))
+               return;
+       if (bp->blk_birth == 0)
+               return;
+       scn->scn_visited_this_txg++;
+       dprintf_bp(bp,
+           "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
+           ds, ds ? ds->ds_object : 0,
+           zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+           pbuf, bp);
+       if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+               return;
+       if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
+               /*
+                * For non-user-accounting blocks, we need to read the
+                * new bp (from a deleted snapshot, found in
+                * check_existing_xlation).  If we used the old bp,
+                * pointers inside this block from before we resumed
+                * would be untranslated.
+                *
+                * For user-accounting blocks, we need to read the old
+                * bp, because we will apply the entire space delta to
+                * it (original untranslated -> translations from
+                * deleted snap -> now).
+                */
+               bp_toread = *bp;
+       }
+       if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
+           &buf) != 0)
+               return;
+       /*
+        * If dsl_scan_ddt() has aready visited this block, it will have
+        * already done any translations or scrubbing, so don't call the
+        * callback again.
+        */
+       if (ddt_class_contains(dp->dp_spa,
+           scn->scn_phys.scn_ddt_class_max, bp)) {
+               ASSERT(buf == NULL);
+               return;
+       }
+       /*
+        * If this block is from the future (after cur_max_txg), then we
+        * are doing this on behalf of a deleted snapshot, and we will
+        * revisit the future block on the next pass of this dataset.
+        * Don't scan it now unless we need to because something
+        * under it was modified.
+        */
+       if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
+               scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+       }
+       if (buf)
+               (void) arc_buf_remove_ref(buf, &buf);
+ }
+ static void
+ dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
+     dmu_tx_t *tx)
+ {
+       zbookmark_t zb;
+       SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+           ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+       dsl_scan_visitbp(bp, &zb, NULL, NULL,
+           ds, scn, DMU_OST_NONE, tx);
+       dprintf_ds(ds, "finished scan%s", "");
+ }
+ void
+ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+ {
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       dsl_scan_t *scn = dp->dp_scan;
+       uint64_t mintxg;
+       if (scn->scn_phys.scn_state != DSS_SCANNING)
+               return;
+       if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+               if (dsl_dataset_is_snapshot(ds)) {
+                       /* Note, scn_cur_{min,max}_txg stays the same. */
+                       scn->scn_phys.scn_bookmark.zb_objset =
+                           ds->ds_phys->ds_next_snap_obj;
+                       zfs_dbgmsg("destroying ds %llu; currently traversing; "
+                           "reset zb_objset to %llu",
+                           (u_longlong_t)ds->ds_object,
+                           (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+                       scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
+               } else {
+                       SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
+                           ZB_DESTROYED_OBJSET, 0, 0, 0);
+                       zfs_dbgmsg("destroying ds %llu; currently traversing; "
+                           "reset bookmark to -1,0,0,0",
+                           (u_longlong_t)ds->ds_object);
+               }
+       } else if (zap_lookup_int_key(dp->dp_meta_objset,
+           scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+               ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+               if (dsl_dataset_is_snapshot(ds)) {
+                       /*
+                        * We keep the same mintxg; it could be >
+                        * ds_creation_txg if the previous snapshot was
+                        * deleted too.
+                        */
+                       VERIFY(zap_add_int_key(dp->dp_meta_objset,
+                           scn->scn_phys.scn_queue_obj,
+                           ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
+                       zfs_dbgmsg("destroying ds %llu; in queue; "
+                           "replacing with %llu",
+                           (u_longlong_t)ds->ds_object,
+                           (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+               } else {
+                       zfs_dbgmsg("destroying ds %llu; in queue; removing",
+                           (u_longlong_t)ds->ds_object);
+               }
+       } else {
+               zfs_dbgmsg("destroying ds %llu; ignoring",
+                   (u_longlong_t)ds->ds_object);
+       }
+       /*
+        * dsl_scan_sync() should be called after this, and should sync
+        * out our changed state, but just to be safe, do it here.
+        */
+       dsl_scan_sync_state(scn, tx);
+ }
+ void
+ dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+ {
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       dsl_scan_t *scn = dp->dp_scan;
+       uint64_t mintxg;
+       if (scn->scn_phys.scn_state != DSS_SCANNING)
+               return;
+       ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
+       if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+               scn->scn_phys.scn_bookmark.zb_objset =
+                   ds->ds_phys->ds_prev_snap_obj;
+               zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+                   "reset zb_objset to %llu",
+                   (u_longlong_t)ds->ds_object,
+                   (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+       } else if (zap_lookup_int_key(dp->dp_meta_objset,
+           scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+               VERIFY(zap_add_int_key(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj,
+                   ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
+               zfs_dbgmsg("snapshotting ds %llu; in queue; "
+                   "replacing with %llu",
+                   (u_longlong_t)ds->ds_object,
+                   (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+       }
+       dsl_scan_sync_state(scn, tx);
+ }
+ void
+ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+ {
+       dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+       dsl_scan_t *scn = dp->dp_scan;
+       uint64_t mintxg;
+       if (scn->scn_phys.scn_state != DSS_SCANNING)
+               return;
+       if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
+               scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
+               zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+                   "reset zb_objset to %llu",
+                   (u_longlong_t)ds1->ds_object,
+                   (u_longlong_t)ds2->ds_object);
+       } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
+               scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
+               zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+                   "reset zb_objset to %llu",
+                   (u_longlong_t)ds2->ds_object,
+                   (u_longlong_t)ds1->ds_object);
+       }
+       if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+           ds1->ds_object, &mintxg) == 0) {
+               int err;
+               ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+               ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
+               err = zap_add_int_key(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
+               VERIFY(err == 0 || err == EEXIST);
+               if (err == EEXIST) {
+                       /* Both were there to begin with */
+                       VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+                           scn->scn_phys.scn_queue_obj,
+                           ds1->ds_object, mintxg, tx));
+               }
+               zfs_dbgmsg("clone_swap ds %llu; in queue; "
+                   "replacing with %llu",
+                   (u_longlong_t)ds1->ds_object,
+                   (u_longlong_t)ds2->ds_object);
+       } else if (zap_lookup_int_key(dp->dp_meta_objset,
+           scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
+               ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+               ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
+               VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
+               zfs_dbgmsg("clone_swap ds %llu; in queue; "
+                   "replacing with %llu",
+                   (u_longlong_t)ds2->ds_object,
+                   (u_longlong_t)ds1->ds_object);
+       }
+       dsl_scan_sync_state(scn, tx);
+ }
+ struct enqueue_clones_arg {
+       dmu_tx_t *tx;
+       uint64_t originobj;
+ };
+ /* ARGSUSED */
+ static int
+ enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+ {
+       struct enqueue_clones_arg *eca = arg;
+       dsl_dataset_t *ds;
+       int err;
+       dsl_pool_t *dp = spa->spa_dsl_pool;
+       dsl_scan_t *scn = dp->dp_scan;
+       err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+       if (err)
+               return (err);
+       if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
+               while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+                       dsl_dataset_t *prev;
+                       err = dsl_dataset_hold_obj(dp,
+                           ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+                       dsl_dataset_rele(ds, FTAG);
+                       if (err)
+                               return (err);
+                       ds = prev;
+               }
+               VERIFY(zap_add_int_key(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds->ds_object,
+                   ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
+       }
+       dsl_dataset_rele(ds, FTAG);
+       return (0);
+ }
+ static void
+ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
+ {
+       dsl_pool_t *dp = scn->scn_dp;
+       dsl_dataset_t *ds;
+       VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+       /*
+        * Iterate over the bps in this ds.
+        */
+       dmu_buf_will_dirty(ds->ds_dbuf, tx);
+       dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
+       char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
+       dsl_dataset_name(ds, dsname);
+       zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
+           "pausing=%u",
+           (longlong_t)dsobj, dsname,
+           (longlong_t)scn->scn_phys.scn_cur_min_txg,
+           (longlong_t)scn->scn_phys.scn_cur_max_txg,
+           (int)scn->scn_pausing);
+       kmem_free(dsname, ZFS_MAXNAMELEN);
+       if (scn->scn_pausing)
+               goto out;
+       /*
+        * We've finished this pass over this dataset.
+        */
+       /*
+        * If we did not completely visit this dataset, do another pass.
+        */
+       if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
+               zfs_dbgmsg("incomplete pass; visiting again");
+               scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
+               VERIFY(zap_add_int_key(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds->ds_object,
+                   scn->scn_phys.scn_cur_max_txg, tx) == 0);
+               goto out;
+       }
+       /*
+        * Add descendent datasets to work queue.
+        */
+       if (ds->ds_phys->ds_next_snap_obj != 0) {
+               VERIFY(zap_add_int_key(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
+                   ds->ds_phys->ds_creation_txg, tx) == 0);
+       }
+       if (ds->ds_phys->ds_num_children > 1) {
+               boolean_t usenext = B_FALSE;
+               if (ds->ds_phys->ds_next_clones_obj != 0) {
+                       uint64_t count;
+                       /*
+                        * A bug in a previous version of the code could
+                        * cause upgrade_clones_cb() to not set
+                        * ds_next_snap_obj when it should, leading to a
+                        * missing entry.  Therefore we can only use the
+                        * next_clones_obj when its count is correct.
+                        */
+                       int err = zap_count(dp->dp_meta_objset,
+                           ds->ds_phys->ds_next_clones_obj, &count);
+                       if (err == 0 &&
+                           count == ds->ds_phys->ds_num_children - 1)
+                               usenext = B_TRUE;
+               }
+               if (usenext) {
+                       VERIFY(zap_join_key(dp->dp_meta_objset,
+                           ds->ds_phys->ds_next_clones_obj,
+                           scn->scn_phys.scn_queue_obj,
+                           ds->ds_phys->ds_creation_txg, tx) == 0);
+               } else {
+                       struct enqueue_clones_arg eca;
+                       eca.tx = tx;
+                       eca.originobj = ds->ds_object;
+                       (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
+                           NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
+               }
+       }
+ out:
+       dsl_dataset_rele(ds, FTAG);
+ }
+ /* ARGSUSED */
+ static int
+ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+ {
+       dmu_tx_t *tx = arg;
+       dsl_dataset_t *ds;
+       int err;
+       dsl_pool_t *dp = spa->spa_dsl_pool;
+       dsl_scan_t *scn = dp->dp_scan;
+       err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+       if (err)
+               return (err);
+       while (ds->ds_phys->ds_prev_snap_obj != 0) {
+               dsl_dataset_t *prev;
+               err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+                   FTAG, &prev);
+               if (err) {
+                       dsl_dataset_rele(ds, FTAG);
+                       return (err);
+               }
+               /*
+                * If this is a clone, we don't need to worry about it for now.
+                */
+               if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+                       dsl_dataset_rele(ds, FTAG);
+                       dsl_dataset_rele(prev, FTAG);
+                       return (0);
+               }
+               dsl_dataset_rele(ds, FTAG);
+               ds = prev;
+       }
+       VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+           ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
+       dsl_dataset_rele(ds, FTAG);
+       return (0);
+ }
+ /*
+  * Scrub/dedup interaction.
+  *
+  * If there are N references to a deduped block, we don't want to scrub it
+  * N times -- ideally, we should scrub it exactly once.
+  *
+  * We leverage the fact that the dde's replication class (enum ddt_class)
+  * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
+  * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
+  *
+  * To prevent excess scrubbing, the scrub begins by walking the DDT
+  * to find all blocks with refcnt > 1, and scrubs each of these once.
+  * Since there are two replication classes which contain blocks with
+  * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
+  * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
+  *
+  * There would be nothing more to say if a block's refcnt couldn't change
+  * during a scrub, but of course it can so we must account for changes
+  * in a block's replication class.
+  *
+  * Here's an example of what can occur:
+  *
+  * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+  * when visited during the top-down scrub phase, it will be scrubbed twice.
+  * This negates our scrub optimization, but is otherwise harmless.
+  *
+  * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+  * on each visit during the top-down scrub phase, it will never be scrubbed.
+  * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+  * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
+  * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
+  * while a scrub is in progress, it scrubs the block right then.
+  */
+ static void
+ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
+ {
+       ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
+       ddt_entry_t dde = { 0 };
+       int error;
+       uint64_t n = 0;
+       while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+               ddt_t *ddt;
+               if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
+                       break;
+               dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
+                   (longlong_t)ddb->ddb_class,
+                   (longlong_t)ddb->ddb_type,
+                   (longlong_t)ddb->ddb_checksum,
+                   (longlong_t)ddb->ddb_cursor);
+               /* There should be no pending changes to the dedup table */
+               ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
+               ASSERT(avl_first(&ddt->ddt_tree) == NULL);
+               dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+               n++;
+               if (dsl_scan_check_pause(scn, NULL))
+                       break;
+       }
+       zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
+           (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
+           (int)scn->scn_pausing);
+       ASSERT(error == 0 || error == ENOENT);
+       ASSERT(error != ENOENT ||
+           ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
+ }
+ /* ARGSUSED */
+ void
+ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+     ddt_entry_t *dde, dmu_tx_t *tx)
+ {
+       const ddt_key_t *ddk = &dde->dde_key;
+       ddt_phys_t *ddp = dde->dde_phys;
+       blkptr_t bp;
+       zbookmark_t zb = { 0 };
++      int p;
+       if (scn->scn_phys.scn_state != DSS_SCANNING)
+               return;
 -      for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
++      for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+               if (ddp->ddp_phys_birth == 0 ||
+                   ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
+                       continue;
+               ddt_bp_create(checksum, ddk, ddp, &bp);
+               scn->scn_visited_this_txg++;
+               scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+       }
+ }
+ static void
+ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
+ {
+       dsl_pool_t *dp = scn->scn_dp;
+       zap_cursor_t zc;
+       zap_attribute_t za;
+       if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+           scn->scn_phys.scn_ddt_class_max) {
+               scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+               scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+               dsl_scan_ddt(scn, tx);
+               if (scn->scn_pausing)
+                       return;
+       }
+       if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
+               /* First do the MOS & ORIGIN */
+               scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+               scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+               dsl_scan_visit_rootbp(scn, NULL,
+                   &dp->dp_meta_rootbp, tx);
+               spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+               if (scn->scn_pausing)
+                       return;
+               if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+                       VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+                           NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
+               } else {
+                       dsl_scan_visitds(scn,
+                           dp->dp_origin_snap->ds_object, tx);
+               }
+               ASSERT(!scn->scn_pausing);
+       } else if (scn->scn_phys.scn_bookmark.zb_objset !=
+           ZB_DESTROYED_OBJSET) {
+               /*
+                * If we were paused, continue from here.  Note if the
+                * ds we were paused on was deleted, the zb_objset may
+                * be -1, so we will skip this and find a new objset
+                * below.
+                */
+               dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
+               if (scn->scn_pausing)
+                       return;
+       }
+       /*
+        * In case we were paused right at the end of the ds, zero the
+        * bookmark so we don't think that we're still trying to resume.
+        */
+       bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
+       /* keep pulling things out of the zap-object-as-queue */
+       while (zap_cursor_init(&zc, dp->dp_meta_objset,
+           scn->scn_phys.scn_queue_obj),
+           zap_cursor_retrieve(&zc, &za) == 0) {
+               dsl_dataset_t *ds;
+               uint64_t dsobj;
+               dsobj = strtonum(za.za_name, NULL);
+               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, dsobj, tx));
+               /* Set up min/max txg */
+               VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+               if (za.za_first_integer != 0) {
+                       scn->scn_phys.scn_cur_min_txg =
+                           MAX(scn->scn_phys.scn_min_txg,
+                           za.za_first_integer);
+               } else {
+                       scn->scn_phys.scn_cur_min_txg =
+                           MAX(scn->scn_phys.scn_min_txg,
+                           ds->ds_phys->ds_prev_snap_txg);
+               }
+               scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
+               dsl_dataset_rele(ds, FTAG);
+               dsl_scan_visitds(scn, dsobj, tx);
+               zap_cursor_fini(&zc);
+               if (scn->scn_pausing)
+                       return;
+       }
+       zap_cursor_fini(&zc);
+ }
+ static int
+ dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+ {
+       dsl_scan_t *scn = arg;
+       uint64_t elapsed_nanosecs;
+       elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+       if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+           (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
+           txg_sync_waiting(scn->scn_dp)) ||
+           spa_shutting_down(scn->scn_dp->dp_spa))
+               return (ERESTART);
+       zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
+           dmu_tx_get_txg(tx), bp, 0));
+       dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+           -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
+           -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+       scn->scn_visited_this_txg++;
+       return (0);
+ }
+ boolean_t
+ dsl_scan_active(dsl_scan_t *scn)
+ {
+       spa_t *spa = scn->scn_dp->dp_spa;
+       uint64_t used = 0, comp, uncomp;
+       if (spa->spa_load_state != SPA_LOAD_NONE)
+               return (B_FALSE);
+       if (spa_shutting_down(spa))
+               return (B_FALSE);
+       if (scn->scn_phys.scn_state == DSS_SCANNING)
+               return (B_TRUE);
+       if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+               (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
+                   &used, &comp, &uncomp);
+       }
+       return (used != 0);
+ }
+ void
+ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+ {
+       dsl_scan_t *scn = dp->dp_scan;
+       spa_t *spa = dp->dp_spa;
+       int err;
+       /*
+        * Check for scn_restart_txg before checking spa_load_state, so
+        * that we can restart an old-style scan while the pool is being
+        * imported (see dsl_scan_init).
+        */
+       if (scn->scn_restart_txg != 0 &&
+           scn->scn_restart_txg <= tx->tx_txg) {
+               pool_scan_func_t func = POOL_SCAN_SCRUB;
+               dsl_scan_done(scn, B_FALSE, tx);
+               if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+                       func = POOL_SCAN_RESILVER;
+               zfs_dbgmsg("restarting scan func=%u txg=%llu",
+                   func, tx->tx_txg);
+               dsl_scan_setup_sync(scn, &func, tx);
+       }
+       if (!dsl_scan_active(scn) ||
+           spa_sync_pass(dp->dp_spa) > 1)
+               return;
+       scn->scn_visited_this_txg = 0;
+       scn->scn_pausing = B_FALSE;
+       scn->scn_sync_start_time = gethrtime();
+       spa->spa_scrub_active = B_TRUE;
+       /*
+        * First process the free list.  If we pause the free, don't do
+        * any scanning.  This ensures that there is no free list when
+        * we are scanning, so the scan code doesn't have to worry about
+        * traversing it.
+        */
+       if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+               scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+                   NULL, ZIO_FLAG_MUSTSUCCEED);
+               err = bpobj_iterate(&dp->dp_free_bpobj,
+                   dsl_scan_free_cb, scn, tx);
+               VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+               if (scn->scn_visited_this_txg) {
+                       zfs_dbgmsg("freed %llu blocks in %llums from "
+                           "free_bpobj txg %llu",
+                           (longlong_t)scn->scn_visited_this_txg,
+                           (longlong_t)
+                           (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
+                           (longlong_t)tx->tx_txg);
+                       scn->scn_visited_this_txg = 0;
+                       /*
+                        * Re-sync the ddt so that we can further modify
+                        * it when doing bprewrite.
+                        */
+                       ddt_sync(spa, tx->tx_txg);
+               }
+               if (err == ERESTART)
+                       return;
+       }
+       if (scn->scn_phys.scn_state != DSS_SCANNING)
+               return;
+       if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+           scn->scn_phys.scn_ddt_class_max) {
+               zfs_dbgmsg("doing scan sync txg %llu; "
+                   "ddt bm=%llu/%llu/%llu/%llx",
+                   (longlong_t)tx->tx_txg,
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+               ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
+               ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
+               ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
+               ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
+       } else {
+               zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
+                   (longlong_t)tx->tx_txg,
+                   (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
+                   (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
+                   (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
+                   (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
+       }
+       scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+           NULL, ZIO_FLAG_CANFAIL);
+       dsl_scan_visit(scn, tx);
+       (void) zio_wait(scn->scn_zio_root);
+       scn->scn_zio_root = NULL;
+       zfs_dbgmsg("visited %llu blocks in %llums",
+           (longlong_t)scn->scn_visited_this_txg,
+           (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
+       if (!scn->scn_pausing) {
+               /* finished with scan. */
+               zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
+               dsl_scan_done(scn, B_TRUE, tx);
+       }
+       if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+               mutex_enter(&spa->spa_scrub_lock);
+               while (spa->spa_scrub_inflight > 0) {
+                       cv_wait(&spa->spa_scrub_io_cv,
+                           &spa->spa_scrub_lock);
+               }
+               mutex_exit(&spa->spa_scrub_lock);
+       }
+       dsl_scan_sync_state(scn, tx);
+ }
+ /*
+  * This will start a new scan, or restart an existing one.
+  */
+ void
+ dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
+ {
+       if (txg == 0) {
+               dmu_tx_t *tx;
+               tx = dmu_tx_create_dd(dp->dp_mos_dir);
+               VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+               txg = dmu_tx_get_txg(tx);
+               dp->dp_scan->scn_restart_txg = txg;
+               dmu_tx_commit(tx);
+       } else {
+               dp->dp_scan->scn_restart_txg = txg;
+       }
+       zfs_dbgmsg("restarting resilver txg=%llu", txg);
+ }
+ boolean_t
+ dsl_scan_resilvering(dsl_pool_t *dp)
+ {
+       return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
+           dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+ }
+ /*
+  * scrub consumers
+  */
+ static void
+ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+ {
+       int i;
+       /*
+        * If we resume after a reboot, zab will be NULL; don't record
+        * incomplete stats in that case.
+        */
+       if (zab == NULL)
+               return;
+       for (i = 0; i < 4; i++) {
+               int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
+               int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+               zfs_blkstat_t *zb = &zab->zab_type[l][t];
+               int equal;
+               zb->zb_count++;
+               zb->zb_asize += BP_GET_ASIZE(bp);
+               zb->zb_lsize += BP_GET_LSIZE(bp);
+               zb->zb_psize += BP_GET_PSIZE(bp);
+               zb->zb_gangs += BP_COUNT_GANG(bp);
+               switch (BP_GET_NDVAS(bp)) {
+               case 2:
+                       if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+                           DVA_GET_VDEV(&bp->blk_dva[1]))
+                               zb->zb_ditto_2_of_2_samevdev++;
+                       break;
+               case 3:
+                       equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+                           DVA_GET_VDEV(&bp->blk_dva[1])) +
+                           (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+                           DVA_GET_VDEV(&bp->blk_dva[2])) +
+                           (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+                           DVA_GET_VDEV(&bp->blk_dva[2]));
+                       if (equal == 1)
+                               zb->zb_ditto_2_of_3_samevdev++;
+                       else if (equal == 3)
+                               zb->zb_ditto_3_of_3_samevdev++;
+                       break;
+               }
+       }
+ }
+ static void
+ dsl_scan_scrub_done(zio_t *zio)
+ {
+       spa_t *spa = zio->io_spa;
+       zio_data_buf_free(zio->io_data, zio->io_size);
+       mutex_enter(&spa->spa_scrub_lock);
+       spa->spa_scrub_inflight--;
+       cv_broadcast(&spa->spa_scrub_io_cv);
+       if (zio->io_error && (zio->io_error != ECKSUM ||
+           !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+               spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
+       }
+       mutex_exit(&spa->spa_scrub_lock);
+ }
+ static int
+ dsl_scan_scrub_cb(dsl_pool_t *dp,
+     const blkptr_t *bp, const zbookmark_t *zb)
+ {
+       dsl_scan_t *scn = dp->dp_scan;
+       size_t size = BP_GET_PSIZE(bp);
+       spa_t *spa = dp->dp_spa;
+       uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+       boolean_t needs_io;
+       int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+       int zio_priority;
++      int d;
+       if (phys_birth <= scn->scn_phys.scn_min_txg ||
+           phys_birth >= scn->scn_phys.scn_max_txg)
+               return (0);
+       count_block(dp->dp_blkstats, bp);
+       ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
+       if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
+               zio_flags |= ZIO_FLAG_SCRUB;
+               zio_priority = ZIO_PRIORITY_SCRUB;
+               needs_io = B_TRUE;
+       } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
+               zio_flags |= ZIO_FLAG_RESILVER;
+               zio_priority = ZIO_PRIORITY_RESILVER;
+               needs_io = B_FALSE;
+       }
+       /* If it's an intent log block, failure is expected. */
+       if (zb->zb_level == ZB_ZIL_LEVEL)
+               zio_flags |= ZIO_FLAG_SPECULATIVE;
++      for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+               vdev_t *vd = vdev_lookup_top(spa,
+                   DVA_GET_VDEV(&bp->blk_dva[d]));
+               /*
+                * Keep track of how much data we've examined so that
+                * zpool(1M) status can make useful progress reports.
+                */
+               scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
+               spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
+               /* if it's a resilver, this may not be in the target range */
+               if (!needs_io) {
+                       if (DVA_GET_GANG(&bp->blk_dva[d])) {
+                               /*
+                                * Gang members may be spread across multiple
+                                * vdevs, so the best estimate we have is the
+                                * scrub range, which has already been checked.
+                                * XXX -- it would be better to change our
+                                * allocation policy to ensure that all
+                                * gang members reside on the same vdev.
+                                */
+                               needs_io = B_TRUE;
+                       } else {
+                               needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
+                                   phys_birth, 1);
+                       }
+               }
+       }
+       if (needs_io && !zfs_no_scrub_io) {
+               void *data = zio_data_buf_alloc(size);
+               mutex_enter(&spa->spa_scrub_lock);
+               while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
+                       cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+               spa->spa_scrub_inflight++;
+               mutex_exit(&spa->spa_scrub_lock);
+               zio_nowait(zio_read(NULL, spa, bp, data, size,
+                   dsl_scan_scrub_done, NULL, zio_priority,
+                   zio_flags, zb));
+       }
+       /* do not relocate this block */
+       return (0);
+ }
+ int
+ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+ {
+       spa_t *spa = dp->dp_spa;
+       /*
+        * Purge all vdev caches and probe all devices.  We do this here
+        * rather than in sync context because this requires a writer lock
+        * on the spa_config lock, which we can't do from sync context.  The
+        * spa_scrub_reopen flag indicates that vdev_open() should not
+        * attempt to start another scrub.
+        */
+       spa_vdev_state_enter(spa, SCL_NONE);
+       spa->spa_scrub_reopen = B_TRUE;
+       vdev_reopen(spa->spa_root_vdev);
+       spa->spa_scrub_reopen = B_FALSE;
+       (void) spa_vdev_state_exit(spa, NULL, 0);
+       return (dsl_sync_task_do(dp, dsl_scan_setup_check,
+           dsl_scan_setup_sync, dp->dp_scan, &func, 0));
+ }
index 0a4d55097685e265a1d72f2268bafd9045e6ee5d,41a40300ebfec2234b4385b3ab33065947a45f3e..86fe01553d7342d91897e372623906263b1ac5c2
@@@ -309,6 -338,66 +338,67 @@@ typedef struct blkptr 
  
  #define       BP_SPRINTF_LEN  320
  
 -              for (int d = 0; d < BP_GET_NDVAS(bp); d++) {            \
+ /*
+  * This macro allows code sharing between zfs, libzpool, and mdb.
+  * 'func' is either snprintf() or mdb_snprintf().
+  * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
+  */
+ #define       SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress)     \
+ {                                                                     \
+       static const char *copyname[] =                                 \
+           { "zero", "single", "double", "triple" };                   \
+       int size = BP_SPRINTF_LEN;                                      \
+       int len = 0;                                                    \
+       int copies = 0;                                                 \
++      int d;                                                          \
+                                                                       \
+       if (bp == NULL) {                                               \
+               len = func(buf + len, size - len, "<NULL>");            \
+       } else if (BP_IS_HOLE(bp)) {                                    \
+               len = func(buf + len, size - len, "<hole>");            \
+       } else {                                                        \
++              for (d = 0; d < BP_GET_NDVAS(bp); d++) {                \
+                       const dva_t *dva = &bp->blk_dva[d];             \
+                       if (DVA_IS_VALID(dva))                          \
+                               copies++;                               \
+                       len += func(buf + len, size - len,              \
+                           "DVA[%d]=<%llu:%llx:%llx>%c", d,            \
+                           (u_longlong_t)DVA_GET_VDEV(dva),            \
+                           (u_longlong_t)DVA_GET_OFFSET(dva),          \
+                           (u_longlong_t)DVA_GET_ASIZE(dva),           \
+                           ws);                                        \
+               }                                                       \
+               if (BP_IS_GANG(bp) &&                                   \
+                   DVA_GET_ASIZE(&bp->blk_dva[2]) <=                   \
+                   DVA_GET_ASIZE(&bp->blk_dva[1]) / 2)                 \
+                       copies--;                                       \
+               len += func(buf + len, size - len,                      \
+                   "[L%llu %s] %s %s %s %s %s %s%c"                    \
+                   "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c"    \
+                   "cksum=%llx:%llx:%llx:%llx",                        \
+                   (u_longlong_t)BP_GET_LEVEL(bp),                     \
+                   type,                                               \
+                   checksum,                                           \
+                   compress,                                           \
+                   BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",            \
+                   BP_IS_GANG(bp) ? "gang" : "contiguous",             \
+                   BP_GET_DEDUP(bp) ? "dedup" : "unique",              \
+                   copyname[copies],                                   \
+                   ws,                                                 \
+                   (u_longlong_t)BP_GET_LSIZE(bp),                     \
+                   (u_longlong_t)BP_GET_PSIZE(bp),                     \
+                   (u_longlong_t)bp->blk_birth,                        \
+                   (u_longlong_t)BP_PHYSICAL_BIRTH(bp),                \
+                   (u_longlong_t)bp->blk_fill,                         \
+                   ws,                                                 \
+                   (u_longlong_t)bp->blk_cksum.zc_word[0],             \
+                   (u_longlong_t)bp->blk_cksum.zc_word[1],             \
+                   (u_longlong_t)bp->blk_cksum.zc_word[2],             \
+                   (u_longlong_t)bp->blk_cksum.zc_word[3]);            \
+       }                                                               \
+       ASSERT(len < size);                                             \
+ }
  #include <sys/dmu.h>
  
  #define       BP_GET_BUFC_TYPE(bp)                                            \
index 987617ffeeb338206c5c32175589f66349433b9e,17b4b12c4ee4db8f4aa0b5af2e915e9d459294c2..1722a53fc9bd65a2778aa55d5b575dcc44b34654
@@@ -495,10 -730,9 +730,10 @@@ voi
  metaslab_fini(metaslab_t *msp)
  {
        metaslab_group_t *mg = msp->ms_group;
 +      int t;
  
-       vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
-           -msp->ms_smo.smo_alloc, B_TRUE);
+       vdev_space_update(mg->mg_vd,
+           -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
  
        metaslab_group_remove(mg, msp);
  
                space_map_destroy(&msp->ms_freemap[t]);
        }
  
 -      for (int t = 0; t < TXG_DEFER_SIZE; t++)
++      for (t = 0; t < TXG_DEFER_SIZE; t++)
+               space_map_destroy(&msp->ms_defermap[t]);
+       ASSERT3S(msp->ms_deferspace, ==, 0);
        mutex_exit(&msp->ms_lock);
        mutex_destroy(&msp->ms_lock);
  
@@@ -574,17 -846,35 +847,36 @@@ metaslab_prefetch(metaslab_group_t *mg
  static int
  metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
  {
+       metaslab_group_t *mg = msp->ms_group;
        space_map_t *sm = &msp->ms_map;
        space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
++      int t;
  
        ASSERT(MUTEX_HELD(&msp->ms_lock));
  
        if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-               int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo,
-                   msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
-               if (error) {
-                       metaslab_group_sort(msp->ms_group, msp, 0);
-                       return (error);
+               space_map_load_wait(sm);
+               if (!sm->sm_loaded) {
+                       int error = space_map_load(sm, sm_ops, SM_FREE,
+                           &msp->ms_smo,
+                           spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
+                       if (error)  {
+                               metaslab_group_sort(msp->ms_group, msp, 0);
+                               return (error);
+                       }
 -                      for (int t = 0; t < TXG_DEFER_SIZE; t++)
++                      for (t = 0; t < TXG_DEFER_SIZE; t++)
+                               space_map_walk(&msp->ms_defermap[t],
+                                   space_map_claim, sm);
+               }
+               /*
+                * Track the bonus area as we activate new metaslabs.
+                */
+               if (sm->sm_start > mg->mg_bonus_area) {
+                       mutex_enter(&mg->mg_lock);
+                       mg->mg_bonus_area = sm->sm_start;
+                       mutex_exit(&mg->mg_lock);
                }
  
                /*
@@@ -632,9 -922,11 +924,12 @@@ metaslab_sync(metaslab_t *msp, uint64_
        space_map_obj_t *smo = &msp->ms_smo_syncing;
        dmu_buf_t *db;
        dmu_tx_t *tx;
 +      int t;
  
-       tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+       ASSERT(!vd->vdev_ishole);
+       if (allocmap->sm_space == 0 && freemap->sm_space == 0)
+               return;
  
        /*
         * The only state that can actually be changing concurrently with
                space_map_walk(sm, space_map_remove, allocmap);
                space_map_walk(freed_map, space_map_remove, allocmap);
  
 -              for (int t = 0; t < TXG_DEFER_SIZE; t++)
++              for (t = 0; t < TXG_DEFER_SIZE; t++)
+                       space_map_walk(&msp->ms_defermap[t],
+                           space_map_remove, allocmap);
 -              for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
 +              for (t = 1; t < TXG_CONCURRENT_STATES; t++)
                        space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
                            space_map_remove, allocmap);
  
@@@ -717,10 -1015,13 +1018,14 @@@ metaslab_sync_done(metaslab_t *msp, uin
        space_map_obj_t *smosync = &msp->ms_smo_syncing;
        space_map_t *sm = &msp->ms_map;
        space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+       space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
        metaslab_group_t *mg = msp->ms_group;
        vdev_t *vd = mg->mg_vd;
+       int64_t alloc_delta, defer_delta;
 +      int t;
  
+       ASSERT(!vd->vdev_ishole);
        mutex_enter(&msp->ms_lock);
  
        /*
                        space_map_create(&msp->ms_freemap[t], sm->sm_start,
                            sm->sm_size, sm->sm_shift, sm->sm_lock);
                }
-               vdev_space_update(vd, sm->sm_size, 0, B_TRUE);
 -              for (int t = 0; t < TXG_DEFER_SIZE; t++)
++              for (t = 0; t < TXG_DEFER_SIZE; t++)
+                       space_map_create(&msp->ms_defermap[t], sm->sm_start,
+                           sm->sm_size, sm->sm_shift, sm->sm_lock);
+               vdev_space_update(vd, 0, 0, sm->sm_size);
        }
  
-       vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE);
+       alloc_delta = smosync->smo_alloc - smo->smo_alloc;
+       defer_delta = freed_map->sm_space - defer_map->sm_space;
+       vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
  
        ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
        ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
        mutex_exit(&msp->ms_lock);
  }
  
 -      for (int m = 0; m < vd->vdev_ms_count; m++) {
+ void
+ metaslab_sync_reassess(metaslab_group_t *mg)
+ {
+       vdev_t *vd = mg->mg_vd;
++      int m;
+       /*
+        * Re-evaluate all metaslabs which have lower offsets than the
+        * bonus area.
+        */
++      for (m = 0; m < vd->vdev_ms_count; m++) {
+               metaslab_t *msp = vd->vdev_ms[m];
+               if (msp->ms_map.sm_start > mg->mg_bonus_area)
+                       break;
+               mutex_enter(&msp->ms_lock);
+               metaslab_group_sort(mg, msp, metaslab_weight(msp));
+               mutex_exit(&msp->ms_lock);
+       }
+       /*
+        * Prefetch the next potential metaslabs
+        */
+       metaslab_prefetch(mg);
+ }
  static uint64_t
  metaslab_distance(metaslab_t *msp, dva_t *dva)
  {
@@@ -1154,9 -1517,10 +1522,10 @@@ metaslab_alloc(spa_t *spa, metaslab_cla
  {
        dva_t *dva = bp->blk_dva;
        dva_t *hintdva = hintbp->blk_dva;
 -      int error = 0;
 +      int d, error = 0;
  
        ASSERT(bp->blk_birth == 0);
+       ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
  
        spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
  
  metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
  {
        const dva_t *dva = bp->blk_dva;
 -      int ndvas = BP_GET_NDVAS(bp);
 +      int d, ndvas = BP_GET_NDVAS(bp);
  
        ASSERT(!BP_IS_HOLE(bp));
-       ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg);
+       ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
  
        spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
  
index 705dda4df6e653a1937bef6b15417b1c852854a1,d7c5de0d357a2fab7e3a281ed52196aab97519c8..b0236e49f9da173182392ea62e17f63cfbc85af4
@@@ -565,27 -601,58 +601,60 @@@ spa_get_errlists(spa_t *spa, avl_tree_
            offsetof(spa_error_entry_t, se_avl));
  }
  
- /*
-  * Activate an uninitialized pool.
-  */
- static void
- spa_activate(spa_t *spa, int mode)
+ static taskq_t *
+ spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
+     uint_t value)
  {
-       int t, q;
+       uint_t flags = TASKQ_PREPOPULATE;
+       boolean_t batch = B_FALSE;
  
-       ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+       switch (mode) {
+       case zti_mode_null:
+               return (NULL);          /* no taskq needed */
  
-       spa->spa_state = POOL_STATE_ACTIVE;
-       spa->spa_mode = mode;
+       case zti_mode_fixed:
+               ASSERT3U(value, >=, 1);
+               value = MAX(value, 1);
+               break;
+       case zti_mode_batch:
+               batch = B_TRUE;
+               flags |= TASKQ_THREADS_CPU_PCT;
+               value = zio_taskq_batch_pct;
+               break;
+       case zti_mode_online_percent:
+               flags |= TASKQ_THREADS_CPU_PCT;
+               break;
+       default:
+               panic("unrecognized mode for %s taskq (%u:%u) in "
+                   "spa_activate()",
+                   name, mode, value);
+               break;
+       }
  
-       spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
-       spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
+       if (zio_taskq_sysdc && spa->spa_proc != &p0) {
+               if (batch)
+                       flags |= TASKQ_DC_BATCH;
+               return (taskq_create_sysdc(name, value, 50, INT_MAX,
+                   spa->spa_proc, zio_taskq_basedc, flags));
+       }
+       return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
+           spa->spa_proc, flags));
+ }
+ static void
+ spa_create_zio_taskqs(spa_t *spa)
+ {
 -      for (int t = 0; t < ZIO_TYPES; t++) {
 -              for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
++      int t, q;
 +
 +      for (t = 0; t < ZIO_TYPES; t++) {
-               const zio_taskq_info_t *ztip = &zio_taskqs[t];
 +              for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
-                       enum zti_modes mode = ztip->zti_nthreads[q].zti_mode;
-                       uint_t value = ztip->zti_nthreads[q].zti_value;
+                       const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+                       enum zti_modes mode = ztip->zti_mode;
+                       uint_t value = ztip->zti_value;
                        char name[32];
  
                        (void) snprintf(name, sizeof (name),
@@@ -660,9 -814,10 +818,10 @@@ spa_deactivate(spa_t *spa
        list_destroy(&spa->spa_config_dirty_list);
        list_destroy(&spa->spa_state_dirty_list);
  
 -      for (int t = 0; t < ZIO_TYPES; t++) {
 -              for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 +      for (t = 0; t < ZIO_TYPES; t++) {
 +              for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
-                       taskq_destroy(spa->spa_zio_taskq[t][q]);
+                       if (spa->spa_zio_taskq[t][q] != NULL)
+                               taskq_destroy(spa->spa_zio_taskq[t][q]);
                        spa->spa_zio_taskq[t][q] = NULL;
                }
        }
@@@ -1106,27 -1288,23 +1295,24 @@@ spa_check_removed(vdev_t *vd
   * that the label does not contain the most up-to-date information.
   */
  void
- spa_load_log_state(spa_t *spa)
+ spa_load_log_state(spa_t *spa, nvlist_t *nv)
  {
-       nvlist_t *nv, *nvroot, **child;
-       uint64_t is_log;
-       uint_t children;
-       vdev_t *rvd = spa->spa_root_vdev;
+       vdev_t *ovd, *rvd = spa->spa_root_vdev;
 +      int c;
  
-       VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0);
-       VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
-       VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
-           &child, &children) == 0);
-       for (c = 0; c < children; c++) {
-               vdev_t *tvd = rvd->vdev_child[c];
+       /*
+        * Load the original root vdev tree from the passed config.
+        */
+       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+       VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
  
-               if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
-                   &is_log) == 0 && is_log)
-                       vdev_load_log_state(tvd, child[c]);
 -      for (int c = 0; c < rvd->vdev_children; c++) {
++      for (c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *cvd = rvd->vdev_child[c];
+               if (cvd->vdev_islog)
+                       vdev_load_log_state(cvd, ovd->vdev_child[c]);
        }
-       nvlist_free(nv);
+       vdev_free(ovd);
+       spa_config_exit(spa, SCL_ALL, FTAG);
  }
  
  /*
@@@ -1149,181 -1327,481 +1335,486 @@@ spa_check_logs(spa_t *spa
        return (0);
  }
  
- /*
-  * Load an existing storage pool, using the pool's builtin spa_config as a
-  * source of configuration information.
-  */
- static int
- spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
+ static boolean_t
+ spa_passivate_log(spa_t *spa)
  {
-       int error = 0;
-       nvlist_t *nvroot = NULL;
-       vdev_t *rvd;
-       uberblock_t *ub = &spa->spa_uberblock;
-       uint64_t config_cache_txg = spa->spa_config_txg;
-       uint64_t pool_guid;
-       uint64_t version;
-       uint64_t autoreplace = 0;
-       int orig_mode = spa->spa_mode;
-       char *ereport = FM_EREPORT_ZFS_POOL;
+       vdev_t *rvd = spa->spa_root_vdev;
+       boolean_t slog_found = B_FALSE;
++      int c;
  
-       /*
-        * If this is an untrusted config, access the pool in read-only mode.
-        * This prevents things like resilvering recently removed devices.
-        */
-       if (!mosconfig)
-               spa->spa_mode = FREAD;
+       ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
  
-       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       if (!spa_has_slogs(spa))
+               return (B_FALSE);
  
-       spa->spa_load_state = state;
 -      for (int c = 0; c < rvd->vdev_children; c++) {
++      for (c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *tvd = rvd->vdev_child[c];
+               metaslab_group_t *mg = tvd->vdev_mg;
  
-       if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
-           nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
-               error = EINVAL;
-               goto out;
+               if (tvd->vdev_islog) {
+                       metaslab_group_passivate(mg);
+                       slog_found = B_TRUE;
+               }
        }
  
-       /*
-        * Versioning wasn't explicitly added to the label until later, so if
-        * it's not present treat it as the initial version.
-        */
-       if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
-               version = SPA_VERSION_INITIAL;
+       return (slog_found);
+ }
  
-       (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
-           &spa->spa_config_txg);
+ static void
+ spa_activate_log(spa_t *spa)
+ {
+       vdev_t *rvd = spa->spa_root_vdev;
++      int c;
  
-       if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
-           spa_guid_exists(pool_guid, 0)) {
-               error = EEXIST;
-               goto out;
-       }
+       ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
  
-       spa->spa_load_guid = pool_guid;
 -      for (int c = 0; c < rvd->vdev_children; c++) {
++      for (c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *tvd = rvd->vdev_child[c];
+               metaslab_group_t *mg = tvd->vdev_mg;
  
-       /*
-        * Create "The Godfather" zio to hold all async IOs
-        */
-       spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
-           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+               if (tvd->vdev_islog)
+                       metaslab_group_activate(mg);
+       }
+ }
  
-       /*
-        * Parse the configuration into a vdev tree.  We explicitly set the
-        * value that will be returned by spa_version() since parsing the
-        * configuration requires knowing the version number.
-        */
-       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-       spa->spa_ubsync.ub_version = version;
-       error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
-       spa_config_exit(spa, SCL_ALL, FTAG);
+ int
+ spa_offline_log(spa_t *spa)
+ {
+       int error = 0;
  
-       if (error != 0)
-               goto out;
+       if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
+           NULL, DS_FIND_CHILDREN)) == 0) {
  
-       ASSERT(spa->spa_root_vdev == rvd);
-       ASSERT(spa_guid(spa) == pool_guid);
+               /*
+                * We successfully offlined the log device, sync out the
+                * current txg so that the "stubby" block can be removed
+                * by zil_sync().
+                */
+               txg_wait_synced(spa->spa_dsl_pool, 0);
+       }
+       return (error);
+ }
  
-       /*
-        * Try to open all vdevs, loading each label in the process.
-        */
-       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-       error = vdev_open(rvd);
-       spa_config_exit(spa, SCL_ALL, FTAG);
-       if (error != 0)
-               goto out;
+ static void
+ spa_aux_check_removed(spa_aux_vdev_t *sav)
+ {
 -      for (int i = 0; i < sav->sav_count; i++)
++      int i;
 +
-       /*
-        * We need to validate the vdev labels against the configuration that
-        * we have in hand, which is dependent on the setting of mosconfig. If
-        * mosconfig is true then we're validating the vdev labels based on
-        * that config. Otherwise, we're validating against the cached config
-        * (zpool.cache) that was read when we loaded the zfs module, and then
-        * later we will recursively call spa_load() and validate against
-        * the vdev config.
-        */
-       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-       error = vdev_validate(rvd);
-       spa_config_exit(spa, SCL_ALL, FTAG);
-       if (error != 0)
-               goto out;
++      for (i = 0; i < sav->sav_count; i++)
+               spa_check_removed(sav->sav_vdevs[i]);
+ }
  
-       if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
-               error = ENXIO;
-               goto out;
-       }
+ void
+ spa_claim_notify(zio_t *zio)
+ {
+       spa_t *spa = zio->io_spa;
  
-       /*
-        * Find the best uberblock.
-        */
-       vdev_uberblock_load(NULL, rvd, ub);
+       if (zio->io_error)
+               return;
  
-       /*
-        * If we weren't able to find a single valid uberblock, return failure.
-        */
-       if (ub->ub_txg == 0) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_CORRUPT_DATA);
-               error = ENXIO;
-               goto out;
-       }
+       mutex_enter(&spa->spa_props_lock);      /* any mutex will do */
+       if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
+               spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+       mutex_exit(&spa->spa_props_lock);
+ }
  
-       /*
-        * If the pool is newer than the code, we can't open it.
-        */
-       if (ub->ub_version > SPA_VERSION) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_VERSION_NEWER);
-               error = ENOTSUP;
-               goto out;
-       }
+ typedef struct spa_load_error {
+       uint64_t        sle_meta_count;
+       uint64_t        sle_data_count;
+ } spa_load_error_t;
  
-       /*
-        * If the vdev guid sum doesn't match the uberblock, we have an
-        * incomplete configuration.
-        */
-       if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_BAD_GUID_SUM);
-               error = ENXIO;
-               goto out;
-       }
+ static void
+ spa_load_verify_done(zio_t *zio)
+ {
+       blkptr_t *bp = zio->io_bp;
+       spa_load_error_t *sle = zio->io_private;
+       dmu_object_type_t type = BP_GET_TYPE(bp);
+       int error = zio->io_error;
  
-       /*
-        * Initialize internal SPA structures.
-        */
-       spa->spa_state = POOL_STATE_ACTIVE;
-       spa->spa_ubsync = spa->spa_uberblock;
-       spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
-       error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
        if (error) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_CORRUPT_DATA);
-               goto out;
+               if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
+                   type != DMU_OT_INTENT_LOG)
+                       atomic_add_64(&sle->sle_meta_count, 1);
+               else
+                       atomic_add_64(&sle->sle_data_count, 1);
        }
-       spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+       zio_data_buf_free(zio->io_data, zio->io_size);
+ }
  
-       if (zap_lookup(spa->spa_meta_objset,
-           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
-           sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
-               vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                   VDEV_AUX_CORRUPT_DATA);
-               error = EIO;
-               goto out;
-       }
+ /*ARGSUSED*/
+ static int
+ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+     arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+ {
+       if (bp != NULL) {
+               zio_t *rio = arg;
+               size_t size = BP_GET_PSIZE(bp);
+               void *data = zio_data_buf_alloc(size);
  
-       if (!mosconfig) {
-               nvlist_t *newconfig;
-               uint64_t hostid;
+               zio_nowait(zio_read(rio, spa, bp, data, size,
+                   spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+                   ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+                   ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
+       }
+       return (0);
+ }
  
-               if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
-                       vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-                           VDEV_AUX_CORRUPT_DATA);
-                       error = EIO;
-                       goto out;
+ static int
+ spa_load_verify(spa_t *spa)
+ {
+       zio_t *rio;
+       spa_load_error_t sle = { 0 };
+       zpool_rewind_policy_t policy;
+       boolean_t verify_ok = B_FALSE;
+       int error;
+       zpool_get_rewind_policy(spa->spa_config, &policy);
+       if (policy.zrp_request & ZPOOL_NEVER_REWIND)
+               return (0);
+       rio = zio_root(spa, NULL, &sle,
+           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+       error = traverse_pool(spa, spa->spa_verify_min_txg,
+           TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
+       (void) zio_wait(rio);
+       spa->spa_load_meta_errors = sle.sle_meta_count;
+       spa->spa_load_data_errors = sle.sle_data_count;
+       if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
+           sle.sle_data_count <= policy.zrp_maxdata) {
+               verify_ok = B_TRUE;
+               spa->spa_load_txg = spa->spa_uberblock.ub_txg;
+               spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
+       } else {
+               spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
+       }
+       if (error) {
+               if (error != ENXIO && error != EIO)
+                       error = EIO;
+               return (error);
+       }
+       return (verify_ok ? 0 : EIO);
+ }
+ /*
+  * Find a value in the pool props object.
+  */
+ static void
+ spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
+ {
+       (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
+           zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
+ }
+ /*
+  * Find a value in the pool directory object.
+  */
+ static int
+ spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
+ {
+       return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           name, sizeof (uint64_t), 1, val));
+ }
+ static int
+ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
+ {
+       vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
+       return (err);
+ }
+ /*
+  * Fix up config after a partly-completed split.  This is done with the
+  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
+  * pool have that entry in their config, but only the splitting one contains
+  * a list of all the guids of the vdevs that are being split off.
+  *
+  * This function determines what to do with that list: either rejoin
+  * all the disks to the pool, or complete the splitting process.  To attempt
+  * the rejoin, each disk that is offlined is marked online again, and
+  * we do a reopen() call.  If the vdev label for every disk that was
+  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
+  * then we call vdev_split() on each disk, and complete the split.
+  *
+  * Otherwise we leave the config alone, with all the vdevs in place in
+  * the original pool.
+  */
+ static void
+ spa_try_repair(spa_t *spa, nvlist_t *config)
+ {
+       uint_t extracted;
+       uint64_t *glist;
+       uint_t i, gcount;
+       nvlist_t *nvl;
+       vdev_t **vd;
+       boolean_t attempt_reopen;
+       if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
+               return;
+       /* check that the config is complete */
+       if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+           &glist, &gcount) != 0)
+               return;
+       vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
+       /* attempt to online all the vdevs & validate */
+       attempt_reopen = B_TRUE;
+       for (i = 0; i < gcount; i++) {
+               if (glist[i] == 0)      /* vdev is hole */
+                       continue;
+               vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
+               if (vd[i] == NULL) {
+                       /*
+                        * Don't bother attempting to reopen the disks;
+                        * just do the split.
+                        */
+                       attempt_reopen = B_FALSE;
+               } else {
+                       /* attempt to re-online it */
+                       vd[i]->vdev_offline = B_FALSE;
+               }
+       }
+       if (attempt_reopen) {
+               vdev_reopen(spa->spa_root_vdev);
+               /* check each device to see what state it's in */
+               for (extracted = 0, i = 0; i < gcount; i++) {
+                       if (vd[i] != NULL &&
+                           vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
+                               break;
+                       ++extracted;
                }
+       }
+       /*
+        * If every disk has been moved to the new pool, or if we never
+        * even attempted to look at them, then we split them off for
+        * good.
+        */
+       if (!attempt_reopen || gcount == extracted) {
+               for (i = 0; i < gcount; i++)
+                       if (vd[i] != NULL)
+                               vdev_split(vd[i]);
+               vdev_reopen(spa->spa_root_vdev);
+       }
+       kmem_free(vd, gcount * sizeof (vdev_t *));
+ }
+ static int
+ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
+     boolean_t mosconfig)
+ {
+       nvlist_t *config = spa->spa_config;
+       char *ereport = FM_EREPORT_ZFS_POOL;
+       int error;
+       uint64_t pool_guid;
+       nvlist_t *nvl;
+       if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
+               return (EINVAL);
  
-               if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig,
+       /*
+        * Versioning wasn't explicitly added to the label until later, so if
+        * it's not present treat it as the initial version.
+        */
+       if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+           &spa->spa_ubsync.ub_version) != 0)
+               spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+       (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+           &spa->spa_config_txg);
+       if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
+           spa_guid_exists(pool_guid, 0)) {
+               error = EEXIST;
+       } else {
+               spa->spa_load_guid = pool_guid;
+               if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
+                   &nvl) == 0) {
+                       VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
+                           KM_SLEEP) == 0);
+               }
+               error = spa_load_impl(spa, pool_guid, config, state, type,
+                   mosconfig, &ereport);
+       }
+       spa->spa_minref = refcount_count(&spa->spa_refcount);
+       if (error && error != EBADF)
+               zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+       spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
+       spa->spa_ena = 0;
+       return (error);
+ }
+ /*
+  * Load an existing storage pool, using the pool's builtin spa_config as a
+  * source of configuration information.
+  */
+ static int
+ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
+     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
+     char **ereport)
+ {
+       int error = 0;
+       nvlist_t *nvroot = NULL;
+       vdev_t *rvd;
+       uberblock_t *ub = &spa->spa_uberblock;
+       uint64_t config_cache_txg = spa->spa_config_txg;
+       int orig_mode = spa->spa_mode;
+       int parse;
+       uint64_t obj;
++      int c;
+       /*
+        * If this is an untrusted config, access the pool in read-only mode.
+        * This prevents things like resilvering recently removed devices.
+        */
+       if (!mosconfig)
+               spa->spa_mode = FREAD;
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       spa->spa_load_state = state;
+       if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
+               return (EINVAL);
+       parse = (type == SPA_IMPORT_EXISTING ?
+           VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
+       /*
+        * Create "The Godfather" zio to hold all async IOs
+        */
+       spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
+           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+       /*
+        * Parse the configuration into a vdev tree.  We explicitly set the
+        * value that will be returned by spa_version() since parsing the
+        * configuration requires knowing the version number.
+        */
+       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+       error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
+       spa_config_exit(spa, SCL_ALL, FTAG);
+       if (error != 0)
+               return (error);
+       ASSERT(spa->spa_root_vdev == rvd);
+       if (type != SPA_IMPORT_ASSEMBLE) {
+               ASSERT(spa_guid(spa) == pool_guid);
+       }
+       /*
+        * Try to open all vdevs, loading each label in the process.
+        */
+       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+       error = vdev_open(rvd);
+       spa_config_exit(spa, SCL_ALL, FTAG);
+       if (error != 0)
+               return (error);
+       /*
+        * We need to validate the vdev labels against the configuration that
+        * we have in hand, which is dependent on the setting of mosconfig. If
+        * mosconfig is true then we're validating the vdev labels based on
+        * that config.  Otherwise, we're validating against the cached config
+        * (zpool.cache) that was read when we loaded the zfs module, and then
+        * later we will recursively call spa_load() and validate against
+        * the vdev config.
+        *
+        * If we're assembling a new pool that's been split off from an
+        * existing pool, the labels haven't yet been updated so we skip
+        * validation for now.
+        */
+       if (type != SPA_IMPORT_ASSEMBLE) {
+               spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+               error = vdev_validate(rvd);
+               spa_config_exit(spa, SCL_ALL, FTAG);
+               if (error != 0)
+                       return (error);
+               if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+                       return (ENXIO);
+       }
+       /*
+        * Find the best uberblock.
+        */
+       vdev_uberblock_load(NULL, rvd, ub);
+       /*
+        * If we weren't able to find a single valid uberblock, return failure.
+        */
+       if (ub->ub_txg == 0)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
+       /*
+        * If the pool is newer than the code, we can't open it.
+        */
+       if (ub->ub_version > SPA_VERSION)
+               return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
+       /*
+        * If the vdev guid sum doesn't match the uberblock, we have an
+        * incomplete configuration.
+        */
+       if (mosconfig && type != SPA_IMPORT_ASSEMBLE &&
+           rvd->vdev_guid_sum != ub->ub_guid_sum)
+               return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
+       if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
+               spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+               spa_try_repair(spa, config);
+               spa_config_exit(spa, SCL_ALL, FTAG);
+               nvlist_free(spa->spa_config_splitting);
+               spa->spa_config_splitting = NULL;
+       }
+       /*
+        * Initialize internal SPA structures.
+        */
+       spa->spa_state = POOL_STATE_ACTIVE;
+       spa->spa_ubsync = spa->spa_uberblock;
+       spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+           TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+       spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+           spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+       spa->spa_claim_max_txg = spa->spa_first_txg;
+       spa->spa_prev_software_version = ub->ub_software_version;
+       error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+       if (error)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+       if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       if (!mosconfig) {
+               uint64_t hostid;
+               nvlist_t *policy = NULL, *nvconfig;
+               if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+               if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
                    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
                        char *hostname;
                        unsigned long myhostid = 0;
         * Check the state of the root vdev.  If it can't be opened, it
         * indicates one or more toplevel vdevs are faulted.
         */
-       if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
-               error = ENXIO;
-               goto out;
+       if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+               return (ENXIO);
+       /*
+        * Load the DDTs (dedup tables).
+        */
+       error = ddt_load(spa);
+       if (error != 0)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       spa_update_dspace(spa);
+       if (state != SPA_LOAD_TRYIMPORT) {
+               error = spa_load_verify(spa);
+               if (error)
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+                           error));
        }
  
-       if (spa_writeable(spa)) {
+       /*
+        * Load the intent log state and check log integrity.  If we're
+        * assembling a pool from a split, the log is not transferred over.
+        */
+       if (type != SPA_IMPORT_ASSEMBLE) {
+               nvlist_t *nvconfig;
+               if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+               VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
+                   &nvroot) == 0);
+               spa_load_log_state(spa, nvroot);
+               nvlist_free(nvconfig);
+               if (spa_check_logs(spa)) {
+                       *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+                       return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
+               }
+       }
+       if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
+           spa->spa_load_max_txg == UINT64_MAX)) {
                dmu_tx_t *tx;
                int need_update = B_FALSE;
 +              int c;
  
                ASSERT(state != SPA_LOAD_TRYIMPORT);
  
                    zil_claim, tx, DS_FIND_CHILDREN);
                dmu_tx_commit(tx);
  
-               spa->spa_log_state = SPA_LOG_GOOD;
-               spa->spa_sync_on = B_TRUE;
-               txg_sync_start(spa->spa_dsl_pool);
+               spa->spa_claiming = B_FALSE;
+               spa_set_log_state(spa, SPA_LOG_GOOD);
+               spa->spa_sync_on = B_TRUE;
+               txg_sync_start(spa->spa_dsl_pool);
+               /*
+                * Wait for all claims to sync.  We sync up to the highest
+                * claimed log block birth time so that claimed log blocks
+                * don't appear to be from the future.  spa_claim_max_txg
+                * will have been set for us by either zil_check_log_chain()
+                * (invoked from spa_check_logs()) or zil_claim() above.
+                */
+               txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
+               /*
+                * If the config cache is stale, or we have uninitialized
+                * metaslabs (see spa_vdev_add()), then update the config.
+                *
+                * If spa_load_verbatim is true, trust the current
+                * in-core spa_config and update the disk labels.
+                */
+               if (config_cache_txg != spa->spa_config_txg ||
+                   state == SPA_LOAD_IMPORT || spa->spa_load_verbatim ||
+                   state == SPA_LOAD_RECOVER)
+                       need_update = B_TRUE;
 -              for (int c = 0; c < rvd->vdev_children; c++)
++              for (c = 0; c < rvd->vdev_children; c++)
+                       if (rvd->vdev_child[c]->vdev_ms_array == 0)
+                               need_update = B_TRUE;
+               /*
+                * Update the config cache asychronously in case we're the
+                * root pool, in which case the config cache isn't writable yet.
+                */
+               if (need_update)
+                       spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+               /*
+                * Check all DTLs to see if anything needs resilvering.
+                */
+               if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
+                   vdev_resilver_needed(rvd, NULL, NULL))
+                       spa_async_request(spa, SPA_ASYNC_RESILVER);
+               /*
+                * Delete any inconsistent datasets.
+                */
+               (void) dmu_objset_find(spa_name(spa),
+                   dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
+               /*
+                * Clean up any stale temporary dataset userrefs.
+                */
+               dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
+       }
+       return (0);
+ }
+ static int
+ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
+ {
+       spa_unload(spa);
+       spa_deactivate(spa);
+       spa->spa_load_max_txg--;
+       spa_activate(spa, spa_mode_global);
+       spa_async_suspend(spa);
+       return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
+ }
+ static int
+ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
+     uint64_t max_request, int rewind_flags)
+ {
+       nvlist_t *config = NULL;
+       int load_error, rewind_error;
+       uint64_t safe_rewind_txg;
+       uint64_t min_txg;
+       if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
+               spa->spa_load_max_txg = spa->spa_load_txg;
+               spa_set_log_state(spa, SPA_LOG_CLEAR);
+       } else {
+               spa->spa_load_max_txg = max_request;
+       }
  
-               /*
-                * Wait for all claims to sync.
-                */
-               txg_wait_synced(spa->spa_dsl_pool, 0);
+       load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
+           mosconfig);
+       if (load_error == 0)
+               return (0);
  
-               /*
-                * If the config cache is stale, or we have uninitialized
-                * metaslabs (see spa_vdev_add()), then update the config.
-                *
-                * If spa_load_verbatim is true, trust the current
-                * in-core spa_config and update the disk labels.
-                */
-               if (config_cache_txg != spa->spa_config_txg ||
-                   state == SPA_LOAD_IMPORT || spa->spa_load_verbatim)
-                       need_update = B_TRUE;
+       if (spa->spa_root_vdev != NULL)
+               config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
  
-               for (c = 0; c < rvd->vdev_children; c++)
-                       if (rvd->vdev_child[c]->vdev_ms_array == 0)
-                               need_update = B_TRUE;
+       spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
+       spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
  
-               /*
-                * Update the config cache asychronously in case we're the
-                * root pool, in which case the config cache isn't writable yet.
-                */
-               if (need_update)
-                       spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+       if (rewind_flags & ZPOOL_NEVER_REWIND) {
+               nvlist_free(config);
+               return (load_error);
+       }
  
-               /*
-                * Check all DTLs to see if anything needs resilvering.
-                */
-               if (vdev_resilver_needed(rvd, NULL, NULL))
-                       spa_async_request(spa, SPA_ASYNC_RESILVER);
+       /* Price of rolling back is discarding txgs, including log */
+       if (state == SPA_LOAD_RECOVER)
+               spa_set_log_state(spa, SPA_LOG_CLEAR);
+       spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
+       safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
+       min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
+           TXG_INITIAL : safe_rewind_txg;
+       /*
+        * Continue as long as we're finding errors, we're still within
+        * the acceptable rewind range, and we're still finding uberblocks
+        */
+       while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
+           spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
+               if (spa->spa_load_max_txg < safe_rewind_txg)
+                       spa->spa_extreme_rewind = B_TRUE;
+               rewind_error = spa_load_retry(spa, state, mosconfig);
        }
  
-       error = 0;
- out:
-       spa->spa_minref = refcount_count(&spa->spa_refcount);
-       if (error && error != EBADF)
-               zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
-       spa->spa_load_state = SPA_LOAD_NONE;
-       spa->spa_ena = 0;
+       if (config)
+               spa_rewind_data_to_nvlist(spa, config);
  
-       return (error);
+       spa->spa_extreme_rewind = B_FALSE;
+       spa->spa_load_max_txg = UINT64_MAX;
+       if (config && (rewind_error || state != SPA_LOAD_RECOVER))
+               spa_config_set(spa, config);
+       return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
  }
  
  /*
@@@ -2950,10 -3575,20 +3592,20 @@@ spa_vdev_add(spa_t *spa, nvlist_t *nvro
        /*
         * Transfer each new top-level vdev from vd to rvd.
         */
 -      for (int c = 0; c < vd->vdev_children; c++) {
 +      for (c = 0; c < vd->vdev_children; c++) {
+               /*
+                * Set the vdev id to the first hole, if one exists.
+                */
+               for (id = 0; id < rvd->vdev_children; id++) {
+                       if (rvd->vdev_child[id]->vdev_ishole) {
+                               vdev_free(rvd->vdev_child[id]);
+                               break;
+                       }
+               }
                tvd = vd->vdev_child[c];
                vdev_remove_child(vd, tvd);
-               tvd->vdev_id = rvd->vdev_children;
+               tvd->vdev_id = id;
                vdev_add_child(rvd, tvd);
                vdev_config_dirty(tvd);
        }
@@@ -3201,7 -3841,7 +3858,8 @@@ spa_vdev_detach(spa_t *spa, uint64_t gu
        boolean_t unspare = B_FALSE;
        uint64_t unspare_guid;
        size_t len;
+       char *vdpath;
 +      int t;
  
        txg = spa_vdev_enter(spa);
  
                (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
        }
  
-       /*
-        * If the parent mirror/replacing vdev only has one child,
-        * the parent is no longer needed.  Remove it from the tree.
-        */
-       if (pvd->vdev_children == 1)
-               vdev_remove_parent(cvd);
+       /*
+        * If the parent mirror/replacing vdev only has one child,
+        * the parent is no longer needed.  Remove it from the tree.
+        */
+       if (pvd->vdev_children == 1)
+               vdev_remove_parent(cvd);
+       /*
+        * We don't set tvd until now because the parent we just removed
+        * may have been the previous top-level vdev.
+        */
+       tvd = cvd->vdev_top;
+       ASSERT(tvd->vdev_parent == rvd);
+       /*
+        * Reevaluate the parent vdev state.
+        */
+       vdev_propagate_state(cvd);
+       /*
+        * If the 'autoexpand' property is set on the pool then automatically
+        * try to expand the size of the pool. For example if the device we
+        * just detached was smaller than the others, it may be possible to
+        * add metaslabs (i.e. grow the pool). We need to reopen the vdev
+        * first so that we can obtain the updated sizes of the leaf vdevs.
+        */
+       if (spa->spa_autoexpand) {
+               vdev_reopen(tvd);
+               vdev_expand(tvd, txg);
+       }
+       vdev_config_dirty(tvd);
+       /*
+        * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
+        * vd->vdev_detached is set and free vd's DTL object in syncing context.
+        * But first make sure we're not on any *other* txg's DTL list, to
+        * prevent vd from being accessed after it's freed.
+        */
+       vdpath = spa_strdup(vd->vdev_path);
 -      for (int t = 0; t < TXG_SIZE; t++)
++      for (t = 0; t < TXG_SIZE; t++)
+               (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
+       vd->vdev_detached = B_TRUE;
+       vdev_dirty(tvd, VDD_DTL, vd, txg);
+       spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+       error = spa_vdev_exit(spa, vd, txg, 0);
+       spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
+           "vdev=%s", vdpath);
+       spa_strfree(vdpath);
+       /*
+        * If this was the removal of the original device in a hot spare vdev,
+        * then we want to go through and remove the device from the hot spare
+        * list of every other pool.
+        */
+       if (unspare) {
+               spa_t *myspa = spa;
+               spa = NULL;
+               mutex_enter(&spa_namespace_lock);
+               while ((spa = spa_next(spa)) != NULL) {
+                       if (spa->spa_state != POOL_STATE_ACTIVE)
+                               continue;
+                       if (spa == myspa)
+                               continue;
+                       spa_open_ref(spa, FTAG);
+                       mutex_exit(&spa_namespace_lock);
+                       (void) spa_vdev_remove(spa, unspare_guid,
+                           B_TRUE);
+                       mutex_enter(&spa_namespace_lock);
+                       spa_close(spa, FTAG);
+               }
+               mutex_exit(&spa_namespace_lock);
+       }
+       return (error);
+ }
+ /*
+  * Split a set of devices from their mirrors, and create a new pool from them.
+  */
+ int
+ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+     nvlist_t *props, boolean_t exp)
+ {
+       int error = 0;
+       uint64_t txg, *glist;
+       spa_t *newspa;
+       uint_t c, children, lastlog;
+       nvlist_t **child, *nvl, *tmp;
+       dmu_tx_t *tx;
+       char *altroot = NULL;
+       vdev_t *rvd, **vml = NULL;                      /* vdev modify list */
+       boolean_t activate_slog;
+       if (!spa_writeable(spa))
+               return (EROFS);
+       txg = spa_vdev_enter(spa);
+       /* clear the log and flush everything up to now */
+       activate_slog = spa_passivate_log(spa);
+       (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+       error = spa_offline_log(spa);
+       txg = spa_vdev_config_enter(spa);
+       if (activate_slog)
+               spa_activate_log(spa);
+       if (error != 0)
+               return (spa_vdev_exit(spa, NULL, txg, error));
+       /* check new spa name before going any further */
+       if (spa_lookup(newname) != NULL)
+               return (spa_vdev_exit(spa, NULL, txg, EEXIST));
+       /*
+        * scan through all the children to ensure they're all mirrors
+        */
+       if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
+           nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
+           &children) != 0)
+               return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+       /* first, check to ensure we've got the right child count */
+       rvd = spa->spa_root_vdev;
+       lastlog = 0;
+       for (c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *vd = rvd->vdev_child[c];
+               /* don't count the holes & logs as children */
+               if (vd->vdev_islog || vd->vdev_ishole) {
+                       if (lastlog == 0)
+                               lastlog = c;
+                       continue;
+               }
+               lastlog = 0;
+       }
+       if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
+               return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+       /* next, ensure no spare or cache devices are part of the split */
+       if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
+           nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
+               return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+       vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
+       glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
+       /* then, loop over each vdev and validate it */
+       for (c = 0; c < children; c++) {
+               uint64_t is_hole = 0;
+               (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+                   &is_hole);
+               if (is_hole != 0) {
+                       if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
+                           spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
+                               continue;
+                       } else {
+                               error = EINVAL;
+                               break;
+                       }
+               }
+               /* which disk is going to be split? */
+               if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
+                   &glist[c]) != 0) {
+                       error = EINVAL;
+                       break;
+               }
+               /* look it up in the spa */
+               vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
+               if (vml[c] == NULL) {
+                       error = ENODEV;
+                       break;
+               }
+               /* make sure there's nothing stopping the split */
+               if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
+                   vml[c]->vdev_islog ||
+                   vml[c]->vdev_ishole ||
+                   vml[c]->vdev_isspare ||
+                   vml[c]->vdev_isl2cache ||
+                   !vdev_writeable(vml[c]) ||
+                   vml[c]->vdev_children != 0 ||
+                   vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
+                   c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
+                       error = EINVAL;
+                       break;
+               }
+               if (vdev_dtl_required(vml[c])) {
+                       error = EBUSY;
+                       break;
+               }
+               /* we need certain info from the top level */
+               VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
+                   vml[c]->vdev_top->vdev_ms_array) == 0);
+               VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
+                   vml[c]->vdev_top->vdev_ms_shift) == 0);
+               VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
+                   vml[c]->vdev_top->vdev_asize) == 0);
+               VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
+                   vml[c]->vdev_top->vdev_ashift) == 0);
+       }
+       if (error != 0) {
+               kmem_free(vml, children * sizeof (vdev_t *));
+               kmem_free(glist, children * sizeof (uint64_t));
+               return (spa_vdev_exit(spa, NULL, txg, error));
+       }
+       /* stop writers from using the disks */
+       for (c = 0; c < children; c++) {
+               if (vml[c] != NULL)
+                       vml[c]->vdev_offline = B_TRUE;
+       }
+       vdev_reopen(spa->spa_root_vdev);
  
        /*
-        * We don't set tvd until now because the parent we just removed
-        * may have been the previous top-level vdev.
+        * Temporarily record the splitting vdevs in the spa config.  This
+        * will disappear once the config is regenerated.
         */
-       tvd = cvd->vdev_top;
-       ASSERT(tvd->vdev_parent == rvd);
+       VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+       VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+           glist, children) == 0);
+       kmem_free(glist, children * sizeof (uint64_t));
  
-       /*
-        * Reevaluate the parent vdev state.
-        */
-       vdev_propagate_state(cvd);
+       mutex_enter(&spa->spa_props_lock);
+       VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
+           nvl) == 0);
+       mutex_exit(&spa->spa_props_lock);
+       spa->spa_config_splitting = nvl;
+       vdev_config_dirty(spa->spa_root_vdev);
+       /* configure and create the new pool */
+       VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
+       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+           exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
+       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+           spa_version(spa)) == 0);
+       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+           spa->spa_config_txg) == 0);
+       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+           spa_generate_guid(NULL)) == 0);
+       (void) nvlist_lookup_string(props,
+           zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
  
-       /*
-        * If the 'autoexpand' property is set on the pool then automatically
-        * try to expand the size of the pool. For example if the device we
-        * just detached was smaller than the others, it may be possible to
-        * add metaslabs (i.e. grow the pool). We need to reopen the vdev
-        * first so that we can obtain the updated sizes of the leaf vdevs.
-        */
-       if (spa->spa_autoexpand) {
-               vdev_reopen(tvd);
-               vdev_expand(tvd, txg);
+       /* add the new pool to the namespace */
+       newspa = spa_add(newname, config, altroot);
+       newspa->spa_config_txg = spa->spa_config_txg;
+       spa_set_log_state(newspa, SPA_LOG_CLEAR);
+       /* release the spa config lock, retaining the namespace lock */
+       spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+       if (zio_injection_enabled)
+               zio_handle_panic_injection(spa, FTAG, 1);
+       spa_activate(newspa, spa_mode_global);
+       spa_async_suspend(newspa);
+       /* create the new pool from the disks of the original pool */
+       error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
+       if (error)
+               goto out;
+       /* if that worked, generate a real config for the new pool */
+       if (newspa->spa_root_vdev != NULL) {
+               VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
+                   NV_UNIQUE_NAME, KM_SLEEP) == 0);
+               VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
+                   ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
+               spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
+                   B_TRUE));
        }
  
-       vdev_config_dirty(tvd);
+       /* set the props */
+       if (props != NULL) {
+               spa_configfile_set(newspa, props, B_FALSE);
+               error = spa_prop_set(newspa, props);
+               if (error)
+                       goto out;
+       }
  
-       /*
-        * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
-        * vd->vdev_detached is set and free vd's DTL object in syncing context.
-        * But first make sure we're not on any *other* txg's DTL list, to
-        * prevent vd from being accessed after it's freed.
-        */
-       for (t = 0; t < TXG_SIZE; t++)
-               (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
-       vd->vdev_detached = B_TRUE;
-       vdev_dirty(tvd, VDD_DTL, vd, txg);
+       /* flush everything */
+       txg = spa_vdev_config_enter(newspa);
+       vdev_config_dirty(newspa->spa_root_vdev);
+       (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
  
-       spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+       if (zio_injection_enabled)
+               zio_handle_panic_injection(spa, FTAG, 2);
  
-       error = spa_vdev_exit(spa, vd, txg, 0);
+       spa_async_resume(newspa);
  
-       /*
-        * If this was the removal of the original device in a hot spare vdev,
-        * then we want to go through and remove the device from the hot spare
-        * list of every other pool.
-        */
-       if (unspare) {
-               spa_t *myspa = spa;
-               spa = NULL;
-               mutex_enter(&spa_namespace_lock);
-               while ((spa = spa_next(spa)) != NULL) {
-                       if (spa->spa_state != POOL_STATE_ACTIVE)
-                               continue;
-                       if (spa == myspa)
-                               continue;
-                       spa_open_ref(spa, FTAG);
-                       mutex_exit(&spa_namespace_lock);
-                       (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
-                       mutex_enter(&spa_namespace_lock);
-                       spa_close(spa, FTAG);
+       /* finally, update the original pool's config */
+       txg = spa_vdev_config_enter(spa);
+       tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+       error = dmu_tx_assign(tx, TXG_WAIT);
+       if (error != 0)
+               dmu_tx_abort(tx);
+       for (c = 0; c < children; c++) {
+               if (vml[c] != NULL) {
+                       vdev_split(vml[c]);
+                       if (error == 0)
+                               spa_history_log_internal(LOG_POOL_VDEV_DETACH,
+                                   spa, tx, "vdev=%s",
+                                   vml[c]->vdev_path);
+                       vdev_free(vml[c]);
                }
-               mutex_exit(&spa_namespace_lock);
        }
+       vdev_config_dirty(spa->spa_root_vdev);
+       spa->spa_config_splitting = NULL;
+       nvlist_free(nvl);
+       if (error == 0)
+               dmu_tx_commit(tx);
+       (void) spa_vdev_exit(spa, NULL, txg, 0);
+       if (zio_injection_enabled)
+               zio_handle_panic_injection(spa, FTAG, 3);
+       /* split is complete; log a history record */
+       spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
+           "split new pool %s from pool %s", newname, spa_name(spa));
+       kmem_free(vml, children * sizeof (vdev_t *));
+       /* if we're not going to mount the filesystems in userland, export */
+       if (exp)
+               error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
+                   B_FALSE, B_FALSE);
+       return (error);
+ out:
+       spa_unload(newspa);
+       spa_deactivate(newspa);
+       spa_remove(newspa);
+       txg = spa_vdev_config_enter(spa);
+       /* re-online all offlined disks */
+       for (c = 0; c < children; c++) {
+               if (vml[c] != NULL)
+                       vml[c]->vdev_offline = B_FALSE;
+       }
+       vdev_reopen(spa->spa_root_vdev);
+       nvlist_free(spa->spa_config_splitting);
+       spa->spa_config_splitting = NULL;
+       (void) spa_vdev_exit(spa, NULL, txg, error);
  
+       kmem_free(vml, children * sizeof (vdev_t *));
        return (error);
  }
  
@@@ -3685,12 -4755,21 +4777,23 @@@ spa_scan(spa_t *spa, pool_scan_func_t f
  static void
  spa_async_remove(spa_t *spa, vdev_t *vd)
  {
 +      int c;
 +
        if (vd->vdev_remove_wanted) {
-               vd->vdev_remove_wanted = 0;
+               vd->vdev_remove_wanted = B_FALSE;
+               vd->vdev_delayed_close = B_FALSE;
                vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
-               vdev_clear(spa, vd);
+               /*
+                * We want to clear the stats, but we don't want to do a full
+                * vdev_clear() as that will cause us to throw away
+                * degraded/faulted state as well as attempt to reopen the
+                * device, all of which is a waste.
+                */
+               vd->vdev_stat.vs_read_errors = 0;
+               vd->vdev_stat.vs_write_errors = 0;
+               vd->vdev_stat.vs_checksum_errors = 0;
                vdev_state_dirty(vd->vdev_top);
        }
  
  static void
  spa_async_probe(spa_t *spa, vdev_t *vd)
  {
 +      int c;
 +
        if (vd->vdev_probe_wanted) {
-               vd->vdev_probe_wanted = 0;
+               vd->vdev_probe_wanted = B_FALSE;
                vdev_reopen(vd);        /* vdev_open() does the actual probe */
        }
  
@@@ -3785,11 -4860,11 +4887,11 @@@ spa_async_thread(spa_t *spa
         * See if any devices need to be marked REMOVED.
         */
        if (tasks & SPA_ASYNC_REMOVE) {
-               spa_vdev_state_enter(spa);
+               spa_vdev_state_enter(spa, SCL_NONE);
                spa_async_remove(spa, spa->spa_root_vdev);
 -              for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 +              for (i = 0; i < spa->spa_l2cache.sav_count; i++)
                        spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
 -              for (int i = 0; i < spa->spa_spares.sav_count; i++)
 +              for (i = 0; i < spa->spa_spares.sav_count; i++)
                        spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
                (void) spa_vdev_state_exit(spa, NULL, 0);
        }
@@@ -4146,9 -5247,7 +5274,8 @@@ spa_sync(spa_t *spa, uint64_t txg
        vdev_t *rvd = spa->spa_root_vdev;
        vdev_t *vd;
        dmu_tx_t *tx;
-       int dirty_vdevs;
        int error;
 +      int c;
  
        /*
         * Lock out configuration changes.
index 88ae172b45304256ff3fa104a430c890871d0ff8,52af7fcb71210c7ec4e990ce7409e02e0eb4968c..20946c4e7580457a3d8cead97691974f9e9f9e34
@@@ -433,6 -424,6 +433,7 @@@ spa_add(const char *name, nvlist_t *con
  {
        spa_t *spa;
        spa_config_dirent_t *dp;
++      int t;
  
        ASSERT(MUTEX_HELD(&spa_namespace_lock));
  
        cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
  
 -      for (int t = 0; t < TXG_SIZE; t++)
++      for (t = 0; t < TXG_SIZE; t++)
+               bplist_create(&spa->spa_free_bplist[t]);
        (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
        spa->spa_state = POOL_STATE_UNINITIALIZED;
        spa->spa_freeze_txg = UINT64_MAX;
@@@ -492,6 -493,6 +503,7 @@@ voi
  spa_remove(spa_t *spa)
  {
        spa_config_dirent_t *dp;
++      int t;
  
        ASSERT(MUTEX_HELD(&spa_namespace_lock));
        ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
  
        spa_config_lock_destroy(spa);
  
 -      for (int t = 0; t < TXG_SIZE; t++)
++      for (t = 0; t < TXG_SIZE; t++)
+               bplist_destroy(&spa->spa_free_bplist[t]);
        cv_destroy(&spa->spa_async_cv);
+       cv_destroy(&spa->spa_proc_cv);
        cv_destroy(&spa->spa_scrub_io_cv);
        cv_destroy(&spa->spa_suspend_cv);
  
@@@ -1302,24 -1419,52 +1430,54 @@@ spa_max_replication(spa_t *spa
        return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
  }
  
+ int
+ spa_prev_software_version(spa_t *spa)
+ {
+       return (spa->spa_prev_software_version);
+ }
  uint64_t
bp_get_dasize(spa_t *spa, const blkptr_t *bp)
dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
  {
-       int sz = 0, i;
+       uint64_t asize = DVA_GET_ASIZE(dva);
+       uint64_t dsize = asize;
  
-       if (!spa->spa_deflate)
-               return (BP_GET_ASIZE(bp));
+       ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
  
-       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-       for (i = 0; i < SPA_DVAS_PER_BP; i++) {
-               vdev_t *vd =
-                   vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
-               if (vd)
-                       sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
-                           SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
+       if (asize != 0 && spa->spa_deflate) {
+               vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+               dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
        }
 -      for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+       return (dsize);
+ }
+ uint64_t
+ bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
+ {
+       uint64_t dsize = 0;
++      int d;
 -      for (int d = 0; d < SPA_DVAS_PER_BP; d++)
++      for (d = 0; d < SPA_DVAS_PER_BP; d++)
+               dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+       return (dsize);
+ }
+ uint64_t
+ bp_get_dsize(spa_t *spa, const blkptr_t *bp)
+ {
+       uint64_t dsize = 0;
++      int d;
+       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
++      for (d = 0; d < SPA_DVAS_PER_BP; d++)
+               dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
        spa_config_exit(spa, SCL_VDEV, FTAG);
-       return (sz);
+       return (dsize);
  }
  
  /*
index cb4a3e252453e400cbe57241684733abdc0d71cb,a61f29b8e78a6deb58690a813aeed9ccee715ad1..e4c1a7707fb7f8ac3f2e8a602f5cec6b542079a1
@@@ -1016,13 -1058,35 +1067,38 @@@ vdev_open_child(void *arg
        vd->vdev_open_thread = NULL;
  }
  
 -      for (int c = 0; c < vd->vdev_children; c++)
+ boolean_t
+ vdev_uses_zvols(vdev_t *vd)
+ {
++      int c;
++
+       if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
+           strlen(ZVOL_DIR)) == 0)
+               return (B_TRUE);
++      for (c = 0; c < vd->vdev_children; c++)
+               if (vdev_uses_zvols(vd->vdev_child[c]))
+                       return (B_TRUE);
+       return (B_FALSE);
+ }
  void
  vdev_open_children(vdev_t *vd)
  {
        taskq_t *tq;
        int children = vd->vdev_children;
 +      int c;
  
 -              for (int c = 0; c < children; c++)
+       /*
+        * in order to handle pools on top of zvols, do the opens
+        * in a single thread so that the same thread holds the
+        * spa_namespace_lock
+        */
+       if (vdev_uses_zvols(vd)) {
++              for (c = 0; c < children; c++)
+                       vd->vdev_child[c]->vdev_open_error =
+                           vdev_open(vd->vdev_child[c]);
+               return;
+       }
        tq = taskq_create("vdev_open", children, minclsyspri,
            children, children, TASKQ_PREPOPULATE);
  
@@@ -1090,10 -1177,16 +1190,16 @@@ vdev_open(vdev_t *vd
                vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
                    VDEV_AUX_ERR_EXCEEDED);
        } else {
-               vd->vdev_state = VDEV_STATE_HEALTHY;
+               vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
        }
  
 -      for (int c = 0; c < vd->vdev_children; c++) {
+       /*
+        * For hole or missing vdevs we just return success.
+        */
+       if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+               return (0);
 +      for (c = 0; c < vd->vdev_children; c++) {
                if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
                        vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
                            VDEV_AUX_NONE);
@@@ -1200,11 -1293,10 +1306,11 @@@ vdev_validate(vdev_t *vd
  {
        spa_t *spa = vd->vdev_spa;
        nvlist_t *label;
-       uint64_t guid, top_guid;
+       uint64_t guid = 0, top_guid;
        uint64_t state;
 +      int c;
  
 -      for (int c = 0; c < vd->vdev_children; c++)
 +      for (c = 0; c < vd->vdev_children; c++)
                if (vdev_validate(vd->vdev_child[c]) != 0)
                        return (EBADF);
  
@@@ -1308,6 -1431,41 +1445,43 @@@ vdev_close(vdev_t *vd
        vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
  }
  
 -      for (int c = 0; c < vd->vdev_children; c++)
+ void
+ vdev_hold(vdev_t *vd)
+ {
+       spa_t *spa = vd->vdev_spa;
++      int c;
+       ASSERT(spa_is_root(spa));
+       if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+               return;
 -      for (int c = 0; c < vd->vdev_children; c++)
++      for (c = 0; c < vd->vdev_children; c++)
+               vdev_hold(vd->vdev_child[c]);
+       if (vd->vdev_ops->vdev_op_leaf)
+               vd->vdev_ops->vdev_op_hold(vd);
+ }
+ void
+ vdev_rele(vdev_t *vd)
+ {
+       spa_t *spa = vd->vdev_spa;
++      int c;
+       ASSERT(spa_is_root(spa));
++      for (c = 0; c < vd->vdev_children; c++)
+               vdev_rele(vd->vdev_child[c]);
+       if (vd->vdev_ops->vdev_op_leaf)
+               vd->vdev_ops->vdev_op_rele(vd);
+ }
+ /*
+  * Reopen all interior vdevs and any unopened leaves.  We don't actually
+  * reopen leaf vdevs which had previously been opened as they might deadlock
+  * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
+  * If the leaf has never been opened then open it, as usual.
+  */
  void
  vdev_reopen(vdev_t *vd)
  {
@@@ -1545,7 -1708,9 +1724,9 @@@ vdev_dtl_reassess(vdev_t *vd, uint64_t 
        }
  
        mutex_enter(&vd->vdev_dtl_lock);
 -      for (int t = 0; t < DTL_TYPES; t++) {
 +      for (t = 0; t < DTL_TYPES; t++) {
+               /* account for child's outage in parent's missing map */
+               int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
                if (t == DTL_SCRUB)
                        continue;                       /* leaf vdevs only */
                if (t == DTL_PARTIAL)
                else
                        minref = vd->vdev_children;     /* any kind of mirror */
                space_map_ref_create(&reftree);
 -              for (int c = 0; c < vd->vdev_children; c++) {
 +              for (c = 0; c < vd->vdev_children; c++) {
                        vdev_t *cvd = vd->vdev_child[c];
                        mutex_enter(&cvd->vdev_dtl_lock);
-                       space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1);
+                       space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
                        mutex_exit(&cvd->vdev_dtl_lock);
                }
                space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
@@@ -1803,6 -1969,42 +1988,43 @@@ vdev_validate_aux(vdev_t *vd
        return (0);
  }
  
 -              for (int m = 0; m < vd->vdev_ms_count; m++) {
+ void
+ vdev_remove(vdev_t *vd, uint64_t txg)
+ {
+       spa_t *spa = vd->vdev_spa;
+       objset_t *mos = spa->spa_meta_objset;
+       dmu_tx_t *tx;
++      int m;
+       tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+       if (vd->vdev_dtl_smo.smo_object) {
+               ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
+               (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
+               vd->vdev_dtl_smo.smo_object = 0;
+       }
+       if (vd->vdev_ms != NULL) {
++              for (m = 0; m < vd->vdev_ms_count; m++) {
+                       metaslab_t *msp = vd->vdev_ms[m];
+                       if (msp == NULL || msp->ms_smo.smo_object == 0)
+                               continue;
+                       ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
+                       (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
+                       msp->ms_smo.smo_object = 0;
+               }
+       }
+       if (vd->vdev_ms_array) {
+               (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
+               vd->vdev_ms_array = 0;
+               vd->vdev_ms_shift = 0;
+       }
+       dmu_tx_commit(tx);
+ }
  void
  vdev_sync_done(vdev_t *vd, uint64_t txg)
  {
@@@ -2201,6 -2484,19 +2506,20 @@@ vdev_clear_stats(vdev_t *vd
        mutex_exit(&vd->vdev_stat_lock);
  }
  
 -      for (int c = 0; c < vd->vdev_children; c++)
+ void
+ vdev_scan_stat_init(vdev_t *vd)
+ {
+       vdev_stat_t *vs = &vd->vdev_stat;
++      int c;
++      for (c = 0; c < vd->vdev_children; c++)
+               vdev_scan_stat_init(vd->vdev_child[c]);
+       mutex_enter(&vd->vdev_stat_lock);
+       vs->vs_scan_processed = 0;
+       mutex_exit(&vd->vdev_stat_lock);
+ }
  void
  vdev_stat_update(zio_t *zio, uint64_t psize)
  {
@@@ -2536,12 -2827,17 +2850,18 @@@ vdev_propagate_state(vdev_t *vd
        int degraded = 0, faulted = 0;
        int corrupted = 0;
        vdev_t *child;
 +      int c;
  
        if (vd->vdev_children > 0) {
 -              for (int c = 0; c < vd->vdev_children; c++) {
 +              for (c = 0; c < vd->vdev_children; c++) {
                        child = vd->vdev_child[c];
  
+                       /*
+                        * Don't factor holes into the decision.
+                        */
+                       if (child->vdev_ishole)
+                               continue;
                        if (!vdev_readable(child) ||
                            (!vdev_writeable(child) && spa_writeable(spa))) {
                                /*
@@@ -2737,24 -3039,24 +3065,25 @@@ vdev_is_bootable(vdev_t *vd
        return (B_TRUE);
  }
  
+ /*
+  * Load the state from the original vdev tree (ovd) which
+  * we've retrieved from the MOS config object. If the original
+  * vdev was offline then we transfer that state to the device
+  * in the current vdev tree (nvd).
+  */
  void
- vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
+ vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
  {
-       uint_t children;
-       nvlist_t **child;
-       uint64_t val;
-       spa_t *spa = vd->vdev_spa;
+       spa_t *spa = nvd->vdev_spa;
 +      int c;
  
-       if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-           &child, &children) == 0) {
-               for (c = 0; c < children; c++)
-                       vdev_load_log_state(vd->vdev_child[c], child[c]);
-       }
+       ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+       ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
  
-       if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
-           ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
 -      for (int c = 0; c < nvd->vdev_children; c++)
++      for (c = 0; c < nvd->vdev_children; c++)
+               vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
  
+       if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) {
                /*
                 * It would be nice to call vdev_offline()
                 * directly but the pool isn't fully loaded and
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index 6efce7705ef2e116f9c379f12e2df6ff30298302,88d80af4e99ff2ccd8a3e812234933a8be16bd4b..520639e063e40b4c1593eaf1b06e3776d7b50bf1
@@@ -904,10 -991,10 +992,10 @@@ zio_write_bp_init(zio_t *zio
         * spa_sync() to allocate new blocks, but force rewrites after that.
         * There should only be a handful of blocks after pass 1 in any case.
         */
-       if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+       if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
            pass > SYNC_PASS_REWRITE) {
-               uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
-               ASSERT(csize != 0);
 -              ASSERT(psize != 0);
+               enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
++              ASSERT(psize != 0);
                zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
                zio->io_flags |= ZIO_FLAG_IO_REWRITE;
        } else {
@@@ -1113,24 -1234,13 +1237,13 @@@ zio_reexecute(zio_t *pio
        pio->io_pipeline = pio->io_orig_pipeline;
        pio->io_reexecute = 0;
        pio->io_error = 0;
 -      for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 +      for (w = 0; w < ZIO_WAIT_TYPES; w++)
                pio->io_state[w] = 0;
 -      for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 +      for (c = 0; c < ZIO_CHILD_TYPES; c++)
                pio->io_child_error[c] = 0;
  
-       if (IO_IS_ALLOCATING(pio)) {
-               /*
-                * Remember the failed bp so that the io_ready() callback
-                * can update its accounting upon reexecution.  The block
-                * was already freed in zio_done(); we indicate this with
-                * a fill count of -1 so that zio_free() knows to skip it.
-                */
-               blkptr_t *bp = pio->io_bp;
-               ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg);
-               bp->blk_fill = BLK_FILL_ALREADY_FREED;
-               pio->io_bp_orig = *bp;
-               BP_ZERO(bp);
-       }
+       if (IO_IS_ALLOCATING(pio))
+               BP_ZERO(pio->io_bp);
  
        /*
         * As we reexecute pio's children, new children could be created.
@@@ -1416,10 -1530,9 +1535,10 @@@ zio_gang_tree_assemble_done(zio_t *zio
        zio_t *gio = zio->io_gang_leader;
        zio_gang_node_t *gn = zio->io_private;
        blkptr_t *bp = zio->io_bp;
 +      int g;
  
        ASSERT(gio == zio_unique_parent(zio));
-       ASSERT(zio_walk_children(zio) == NULL);
+       ASSERT(zio->io_child_count == 0);
  
        if (zio->io_error)
                return;
  
        ASSERT(zio->io_data == gn->gn_gbh);
        ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
-       ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+       ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
  
 -      for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 +      for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
                blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
                if (!BP_IS_GANG(gbp))
                        continue;
@@@ -1457,9 -1569,9 +1576,9 @@@ zio_gang_tree_issue(zio_t *pio, zio_gan
        zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
  
        if (gn != NULL) {
-               ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+               ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
  
 -              for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 +              for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
                        blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
                        if (BP_IS_HOLE(gbp))
                                continue;
@@@ -1554,13 -1665,13 +1673,13 @@@ zio_write_gang_block(zio_t *pio
        uint64_t txg = pio->io_txg;
        uint64_t resid = pio->io_size;
        uint64_t lsize;
-       int ndvas = gio->io_prop.zp_ndvas;
-       int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
+       int copies = gio->io_prop.zp_copies;
+       int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
        zio_prop_t zp;
 -      int error;
 +      int g, error;
  
-       error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
-           bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp,
+       error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
+           bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
            METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
        if (error) {
                pio->io_error = error;
  
  /*
   * ==========================================================================
-  * Allocate and free blocks
+  * Dedup
   * ==========================================================================
   */
 -              for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ static void
+ zio_ddt_child_read_done(zio_t *zio)
+ {
+       blkptr_t *bp = zio->io_bp;
+       ddt_entry_t *dde = zio->io_private;
+       ddt_phys_t *ddp;
+       zio_t *pio = zio_unique_parent(zio);
+       mutex_enter(&pio->io_lock);
+       ddp = ddt_phys_select(dde, bp);
+       if (zio->io_error == 0)
+               ddt_phys_clear(ddp);    /* this ddp doesn't need repair */
+       if (zio->io_error == 0 && dde->dde_repair_data == NULL)
+               dde->dde_repair_data = zio->io_data;
+       else
+               zio_buf_free(zio->io_data, zio->io_size);
+       mutex_exit(&pio->io_lock);
+ }
+ static int
+ zio_ddt_read_start(zio_t *zio)
+ {
+       blkptr_t *bp = zio->io_bp;
++      int p;
+       ASSERT(BP_GET_DEDUP(bp));
+       ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+       ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+       if (zio->io_child_error[ZIO_CHILD_DDT]) {
+               ddt_t *ddt = ddt_select(zio->io_spa, bp);
+               ddt_entry_t *dde = ddt_repair_start(ddt, bp);
+               ddt_phys_t *ddp = dde->dde_phys;
+               ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+               blkptr_t blk;
+               ASSERT(zio->io_vsd == NULL);
+               zio->io_vsd = dde;
+               if (ddp_self == NULL)
+                       return (ZIO_PIPELINE_CONTINUE);
++              for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+                       if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+                               continue;
+                       ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
+                           &blk);
+                       zio_nowait(zio_read(zio, zio->io_spa, &blk,
+                           zio_buf_alloc(zio->io_size), zio->io_size,
+                           zio_ddt_child_read_done, dde, zio->io_priority,
+                           ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
+                           &zio->io_bookmark));
+               }
+               return (ZIO_PIPELINE_CONTINUE);
+       }
+       zio_nowait(zio_read(zio, zio->io_spa, bp,
+           zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
+           ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
+       return (ZIO_PIPELINE_CONTINUE);
+ }
  
 -      for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ static int
+ zio_ddt_read_done(zio_t *zio)
+ {
+       blkptr_t *bp = zio->io_bp;
+       if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
+               return (ZIO_PIPELINE_STOP);
+       ASSERT(BP_GET_DEDUP(bp));
+       ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+       ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+       if (zio->io_child_error[ZIO_CHILD_DDT]) {
+               ddt_t *ddt = ddt_select(zio->io_spa, bp);
+               ddt_entry_t *dde = zio->io_vsd;
+               if (ddt == NULL) {
+                       ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
+                       return (ZIO_PIPELINE_CONTINUE);
+               }
+               if (dde == NULL) {
+                       zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
+                       zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
+                       return (ZIO_PIPELINE_STOP);
+               }
+               if (dde->dde_repair_data != NULL) {
+                       bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
+                       zio->io_child_error[ZIO_CHILD_DDT] = 0;
+               }
+               ddt_repair_done(ddt, dde);
+               zio->io_vsd = NULL;
+       }
+       ASSERT(zio->io_vsd == NULL);
+       return (ZIO_PIPELINE_CONTINUE);
+ }
+ static boolean_t
+ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
+ {
+       spa_t *spa = zio->io_spa;
++      int p;
+       /*
+        * Note: we compare the original data, not the transformed data,
+        * because when zio->io_bp is an override bp, we will not have
+        * pushed the I/O transforms.  That's an important optimization
+        * because otherwise we'd compress/encrypt all dmu_sync() data twice.
+        */
 -      for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
++      for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+               zio_t *lio = dde->dde_lead_zio[p];
+               if (lio != NULL) {
+                       return (lio->io_orig_size != zio->io_orig_size ||
+                           bcmp(zio->io_orig_data, lio->io_orig_data,
+                           zio->io_orig_size) != 0);
+               }
+       }
++      for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+               ddt_phys_t *ddp = &dde->dde_phys[p];
+               if (ddp->ddp_phys_birth != 0) {
+                       arc_buf_t *abuf = NULL;
+                       uint32_t aflags = ARC_WAIT;
+                       blkptr_t blk = *zio->io_bp;
+                       int error;
+                       ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+                       ddt_exit(ddt);
+                       error = arc_read_nolock(NULL, spa, &blk,
+                           arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
+                           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+                           &aflags, &zio->io_bookmark);
+                       if (error == 0) {
+                               if (arc_buf_size(abuf) != zio->io_orig_size ||
+                                   bcmp(abuf->b_data, zio->io_orig_data,
+                                   zio->io_orig_size) != 0)
+                                       error = EEXIST;
+                               VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+                       }
+                       ddt_enter(ddt);
+                       return (error != 0);
+               }
+       }
+       return (B_FALSE);
+ }
+ static void
+ zio_ddt_child_write_ready(zio_t *zio)
+ {
+       int p = zio->io_prop.zp_copies;
+       ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+       ddt_entry_t *dde = zio->io_private;
+       ddt_phys_t *ddp = &dde->dde_phys[p];
+       zio_t *pio;
+       if (zio->io_error)
+               return;
+       ddt_enter(ddt);
+       ASSERT(dde->dde_lead_zio[p] == zio);
+       ddt_phys_fill(ddp, zio->io_bp);
+       while ((pio = zio_walk_parents(zio)) != NULL)
+               ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+       ddt_exit(ddt);
+ }
+ static void
+ zio_ddt_child_write_done(zio_t *zio)
+ {
+       int p = zio->io_prop.zp_copies;
+       ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+       ddt_entry_t *dde = zio->io_private;
+       ddt_phys_t *ddp = &dde->dde_phys[p];
+       ddt_enter(ddt);
+       ASSERT(ddp->ddp_refcnt == 0);
+       ASSERT(dde->dde_lead_zio[p] == zio);
+       dde->dde_lead_zio[p] = NULL;
+       if (zio->io_error == 0) {
+               while (zio_walk_parents(zio) != NULL)
+                       ddt_phys_addref(ddp);
+       } else {
+               ddt_phys_clear(ddp);
+       }
+       ddt_exit(ddt);
+ }
+ static void
+ zio_ddt_ditto_write_done(zio_t *zio)
+ {
+       int p = DDT_PHYS_DITTO;
+       zio_prop_t *zp = &zio->io_prop;
+       blkptr_t *bp = zio->io_bp;
+       ddt_t *ddt = ddt_select(zio->io_spa, bp);
+       ddt_entry_t *dde = zio->io_private;
+       ddt_phys_t *ddp = &dde->dde_phys[p];
+       ddt_key_t *ddk = &dde->dde_key;
+       ddt_enter(ddt);
+       ASSERT(ddp->ddp_refcnt == 0);
+       ASSERT(dde->dde_lead_zio[p] == zio);
+       dde->dde_lead_zio[p] = NULL;
+       if (zio->io_error == 0) {
+               ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
+               ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
+               ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
+               if (ddp->ddp_phys_birth != 0)
+                       ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
+               ddt_phys_fill(ddp, bp);
+       }
+       ddt_exit(ddt);
+ }
+ static int
+ zio_ddt_write(zio_t *zio)
+ {
+       spa_t *spa = zio->io_spa;
+       blkptr_t *bp = zio->io_bp;
+       uint64_t txg = zio->io_txg;
+       zio_prop_t *zp = &zio->io_prop;
+       int p = zp->zp_copies;
+       int ditto_copies;
+       zio_t *cio = NULL;
+       zio_t *dio = NULL;
+       ddt_t *ddt = ddt_select(spa, bp);
+       ddt_entry_t *dde;
+       ddt_phys_t *ddp;
+       ASSERT(BP_GET_DEDUP(bp));
+       ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
+       ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
+       ddt_enter(ddt);
+       dde = ddt_lookup(ddt, bp, B_TRUE);
+       ddp = &dde->dde_phys[p];
+       if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
+               /*
+                * If we're using a weak checksum, upgrade to a strong checksum
+                * and try again.  If we're already using a strong checksum,
+                * we can't resolve it, so just convert to an ordinary write.
+                * (And automatically e-mail a paper to Nature?)
+                */
+               if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
+                       zp->zp_checksum = spa_dedup_checksum(spa);
+                       zio_pop_transforms(zio);
+                       zio->io_stage = ZIO_STAGE_OPEN;
+                       BP_ZERO(bp);
+               } else {
+                       zp->zp_dedup = 0;
+               }
+               zio->io_pipeline = ZIO_WRITE_PIPELINE;
+               ddt_exit(ddt);
+               return (ZIO_PIPELINE_CONTINUE);
+       }
+       ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
+       ASSERT(ditto_copies < SPA_DVAS_PER_BP);
+       if (ditto_copies > ddt_ditto_copies_present(dde) &&
+           dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
+               zio_prop_t czp = *zp;
+               czp.zp_copies = ditto_copies;
+               /*
+                * If we arrived here with an override bp, we won't have run
+                * the transform stack, so we won't have the data we need to
+                * generate a child i/o.  So, toss the override bp and restart.
+                * This is safe, because using the override bp is just an
+                * optimization; and it's rare, so the cost doesn't matter.
+                */
+               if (zio->io_bp_override) {
+                       zio_pop_transforms(zio);
+                       zio->io_stage = ZIO_STAGE_OPEN;
+                       zio->io_pipeline = ZIO_WRITE_PIPELINE;
+                       zio->io_bp_override = NULL;
+                       BP_ZERO(bp);
+                       ddt_exit(ddt);
+                       return (ZIO_PIPELINE_CONTINUE);
+               }
+               dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+                   zio->io_orig_size, &czp, NULL,
+                   zio_ddt_ditto_write_done, dde, zio->io_priority,
+                   ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+               zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
+               dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
+       }
+       if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
+               if (ddp->ddp_phys_birth != 0)
+                       ddt_bp_fill(ddp, bp, txg);
+               if (dde->dde_lead_zio[p] != NULL)
+                       zio_add_child(zio, dde->dde_lead_zio[p]);
+               else
+                       ddt_phys_addref(ddp);
+       } else if (zio->io_bp_override) {
+               ASSERT(bp->blk_birth == txg);
+               ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+               ddt_phys_fill(ddp, bp);
+               ddt_phys_addref(ddp);
+       } else {
+               cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+                   zio->io_orig_size, zp, zio_ddt_child_write_ready,
+                   zio_ddt_child_write_done, dde, zio->io_priority,
+                   ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+               zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
+               dde->dde_lead_zio[p] = cio;
+       }
+       ddt_exit(ddt);
+       if (cio)
+               zio_nowait(cio);
+       if (dio)
+               zio_nowait(dio);
+       return (ZIO_PIPELINE_CONTINUE);
+ }
+ ddt_entry_t *freedde; /* for debugging */
+ static int
+ zio_ddt_free(zio_t *zio)
+ {
+       spa_t *spa = zio->io_spa;
+       blkptr_t *bp = zio->io_bp;
+       ddt_t *ddt = ddt_select(spa, bp);
+       ddt_entry_t *dde;
+       ddt_phys_t *ddp;
+       ASSERT(BP_GET_DEDUP(bp));
+       ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+       ddt_enter(ddt);
+       freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
+       ddp = ddt_phys_select(dde, bp);
+       ddt_phys_decref(ddp);
+       ddt_exit(ddt);
+       return (ZIO_PIPELINE_CONTINUE);
+ }
+ /*
+  * ==========================================================================
+  * Allocate and free blocks
+  * ==========================================================================
+  */
  static int
  zio_dva_allocate(zio_t *zio)
  {
@@@ -1680,40 -2161,14 +2171,14 @@@ zio_dva_claim(zio_t *zio
  static void
  zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
  {
-       spa_t *spa = zio->io_spa;
-       boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE);
-       int g;
        ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
-       if (zio->io_bp == bp && !now) {
-               /*
-                * This is a rewrite for sync-to-convergence.
-                * We can't do a metaslab_free(NOW) because bp wasn't allocated
-                * during this sync pass, which means that metaslab_sync()
-                * already committed the allocation.
-                */
-               ASSERT(DVA_EQUAL(BP_IDENTITY(bp),
-                   BP_IDENTITY(&zio->io_bp_orig)));
-               ASSERT(spa_sync_pass(spa) > 1);
-               if (BP_IS_GANG(bp) && gn == NULL) {
-                       /*
-                        * This is a gang leader whose gang header(s) we
-                        * couldn't read now, so defer the free until later.
-                        * The block should still be intact because without
-                        * the headers, we'd never even start the rewrite.
-                        */
-                       bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
-                       return;
-               }
-       }
+       ASSERT(zio->io_bp_override == NULL);
  
        if (!BP_IS_HOLE(bp))
-               metaslab_free(spa, bp, bp->blk_birth, now);
+               metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
  
        if (gn != NULL) {
 -              for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 +              for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
                        zio_dva_unallocate(zio, gn->gn_child[g],
                            &gn->gn_gbh->zg_blkptr[g]);
                }