From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 28 May 2010 21:19:22 +0000 (-0700)
Subject: Merge commit 'refs/top-bases/gcc-c90' into gcc-c90
X-Git-Tag: zfs-0.5.0~38^2^2~1^2^2~34^2~1^2^2~12^2
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=957b7b41d48c05e0b9fbdd28a98744ea2298f5a4;p=zfs

Merge commit 'refs/top-bases/gcc-c90' into gcc-c90

Conflicts:
	cmd/zdb/zdb.c
	cmd/ztest/ztest.c
	module/zfs/dbuf.c
	module/zfs/dsl_dataset.c
	module/zfs/dsl_scrub.c
	module/zfs/spa.c
	module/zfs/vdev.c
	module/zfs/zio.c
---

957b7b41d48c05e0b9fbdd28a98744ea2298f5a4
diff --cc cmd/zdb/zdb.c
index 478781882,ff73072f8..202b5a619
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@@ -546,6 -637,133 +637,134 @@@ dump_metaslabs(spa_t *spa
  	}
  }
  
+ static void
+ dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+ {
+ 	const ddt_phys_t *ddp = dde->dde_phys;
+ 	const ddt_key_t *ddk = &dde->dde_key;
+ 	char *types[4] = { "ditto", "single", "double", "triple" };
+ 	char blkbuf[BP_SPRINTF_LEN];
+ 	blkptr_t blk;
++	int p;
+ 
 -	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
++	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ 		if (ddp->ddp_phys_birth == 0)
+ 			continue;
+ 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ 		sprintf_blkptr(blkbuf, &blk);
+ 		(void) printf("index %llx refcnt %llu %s %s\n",
+ 		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
+ 		    types[p], blkbuf);
+ 	}
+ }
+ 
+ static void
+ dump_dedup_ratio(const ddt_stat_t *dds)
+ {
+ 	double rL, rP, rD, D, dedup, compress, copies;
+ 
+ 	if (dds->dds_blocks == 0)
+ 		return;
+ 
+ 	rL = (double)dds->dds_ref_lsize;
+ 	rP = (double)dds->dds_ref_psize;
+ 	rD = (double)dds->dds_ref_dsize;
+ 	D = (double)dds->dds_dsize;
+ 
+ 	dedup = rD / D;
+ 	compress = rL / rP;
+ 	copies = rD / rP;
+ 
+ 	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
+ 	    "dedup * compress / copies = %.2f\n\n",
+ 	    dedup, compress, copies, dedup * compress / copies);
+ }
+ 
+ static void
+ dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+ {
+ 	char name[DDT_NAMELEN];
+ 	ddt_entry_t dde;
+ 	uint64_t walk = 0;
+ 	dmu_object_info_t doi;
+ 	uint64_t count, dspace, mspace;
+ 	int error;
+ 
+ 	error = ddt_object_info(ddt, type, class, &doi);
+ 
+ 	if (error == ENOENT)
+ 		return;
+ 	ASSERT(error == 0);
+ 
+ 	count = ddt_object_count(ddt, type, class);
+ 	dspace = doi.doi_physical_blocks_512 << 9;
+ 	mspace = doi.doi_fill_count * doi.doi_data_block_size;
+ 
+ 	ASSERT(count != 0);	/* we should have destroyed it */
+ 
+ 	ddt_object_name(ddt, type, class, name);
+ 
+ 	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
+ 	    name,
+ 	    (u_longlong_t)count,
+ 	    (u_longlong_t)(dspace / count),
+ 	    (u_longlong_t)(mspace / count));
+ 
+ 	if (dump_opt['D'] < 3)
+ 		return;
+ 
+ 	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
+ 
+ 	if (dump_opt['D'] < 4)
+ 		return;
+ 
+ 	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
+ 		return;
+ 
+ 	(void) printf("%s contents:\n\n", name);
+ 
+ 	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
+ 		dump_dde(ddt, &dde, walk);
+ 
+ 	ASSERT(error == ENOENT);
+ 
+ 	(void) printf("\n");
+ }
+ 
+ static void
+ dump_all_ddts(spa_t *spa)
+ {
+ 	ddt_histogram_t ddh_total = { 0 };
+ 	ddt_stat_t dds_total = { 0 };
+ 
+ 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ 		ddt_t *ddt = spa->spa_ddt[c];
+ 		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ 			for (enum ddt_class class = 0; class < DDT_CLASSES;
+ 			    class++) {
+ 				dump_ddt(ddt, type, class);
+ 			}
+ 		}
+ 	}
+ 
+ 	ddt_get_dedup_stats(spa, &dds_total);
+ 
+ 	if (dds_total.dds_blocks == 0) {
+ 		(void) printf("All DDTs are empty\n");
+ 		return;
+ 	}
+ 
+ 	(void) printf("\n");
+ 
+ 	if (dump_opt['D'] > 1) {
+ 		(void) printf("DDT histogram (aggregated over all DDTs):\n");
+ 		ddt_get_dedup_histogram(spa, &ddh_total);
+ 		zpool_dump_ddt(&dds_total, &ddh_total);
+ 	}
+ 
+ 	dump_dedup_ratio(&dds_total);
+ }
+ 
  static void
  dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size)
  {
@@@ -565,9 -783,8 +784,9 @@@ dump_dtl(vdev_t *vd, int indent
  	boolean_t required;
  	char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
  	char prefix[256];
 +	int c, t;
  
- 	spa_vdev_state_enter(spa);
+ 	spa_vdev_state_enter(spa, SCL_NONE);
  	required = vdev_dtl_required(vd);
  	(void) spa_vdev_state_exit(spa, NULL, 0);
  
@@@ -597,6 -814,68 +816,69 @@@
  		dump_dtl(vd->vdev_child[c], indent + 4);
  }
  
+ static void
+ dump_history(spa_t *spa)
+ {
+ 	nvlist_t **events = NULL;
+ 	char buf[SPA_MAXBLOCKSIZE];
+ 	uint64_t resid, len, off = 0;
+ 	uint_t num = 0;
+ 	int error;
+ 	time_t tsec;
+ 	struct tm t;
+ 	char tbuf[30];
+ 	char internalstr[MAXPATHLEN];
++	int i;
+ 
+ 	do {
+ 		len = sizeof (buf);
+ 
+ 		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
+ 			(void) fprintf(stderr, "Unable to read history: "
+ 			    "error %d\n", error);
+ 			return;
+ 		}
+ 
+ 		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
+ 			break;
+ 
+ 		off -= resid;
+ 	} while (len != 0);
+ 
+ 	(void) printf("\nHistory:\n");
 -	for (int i = 0; i < num; i++) {
++	for (i = 0; i < num; i++) {
+ 		uint64_t time, txg, ievent;
+ 		char *cmd, *intstr;
+ 
+ 		if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
+ 		    &time) != 0)
+ 			continue;
+ 		if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
+ 		    &cmd) != 0) {
+ 			if (nvlist_lookup_uint64(events[i],
+ 			    ZPOOL_HIST_INT_EVENT, &ievent) != 0)
+ 				continue;
+ 			verify(nvlist_lookup_uint64(events[i],
+ 			    ZPOOL_HIST_TXG, &txg) == 0);
+ 			verify(nvlist_lookup_string(events[i],
+ 			    ZPOOL_HIST_INT_STR, &intstr) == 0);
+ 			if (ievent >= LOG_END)
+ 				continue;
+ 
+ 			(void) snprintf(internalstr,
+ 			    sizeof (internalstr),
+ 			    "[internal %s txg:%lld] %s",
+ 			    zfs_history_event_names[ievent], txg,
+ 			    intstr);
+ 			cmd = internalstr;
+ 		}
+ 		tsec = time;
+ 		(void) localtime_r(&tsec, &t);
+ 		(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+ 		(void) printf("%s %s\n", tbuf, cmd);
+ 	}
+ }
+ 
  /*ARGSUSED*/
  static void
  dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
@@@ -614,15 -900,19 +903,20 @@@ blkid2offset(const dnode_phys_t *dnp, c
  }
  
  static void
- sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas)
+ sprintf_blkptr_compact(char *blkbuf, const blkptr_t *bp)
  {
- 	dva_t *dva = bp->blk_dva;
- 	int ndvas = alldvas ? BP_GET_NDVAS(bp) : 1;
+ 	const dva_t *dva = bp->blk_dva;
+ 	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
 +	int i;
  
+ 	if (dump_opt['b'] >= 5) {
+ 		sprintf_blkptr(blkbuf, bp);
+ 		return;
+ 	}
+ 
  	blkbuf[0] = '\0';
  
 -	for (int i = 0; i < ndvas; i++)
 +	for (i = 0; i < ndvas; i++)
  		(void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ",
  		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
  		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
@@@ -1344,19 -1739,52 +1743,54 @@@ dump_cachefile(const char *cachefile
  	nvlist_free(config);
  }
  
+ #define	ZDB_MAX_UB_HEADER_SIZE 32
+ 
+ static void
+ dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
+ {
+ 	vdev_t vd;
+ 	vdev_t *vdp = &vd;
+ 	char header[ZDB_MAX_UB_HEADER_SIZE];
++	int i;
+ 
+ 	vd.vdev_ashift = ashift;
+ 	vdp->vdev_top = vdp;
+ 
 -	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
++	for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
+ 		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
+ 		uberblock_t *ub = (void *)((char *)lbl + uoff);
+ 
+ 		if (uberblock_verify(ub))
+ 			continue;
+ 		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
+ 		    "Uberblock[%d]\n", i);
+ 		dump_uberblock(ub, header, "");
+ 	}
+ }
+ 
  static void
  dump_label(const char *dev)
  {
  	int fd;
  	vdev_label_t label;
- 	char *buf = label.vl_vdev_phys.vp_nvlist;
+ 	char *path, *buf = label.vl_vdev_phys.vp_nvlist;
  	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
  	struct stat64 statbuf;
- 	uint64_t psize;
+ 	uint64_t psize, ashift;
+ 	int len = strlen(dev) + 1;
 +	int l;
  
- 	if ((fd = open64(dev, O_RDONLY)) < 0) {
- 		(void) printf("cannot open '%s': %s\n", dev, strerror(errno));
+ 	if (strncmp(dev, "/dev/dsk/", 9) == 0) {
+ 		len++;
+ 		path = malloc(len);
+ 		(void) snprintf(path, len, "%s%s", "/dev/rdsk/", dev + 9);
+ 	} else {
+ 		path = strdup(dev);
+ 	}
+ 
+ 	if ((fd = open64(path, O_RDONLY)) < 0) {
+ 		(void) printf("cannot open '%s': %s\n", path, strerror(errno));
+ 		free(path);
  		exit(1);
  	}
  
@@@ -1369,8 -1807,7 +1813,7 @@@
  	psize = statbuf.st_size;
  	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
  
 -	for (int l = 0; l < VDEV_LABELS; l++) {
 +	for (l = 0; l < VDEV_LABELS; l++) {
- 
  		nvlist_t *config = NULL;
  
  		(void) printf("--------------------------------------------\n");
@@@ -1507,13 -1897,19 +1903,20 @@@ typedef struct zdb_cb 
  } zdb_cb_t;
  
  static void
- zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type)
+ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
+     dmu_object_type_t type)
  {
+ 	uint64_t refcnt = 0;
 +	int i;
  
+ 	ASSERT(type < ZDB_OT_TOTAL);
+ 
+ 	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
+ 		return;
+ 
 -	for (int i = 0; i < 4; i++) {
 +	for (i = 0; i < 4; i++) {
  		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
- 		int t = (i & 1) ? type : DMU_OT_TOTAL;
+ 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
  		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
  
  		zb->zb_asize += BP_GET_ASIZE(bp);
@@@ -1625,24 -2017,159 +2024,164 @@@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilo
  	return (0);
  }
  
+ static void
+ zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
+ {
+ 	vdev_t *vd = sm->sm_ppd;
+ 
+ 	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
+ 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
+ }
+ 
+ /* ARGSUSED */
+ static void
+ zdb_space_map_load(space_map_t *sm)
+ {
+ }
+ 
+ static void
+ zdb_space_map_unload(space_map_t *sm)
+ {
+ 	space_map_vacate(sm, zdb_leak, sm);
+ }
+ 
+ /* ARGSUSED */
+ static void
+ zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
+ {
+ }
+ 
+ static space_map_ops_t zdb_space_map_ops = {
+ 	zdb_space_map_load,
+ 	zdb_space_map_unload,
+ 	NULL,	/* alloc */
+ 	zdb_space_map_claim,
+ 	NULL,	/* free */
+ 	NULL	/* maxsize */
+ };
+ 
+ static void
+ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
+ {
+ 	ddt_bookmark_t ddb = { 0 };
+ 	ddt_entry_t dde;
+ 	int error;
++	int p;
+ 
+ 	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
+ 		blkptr_t blk;
+ 		ddt_phys_t *ddp = dde.dde_phys;
+ 
+ 		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
+ 			return;
+ 
+ 		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
+ 
 -		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
++		for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ 			if (ddp->ddp_phys_birth == 0)
+ 				continue;
+ 			ddt_bp_create(ddb.ddb_checksum,
+ 			    &dde.dde_key, ddp, &blk);
+ 			if (p == DDT_PHYS_DITTO) {
+ 				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
+ 			} else {
+ 				zcb->zcb_dedup_asize +=
+ 				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
+ 				zcb->zcb_dedup_blocks++;
+ 			}
+ 		}
+ 		if (!dump_opt['L']) {
+ 			ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+ 			ddt_enter(ddt);
+ 			VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+ 			ddt_exit(ddt);
+ 		}
+ 	}
+ 
+ 	ASSERT(error == ENOENT);
+ }
+ 
+ static void
+ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
+ {
+ 	zcb->zcb_spa = spa;
++	int c, m;
+ 
+ 	if (!dump_opt['L']) {
+ 		vdev_t *rvd = spa->spa_root_vdev;
 -		for (int c = 0; c < rvd->vdev_children; c++) {
++		for (c = 0; c < rvd->vdev_children; c++) {
+ 			vdev_t *vd = rvd->vdev_child[c];
 -			for (int m = 0; m < vd->vdev_ms_count; m++) {
++			for (m = 0; m < vd->vdev_ms_count; m++) {
+ 				metaslab_t *msp = vd->vdev_ms[m];
+ 				mutex_enter(&msp->ms_lock);
+ 				space_map_unload(&msp->ms_map);
+ 				VERIFY(space_map_load(&msp->ms_map,
+ 				    &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo,
+ 				    spa->spa_meta_objset) == 0);
+ 				msp->ms_map.sm_ppd = vd;
+ 				mutex_exit(&msp->ms_lock);
+ 			}
+ 		}
+ 	}
+ 
+ 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ 
+ 	zdb_ddt_leak_init(spa, zcb);
+ 
+ 	spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+ 
+ static void
+ zdb_leak_fini(spa_t *spa)
+ {
++	int c, m;
++
+ 	if (!dump_opt['L']) {
+ 		vdev_t *rvd = spa->spa_root_vdev;
 -		for (int c = 0; c < rvd->vdev_children; c++) {
++		for (c = 0; c < rvd->vdev_children; c++) {
+ 			vdev_t *vd = rvd->vdev_child[c];
 -			for (int m = 0; m < vd->vdev_ms_count; m++) {
++			for (m = 0; m < vd->vdev_ms_count; m++) {
+ 				metaslab_t *msp = vd->vdev_ms[m];
+ 				mutex_enter(&msp->ms_lock);
+ 				space_map_unload(&msp->ms_map);
+ 				mutex_exit(&msp->ms_lock);
+ 			}
+ 		}
+ 	}
+ }
+ 
+ /* ARGSUSED */
+ static int
+ count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+ {
+ 	zdb_cb_t *zcb = arg;
+ 
+ 	if (dump_opt['b'] >= 4) {
+ 		char blkbuf[BP_SPRINTF_LEN];
+ 		sprintf_blkptr(blkbuf, bp);
+ 		(void) printf("[%s] %s\n",
+ 		    "deferred free", blkbuf);
+ 	}
+ 	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
+ 	return (0);
+ }
+ 
  static int
  dump_block_stats(spa_t *spa)
  {
  	zdb_cb_t zcb = { 0 };
  	zdb_blkstats_t *zb, *tzb;
- 	uint64_t alloc, space, logalloc;
- 	vdev_t *rvd = spa->spa_root_vdev;
+ 	uint64_t norm_alloc, norm_space, total_alloc, total_found;
+ 	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
  	int leaks = 0;
- 	int c, e;
++	int e;
  
- 	if (!dump_opt['S']) {
- 		(void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
- 		    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
- 		    (dump_opt['c'] == 1) ? "metadata " : "",
- 		    dump_opt['c'] ? "checksums " : "",
- 		    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
- 		    !dump_opt['L'] ? "nothing leaked " : "");
- 	}
+ 	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
+ 	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
+ 	    (dump_opt['c'] == 1) ? "metadata " : "",
+ 	    dump_opt['c'] ? "checksums " : "",
+ 	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
+ 	    !dump_opt['L'] ? "nothing leaked " : "");
  
  	/*
  	 * Load all space maps as SM_ALLOC maps, then traverse the pool
@@@ -1658,33 -2184,20 +2196,20 @@@
  	/*
  	 * If there's a deferred-free bplist, process that first.
  	 */
- 	if (spa->spa_sync_bplist_obj != 0) {
- 		bplist_t *bpl = &spa->spa_sync_bplist;
- 		blkptr_t blk;
- 		uint64_t itor = 0;
+ 	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
+ 	    count_block_cb, &zcb, NULL);
+ 	(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
+ 	    count_block_cb, &zcb, NULL);
  
- 		VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset,
- 		    spa->spa_sync_bplist_obj));
+ 	if (dump_opt['c'] > 1)
+ 		flags |= TRAVERSE_PREFETCH_DATA;
  
- 		while (bplist_iterate(bpl, &itor, &blk) == 0) {
- 			if (dump_opt['b'] >= 4) {
- 				char blkbuf[BP_SPRINTF_LEN];
- 				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk);
- 				(void) printf("[%s] %s\n",
- 				    "deferred free", blkbuf);
- 			}
- 			zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
- 		}
- 
- 		bplist_close(bpl);
- 	}
- 
- 	zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb);
+ 	zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
  
- 	if (zcb.zcb_haderrors && !dump_opt['S']) {
+ 	if (zcb.zcb_haderrors) {
  		(void) printf("\nError counts:\n\n");
  		(void) printf("\t%5s  %s\n", "errno", "count");
 -		for (int e = 0; e < 256; e++) {
 +		for (e = 0; e < 256; e++) {
  			if (zcb.zcb_errors[e] != 0) {
  				(void) printf("\t%5d  %llu\n",
  				    e, (u_longlong_t)zcb.zcb_errors[e]);
diff --cc cmd/ztest/ztest.c
index 5ce765418,eed92ec72..bdfde21bb
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@@ -763,152 -871,1365 +871,1372 @@@ ztest_spa_prop_set_uint64(ztest_shared_
  	return (error);
  }
  
- zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
- 	NULL,			/* 0 no such transaction type */
- 	ztest_replay_create,	/* TX_CREATE */
- 	NULL,			/* TX_MKDIR */
- 	NULL,			/* TX_MKXATTR */
- 	NULL,			/* TX_SYMLINK */
- 	ztest_replay_remove,	/* TX_REMOVE */
- 	NULL,			/* TX_RMDIR */
- 	NULL,			/* TX_LINK */
- 	NULL,			/* TX_RENAME */
- 	NULL,			/* TX_WRITE */
- 	NULL,			/* TX_TRUNCATE */
- 	NULL,			/* TX_SETATTR */
- 	NULL,			/* TX_ACL */
- };
+ static void
+ ztest_rll_init(rll_t *rll)
+ {
+ 	rll->rll_writer = NULL;
+ 	rll->rll_readers = 0;
+ 	VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0);
+ 	VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0);
+ }
  
- /*
-  * Verify that we can't destroy an active pool, create an existing pool,
-  * or create a pool with a bad vdev spec.
-  */
- void
- ztest_spa_create_destroy(ztest_args_t *za)
+ static void
+ ztest_rll_destroy(rll_t *rll)
  {
- 	int error;
- 	spa_t *spa;
- 	nvlist_t *nvroot;
+ 	ASSERT(rll->rll_writer == NULL);
+ 	ASSERT(rll->rll_readers == 0);
+ 	VERIFY(_mutex_destroy(&rll->rll_lock) == 0);
+ 	VERIFY(cond_destroy(&rll->rll_cv) == 0);
+ }
  
- 	/*
- 	 * Attempt to create using a bad file.
- 	 */
- 	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
- 	error = spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL);
- 	nvlist_free(nvroot);
- 	if (error != ENOENT)
- 		fatal(0, "spa_create(bad_file) = %d", error);
+ static void
+ ztest_rll_lock(rll_t *rll, rl_type_t type)
+ {
+ 	VERIFY(mutex_lock(&rll->rll_lock) == 0);
  
- 	/*
- 	 * Attempt to create using a bad mirror.
- 	 */
- 	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
- 	error = spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL);
- 	nvlist_free(nvroot);
- 	if (error != ENOENT)
- 		fatal(0, "spa_create(bad_mirror) = %d", error);
+ 	if (type == RL_READER) {
+ 		while (rll->rll_writer != NULL)
+ 			(void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+ 		rll->rll_readers++;
+ 	} else {
+ 		while (rll->rll_writer != NULL || rll->rll_readers)
+ 			(void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+ 		rll->rll_writer = curthread;
+ 	}
  
- 	/*
- 	 * Attempt to create an existing pool.  It shouldn't matter
- 	 * what's in the nvroot; we should fail with EEXIST.
- 	 */
- 	(void) rw_rdlock(&ztest_shared->zs_name_lock);
- 	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
- 	error = spa_create(za->za_pool, nvroot, NULL, NULL, NULL);
- 	nvlist_free(nvroot);
- 	if (error != EEXIST)
- 		fatal(0, "spa_create(whatever) = %d", error);
+ 	VERIFY(mutex_unlock(&rll->rll_lock) == 0);
+ }
  
- 	error = spa_open(za->za_pool, &spa, FTAG);
- 	if (error)
- 		fatal(0, "spa_open() = %d", error);
+ static void
+ ztest_rll_unlock(rll_t *rll)
+ {
+ 	VERIFY(mutex_lock(&rll->rll_lock) == 0);
  
- 	error = spa_destroy(za->za_pool);
- 	if (error != EBUSY)
- 		fatal(0, "spa_destroy() = %d", error);
+ 	if (rll->rll_writer) {
+ 		ASSERT(rll->rll_readers == 0);
+ 		rll->rll_writer = NULL;
+ 	} else {
+ 		ASSERT(rll->rll_readers != 0);
+ 		ASSERT(rll->rll_writer == NULL);
+ 		rll->rll_readers--;
+ 	}
  
- 	spa_close(spa, FTAG);
- 	(void) rw_unlock(&ztest_shared->zs_name_lock);
+ 	if (rll->rll_writer == NULL && rll->rll_readers == 0)
+ 		VERIFY(cond_broadcast(&rll->rll_cv) == 0);
+ 
+ 	VERIFY(mutex_unlock(&rll->rll_lock) == 0);
  }
  
- static vdev_t *
- vdev_lookup_by_path(vdev_t *vd, const char *path)
+ static void
+ ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
  {
- 	vdev_t *mvd;
- 	int c;
+ 	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
  
- 	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
- 		return (vd);
+ 	ztest_rll_lock(rll, type);
+ }
  
- 	for (c = 0; c < vd->vdev_children; c++)
- 		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
- 		    NULL)
- 			return (mvd);
+ static void
+ ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
+ {
+ 	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
  
- 	return (NULL);
+ 	ztest_rll_unlock(rll);
  }
  
- /*
-  * Verify that vdev_add() works as expected.
-  */
- void
- ztest_vdev_add_remove(ztest_args_t *za)
+ static rl_t *
+ ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
+     uint64_t size, rl_type_t type)
  {
- 	spa_t *spa = za->za_spa;
- 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
- 	nvlist_t *nvroot;
- 	int error;
+ 	uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
+ 	rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
+ 	rl_t *rl;
  
- 	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
+ 	rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
+ 	rl->rl_object = object;
+ 	rl->rl_offset = offset;
+ 	rl->rl_size = size;
+ 	rl->rl_lock = rll;
  
- 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ 	ztest_rll_lock(rll, type);
  
- 	ztest_shared->zs_vdev_primaries =
- 	    spa->spa_root_vdev->vdev_children * leaves;
+ 	return (rl);
+ }
  
- 	spa_config_exit(spa, SCL_VDEV, FTAG);
+ static void
+ ztest_range_unlock(rl_t *rl)
+ {
+ 	rll_t *rll = rl->rl_lock;
  
- 	/*
- 	 * Make 1/4 of the devices be log devices.
- 	 */
- 	nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
- 	    ztest_random(4) == 0, zopt_raidz, zopt_mirrors, 1);
+ 	ztest_rll_unlock(rll);
  
- 	error = spa_vdev_add(spa, nvroot);
- 	nvlist_free(nvroot);
+ 	umem_free(rl, sizeof (*rl));
+ }
+ 
+ static void
+ ztest_zd_init(ztest_ds_t *zd, objset_t *os)
+ {
+ 	zd->zd_os = os;
+ 	zd->zd_zilog = dmu_objset_zil(os);
+ 	zd->zd_seq = 0;
+ 	dmu_objset_name(os, zd->zd_name);
++	int l;
  
- 	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+ 	VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0);
  
- 	if (error == ENOSPC)
- 		ztest_record_enospc("spa_vdev_add");
- 	else if (error != 0)
- 		fatal(0, "spa_vdev_add() = %d", error);
 -	for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
++	for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+ 		ztest_rll_init(&zd->zd_object_lock[l]);
+ 
 -	for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
++	for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
+ 		ztest_rll_init(&zd->zd_range_lock[l]);
  }
  
- /*
-  * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
-  */
- void
- ztest_vdev_aux_add_remove(ztest_args_t *za)
+ static void
+ ztest_zd_fini(ztest_ds_t *zd)
  {
- 	spa_t *spa = za->za_spa;
- 	vdev_t *rvd = spa->spa_root_vdev;
- 	spa_aux_vdev_t *sav;
- 	char *aux;
- 	uint64_t guid = 0;
- 	int error;
++	int l;
 +
- 	if (ztest_random(2) == 0) {
- 		sav = &spa->spa_spares;
- 		aux = ZPOOL_CONFIG_SPARES;
- 	} else {
- 		sav = &spa->spa_l2cache;
- 		aux = ZPOOL_CONFIG_L2CACHE;
- 	}
+ 	VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0);
  
- 	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
 -	for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
++	for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+ 		ztest_rll_destroy(&zd->zd_object_lock[l]);
  
- 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 -	for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
++	for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
+ 		ztest_rll_destroy(&zd->zd_range_lock[l]);
+ }
+ 
+ #define	TXG_MIGHTWAIT	(ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
+ 
+ static uint64_t
+ ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
+ {
+ 	uint64_t txg;
+ 	int error;
+ 
+ 	/*
+ 	 * Attempt to assign tx to some transaction group.
+ 	 */
+ 	error = dmu_tx_assign(tx, txg_how);
+ 	if (error) {
+ 		if (error == ERESTART) {
+ 			ASSERT(txg_how == TXG_NOWAIT);
+ 			dmu_tx_wait(tx);
+ 		} else {
+ 			ASSERT3U(error, ==, ENOSPC);
+ 			ztest_record_enospc(tag);
+ 		}
+ 		dmu_tx_abort(tx);
+ 		return (0);
+ 	}
+ 	txg = dmu_tx_get_txg(tx);
+ 	ASSERT(txg != 0);
+ 	return (txg);
+ }
+ 
+ static void
+ ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
+ {
+ 	uint64_t *ip = buf;
+ 	uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+ 
+ 	while (ip < ip_end)
+ 		*ip++ = value;
+ }
+ 
+ static boolean_t
+ ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
+ {
+ 	uint64_t *ip = buf;
+ 	uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+ 	uint64_t diff = 0;
+ 
+ 	while (ip < ip_end)
+ 		diff |= (value - *ip++);
+ 
+ 	return (diff == 0);
+ }
+ 
+ static void
+ ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+     uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+ {
+ 	bt->bt_magic = BT_MAGIC;
+ 	bt->bt_objset = dmu_objset_id(os);
+ 	bt->bt_object = object;
+ 	bt->bt_offset = offset;
+ 	bt->bt_gen = gen;
+ 	bt->bt_txg = txg;
+ 	bt->bt_crtxg = crtxg;
+ }
+ 
+ static void
+ ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+     uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+ {
+ 	ASSERT(bt->bt_magic == BT_MAGIC);
+ 	ASSERT(bt->bt_objset == dmu_objset_id(os));
+ 	ASSERT(bt->bt_object == object);
+ 	ASSERT(bt->bt_offset == offset);
+ 	ASSERT(bt->bt_gen <= gen);
+ 	ASSERT(bt->bt_txg <= txg);
+ 	ASSERT(bt->bt_crtxg == crtxg);
+ }
+ 
+ static ztest_block_tag_t *
+ ztest_bt_bonus(dmu_buf_t *db)
+ {
+ 	dmu_object_info_t doi;
+ 	ztest_block_tag_t *bt;
+ 
+ 	dmu_object_info_from_db(db, &doi);
+ 	ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
+ 	ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
+ 	bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
+ 
+ 	return (bt);
+ }
+ 
+ /*
+  * ZIL logging ops
+  */
+ 
+ #define	lrz_type	lr_mode
+ #define	lrz_blocksize	lr_uid
+ #define	lrz_ibshift	lr_gid
+ #define	lrz_bonustype	lr_rdev
+ #define	lrz_bonuslen	lr_crtime[1]
+ 
+ static uint64_t
+ ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
+ {
+ 	char *name = (void *)(lr + 1);		/* name follows lr */
+ 	size_t namesize = strlen(name) + 1;
+ 	itx_t *itx;
+ 
+ 	if (zil_replaying(zd->zd_zilog, tx))
+ 		return (0);
+ 
+ 	itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
+ 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ 	    sizeof (*lr) + namesize - sizeof (lr_t));
+ 
+ 	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+ }
+ 
+ static uint64_t
+ ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr)
+ {
+ 	char *name = (void *)(lr + 1);		/* name follows lr */
+ 	size_t namesize = strlen(name) + 1;
+ 	itx_t *itx;
+ 
+ 	if (zil_replaying(zd->zd_zilog, tx))
+ 		return (0);
+ 
+ 	itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
+ 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ 	    sizeof (*lr) + namesize - sizeof (lr_t));
+ 
+ 	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+ }
+ 
+ static uint64_t
+ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
+ {
+ 	itx_t *itx;
+ 	itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
+ 
+ 	if (zil_replaying(zd->zd_zilog, tx))
+ 		return (0);
+ 
+ 	if (lr->lr_length > ZIL_MAX_LOG_DATA)
+ 		write_state = WR_INDIRECT;
+ 
+ 	itx = zil_itx_create(TX_WRITE,
+ 	    sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
+ 
+ 	if (write_state == WR_COPIED &&
+ 	    dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
+ 	    ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
+ 		zil_itx_destroy(itx);
+ 		itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+ 		write_state = WR_NEED_COPY;
+ 	}
+ 	itx->itx_private = zd;
+ 	itx->itx_wr_state = write_state;
+ 	itx->itx_sync = (ztest_random(8) == 0);
+ 	itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
+ 
+ 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ 	    sizeof (*lr) - sizeof (lr_t));
+ 
+ 	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+ }
+ 
+ static uint64_t
+ ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
+ {
+ 	itx_t *itx;
+ 
+ 	if (zil_replaying(zd->zd_zilog, tx))
+ 		return (0);
+ 
+ 	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+ 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ 	    sizeof (*lr) - sizeof (lr_t));
+ 
+ 	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+ }
+ 
+ static uint64_t
+ ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
+ {
+ 	itx_t *itx;
+ 
+ 	if (zil_replaying(zd->zd_zilog, tx))
+ 		return (0);
+ 
+ 	itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
+ 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ 	    sizeof (*lr) - sizeof (lr_t));
+ 
+ 	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+ }
+ 
+ /*
+  * ZIL replay ops
+  */
+ static int
+ ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap)
+ {
+ 	char *name = (void *)(lr + 1);		/* name follows lr */
+ 	objset_t *os = zd->zd_os;
+ 	ztest_block_tag_t *bbt;
+ 	dmu_buf_t *db;
+ 	dmu_tx_t *tx;
+ 	uint64_t txg;
+ 	int error = 0;
+ 
+ 	if (byteswap)
+ 		byteswap_uint64_array(lr, sizeof (*lr));
+ 
+ 	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+ 	ASSERT(name[0] != '\0');
+ 
+ 	tx = dmu_tx_create(os);
+ 
+ 	dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
+ 
+ 	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+ 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ 	} else {
+ 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ 	}
+ 
+ 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ 	if (txg == 0)
+ 		return (ENOSPC);
+ 
+ 	ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
+ 
+ 	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+ 		if (lr->lr_foid == 0) {
+ 			lr->lr_foid = zap_create(os,
+ 			    lr->lrz_type, lr->lrz_bonustype,
+ 			    lr->lrz_bonuslen, tx);
+ 		} else {
+ 			error = zap_create_claim(os, lr->lr_foid,
+ 			    lr->lrz_type, lr->lrz_bonustype,
+ 			    lr->lrz_bonuslen, tx);
+ 		}
+ 	} else {
+ 		if (lr->lr_foid == 0) {
+ 			lr->lr_foid = dmu_object_alloc(os,
+ 			    lr->lrz_type, 0, lr->lrz_bonustype,
+ 			    lr->lrz_bonuslen, tx);
+ 		} else {
+ 			error = dmu_object_claim(os, lr->lr_foid,
+ 			    lr->lrz_type, 0, lr->lrz_bonustype,
+ 			    lr->lrz_bonuslen, tx);
+ 		}
+ 	}
+ 
+ 	if (error) {
+ 		ASSERT3U(error, ==, EEXIST);
+ 		ASSERT(zd->zd_zilog->zl_replay);
+ 		dmu_tx_commit(tx);
+ 		return (error);
+ 	}
+ 
+ 	ASSERT(lr->lr_foid != 0);
+ 
+ 	if (lr->lrz_type != DMU_OT_ZAP_OTHER)
+ 		VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
+ 		    lr->lrz_blocksize, lr->lrz_ibshift, tx));
+ 
+ 	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+ 	bbt = ztest_bt_bonus(db);
+ 	dmu_buf_will_dirty(db, tx);
+ 	ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg);
+ 	dmu_buf_rele(db, FTAG);
+ 
+ 	VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
+ 	    &lr->lr_foid, tx));
+ 
+ 	(void) ztest_log_create(zd, tx, lr);
+ 
+ 	dmu_tx_commit(tx);
+ 
+ 	return (0);
+ }
+ 
+ static int
+ ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap)
+ {
+ 	char *name = (void *)(lr + 1);		/* name follows lr */
+ 	objset_t *os = zd->zd_os;
+ 	dmu_object_info_t doi;
+ 	dmu_tx_t *tx;
+ 	uint64_t object, txg;
+ 
+ 	if (byteswap)
+ 		byteswap_uint64_array(lr, sizeof (*lr));
+ 
+ 	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+ 	ASSERT(name[0] != '\0');
+ 
+ 	VERIFY3U(0, ==,
+ 	    zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
+ 	ASSERT(object != 0);
+ 
+ 	ztest_object_lock(zd, object, RL_WRITER);
+ 
+ 	VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
+ 
+ 	tx = dmu_tx_create(os);
+ 
+ 	dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
+ 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+ 
+ 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ 	if (txg == 0) {
+ 		ztest_object_unlock(zd, object);
+ 		return (ENOSPC);
+ 	}
+ 
+ 	if (doi.doi_type == DMU_OT_ZAP_OTHER) {
+ 		VERIFY3U(0, ==, zap_destroy(os, object, tx));
+ 	} else {
+ 		VERIFY3U(0, ==, dmu_object_free(os, object, tx));
+ 	}
+ 
+ 	VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
+ 
+ 	(void) ztest_log_remove(zd, tx, lr);
+ 
+ 	dmu_tx_commit(tx);
+ 
+ 	ztest_object_unlock(zd, object);
+ 
+ 	return (0);
+ }
+ 
+ static int
+ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
+ {
+ 	objset_t *os = zd->zd_os;
+ 	void *data = lr + 1;			/* data follows lr */
+ 	uint64_t offset, length;
+ 	ztest_block_tag_t *bt = data;
+ 	ztest_block_tag_t *bbt;
+ 	uint64_t gen, txg, lrtxg, crtxg;
+ 	dmu_object_info_t doi;
+ 	dmu_tx_t *tx;
+ 	dmu_buf_t *db;
+ 	arc_buf_t *abuf = NULL;
+ 	rl_t *rl;
+ 
+ 	if (byteswap)
+ 		byteswap_uint64_array(lr, sizeof (*lr));
+ 
+ 	offset = lr->lr_offset;
+ 	length = lr->lr_length;
+ 
+ 	/* If it's a dmu_sync() block, write the whole block */
+ 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ 		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ 		if (length < blocksize) {
+ 			offset -= offset % blocksize;
+ 			length = blocksize;
+ 		}
+ 	}
+ 
+ 	if (bt->bt_magic == BSWAP_64(BT_MAGIC))
+ 		byteswap_uint64_array(bt, sizeof (*bt));
+ 
+ 	if (bt->bt_magic != BT_MAGIC)
+ 		bt = NULL;
+ 
+ 	ztest_object_lock(zd, lr->lr_foid, RL_READER);
+ 	rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
+ 
+ 	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+ 
+ 	dmu_object_info_from_db(db, &doi);
+ 
+ 	bbt = ztest_bt_bonus(db);
+ 	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+ 	gen = bbt->bt_gen;
+ 	crtxg = bbt->bt_crtxg;
+ 	lrtxg = lr->lr_common.lrc_txg;
+ 
+ 	tx = dmu_tx_create(os);
+ 
+ 	dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
+ 
+ 	if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
+ 	    P2PHASE(offset, length) == 0)
+ 		abuf = dmu_request_arcbuf(db, length);
+ 
+ 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ 	if (txg == 0) {
+ 		if (abuf != NULL)
+ 			dmu_return_arcbuf(abuf);
+ 		dmu_buf_rele(db, FTAG);
+ 		ztest_range_unlock(rl);
+ 		ztest_object_unlock(zd, lr->lr_foid);
+ 		return (ENOSPC);
+ 	}
+ 
+ 	if (bt != NULL) {
+ 		/*
+ 		 * Usually, verify the old data before writing new data --
+ 		 * but not always, because we also want to verify correct
+ 		 * behavior when the data was not recently read into cache.
+ 		 */
+ 		ASSERT(offset % doi.doi_data_block_size == 0);
+ 		if (ztest_random(4) != 0) {
+ 			int prefetch = ztest_random(2) ?
+ 			    DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
+ 			ztest_block_tag_t rbt;
+ 
+ 			VERIFY(dmu_read(os, lr->lr_foid, offset,
+ 			    sizeof (rbt), &rbt, prefetch) == 0);
+ 			if (rbt.bt_magic == BT_MAGIC) {
+ 				ztest_bt_verify(&rbt, os, lr->lr_foid,
+ 				    offset, gen, txg, crtxg);
+ 			}
+ 		}
+ 
+ 		/*
+ 		 * Writes can appear to be newer than the bonus buffer because
+ 		 * the ztest_get_data() callback does a dmu_read() of the
+ 		 * open-context data, which may be different than the data
+ 		 * as it was when the write was generated.
+ 		 */
+ 		if (zd->zd_zilog->zl_replay) {
+ 			ztest_bt_verify(bt, os, lr->lr_foid, offset,
+ 			    MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
+ 			    bt->bt_crtxg);
+ 		}
+ 
+ 		/*
+ 		 * Set the bt's gen/txg to the bonus buffer's gen/txg
+ 		 * so that all of the usual ASSERTs will work.
+ 		 */
+ 		ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg);
+ 	}
+ 
+ 	if (abuf == NULL) {
+ 		dmu_write(os, lr->lr_foid, offset, length, data, tx);
+ 	} else {
+ 		bcopy(data, abuf->b_data, length);
+ 		dmu_assign_arcbuf(db, offset, abuf, tx);
+ 	}
+ 
+ 	(void) ztest_log_write(zd, tx, lr);
+ 
+ 	dmu_buf_rele(db, FTAG);
+ 
+ 	dmu_tx_commit(tx);
+ 
+ 	ztest_range_unlock(rl);
+ 	ztest_object_unlock(zd, lr->lr_foid);
+ 
+ 	return (0);
+ }
+ 
+ static int
+ ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap)
+ {
+ 	objset_t *os = zd->zd_os;
+ 	dmu_tx_t *tx;
+ 	uint64_t txg;
+ 	rl_t *rl;
+ 
+ 	if (byteswap)
+ 		byteswap_uint64_array(lr, sizeof (*lr));
+ 
+ 	ztest_object_lock(zd, lr->lr_foid, RL_READER);
+ 	rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
+ 	    RL_WRITER);
+ 
+ 	tx = dmu_tx_create(os);
+ 
+ 	dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
+ 
+ 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ 	if (txg == 0) {
+ 		ztest_range_unlock(rl);
+ 		ztest_object_unlock(zd, lr->lr_foid);
+ 		return (ENOSPC);
+ 	}
+ 
+ 	VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
+ 	    lr->lr_length, tx) == 0);
+ 
+ 	(void) ztest_log_truncate(zd, tx, lr);
+ 
+ 	dmu_tx_commit(tx);
+ 
+ 	ztest_range_unlock(rl);
+ 	ztest_object_unlock(zd, lr->lr_foid);
+ 
+ 	return (0);
+ }
+ 
+ static int
+ ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
+ {
+ 	objset_t *os = zd->zd_os;
+ 	dmu_tx_t *tx;
+ 	dmu_buf_t *db;
+ 	ztest_block_tag_t *bbt;
+ 	uint64_t txg, lrtxg, crtxg;
+ 
+ 	if (byteswap)
+ 		byteswap_uint64_array(lr, sizeof (*lr));
+ 
+ 	ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
+ 
+ 	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+ 
+ 	tx = dmu_tx_create(os);
+ 	dmu_tx_hold_bonus(tx, lr->lr_foid);
+ 
+ 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ 	if (txg == 0) {
+ 		dmu_buf_rele(db, FTAG);
+ 		ztest_object_unlock(zd, lr->lr_foid);
+ 		return (ENOSPC);
+ 	}
+ 
+ 	bbt = ztest_bt_bonus(db);
+ 	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+ 	crtxg = bbt->bt_crtxg;
+ 	lrtxg = lr->lr_common.lrc_txg;
+ 
+ 	if (zd->zd_zilog->zl_replay) {
+ 		ASSERT(lr->lr_size != 0);
+ 		ASSERT(lr->lr_mode != 0);
+ 		ASSERT(lrtxg != 0);
+ 	} else {
+ 		/*
+ 		 * Randomly change the size and increment the generation.
+ 		 */
+ 		lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
+ 		    sizeof (*bbt);
+ 		lr->lr_mode = bbt->bt_gen + 1;
+ 		ASSERT(lrtxg == 0);
+ 	}
+ 
+ 	/*
+ 	 * Verify that the current bonus buffer is not newer than our txg.
+ 	 */
+ 	ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode,
+ 	    MAX(txg, lrtxg), crtxg);
+ 
+ 	dmu_buf_will_dirty(db, tx);
+ 
+ 	ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
+ 	ASSERT3U(lr->lr_size, <=, db->db_size);
+ 	VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0);
+ 	bbt = ztest_bt_bonus(db);
+ 
+ 	ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);
+ 
+ 	dmu_buf_rele(db, FTAG);
+ 
+ 	(void) ztest_log_setattr(zd, tx, lr);
+ 
+ 	dmu_tx_commit(tx);
+ 
+ 	ztest_object_unlock(zd, lr->lr_foid);
+ 
+ 	return (0);
+ }
+ 
+ zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
+ 	NULL,			/* 0 no such transaction type */
+ 	ztest_replay_create,	/* TX_CREATE */
+ 	NULL,			/* TX_MKDIR */
+ 	NULL,			/* TX_MKXATTR */
+ 	NULL,			/* TX_SYMLINK */
+ 	ztest_replay_remove,	/* TX_REMOVE */
+ 	NULL,			/* TX_RMDIR */
+ 	NULL,			/* TX_LINK */
+ 	NULL,			/* TX_RENAME */
+ 	ztest_replay_write,	/* TX_WRITE */
+ 	ztest_replay_truncate,	/* TX_TRUNCATE */
+ 	ztest_replay_setattr,	/* TX_SETATTR */
+ 	NULL,			/* TX_ACL */
+ 	NULL,			/* TX_CREATE_ACL */
+ 	NULL,			/* TX_CREATE_ATTR */
+ 	NULL,			/* TX_CREATE_ACL_ATTR */
+ 	NULL,			/* TX_MKDIR_ACL */
+ 	NULL,			/* TX_MKDIR_ATTR */
+ 	NULL,			/* TX_MKDIR_ACL_ATTR */
+ 	NULL,			/* TX_WRITE2 */
+ };
+ 
+ /*
+  * ZIL get_data callbacks
+  */
+ 
+ static void
+ ztest_get_done(zgd_t *zgd, int error)
+ {
+ 	ztest_ds_t *zd = zgd->zgd_private;
+ 	uint64_t object = zgd->zgd_rl->rl_object;
+ 
+ 	if (zgd->zgd_db)
+ 		dmu_buf_rele(zgd->zgd_db, zgd);
+ 
+ 	ztest_range_unlock(zgd->zgd_rl);
+ 	ztest_object_unlock(zd, object);
+ 
+ 	if (error == 0 && zgd->zgd_bp)
+ 		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+ 
+ 	umem_free(zgd, sizeof (*zgd));
+ }
+ 
+ static int
+ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+ {
+ 	ztest_ds_t *zd = arg;
+ 	objset_t *os = zd->zd_os;
+ 	uint64_t object = lr->lr_foid;
+ 	uint64_t offset = lr->lr_offset;
+ 	uint64_t size = lr->lr_length;
+ 	blkptr_t *bp = &lr->lr_blkptr;
+ 	uint64_t txg = lr->lr_common.lrc_txg;
+ 	uint64_t crtxg;
+ 	dmu_object_info_t doi;
+ 	dmu_buf_t *db;
+ 	zgd_t *zgd;
+ 	int error;
+ 
+ 	ztest_object_lock(zd, object, RL_READER);
+ 	error = dmu_bonus_hold(os, object, FTAG, &db);
+ 	if (error) {
+ 		ztest_object_unlock(zd, object);
+ 		return (error);
+ 	}
+ 
+ 	crtxg = ztest_bt_bonus(db)->bt_crtxg;
+ 
+ 	if (crtxg == 0 || crtxg > txg) {
+ 		dmu_buf_rele(db, FTAG);
+ 		ztest_object_unlock(zd, object);
+ 		return (ENOENT);
+ 	}
+ 
+ 	dmu_object_info_from_db(db, &doi);
+ 	dmu_buf_rele(db, FTAG);
+ 	db = NULL;
+ 
+ 	zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
+ 	zgd->zgd_zilog = zd->zd_zilog;
+ 	zgd->zgd_private = zd;
+ 
+ 	if (buf != NULL) {	/* immediate write */
+ 		zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+ 		    RL_READER);
+ 
+ 		error = dmu_read(os, object, offset, size, buf,
+ 		    DMU_READ_NO_PREFETCH);
+ 		ASSERT(error == 0);
+ 	} else {
+ 		size = doi.doi_data_block_size;
+ 		if (ISP2(size)) {
+ 			offset = P2ALIGN(offset, size);
+ 		} else {
+ 			ASSERT(offset < size);
+ 			offset = 0;
+ 		}
+ 
+ 		zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+ 		    RL_READER);
+ 
+ 		error = dmu_buf_hold(os, object, offset, zgd, &db,
+ 		    DMU_READ_NO_PREFETCH);
+ 
+ 		if (error == 0) {
+ 			zgd->zgd_db = db;
+ 			zgd->zgd_bp = bp;
+ 
+ 			ASSERT(db->db_offset == offset);
+ 			ASSERT(db->db_size == size);
+ 
+ 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ 			    ztest_get_done, zgd);
+ 
+ 			if (error == 0)
+ 				return (0);
+ 		}
+ 	}
+ 
+ 	ztest_get_done(zgd, error);
+ 
+ 	return (error);
+ }
+ 
+ static void *
+ ztest_lr_alloc(size_t lrsize, char *name)
+ {
+ 	char *lr;
+ 	size_t namesize = name ? strlen(name) + 1 : 0;
+ 
+ 	lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
+ 
+ 	if (name)
+ 		bcopy(name, lr + lrsize, namesize);
+ 
+ 	return (lr);
+ }
+ 
+ void
+ ztest_lr_free(void *lr, size_t lrsize, char *name)
+ {
+ 	size_t namesize = name ? strlen(name) + 1 : 0;
+ 
+ 	umem_free(lr, lrsize + namesize);
+ }
+ 
+ /*
+  * Lookup a bunch of objects.  Returns the number of objects not found.
+  */
+ static int
+ ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
+ {
+ 	int missing = 0;
+ 	int error;
++	int i;
+ 
+ 	ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+ 
 -	for (int i = 0; i < count; i++, od++) {
++	for (i = 0; i < count; i++, od++) {
+ 		od->od_object = 0;
+ 		error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
+ 		    sizeof (uint64_t), 1, &od->od_object);
+ 		if (error) {
+ 			ASSERT(error == ENOENT);
+ 			ASSERT(od->od_object == 0);
+ 			missing++;
+ 		} else {
+ 			dmu_buf_t *db;
+ 			ztest_block_tag_t *bbt;
+ 			dmu_object_info_t doi;
+ 
+ 			ASSERT(od->od_object != 0);
+ 			ASSERT(missing == 0);	/* there should be no gaps */
+ 
+ 			ztest_object_lock(zd, od->od_object, RL_READER);
+ 			VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
+ 			    od->od_object, FTAG, &db));
+ 			dmu_object_info_from_db(db, &doi);
+ 			bbt = ztest_bt_bonus(db);
+ 			ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+ 			od->od_type = doi.doi_type;
+ 			od->od_blocksize = doi.doi_data_block_size;
+ 			od->od_gen = bbt->bt_gen;
+ 			dmu_buf_rele(db, FTAG);
+ 			ztest_object_unlock(zd, od->od_object);
+ 		}
+ 	}
+ 
+ 	return (missing);
+ }
+ 
+ static int
+ ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
+ {
+ 	int missing = 0;
++	int i;
+ 
+ 	ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+ 
 -	for (int i = 0; i < count; i++, od++) {
++	for (i = 0; i < count; i++, od++) {
+ 		if (missing) {
+ 			od->od_object = 0;
+ 			missing++;
+ 			continue;
+ 		}
+ 
+ 		lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+ 
+ 		lr->lr_doid = od->od_dir;
+ 		lr->lr_foid = 0;	/* 0 to allocate, > 0 to claim */
+ 		lr->lrz_type = od->od_crtype;
+ 		lr->lrz_blocksize = od->od_crblocksize;
+ 		lr->lrz_ibshift = ztest_random_ibshift();
+ 		lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
+ 		lr->lrz_bonuslen = dmu_bonus_max();
+ 		lr->lr_gen = od->od_crgen;
+ 		lr->lr_crtime[0] = time(NULL);
+ 
+ 		if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
+ 			ASSERT(missing == 0);
+ 			od->od_object = 0;
+ 			missing++;
+ 		} else {
+ 			od->od_object = lr->lr_foid;
+ 			od->od_type = od->od_crtype;
+ 			od->od_blocksize = od->od_crblocksize;
+ 			od->od_gen = od->od_crgen;
+ 			ASSERT(od->od_object != 0);
+ 		}
+ 
+ 		ztest_lr_free(lr, sizeof (*lr), od->od_name);
+ 	}
+ 
+ 	return (missing);
+ }
+ 
+ static int
+ ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
+ {
+ 	int missing = 0;
+ 	int error;
++	int i;
+ 
+ 	ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+ 
+ 	od += count - 1;
+ 
 -	for (int i = count - 1; i >= 0; i--, od--) {
++	for (i = count - 1; i >= 0; i--, od--) {
+ 		if (missing) {
+ 			missing++;
+ 			continue;
+ 		}
+ 
+ 		if (od->od_object == 0)
+ 			continue;
+ 
+ 		lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+ 
+ 		lr->lr_doid = od->od_dir;
+ 
+ 		if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
+ 			ASSERT3U(error, ==, ENOSPC);
+ 			missing++;
+ 		} else {
+ 			od->od_object = 0;
+ 		}
+ 		ztest_lr_free(lr, sizeof (*lr), od->od_name);
+ 	}
+ 
+ 	return (missing);
+ }
+ 
+ static int
+ ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
+     void *data)
+ {
+ 	lr_write_t *lr;
+ 	int error;
+ 
+ 	lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
+ 
+ 	lr->lr_foid = object;
+ 	lr->lr_offset = offset;
+ 	lr->lr_length = size;
+ 	lr->lr_blkoff = 0;
+ 	BP_ZERO(&lr->lr_blkptr);
+ 
+ 	bcopy(data, lr + 1, size);
+ 
+ 	error = ztest_replay_write(zd, lr, B_FALSE);
+ 
+ 	ztest_lr_free(lr, sizeof (*lr) + size, NULL);
+ 
+ 	return (error);
+ }
+ 
+ static int
+ ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+ {
+ 	lr_truncate_t *lr;
+ 	int error;
+ 
+ 	lr = ztest_lr_alloc(sizeof (*lr), NULL);
+ 
+ 	lr->lr_foid = object;
+ 	lr->lr_offset = offset;
+ 	lr->lr_length = size;
+ 
+ 	error = ztest_replay_truncate(zd, lr, B_FALSE);
+ 
+ 	ztest_lr_free(lr, sizeof (*lr), NULL);
+ 
+ 	return (error);
+ }
+ 
+ static int
+ ztest_setattr(ztest_ds_t *zd, uint64_t object)
+ {
+ 	lr_setattr_t *lr;
+ 	int error;
+ 
+ 	lr = ztest_lr_alloc(sizeof (*lr), NULL);
+ 
+ 	lr->lr_foid = object;
+ 	lr->lr_size = 0;
+ 	lr->lr_mode = 0;
+ 
+ 	error = ztest_replay_setattr(zd, lr, B_FALSE);
+ 
+ 	ztest_lr_free(lr, sizeof (*lr), NULL);
+ 
+ 	return (error);
+ }
+ 
+ static void
+ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+ {
+ 	objset_t *os = zd->zd_os;
+ 	dmu_tx_t *tx;
+ 	uint64_t txg;
+ 	rl_t *rl;
+ 
+ 	txg_wait_synced(dmu_objset_pool(os), 0);
+ 
+ 	ztest_object_lock(zd, object, RL_READER);
+ 	rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
+ 
+ 	tx = dmu_tx_create(os);
+ 
+ 	dmu_tx_hold_write(tx, object, offset, size);
+ 
+ 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ 
+ 	if (txg != 0) {
+ 		dmu_prealloc(os, object, offset, size, tx);
+ 		dmu_tx_commit(tx);
+ 		txg_wait_synced(dmu_objset_pool(os), txg);
+ 	} else {
+ 		(void) dmu_free_long_range(os, object, offset, size);
+ 	}
+ 
+ 	ztest_range_unlock(rl);
+ 	ztest_object_unlock(zd, object);
+ }
+ 
+ static void
+ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
+ {
+ 	ztest_block_tag_t wbt;
+ 	dmu_object_info_t doi;
+ 	enum ztest_io_type io_type;
+ 	uint64_t blocksize;
+ 	void *data;
+ 
+ 	VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
+ 	blocksize = doi.doi_data_block_size;
+ 	data = umem_alloc(blocksize, UMEM_NOFAIL);
+ 
+ 	/*
+ 	 * Pick an i/o type at random, biased toward writing block tags.
+ 	 */
+ 	io_type = ztest_random(ZTEST_IO_TYPES);
+ 	if (ztest_random(2) == 0)
+ 		io_type = ZTEST_IO_WRITE_TAG;
+ 
+ 	switch (io_type) {
+ 
+ 	case ZTEST_IO_WRITE_TAG:
+ 		ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
+ 		(void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
+ 		break;
+ 
+ 	case ZTEST_IO_WRITE_PATTERN:
+ 		(void) memset(data, 'a' + (object + offset) % 5, blocksize);
+ 		if (ztest_random(2) == 0) {
+ 			/*
+ 			 * Induce fletcher2 collisions to ensure that
+ 			 * zio_ddt_collision() detects and resolves them
+ 			 * when using fletcher2-verify for deduplication.
+ 			 */
+ 			((uint64_t *)data)[0] ^= 1ULL << 63;
+ 			((uint64_t *)data)[4] ^= 1ULL << 63;
+ 		}
+ 		(void) ztest_write(zd, object, offset, blocksize, data);
+ 		break;
+ 
+ 	case ZTEST_IO_WRITE_ZEROES:
+ 		bzero(data, blocksize);
+ 		(void) ztest_write(zd, object, offset, blocksize, data);
+ 		break;
+ 
+ 	case ZTEST_IO_TRUNCATE:
+ 		(void) ztest_truncate(zd, object, offset, blocksize);
+ 		break;
+ 
+ 	case ZTEST_IO_SETATTR:
+ 		(void) ztest_setattr(zd, object);
+ 		break;
+ 	}
+ 
+ 	umem_free(data, blocksize);
+ }
+ 
+ /*
+  * Initialize an object description template.
+  */
+ static void
+ ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
+     dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
+ {
+ 	od->od_dir = ZTEST_DIROBJ;
+ 	od->od_object = 0;
+ 
+ 	od->od_crtype = type;
+ 	od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
+ 	od->od_crgen = gen;
+ 
+ 	od->od_type = DMU_OT_NONE;
+ 	od->od_blocksize = 0;
+ 	od->od_gen = 0;
+ 
+ 	(void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
+ 	    tag, (int64_t)id, index);
+ }
+ 
+ /*
+  * Lookup or create the objects for a test using the od template.
+  * If the objects do not all exist, or if 'remove' is specified,
+  * remove any existing objects and create new ones.  Otherwise,
+  * use the existing objects.
+  */
+ static int
+ ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
+ {
+ 	int count = size / sizeof (*od);
+ 	int rv = 0;
+ 
+ 	VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0);
+ 	if ((ztest_lookup(zd, od, count) != 0 || remove) &&
+ 	    (ztest_remove(zd, od, count) != 0 ||
+ 	    ztest_create(zd, od, count) != 0))
+ 		rv = -1;
+ 	zd->zd_od = od;
+ 	VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
+ 
+ 	return (rv);
+ }
+ 
+ /* ARGSUSED */
+ void
+ ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
+ {
+ 	zilog_t *zilog = zd->zd_zilog;
+ 
+ 	zil_commit(zilog, UINT64_MAX, ztest_random(ZTEST_OBJECTS));
+ 
+ 	/*
+ 	 * Remember the committed values in zd, which is in parent/child
+ 	 * shared memory.  If we die, the next iteration of ztest_run()
+ 	 * will verify that the log really does contain this record.
+ 	 */
+ 	mutex_enter(&zilog->zl_lock);
+ 	ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq);
+ 	zd->zd_seq = zilog->zl_commit_lr_seq;
+ 	mutex_exit(&zilog->zl_lock);
+ }
+ 
+ /*
+  * Verify that we can't destroy an active pool, create an existing pool,
+  * or create a pool with a bad vdev spec.
+  */
+ /* ARGSUSED */
+ void
+ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
+ {
+ 	ztest_shared_t *zs = ztest_shared;
+ 	spa_t *spa;
+ 	nvlist_t *nvroot;
+ 
+ 	/*
+ 	 * Attempt to create using a bad file.
+ 	 */
+ 	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
+ 	VERIFY3U(ENOENT, ==,
+ 	    spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL));
+ 	nvlist_free(nvroot);
+ 
+ 	/*
+ 	 * Attempt to create using a bad mirror.
+ 	 */
+ 	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
+ 	VERIFY3U(ENOENT, ==,
+ 	    spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL));
+ 	nvlist_free(nvroot);
+ 
+ 	/*
+ 	 * Attempt to create an existing pool.  It shouldn't matter
+ 	 * what's in the nvroot; we should fail with EEXIST.
+ 	 */
+ 	(void) rw_rdlock(&zs->zs_name_lock);
+ 	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
+ 	VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL));
+ 	nvlist_free(nvroot);
+ 	VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+ 	VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool));
+ 	spa_close(spa, FTAG);
+ 
+ 	(void) rw_unlock(&zs->zs_name_lock);
+ }
+ 
+ static vdev_t *
+ vdev_lookup_by_path(vdev_t *vd, const char *path)
+ {
+ 	vdev_t *mvd;
++	int c;
+ 
+ 	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
+ 		return (vd);
+ 
 -	for (int c = 0; c < vd->vdev_children; c++)
++	for (c = 0; c < vd->vdev_children; c++)
+ 		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
+ 		    NULL)
+ 			return (mvd);
+ 
+ 	return (NULL);
+ }
+ 
+ /*
+  * Find the first available hole which can be used as a top-level.
+  */
+ int
+ find_vdev_hole(spa_t *spa)
+ {
+ 	vdev_t *rvd = spa->spa_root_vdev;
+ 	int c;
+ 
+ 	ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);
+ 
+ 	for (c = 0; c < rvd->vdev_children; c++) {
+ 		vdev_t *cvd = rvd->vdev_child[c];
+ 
+ 		if (cvd->vdev_ishole)
+ 			break;
+ 	}
+ 	return (c);
+ }
+ 
+ /*
+  * Verify that vdev_add() works as expected.
+  */
+ /* ARGSUSED */
+ void
+ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
+ {
+ 	ztest_shared_t *zs = ztest_shared;
+ 	spa_t *spa = zs->zs_spa;
+ 	uint64_t leaves;
+ 	uint64_t guid;
+ 	nvlist_t *nvroot;
+ 	int error;
+ 
+ 	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+ 	leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * zopt_raidz;
+ 
+ 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ 
+ 	ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
+ 
+ 	/*
+ 	 * If we have slogs then remove them 1/4 of the time.
+ 	 */
+ 	if (spa_has_slogs(spa) && ztest_random(4) == 0) {
+ 		/*
+ 		 * Grab the guid from the head of the log class rotor.
+ 		 */
+ 		guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
+ 
+ 		spa_config_exit(spa, SCL_VDEV, FTAG);
+ 
+ 		/*
+ 		 * We have to grab the zs_name_lock as writer to
+ 		 * prevent a race between removing a slog (dmu_objset_find)
+ 		 * and destroying a dataset. Removing the slog will
+ 		 * grab a reference on the dataset which may cause
+ 		 * dmu_objset_destroy() to fail with EBUSY thus
+ 		 * leaving the dataset in an inconsistent state.
+ 		 */
+ 		VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0);
+ 		error = spa_vdev_remove(spa, guid, B_FALSE);
+ 		VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0);
+ 
+ 		if (error && error != EEXIST)
+ 			fatal(0, "spa_vdev_remove() = %d", error);
+ 	} else {
+ 		spa_config_exit(spa, SCL_VDEV, FTAG);
+ 
+ 		/*
+ 		 * Make 1/4 of the devices be log devices.
+ 		 */
+ 		nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
+ 		    ztest_random(4) == 0, zopt_raidz, zs->zs_mirrors, 1);
+ 
+ 		error = spa_vdev_add(spa, nvroot);
+ 		nvlist_free(nvroot);
+ 
+ 		if (error == ENOSPC)
+ 			ztest_record_enospc("spa_vdev_add");
+ 		else if (error != 0)
+ 			fatal(0, "spa_vdev_add() = %d", error);
+ 	}
+ 
+ 	VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0);
+ }
+ 
+ /*
+  * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
+  */
+ /* ARGSUSED */
+ void
+ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
+ {
+ 	ztest_shared_t *zs = ztest_shared;
+ 	spa_t *spa = zs->zs_spa;
+ 	vdev_t *rvd = spa->spa_root_vdev;
+ 	spa_aux_vdev_t *sav;
+ 	char *aux;
+ 	uint64_t guid = 0;
+ 	int error;
+ 
+ 	if (ztest_random(2) == 0) {
+ 		sav = &spa->spa_spares;
+ 		aux = ZPOOL_CONFIG_SPARES;
+ 	} else {
+ 		sav = &spa->spa_l2cache;
+ 		aux = ZPOOL_CONFIG_L2CACHE;
+ 	}
+ 
+ 	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+ 
+ 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
  
  	if (sav->sav_count != 0 && ztest_random(4) == 0) {
  		/*
@@@ -1399,56 -2865,57 +2874,58 @@@ ztest_objset_destroy_cb(const char *nam
  	return (0);
  }
  
- /*
-  * Verify that dmu_objset_{create,destroy,open,close} work as expected.
-  */
- static uint64_t
- ztest_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t object, int mode)
+ static boolean_t
+ ztest_snapshot_create(char *osname, uint64_t id)
+ {
+ 	char snapname[MAXNAMELEN];
+ 	int error;
+ 
+ 	(void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+ 	    (u_longlong_t)id);
+ 
+ 	error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1,
+ 	    NULL, B_FALSE);
+ 	if (error == ENOSPC) {
+ 		ztest_record_enospc(FTAG);
+ 		return (B_FALSE);
+ 	}
+ 	if (error != 0 && error != EEXIST)
+ 		fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error);
+ 	return (B_TRUE);
+ }
+ 
+ static boolean_t
+ ztest_snapshot_destroy(char *osname, uint64_t id)
  {
- 	itx_t *itx;
- 	lr_create_t *lr;
- 	size_t namesize;
- 	char name[24];
- 
- 	(void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object);
- 	namesize = strlen(name) + 1;
- 
- 	itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize +
- 	    ztest_random(ZIL_MAX_BLKSZ));
- 	lr = (lr_create_t *)&itx->itx_lr;
- 	bzero(lr + 1, lr->lr_common.lrc_reclen - sizeof (*lr));
- 	lr->lr_doid = object;
- 	lr->lr_foid = 0;
- 	lr->lr_mode = mode;
- 	lr->lr_uid = 0;
- 	lr->lr_gid = 0;
- 	lr->lr_gen = dmu_tx_get_txg(tx);
- 	lr->lr_crtime[0] = time(NULL);
- 	lr->lr_crtime[1] = 0;
- 	lr->lr_rdev = 0;
- 	bcopy(name, (char *)(lr + 1), namesize);
- 
- 	return (zil_itx_assign(zilog, itx, tx));
+ 	char snapname[MAXNAMELEN];
+ 	int error;
+ 
+ 	(void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+ 	    (u_longlong_t)id);
+ 
+ 	error = dmu_objset_destroy(snapname, B_FALSE);
+ 	if (error != 0 && error != ENOENT)
+ 		fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
+ 	return (B_TRUE);
  }
  
+ /* ARGSUSED */
  void
- ztest_dmu_objset_create_destroy(ztest_args_t *za)
+ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
  {
+ 	ztest_shared_t *zs = ztest_shared;
+ 	ztest_ds_t zdtmp;
+ 	int iters;
  	int error;
  	objset_t *os, *os2;
- 	char name[100];
- 	int basemode, expected_error;
+ 	char name[MAXNAMELEN];
  	zilog_t *zilog;
- 	uint64_t seq;
- 	uint64_t objects;
++	int i;
  
- 	(void) rw_rdlock(&ztest_shared->zs_name_lock);
- 	(void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool,
- 	    (u_longlong_t)za->za_instance);
+ 	(void) rw_rdlock(&zs->zs_name_lock);
  
- 	basemode = DS_MODE_TYPE(za->za_instance);
- 	if (basemode != DS_MODE_USER && basemode != DS_MODE_OWNER)
- 		basemode = DS_MODE_USER;
+ 	(void) snprintf(name, MAXNAMELEN, "%s/temp_%llu",
+ 	    zs->zs_pool, (u_longlong_t)id);
  
  	/*
  	 * If this dataset exists from a previous run, process its replay log
@@@ -1499,38 -2964,17 +2974,17 @@@
  	/*
  	 * Open the intent log for it.
  	 */
- 	zilog = zil_open(os, NULL);
+ 	zilog = zil_open(os, ztest_get_data);
  
  	/*
- 	 * Put a random number of objects in there.
+ 	 * Put some objects in there, do a little I/O to them,
+ 	 * and randomly take a couple of snapshots along the way.
  	 */
- 	objects = ztest_random(20);
- 	seq = 0;
- 	while (objects-- != 0) {
- 		uint64_t object;
- 		dmu_tx_t *tx = dmu_tx_create(os);
- 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, sizeof (name));
- 		error = dmu_tx_assign(tx, TXG_WAIT);
- 		if (error) {
- 			dmu_tx_abort(tx);
- 		} else {
- 			object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- 			    DMU_OT_NONE, 0, tx);
- 			ztest_set_random_blocksize(os, object, tx);
- 			seq = ztest_log_create(zilog, tx, object,
- 			    DMU_OT_UINT64_OTHER);
- 			dmu_write(os, object, 0, sizeof (name), name, tx);
- 			dmu_tx_commit(tx);
- 		}
- 		if (ztest_random(5) == 0) {
- 			zil_commit(zilog, seq, object);
- 		}
- 		if (ztest_random(100) == 0) {
- 			error = zil_suspend(zilog);
- 			if (error == 0) {
- 				zil_resume(zilog);
- 			}
- 		}
+ 	iters = ztest_random(5);
 -	for (int i = 0; i < iters; i++) {
++	for (i = 0; i < iters; i++) {
+ 		ztest_dmu_object_alloc_free(&zdtmp, id);
+ 		if (ztest_random(iters) == 0)
+ 			(void) ztest_snapshot_create(name, i);
  	}
  
  	/*
@@@ -1744,210 -3156,24 +3166,25 @@@ out
   * Verify that dmu_object_{alloc,free} work as expected.
   */
  void
- ztest_dmu_object_alloc_free(ztest_args_t *za)
+ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
  {
- 	objset_t *os = za->za_os;
- 	dmu_buf_t *db;
- 	dmu_tx_t *tx;
- 	uint64_t batchobj, object, batchsize, endoff, temp;
- 	int b, c, error, bonuslen;
- 	dmu_object_info_t *doi = &za->za_doi;
- 	char osname[MAXNAMELEN];
- 
- 	dmu_objset_name(os, osname);
- 
- 	endoff = -8ULL;
- 	batchsize = 2;
- 
- 	/*
- 	 * Create a batch object if necessary, and record it in the directory.
- 	 */
- 	VERIFY3U(0, ==, dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
- 	    sizeof (uint64_t), &batchobj, DMU_READ_PREFETCH));
- 	if (batchobj == 0) {
- 		tx = dmu_tx_create(os);
- 		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
- 		    sizeof (uint64_t));
- 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- 		error = dmu_tx_assign(tx, TXG_WAIT);
- 		if (error) {
- 			ztest_record_enospc("create a batch object");
- 			dmu_tx_abort(tx);
- 			return;
- 		}
- 		batchobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- 		    DMU_OT_NONE, 0, tx);
- 		ztest_set_random_blocksize(os, batchobj, tx);
- 		dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
- 		    sizeof (uint64_t), &batchobj, tx);
- 		dmu_tx_commit(tx);
- 	}
- 
- 	/*
- 	 * Destroy the previous batch of objects.
- 	 */
- 	for (b = 0; b < batchsize; b++) {
- 		VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t),
- 		    sizeof (uint64_t), &object, DMU_READ_PREFETCH));
- 		if (object == 0)
- 			continue;
- 		/*
- 		 * Read and validate contents.
- 		 * We expect the nth byte of the bonus buffer to be n.
- 		 */
- 		VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
- 		za->za_dbuf = db;
- 
- 		dmu_object_info_from_db(db, doi);
- 		ASSERT(doi->doi_type == DMU_OT_UINT64_OTHER);
- 		ASSERT(doi->doi_bonus_type == DMU_OT_PLAIN_OTHER);
- 		ASSERT3S(doi->doi_physical_blks, >=, 0);
- 
- 		bonuslen = doi->doi_bonus_size;
- 
- 		for (c = 0; c < bonuslen; c++) {
- 			if (((uint8_t *)db->db_data)[c] !=
- 			    (uint8_t)(c + bonuslen)) {
- 				fatal(0,
- 				    "bad bonus: %s, obj %llu, off %d: %u != %u",
- 				    osname, object, c,
- 				    ((uint8_t *)db->db_data)[c],
- 				    (uint8_t)(c + bonuslen));
- 			}
- 		}
- 
- 		dmu_buf_rele(db, FTAG);
- 		za->za_dbuf = NULL;
- 
- 		/*
- 		 * We expect the word at endoff to be our object number.
- 		 */
- 		VERIFY(0 == dmu_read(os, object, endoff,
- 		    sizeof (uint64_t), &temp, DMU_READ_PREFETCH));
- 
- 		if (temp != object) {
- 			fatal(0, "bad data in %s, got %llu, expected %llu",
- 			    osname, temp, object);
- 		}
- 
- 		/*
- 		 * Destroy old object and clear batch entry.
- 		 */
- 		tx = dmu_tx_create(os);
- 		dmu_tx_hold_write(tx, batchobj,
- 		    b * sizeof (uint64_t), sizeof (uint64_t));
- 		dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
- 		error = dmu_tx_assign(tx, TXG_WAIT);
- 		if (error) {
- 			ztest_record_enospc("free object");
- 			dmu_tx_abort(tx);
- 			return;
- 		}
- 		error = dmu_object_free(os, object, tx);
- 		if (error) {
- 			fatal(0, "dmu_object_free('%s', %llu) = %d",
- 			    osname, object, error);
- 		}
- 		object = 0;
- 
- 		dmu_object_set_checksum(os, batchobj,
- 		    ztest_random_checksum(), tx);
- 		dmu_object_set_compress(os, batchobj,
- 		    ztest_random_compress(), tx);
- 
- 		dmu_write(os, batchobj, b * sizeof (uint64_t),
- 		    sizeof (uint64_t), &object, tx);
- 
- 		dmu_tx_commit(tx);
- 	}
+ 	ztest_od_t od[4];
+ 	int batchsize = sizeof (od) / sizeof (od[0]);
++	int b;
  
- 	/*
- 	 * Before creating the new batch of objects, generate a bunch of churn.
- 	 */
- 	for (b = ztest_random(100); b > 0; b--) {
- 		tx = dmu_tx_create(os);
- 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- 		error = dmu_tx_assign(tx, TXG_WAIT);
- 		if (error) {
- 			ztest_record_enospc("churn objects");
- 			dmu_tx_abort(tx);
- 			return;
- 		}
- 		object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- 		    DMU_OT_NONE, 0, tx);
- 		ztest_set_random_blocksize(os, object, tx);
- 		error = dmu_object_free(os, object, tx);
- 		if (error) {
- 			fatal(0, "dmu_object_free('%s', %llu) = %d",
- 			    osname, object, error);
- 		}
- 		dmu_tx_commit(tx);
- 	}
 -	for (int b = 0; b < batchsize; b++)
++	for (b = 0; b < batchsize; b++)
+ 		ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
  
  	/*
- 	 * Create a new batch of objects with randomly chosen
- 	 * blocksizes and record them in the batch directory.
+ 	 * Destroy the previous batch of objects, create a new batch,
+ 	 * and do some I/O on the new objects.
  	 */
- 	for (b = 0; b < batchsize; b++) {
- 		uint32_t va_blksize;
- 		u_longlong_t va_nblocks;
- 
- 		tx = dmu_tx_create(os);
- 		dmu_tx_hold_write(tx, batchobj, b * sizeof (uint64_t),
- 		    sizeof (uint64_t));
- 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, endoff,
- 		    sizeof (uint64_t));
- 		error = dmu_tx_assign(tx, TXG_WAIT);
- 		if (error) {
- 			ztest_record_enospc("create batchobj");
- 			dmu_tx_abort(tx);
- 			return;
- 		}
- 		bonuslen = (int)ztest_random(dmu_bonus_max()) + 1;
- 
- 		object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- 		    DMU_OT_PLAIN_OTHER, bonuslen, tx);
- 
- 		ztest_set_random_blocksize(os, object, tx);
- 
- 		dmu_object_set_checksum(os, object,
- 		    ztest_random_checksum(), tx);
- 		dmu_object_set_compress(os, object,
- 		    ztest_random_compress(), tx);
- 
- 		dmu_write(os, batchobj, b * sizeof (uint64_t),
- 		    sizeof (uint64_t), &object, tx);
- 
- 		/*
- 		 * Write to both the bonus buffer and the regular data.
- 		 */
- 		VERIFY(dmu_bonus_hold(os, object, FTAG, &db) == 0);
- 		za->za_dbuf = db;
- 		ASSERT3U(bonuslen, <=, db->db_size);
- 
- 		dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
- 		ASSERT3S(va_nblocks, >=, 0);
- 
- 		dmu_buf_will_dirty(db, tx);
- 
- 		/*
- 		 * See comments above regarding the contents of
- 		 * the bonus buffer and the word at endoff.
- 		 */
- 		for (c = 0; c < bonuslen; c++)
- 			((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
- 
- 		dmu_buf_rele(db, FTAG);
- 		za->za_dbuf = NULL;
- 
- 		/*
- 		 * Write to a large offset to increase indirection.
- 		 */
- 		dmu_write(os, object, endoff, sizeof (uint64_t), &object, tx);
+ 	if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0)
+ 		return;
  
- 		dmu_tx_commit(tx);
- 	}
+ 	while (ztest_random(4 * batchsize) != 0)
+ 		ztest_io(zd, od[ztest_random(batchsize)].od_object,
+ 		    ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
  }
  
  /*
@@@ -2918,168 -3859,430 +3870,432 @@@ ztest_zap(ztest_ds_t *zd, uint64_t id
  	ASSERT3U(error, ==, 0);
  
  	tx = dmu_tx_create(os);
- 	dmu_tx_hold_zap(tx, object, TRUE, NULL);
- 	error = dmu_tx_assign(tx, TXG_WAIT);
- 	if (error) {
- 		ztest_record_enospc("remove zap entry");
- 		dmu_tx_abort(tx);
+ 	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ 	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ 	if (txg == 0)
+ 		return;
+ 	VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
+ 	VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
+ 	dmu_tx_commit(tx);
+ }
+ 
+ /*
+  * Testcase to test the upgrading of a microzap to fatzap.
+  */
+ void
+ ztest_fzap(ztest_ds_t *zd, uint64_t id)
+ {
+ 	objset_t *os = zd->zd_os;
+ 	ztest_od_t od[1];
+ 	uint64_t object, txg;
++	int i;
+ 
+ 	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
+ 
+ 	if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
  		return;
+ 
+ 	object = od[0].od_object;
+ 
+ 	/*
+ 	 * Add entries to this ZAP and make sure it spills over
+ 	 * and gets upgraded to a fatzap. Also, since we are adding
+ 	 * 2050 entries we should see ptrtbl growth and leaf-block split.
+ 	 */
 -	for (int i = 0; i < 2050; i++) {
++	for (i = 0; i < 2050; i++) {
+ 		char name[MAXNAMELEN];
+ 		uint64_t value = i;
+ 		dmu_tx_t *tx;
+ 		int error;
+ 
+ 		(void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
+ 		    id, value);
+ 
+ 		tx = dmu_tx_create(os);
+ 		dmu_tx_hold_zap(tx, object, B_TRUE, name);
+ 		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ 		if (txg == 0)
+ 			return;
+ 		error = zap_add(os, object, name, sizeof (uint64_t), 1,
+ 		    &value, tx);
+ 		ASSERT(error == 0 || error == EEXIST);
+ 		dmu_tx_commit(tx);
  	}
- 	error = zap_remove(os, object, txgname, tx);
- 	if (error)
- 		fatal(0, "zap_remove('%s', %llu, '%s') = %d",
- 		    osname, object, txgname, error);
+ }
  
- 	error = zap_remove(os, object, propname, tx);
- 	if (error)
- 		fatal(0, "zap_remove('%s', %llu, '%s') = %d",
- 		    osname, object, propname, error);
+ /* ARGSUSED */
+ void
+ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
+ {
+ 	objset_t *os = zd->zd_os;
+ 	ztest_od_t od[1];
+ 	uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
+ 	dmu_tx_t *tx;
+ 	int i, namelen, error;
+ 	int micro = ztest_random(2);
+ 	char name[20], string_value[20];
+ 	void *data;
  
- 	dmu_tx_commit(tx);
+ 	ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0);
+ 
+ 	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+ 		return;
+ 
+ 	object = od[0].od_object;
+ 
+ 	/*
+ 	 * Generate a random name of the form 'xxx.....' where each
+ 	 * x is a random printable character and the dots are dots.
+ 	 * There are 94 such characters, and the name length goes from
+ 	 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
+ 	 */
+ 	namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
+ 
+ 	for (i = 0; i < 3; i++)
+ 		name[i] = '!' + ztest_random('~' - '!' + 1);
+ 	for (; i < namelen - 1; i++)
+ 		name[i] = '.';
+ 	name[i] = '\0';
+ 
+ 	if ((namelen & 1) || micro) {
+ 		wsize = sizeof (txg);
+ 		wc = 1;
+ 		data = &txg;
+ 	} else {
+ 		wsize = 1;
+ 		wc = namelen;
+ 		data = string_value;
+ 	}
+ 
+ 	count = -1ULL;
+ 	VERIFY(zap_count(os, object, &count) == 0);
+ 	ASSERT(count != -1ULL);
  
  	/*
- 	 * Once in a while, destroy the object.
+ 	 * Select an operation: length, lookup, add, update, remove.
  	 */
- 	if (ztest_random(1000) != 0)
+ 	i = ztest_random(5);
+ 
+ 	if (i >= 2) {
+ 		tx = dmu_tx_create(os);
+ 		dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ 		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ 		if (txg == 0)
+ 			return;
+ 		bcopy(name, string_value, namelen);
+ 	} else {
+ 		tx = NULL;
+ 		txg = 0;
+ 		bzero(string_value, namelen);
+ 	}
+ 
+ 	switch (i) {
+ 
+ 	case 0:
+ 		error = zap_length(os, object, name, &zl_wsize, &zl_wc);
+ 		if (error == 0) {
+ 			ASSERT3U(wsize, ==, zl_wsize);
+ 			ASSERT3U(wc, ==, zl_wc);
+ 		} else {
+ 			ASSERT3U(error, ==, ENOENT);
+ 		}
+ 		break;
+ 
+ 	case 1:
+ 		error = zap_lookup(os, object, name, wsize, wc, data);
+ 		if (error == 0) {
+ 			if (data == string_value &&
+ 			    bcmp(name, data, namelen) != 0)
+ 				fatal(0, "name '%s' != val '%s' len %d",
+ 				    name, data, namelen);
+ 		} else {
+ 			ASSERT3U(error, ==, ENOENT);
+ 		}
+ 		break;
+ 
+ 	case 2:
+ 		error = zap_add(os, object, name, wsize, wc, data, tx);
+ 		ASSERT(error == 0 || error == EEXIST);
+ 		break;
+ 
+ 	case 3:
+ 		VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
+ 		break;
+ 
+ 	case 4:
+ 		error = zap_remove(os, object, name, tx);
+ 		ASSERT(error == 0 || error == ENOENT);
+ 		break;
+ 	}
+ 
+ 	if (tx != NULL)
+ 		dmu_tx_commit(tx);
+ }
+ 
+ /*
+  * Commit callback data.
+  */
+ typedef struct ztest_cb_data {
+ 	list_node_t		zcd_node;
+ 	uint64_t		zcd_txg;
+ 	int			zcd_expected_err;
+ 	boolean_t		zcd_added;
+ 	boolean_t		zcd_called;
+ 	spa_t			*zcd_spa;
+ } ztest_cb_data_t;
+ 
+ /* This is the actual commit callback function */
+ static void
+ ztest_commit_callback(void *arg, int error)
+ {
+ 	ztest_cb_data_t *data = arg;
+ 	uint64_t synced_txg;
+ 
+ 	VERIFY(data != NULL);
+ 	VERIFY3S(data->zcd_expected_err, ==, error);
+ 	VERIFY(!data->zcd_called);
+ 
+ 	synced_txg = spa_last_synced_txg(data->zcd_spa);
+ 	if (data->zcd_txg > synced_txg)
+ 		fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
+ 		    ", last synced txg = %" PRIu64 "\n", data->zcd_txg,
+ 		    synced_txg);
+ 
+ 	data->zcd_called = B_TRUE;
+ 
+ 	if (error == ECANCELED) {
+ 		ASSERT3U(data->zcd_txg, ==, 0);
+ 		ASSERT(!data->zcd_added);
+ 
+ 		/*
+ 		 * The private callback data should be destroyed here, but
+ 		 * since we are going to check the zcd_called field after
+ 		 * dmu_tx_abort(), we will destroy it there.
+ 		 */
+ 		return;
+ 	}
+ 
+ 	/* Was this callback added to the global callback list? */
+ 	if (!data->zcd_added)
+ 		goto out;
+ 
+ 	ASSERT3U(data->zcd_txg, !=, 0);
+ 
+ 	/* Remove our callback from the list */
+ 	(void) mutex_lock(&zcl.zcl_callbacks_lock);
+ 	list_remove(&zcl.zcl_callbacks, data);
+ 	(void) mutex_unlock(&zcl.zcl_callbacks_lock);
+ 
+ out:
+ 	umem_free(data, sizeof (ztest_cb_data_t));
+ }
+ 
+ /* Allocate and initialize callback data structure */
+ static ztest_cb_data_t *
+ ztest_create_cb_data(objset_t *os, uint64_t txg)
+ {
+ 	ztest_cb_data_t *cb_data;
+ 
+ 	cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
+ 
+ 	cb_data->zcd_txg = txg;
+ 	cb_data->zcd_spa = dmu_objset_spa(os);
+ 
+ 	return (cb_data);
+ }
+ 
+ /*
+  * If a number of txgs equal to this threshold have been created after a commit
+  * callback has been registered but not called, then we assume there is an
+  * implementation bug.
+  */
+ #define	ZTEST_COMMIT_CALLBACK_THRESH	(TXG_CONCURRENT_STATES + 2)
+ 
+ /*
+  * Commit callback test.
+  */
+ void
+ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
+ {
+ 	objset_t *os = zd->zd_os;
+ 	ztest_od_t od[1];
+ 	dmu_tx_t *tx;
+ 	ztest_cb_data_t *cb_data[3], *tmp_cb;
+ 	uint64_t old_txg, txg;
+ 	int i, error;
+ 
+ 	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+ 
+ 	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
  		return;
  
  	tx = dmu_tx_create(os);
- 	dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
- 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
- 	error = dmu_tx_assign(tx, TXG_WAIT);
+ 
+ 	cb_data[0] = ztest_create_cb_data(os, 0);
+ 	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
+ 
+ 	dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t));
+ 
+ 	/* Every once in a while, abort the transaction on purpose */
+ 	if (ztest_random(100) == 0)
+ 		error = -1;
+ 
+ 	if (!error)
+ 		error = dmu_tx_assign(tx, TXG_NOWAIT);
+ 
+ 	txg = error ? 0 : dmu_tx_get_txg(tx);
+ 
+ 	cb_data[0]->zcd_txg = txg;
+ 	cb_data[1] = ztest_create_cb_data(os, txg);
+ 	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);
+ 
  	if (error) {
- 		ztest_record_enospc("destroy zap object");
+ 		/*
+ 		 * It's not a strict requirement to call the registered
+ 		 * callbacks from inside dmu_tx_abort(), but that's what
+ 		 * it's supposed to happen in the current implementation
+ 		 * so we will check for that.
+ 		 */
+ 		for (i = 0; i < 2; i++) {
+ 			cb_data[i]->zcd_expected_err = ECANCELED;
+ 			VERIFY(!cb_data[i]->zcd_called);
+ 		}
+ 
  		dmu_tx_abort(tx);
+ 
+ 		for (i = 0; i < 2; i++) {
+ 			VERIFY(cb_data[i]->zcd_called);
+ 			umem_free(cb_data[i], sizeof (ztest_cb_data_t));
+ 		}
+ 
  		return;
  	}
- 	error = zap_destroy(os, object, tx);
- 	if (error)
- 		fatal(0, "zap_destroy('%s', %llu) = %d",
- 		    osname, object, error);
- 	object = 0;
- 	dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
- 	    &object, tx);
- 	dmu_tx_commit(tx);
- }
  
- void
- ztest_zap_parallel(ztest_args_t *za)
- {
- 	objset_t *os = za->za_os;
- 	uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
- 	dmu_tx_t *tx;
- 	int i, namelen, error;
- 	char name[20], string_value[20];
- 	void *data;
+ 	cb_data[2] = ztest_create_cb_data(os, txg);
+ 	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
  
  	/*
- 	 * Generate a random name of the form 'xxx.....' where each
- 	 * x is a random printable character and the dots are dots.
- 	 * There are 94 such characters, and the name length goes from
- 	 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
+ 	 * Read existing data to make sure there isn't a future leak.
  	 */
- 	namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
+ 	VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t),
+ 	    &old_txg, DMU_READ_PREFETCH));
  
- 	for (i = 0; i < 3; i++)
- 		name[i] = '!' + ztest_random('~' - '!' + 1);
- 	for (; i < namelen - 1; i++)
- 		name[i] = '.';
- 	name[i] = '\0';
+ 	if (old_txg > txg)
+ 		fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
+ 		    old_txg, txg);
  
- 	if (ztest_random(2) == 0)
- 		object = ZTEST_MICROZAP_OBJ;
- 	else
- 		object = ZTEST_FATZAP_OBJ;
+ 	dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx);
  
- 	if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
- 		wsize = sizeof (txg);
- 		wc = 1;
- 		data = &txg;
- 	} else {
- 		wsize = 1;
- 		wc = namelen;
- 		data = string_value;
- 	}
+ 	(void) mutex_lock(&zcl.zcl_callbacks_lock);
  
- 	count = -1ULL;
- 	VERIFY(zap_count(os, object, &count) == 0);
- 	ASSERT(count != -1ULL);
+ 	/*
+ 	 * Since commit callbacks don't have any ordering requirement and since
+ 	 * it is theoretically possible for a commit callback to be called
+ 	 * after an arbitrary amount of time has elapsed since its txg has been
+ 	 * synced, it is difficult to reliably determine whether a commit
+ 	 * callback hasn't been called due to high load or due to a flawed
+ 	 * implementation.
+ 	 *
+ 	 * In practice, we will assume that if after a certain number of txgs a
+ 	 * commit callback hasn't been called, then most likely there's an
+ 	 * implementation bug..
+ 	 */
+ 	tmp_cb = list_head(&zcl.zcl_callbacks);
+ 	if (tmp_cb != NULL &&
+ 	    tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) {
+ 		fatal(0, "Commit callback threshold exceeded, oldest txg: %"
+ 		    PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
+ 	}
  
  	/*
- 	 * Select an operation: length, lookup, add, update, remove.
+ 	 * Let's find the place to insert our callbacks.
+ 	 *
+ 	 * Even though the list is ordered by txg, it is possible for the
+ 	 * insertion point to not be the end because our txg may already be
+ 	 * quiescing at this point and other callbacks in the open txg
+ 	 * (from other objsets) may have sneaked in.
  	 */
- 	i = ztest_random(5);
+ 	tmp_cb = list_tail(&zcl.zcl_callbacks);
+ 	while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
+ 		tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
+ 
+ 	/* Add the 3 callbacks to the list */
+ 	for (i = 0; i < 3; i++) {
+ 		if (tmp_cb == NULL)
+ 			list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
+ 		else
+ 			list_insert_after(&zcl.zcl_callbacks, tmp_cb,
+ 			    cb_data[i]);
  
- 	if (i >= 2) {
- 		tx = dmu_tx_create(os);
- 		dmu_tx_hold_zap(tx, object, TRUE, NULL);
- 		error = dmu_tx_assign(tx, TXG_WAIT);
- 		if (error) {
- 			ztest_record_enospc("zap parallel");
- 			dmu_tx_abort(tx);
- 			return;
- 		}
- 		txg = dmu_tx_get_txg(tx);
- 		bcopy(name, string_value, namelen);
- 	} else {
- 		tx = NULL;
- 		txg = 0;
- 		bzero(string_value, namelen);
+ 		cb_data[i]->zcd_added = B_TRUE;
+ 		VERIFY(!cb_data[i]->zcd_called);
+ 
+ 		tmp_cb = cb_data[i];
  	}
  
- 	switch (i) {
+ 	(void) mutex_unlock(&zcl.zcl_callbacks_lock);
  
- 	case 0:
- 		error = zap_length(os, object, name, &zl_wsize, &zl_wc);
- 		if (error == 0) {
- 			ASSERT3U(wsize, ==, zl_wsize);
- 			ASSERT3U(wc, ==, zl_wc);
- 		} else {
- 			ASSERT3U(error, ==, ENOENT);
- 		}
- 		break;
+ 	dmu_tx_commit(tx);
+ }
  
- 	case 1:
- 		error = zap_lookup(os, object, name, wsize, wc, data);
- 		if (error == 0) {
- 			if (data == string_value &&
- 			    bcmp(name, data, namelen) != 0)
- 				fatal(0, "name '%s' != val '%s' len %d",
- 				    name, data, namelen);
- 		} else {
- 			ASSERT3U(error, ==, ENOENT);
- 		}
- 		break;
+ /* ARGSUSED */
+ void
+ ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
+ {
+ 	zfs_prop_t proplist[] = {
+ 		ZFS_PROP_CHECKSUM,
+ 		ZFS_PROP_COMPRESSION,
+ 		ZFS_PROP_COPIES,
+ 		ZFS_PROP_DEDUP
+ 	};
+ 	ztest_shared_t *zs = ztest_shared;
++	int p;
  
- 	case 2:
- 		error = zap_add(os, object, name, wsize, wc, data, tx);
- 		ASSERT(error == 0 || error == EEXIST);
- 		break;
+ 	(void) rw_rdlock(&zs->zs_name_lock);
  
- 	case 3:
- 		VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
- 		break;
 -	for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
++	for (p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
+ 		(void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
+ 		    ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
  
- 	case 4:
- 		error = zap_remove(os, object, name, tx);
- 		ASSERT(error == 0 || error == ENOENT);
- 		break;
- 	}
+ 	(void) rw_unlock(&zs->zs_name_lock);
+ }
  
- 	if (tx != NULL)
- 		dmu_tx_commit(tx);
+ /* ARGSUSED */
+ void
+ ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
+ {
+ 	ztest_shared_t *zs = ztest_shared;
+ 	nvlist_t *props = NULL;
+ 
+ 	(void) rw_rdlock(&zs->zs_name_lock);
+ 
+ 	(void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO,
+ 	    ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
+ 
+ 	VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0);
+ 
+ 	if (zopt_verbose >= 6)
+ 		dump_nvlist(props, 4);
+ 
+ 	nvlist_free(props);
+ 
+ 	(void) rw_unlock(&zs->zs_name_lock);
  }
  
+ /*
+  * Test snapshot hold/release and deferred destroy.
+  */
  void
- ztest_dsl_prop_get_set(ztest_args_t *za)
+ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
  {
- 	objset_t *os = za->za_os;
- 	int i, inherit;
- 	uint64_t value;
- 	const char *prop, *valname;
- 	char setpoint[MAXPATHLEN];
- 	char osname[MAXNAMELEN];
  	int error;
+ 	objset_t *os = zd->zd_os;
+ 	objset_t *origin;
+ 	char snapname[100];
+ 	char fullname[100];
+ 	char clonename[100];
+ 	char tag[100];
+ 	char osname[MAXNAMELEN];
  
  	(void) rw_rdlock(&ztest_shared->zs_name_lock);
  
@@@ -3262,160 -4558,176 +4571,177 @@@ ztest_fault_inject(ztest_ds_t *zd, uint
  }
  
  /*
-  * Scrub the pool.
+  * Verify that DDT repair works as expected.
   */
  void
- ztest_scrub(ztest_args_t *za)
+ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
  {
- 	spa_t *spa = za->za_spa;
- 
- 	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
- 	(void) poll(NULL, 0, 1000); /* wait a second, then force a restart */
- 	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
- }
+ 	ztest_shared_t *zs = ztest_shared;
+ 	spa_t *spa = zs->zs_spa;
+ 	objset_t *os = zd->zd_os;
+ 	ztest_od_t od[1];
+ 	uint64_t object, blocksize, txg, pattern, psize;
+ 	enum zio_checksum checksum = spa_dedup_checksum(spa);
+ 	dmu_buf_t *db;
+ 	dmu_tx_t *tx;
+ 	void *buf;
+ 	blkptr_t blk;
+ 	int copies = 2 * ZIO_DEDUPDITTO_MIN;
++	int i;
  
- /*
-  * Rename the pool to a different name and then rename it back.
-  */
- void
- ztest_spa_rename(ztest_args_t *za)
- {
- 	char *oldname, *newname;
- 	int error;
- 	spa_t *spa;
+ 	blocksize = ztest_random_blocksize();
+ 	blocksize = MIN(blocksize, 2048);	/* because we write so many */
  
- 	(void) rw_wrlock(&ztest_shared->zs_name_lock);
+ 	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
  
- 	oldname = za->za_pool;
- 	newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
- 	(void) strcpy(newname, oldname);
- 	(void) strcat(newname, "_tmp");
+ 	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+ 		return;
  
  	/*
- 	 * Do the rename
+ 	 * Take the name lock as writer to prevent anyone else from changing
+ 	 * the pool and dataset properies we need to maintain during this test.
  	 */
- 	error = spa_rename(oldname, newname);
- 	if (error)
- 		fatal(0, "spa_rename('%s', '%s') = %d", oldname,
- 		    newname, error);
+ 	(void) rw_wrlock(&zs->zs_name_lock);
  
- 	/*
- 	 * Try to open it under the old name, which shouldn't exist
- 	 */
- 	error = spa_open(oldname, &spa, FTAG);
- 	if (error != ENOENT)
- 		fatal(0, "spa_open('%s') = %d", oldname, error);
+ 	if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
+ 	    B_FALSE) != 0 ||
+ 	    ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
+ 	    B_FALSE) != 0) {
+ 		(void) rw_unlock(&zs->zs_name_lock);
+ 		return;
+ 	}
+ 
+ 	object = od[0].od_object;
+ 	blocksize = od[0].od_blocksize;
+ 	pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os);
+ 
+ 	ASSERT(object != 0);
+ 
+ 	tx = dmu_tx_create(os);
+ 	dmu_tx_hold_write(tx, object, 0, copies * blocksize);
+ 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ 	if (txg == 0) {
+ 		(void) rw_unlock(&zs->zs_name_lock);
+ 		return;
+ 	}
  
  	/*
- 	 * Open it under the new name and make sure it's still the same spa_t.
+ 	 * Write all the copies of our block.
  	 */
- 	error = spa_open(newname, &spa, FTAG);
- 	if (error != 0)
- 		fatal(0, "spa_open('%s') = %d", newname, error);
 -	for (int i = 0; i < copies; i++) {
++	for (i = 0; i < copies; i++) {
+ 		uint64_t offset = i * blocksize;
+ 		VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db,
+ 		    DMU_READ_NO_PREFETCH) == 0);
+ 		ASSERT(db->db_offset == offset);
+ 		ASSERT(db->db_size == blocksize);
+ 		ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
+ 		    ztest_pattern_match(db->db_data, db->db_size, 0ULL));
+ 		dmu_buf_will_fill(db, tx);
+ 		ztest_pattern_set(db->db_data, db->db_size, pattern);
+ 		dmu_buf_rele(db, FTAG);
+ 	}
  
- 	ASSERT(spa == za->za_spa);
- 	spa_close(spa, FTAG);
+ 	dmu_tx_commit(tx);
+ 	txg_wait_synced(spa_get_dsl(spa), txg);
  
  	/*
- 	 * Rename it back to the original
+ 	 * Find out what block we got.
  	 */
- 	error = spa_rename(newname, oldname);
- 	if (error)
- 		fatal(0, "spa_rename('%s', '%s') = %d", newname,
- 		    oldname, error);
+ 	VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db,
+ 	    DMU_READ_NO_PREFETCH) == 0);
+ 	blk = *((dmu_buf_impl_t *)db)->db_blkptr;
+ 	dmu_buf_rele(db, FTAG);
  
  	/*
- 	 * Make sure it can still be opened
+ 	 * Damage the block.  Dedup-ditto will save us when we read it later.
  	 */
- 	error = spa_open(oldname, &spa, FTAG);
- 	if (error != 0)
- 		fatal(0, "spa_open('%s') = %d", oldname, error);
+ 	psize = BP_GET_PSIZE(&blk);
+ 	buf = zio_buf_alloc(psize);
+ 	ztest_pattern_set(buf, psize, ~pattern);
  
- 	ASSERT(spa == za->za_spa);
- 	spa_close(spa, FTAG);
+ 	(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
+ 	    buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+ 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
  
- 	umem_free(newname, strlen(newname) + 1);
+ 	zio_buf_free(buf, psize);
  
- 	(void) rw_unlock(&ztest_shared->zs_name_lock);
+ 	(void) rw_unlock(&zs->zs_name_lock);
  }
  
- 
  /*
-  * Completely obliterate one disk.
+  * Scrub the pool.
   */
- static void
- ztest_obliterate_one_disk(uint64_t vdev)
+ /* ARGSUSED */
+ void
+ ztest_scrub(ztest_ds_t *zd, uint64_t id)
  {
- 	int fd;
- 	char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN];
- 	size_t fsize;
+ 	ztest_shared_t *zs = ztest_shared;
+ 	spa_t *spa = zs->zs_spa;
  
- 	if (zopt_maxfaults < 2)
- 		return;
+ 	(void) spa_scan(spa, POOL_SCAN_SCRUB);
+ 	(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
+ 	(void) spa_scan(spa, POOL_SCAN_SCRUB);
+ }
  
- 	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
- 	(void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name);
+ /*
+  * Rename the pool to a different name and then rename it back.
+  */
+ /* ARGSUSED */
+ void
+ ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
+ {
+ 	ztest_shared_t *zs = ztest_shared;
+ 	char *oldname, *newname;
+ 	spa_t *spa;
  
- 	fd = open(dev_name, O_RDWR);
+ 	(void) rw_wrlock(&zs->zs_name_lock);
  
- 	if (fd == -1)
- 		fatal(1, "can't open %s", dev_name);
+ 	oldname = zs->zs_pool;
+ 	newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
+ 	(void) strcpy(newname, oldname);
+ 	(void) strcat(newname, "_tmp");
  
  	/*
- 	 * Determine the size.
+ 	 * Do the rename
  	 */
- 	fsize = lseek(fd, 0, SEEK_END);
- 
- 	(void) close(fd);
+ 	VERIFY3U(0, ==, spa_rename(oldname, newname));
  
  	/*
- 	 * Rename the old device to dev_name.old (useful for debugging).
+ 	 * Try to open it under the old name, which shouldn't exist
  	 */
- 	VERIFY(rename(dev_name, copy_name) == 0);
+ 	VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
  
  	/*
- 	 * Create a new one.
+ 	 * Open it under the new name and make sure it's still the same spa_t.
  	 */
- 	VERIFY((fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666)) >= 0);
- 	VERIFY(ftruncate(fd, fsize) == 0);
- 	(void) close(fd);
- }
+ 	VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
  
- static void
- ztest_replace_one_disk(spa_t *spa, uint64_t vdev)
- {
- 	char dev_name[MAXPATHLEN];
- 	nvlist_t *root;
- 	int error;
- 	uint64_t guid;
- 	vdev_t *vd;
+ 	ASSERT(spa == zs->zs_spa);
+ 	spa_close(spa, FTAG);
  
- 	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+ 	/*
+ 	 * Rename it back to the original
+ 	 */
+ 	VERIFY3U(0, ==, spa_rename(newname, oldname));
  
  	/*
- 	 * Build the nvlist describing dev_name.
+ 	 * Make sure it can still be opened
  	 */
- 	root = make_vdev_root(dev_name, NULL, 0, 0, 0, 0, 0, 1);
+ 	VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
  
- 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, dev_name)) == NULL)
- 		guid = 0;
- 	else
- 		guid = vd->vdev_guid;
- 	spa_config_exit(spa, SCL_VDEV, FTAG);
- 	error = spa_vdev_attach(spa, guid, root, B_TRUE);
- 	if (error != 0 &&
- 	    error != EBUSY &&
- 	    error != ENOTSUP &&
- 	    error != ENODEV &&
- 	    error != EDOM)
- 		fatal(0, "spa_vdev_attach(in-place) = %d", error);
+ 	ASSERT(spa == zs->zs_spa);
+ 	spa_close(spa, FTAG);
  
- 	nvlist_free(root);
+ 	umem_free(newname, strlen(newname) + 1);
+ 
+ 	(void) rw_unlock(&zs->zs_name_lock);
  }
  
+ /*
+  * Verify pool integrity by running zdb.
+  */
  static void
- ztest_verify_blocks(char *pool)
+ ztest_run_zdb(char *pool)
  {
  	int status;
  	char zdb[MAXPATHLEN + MAXNAMELEN + 20];
@@@ -3597,6 -4896,45 +4910,46 @@@ ztest_resume_thread(void *arg
  	return (NULL);
  }
  
+ static void *
+ ztest_deadman_thread(void *arg)
+ {
+ 	ztest_shared_t *zs = arg;
+ 	int grace = 300;
+ 	hrtime_t delta;
+ 
+ 	delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace;
+ 
+ 	(void) poll(NULL, 0, (int)(1000 * delta));
+ 
+ 	fatal(0, "failed to complete within %d seconds of deadline", grace);
+ 
+ 	return (NULL);
+ }
+ 
+ static void
+ ztest_execute(ztest_info_t *zi, uint64_t id)
+ {
+ 	ztest_shared_t *zs = ztest_shared;
+ 	ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets];
+ 	hrtime_t functime = gethrtime();
++	int i;
+ 
 -	for (int i = 0; i < zi->zi_iters; i++)
++	for (i = 0; i < zi->zi_iters; i++)
+ 		zi->zi_func(zd, id);
+ 
+ 	functime = gethrtime() - functime;
+ 
+ 	atomic_add_64(&zi->zi_call_count, 1);
+ 	atomic_add_64(&zi->zi_call_time, functime);
+ 
+ 	if (zopt_verbose >= 4) {
+ 		Dl_info dli;
+ 		(void) dladdr((void *)zi->zi_func, &dli);
+ 		(void) printf("%6.2f sec in %s\n",
+ 		    (double)functime / NANOSEC, dli.dli_sname);
+ 	}
+ }
+ 
  static void *
  ztest_thread(void *arg)
  {
@@@ -3610,69 -4948,157 +4963,159 @@@
  		/*
  		 * See if it's time to force a crash.
  		 */
- 		if (now > za->za_kill) {
- 			zs->zs_alloc = spa_get_alloc(za->za_spa);
- 			zs->zs_space = spa_get_space(za->za_spa);
- 			(void) kill(getpid(), SIGKILL);
- 		}
+ 		if (now > zs->zs_thread_kill)
+ 			ztest_kill(zs);
  
  		/*
- 		 * Pick a random function.
+ 		 * If we're getting ENOSPC with some regularity, stop.
  		 */
- 		f = ztest_random(ZTEST_FUNCS);
- 		zi = &zs->zs_info[f];
+ 		if (zs->zs_enospc_count > 10)
+ 			break;
  
  		/*
- 		 * Decide whether to call it, based on the requested frequency.
+ 		 * Pick a random function to execute.
  		 */
- 		if (zi->zi_call_target == 0 ||
- 		    (double)zi->zi_call_total / zi->zi_call_target >
- 		    (double)(now - zs->zs_start_time) / (zopt_time * NANOSEC))
- 			continue;
+ 		zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)];
+ 		call_next = zi->zi_call_next;
+ 
+ 		if (now >= call_next &&
+ 		    atomic_cas_64(&zi->zi_call_next, call_next, call_next +
+ 		    ztest_random(2 * zi->zi_interval[0] + 1)) == call_next)
+ 			ztest_execute(zi, id);
+ 	}
  
- 		atomic_add_64(&zi->zi_calls, 1);
- 		atomic_add_64(&zi->zi_call_total, 1);
+ 	return (NULL);
+ }
  
- 		za->za_diroff = (za->za_instance * ZTEST_FUNCS + f) *
- 		    ZTEST_DIRSIZE;
- 		za->za_diroff_shared = (1ULL << 63);
+ static void
+ ztest_dataset_name(char *dsname, char *pool, int d)
+ {
+ 	(void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d);
+ }
  
- 		for (i = 0; i < zi->zi_iters; i++)
- 			zi->zi_func(za);
+ static void
+ ztest_dataset_destroy(ztest_shared_t *zs, int d)
+ {
+ 	char name[MAXNAMELEN];
++	int t;
  
- 		functime = gethrtime() - now;
+ 	ztest_dataset_name(name, zs->zs_pool, d);
  
- 		atomic_add_64(&zi->zi_call_time, functime);
+ 	if (zopt_verbose >= 3)
+ 		(void) printf("Destroying %s to free up space\n", name);
  
- 		if (zopt_verbose >= 4) {
- 			Dl_info dli;
- 			(void) dladdr((void *)zi->zi_func, &dli);
- 			(void) printf("%6.2f sec in %s\n",
- 			    (double)functime / NANOSEC, dli.dli_sname);
- 		}
+ 	/*
+ 	 * Cleanup any non-standard clones and snapshots.  In general,
+ 	 * ztest thread t operates on dataset (t % zopt_datasets),
+ 	 * so there may be more than one thing to clean up.
+ 	 */
 -	for (int t = d; t < zopt_threads; t += zopt_datasets)
++	for (t = d; t < zopt_threads; t += zopt_datasets)
+ 		ztest_dsl_dataset_cleanup(name, t);
  
- 		/*
- 		 * If we're getting ENOSPC with some regularity, stop.
- 		 */
- 		if (zs->zs_enospc_count > 10)
- 			break;
+ 	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
+ 	    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+ }
+ 
+ static void
+ ztest_dataset_dirobj_verify(ztest_ds_t *zd)
+ {
+ 	uint64_t usedobjs, dirobjs, scratch;
+ 
+ 	/*
+ 	 * ZTEST_DIROBJ is the object directory for the entire dataset.
+ 	 * Therefore, the number of objects in use should equal the
+ 	 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
+ 	 * If not, we have an object leak.
+ 	 *
+ 	 * Note that we can only check this in ztest_dataset_open(),
+ 	 * when the open-context and syncing-context values agree.
+ 	 * That's because zap_count() returns the open-context value,
+ 	 * while dmu_objset_space() returns the rootbp fill count.
+ 	 */
+ 	VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
+ 	dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
+ 	ASSERT3U(dirobjs + 1, ==, usedobjs);
+ }
+ 
+ static int
+ ztest_dataset_open(ztest_shared_t *zs, int d)
+ {
+ 	ztest_ds_t *zd = &zs->zs_zd[d];
+ 	uint64_t committed_seq = zd->zd_seq;
+ 	objset_t *os;
+ 	zilog_t *zilog;
+ 	char name[MAXNAMELEN];
+ 	int error;
+ 
+ 	ztest_dataset_name(name, zs->zs_pool, d);
+ 
+ 	(void) rw_rdlock(&zs->zs_name_lock);
+ 
+ 	error = ztest_dataset_create(name);
+ 	if (error == ENOSPC) {
+ 		(void) rw_unlock(&zs->zs_name_lock);
+ 		ztest_record_enospc(FTAG);
+ 		return (error);
  	}
+ 	ASSERT(error == 0 || error == EEXIST);
  
- 	return (NULL);
+ 	VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0);
+ 	(void) rw_unlock(&zs->zs_name_lock);
+ 
+ 	ztest_zd_init(zd, os);
+ 
+ 	zilog = zd->zd_zilog;
+ 
+ 	if (zilog->zl_header->zh_claim_lr_seq != 0 &&
+ 	    zilog->zl_header->zh_claim_lr_seq < committed_seq)
+ 		fatal(0, "missing log records: claimed %llu < committed %llu",
+ 		    zilog->zl_header->zh_claim_lr_seq, committed_seq);
+ 
+ 	ztest_dataset_dirobj_verify(zd);
+ 
+ 	zil_replay(os, zd, ztest_replay_vector);
+ 
+ 	ztest_dataset_dirobj_verify(zd);
+ 
+ 	if (zopt_verbose >= 6)
+ 		(void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
+ 		    zd->zd_name,
+ 		    (u_longlong_t)zilog->zl_parse_blk_count,
+ 		    (u_longlong_t)zilog->zl_parse_lr_count,
+ 		    (u_longlong_t)zilog->zl_replaying_seq);
+ 
+ 	zilog = zil_open(os, ztest_get_data);
+ 
+ 	if (zilog->zl_replaying_seq != 0 &&
+ 	    zilog->zl_replaying_seq < committed_seq)
+ 		fatal(0, "missing log records: replayed %llu < committed %llu",
+ 		    zilog->zl_replaying_seq, committed_seq);
+ 
+ 	return (0);
+ }
+ 
+ static void
+ ztest_dataset_close(ztest_shared_t *zs, int d)
+ {
+ 	ztest_ds_t *zd = &zs->zs_zd[d];
+ 
+ 	zil_close(zd->zd_zilog);
+ 	dmu_objset_rele(zd->zd_os, zd);
+ 
+ 	ztest_zd_fini(zd);
  }
  
  /*
   * Kick off threads to run tests on all datasets in parallel.
   */
  static void
- ztest_run(char *pool)
+ ztest_run(ztest_shared_t *zs)
  {
- 	int t, d, error;
- 	ztest_shared_t *zs = ztest_shared;
- 	ztest_args_t *za;
+ 	thread_t *tid;
  	spa_t *spa;
- 	char name[100];
  	thread_t resume_tid;
+ 	int error;
++	int t, d;
  
  	ztest_exiting = B_FALSE;
  
@@@ -3775,91 -5180,92 +5197,92 @@@
  	if (zopt_verbose >= 4)
  		(void) printf("starting main threads...\n");
  
- 	za[0].za_start = gethrtime();
- 	za[0].za_stop = za[0].za_start + zopt_passtime * NANOSEC;
- 	za[0].za_stop = MIN(za[0].za_stop, zs->zs_stop_time);
- 	za[0].za_kill = za[0].za_stop;
- 	if (ztest_random(100) < zopt_killrate)
- 		za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC);
- 
+ 	/*
+ 	 * Kick off all the tests that run in parallel.
+ 	 */
 -	for (int t = 0; t < zopt_threads; t++) {
 +	for (t = 0; t < zopt_threads; t++) {
- 		d = t % zopt_datasets;
- 
- 		(void) strcpy(za[t].za_pool, pool);
- 		za[t].za_os = za[d].za_os;
- 		za[t].za_spa = spa;
- 		za[t].za_zilog = za[d].za_zilog;
- 		za[t].za_instance = t;
- 		za[t].za_random = ztest_random(-1ULL);
- 		za[t].za_start = za[0].za_start;
- 		za[t].za_stop = za[0].za_stop;
- 		za[t].za_kill = za[0].za_kill;
- 
- 		if (t < zopt_datasets) {
- 			int test_future = FALSE;
- 			(void) rw_rdlock(&ztest_shared->zs_name_lock);
- 			(void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
- 			error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0,
- 			    ztest_create_cb, NULL);
- 			if (error == EEXIST) {
- 				test_future = TRUE;
- 			} else if (error == ENOSPC) {
- 				zs->zs_enospc_count++;
- 				(void) rw_unlock(&ztest_shared->zs_name_lock);
- 				break;
- 			} else if (error != 0) {
- 				fatal(0, "dmu_objset_create(%s) = %d",
- 				    name, error);
- 			}
- 			error = dmu_objset_open(name, DMU_OST_OTHER,
- 			    DS_MODE_USER, &za[d].za_os);
- 			if (error)
- 				fatal(0, "dmu_objset_open('%s') = %d",
- 				    name, error);
- 			(void) rw_unlock(&ztest_shared->zs_name_lock);
- 			if (test_future)
- 				ztest_dmu_check_future_leak(&za[t]);
- 			zil_replay(za[d].za_os, za[d].za_os,
- 			    ztest_replay_vector);
- 			za[d].za_zilog = zil_open(za[d].za_os, NULL);
- 		}
- 
- 		VERIFY(thr_create(0, 0, ztest_thread, &za[t], THR_BOUND,
- 		    &za[t].za_thread) == 0);
+ 		if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0)
+ 			return;
+ 		VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
+ 		    THR_BOUND, &tid[t]) == 0);
  	}
  
- 	while (--t >= 0) {
- 		VERIFY(thr_join(za[t].za_thread, NULL, NULL) == 0);
- 		if (t < zopt_datasets) {
- 			zil_close(za[t].za_zilog);
- 			dmu_objset_close(za[t].za_os);
- 		}
+ 	/*
+ 	 * Wait for all of the tests to complete.  We go in reverse order
+ 	 * so we don't close datasets while threads are still using them.
+ 	 */
 -	for (int t = zopt_threads - 1; t >= 0; t--) {
++	for (t = zopt_threads - 1; t >= 0; t--) {
+ 		VERIFY(thr_join(tid[t], NULL, NULL) == 0);
+ 		if (t < zopt_datasets)
+ 			ztest_dataset_close(zs, t);
  	}
  
- 	if (zopt_verbose >= 3)
- 		show_pool_stats(spa);
- 
  	txg_wait_synced(spa_get_dsl(spa), 0);
  
- 	zs->zs_alloc = spa_get_alloc(spa);
- 	zs->zs_space = spa_get_space(spa);
+ 	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ 	zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
+ 
+ 	umem_free(tid, zopt_threads * sizeof (thread_t));
+ 
+ 	/* Kill the resume thread */
+ 	ztest_exiting = B_TRUE;
+ 	VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
+ 	ztest_resume(spa);
+ 
+ 	/*
+ 	 * Right before closing the pool, kick off a bunch of async I/O;
+ 	 * spa_close() should wait for it to complete.
+ 	 */
+ 	for (uint64_t object = 1; object < 50; object++)
+ 		dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
+ 
+ 	spa_close(spa, FTAG);
  
  	/*
- 	 * If we had out-of-space errors, destroy a random objset.
+ 	 * Verify that we can loop over all pools.
  	 */
- 	if (zs->zs_enospc_count != 0) {
- 		(void) rw_rdlock(&ztest_shared->zs_name_lock);
- 		d = (int)ztest_random(zopt_datasets);
- 		(void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
- 		if (zopt_verbose >= 3)
- 			(void) printf("Destroying %s to free up space\n", name);
+ 	mutex_enter(&spa_namespace_lock);
+ 	for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
+ 		if (zopt_verbose > 3)
+ 			(void) printf("spa_next: found %s\n", spa_name(spa));
+ 	mutex_exit(&spa_namespace_lock);
+ 
+ 	/*
+ 	 * Verify that we can export the pool and reimport it under a
+ 	 * different name.
+ 	 */
+ 	if (ztest_random(2) == 0) {
+ 		char name[MAXNAMELEN];
+ 		(void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool);
+ 		ztest_spa_import_export(zs->zs_pool, name);
+ 		ztest_spa_import_export(name, zs->zs_pool);
+ 	}
+ 
+ 	kernel_fini();
+ }
+ 
+ static void
+ ztest_freeze(ztest_shared_t *zs)
+ {
+ 	ztest_ds_t *zd = &zs->zs_zd[0];
+ 	spa_t *spa;
+ 	int numloops = 0;
+ 
+ 	if (zopt_verbose >= 3)
+ 		(void) printf("testing spa_freeze()...\n");
  
- 		/* Cleanup any non-standard clones and snapshots */
- 		ztest_dsl_dataset_cleanup(name, za[d].za_instance);
+ 	kernel_init(FREAD | FWRITE);
+ 	VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+ 	VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
  
- 		(void) dmu_objset_find(name, ztest_destroy_cb, &za[d],
- 		    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
- 		(void) rw_unlock(&ztest_shared->zs_name_lock);
+ 	/*
+ 	 * Force the first log block to be transactionally allocated.
+ 	 * We have to do this before we freeze the pool -- otherwise
+ 	 * the log chain won't be anchored.
+ 	 */
+ 	while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
+ 		ztest_dmu_object_alloc_free(zd, 0);
+ 		zil_commit(zd->zd_zilog, UINT64_MAX, 0);
  	}
  
  	txg_wait_synced(spa_get_dsl(spa), 0);
@@@ -3957,6 -5413,7 +5430,8 @@@ main(int argc, char **argv
  	ztest_info_t *zi;
  	char timebuf[100];
  	char numbuf[6];
+ 	spa_t *spa;
++	int i, f;
  
  	(void) setvbuf(stdout, NULL, _IOLBF, 0);
  
@@@ -3991,26 -5450,24 +5468,24 @@@
  		bzero(zs, sizeof (ztest_shared_t));
  		if (zopt_verbose >= 3 && zopt_init != 1)
  			(void) printf("ztest_init(), pass %d\n", i);
- 		ztest_init(zopt_pool);
+ 		zs->zs_pool = zopt_pool;
+ 		ztest_init(zs);
  	}
  
- 	/*
- 	 * Initialize the call targets for each function.
- 	 */
+ 	zs->zs_pool = zopt_pool;
+ 	zs->zs_proc_start = gethrtime();
+ 	zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC;
+ 
 -	for (int f = 0; f < ZTEST_FUNCS; f++) {
 +	for (f = 0; f < ZTEST_FUNCS; f++) {
  		zi = &zs->zs_info[f];
- 
  		*zi = ztest_info[f];
- 
- 		if (*zi->zi_interval == 0)
- 			zi->zi_call_target = UINT64_MAX;
+ 		if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
+ 			zi->zi_call_next = UINT64_MAX;
  		else
- 			zi->zi_call_target = zopt_time / *zi->zi_interval;
+ 			zi->zi_call_next = zs->zs_proc_start +
+ 			    ztest_random(2 * zi->zi_interval[0] + 1);
  	}
  
- 	zs->zs_start_time = gethrtime();
- 	zs->zs_stop_time = zs->zs_start_time + zopt_time * NANOSEC;
- 
  	/*
  	 * Run the tests in a loop.  These tests include fault injection
  	 * to verify that self-healing data works, and forced crashes
@@@ -4024,9 -5480,9 +5498,9 @@@
  		/*
  		 * Initialize the workload counters for each function.
  		 */
 -		for (int f = 0; f < ZTEST_FUNCS; f++) {
 +		for (f = 0; f < ZTEST_FUNCS; f++) {
  			zi = &zs->zs_info[f];
- 			zi->zi_calls = 0;
+ 			zi->zi_call_count = 0;
  			zi->zi_call_time = 0;
  		}
  
diff --cc lib/libzfs/libzfs_import.c
index d67776889,fd3044b1d..95632d938
--- a/lib/libzfs/libzfs_import.c
+++ b/lib/libzfs/libzfs_import.c
@@@ -403,6 -405,21 +405,23 @@@ refresh_config(libzfs_handle_t *hdl, nv
  	return (nvl);
  }
  
+ /*
+  * Determine if the vdev id is a hole in the namespace.
+  */
+ boolean_t
+ vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
+ {
 -	for (int c = 0; c < holes; c++) {
++	int c;
++
++	for (c = 0; c < holes; c++) {
+ 
+ 		/* Top-level is a hole */
+ 		if (hole_array[c] == id)
+ 			return (B_TRUE);
+ 	}
+ 	return (B_FALSE);
+ }
+ 
  /*
   * Convert our list of pools into the definitive set of configurations.  We
   * start by picking the best config for each toplevel vdev.  Once that's done,
diff --cc module/zfs/dbuf.c
index e9a8aab49,42ae43997..22e7188bc
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@@ -109,14 -106,12 +106,16 @@@ dmu_buf_impl_t 
  dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
  {
  	dbuf_hash_table_t *h = &dbuf_hash_table;
- 	objset_impl_t *os = dn->dn_objset;
- 	uint64_t obj, hv, idx;
+ 	objset_t *os = dn->dn_objset;
+ 	uint64_t obj = dn->dn_object;
+ 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+ 	uint64_t idx = hv & h->hash_table_mask;
  	dmu_buf_impl_t *db;
  
 +	obj = dn->dn_object;
 +	hv = DBUF_HASH(os, obj, level, blkid);
 +	idx = hv & h->hash_table_mask;
 +
  	mutex_enter(DBUF_HASH_MUTEX(h, idx));
  	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
  		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
@@@ -142,16 -137,14 +141,16 @@@ static dmu_buf_impl_t 
  dbuf_hash_insert(dmu_buf_impl_t *db)
  {
  	dbuf_hash_table_t *h = &dbuf_hash_table;
- 	objset_impl_t *os = db->db_objset;
+ 	objset_t *os = db->db_objset;
  	uint64_t obj = db->db.db_object;
  	int level = db->db_level;
 -	uint64_t blkid = db->db_blkid;
 -	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 -	uint64_t idx = hv & h->hash_table_mask;
 +	uint64_t blkid, hv, idx;
  	dmu_buf_impl_t *dbf;
  
 +	blkid = db->db_blkid;
 +	hv = DBUF_HASH(os, obj, level, blkid);
 +	idx = hv & h->hash_table_mask;
 +
  	mutex_enter(DBUF_HASH_MUTEX(h, idx));
  	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
  		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
diff --cc module/zfs/ddt.c
index 000000000,926b4df9a..cd4e8476c
mode 000000,100644..100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@@ -1,0 -1,1140 +1,1155 @@@
+ /*
+  * CDDL HEADER START
+  *
+  * The contents of this file are subject to the terms of the
+  * Common Development and Distribution License (the "License").
+  * You may not use this file except in compliance with the License.
+  *
+  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+  * or http://www.opensolaris.org/os/licensing.
+  * See the License for the specific language governing permissions
+  * and limitations under the License.
+  *
+  * When distributing Covered Code, include this CDDL HEADER in each
+  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+  * If applicable, add the following below this CDDL HEADER, with the
+  * fields enclosed by brackets "[]" replaced with your own identifying
+  * information: Portions Copyright [yyyy] [name of copyright owner]
+  *
+  * CDDL HEADER END
+  */
+ 
+ /*
+  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+  */
+ 
+ #include <sys/zfs_context.h>
+ #include <sys/spa.h>
+ #include <sys/spa_impl.h>
+ #include <sys/zio.h>
+ #include <sys/ddt.h>
+ #include <sys/zap.h>
+ #include <sys/dmu_tx.h>
+ #include <sys/arc.h>
+ #include <sys/dsl_pool.h>
+ #include <sys/zio_checksum.h>
+ #include <sys/zio_compress.h>
+ #include <sys/dsl_scan.h>
+ 
+ static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
+ 	&ddt_zap_ops,
+ };
+ 
+ static const char *ddt_class_name[DDT_CLASSES] = {
+ 	"ditto",
+ 	"duplicate",
+ 	"unique",
+ };
+ 
+ static void
+ ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     dmu_tx_t *tx)
+ {
+ 	spa_t *spa = ddt->ddt_spa;
+ 	objset_t *os = ddt->ddt_os;
+ 	uint64_t *objectp = &ddt->ddt_object[type][class];
+ 	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
+ 	char name[DDT_NAMELEN];
+ 
+ 	ddt_object_name(ddt, type, class, name);
+ 
+ 	ASSERT(*objectp == 0);
+ 	VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
+ 	ASSERT(*objectp != 0);
+ 
+ 	VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
+ 	    sizeof (uint64_t), 1, objectp, tx) == 0);
+ 
+ 	VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
+ 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ 	    &ddt->ddt_histogram[type][class], tx) == 0);
+ }
+ 
+ static void
+ ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     dmu_tx_t *tx)
+ {
+ 	spa_t *spa = ddt->ddt_spa;
+ 	objset_t *os = ddt->ddt_os;
+ 	uint64_t *objectp = &ddt->ddt_object[type][class];
+ 	char name[DDT_NAMELEN];
+ 
+ 	ddt_object_name(ddt, type, class, name);
+ 
+ 	ASSERT(*objectp != 0);
+ 	ASSERT(ddt_object_count(ddt, type, class) == 0);
+ 	ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
+ 	VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
+ 	VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
+ 	VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
+ 	bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
+ 
+ 	*objectp = 0;
+ }
+ 
+ static int
+ ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+ {
+ 	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+ 	dmu_object_info_t doi;
+ 	char name[DDT_NAMELEN];
+ 	int error;
+ 
+ 	ddt_object_name(ddt, type, class, name);
+ 
+ 	error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+ 	    sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
+ 
+ 	if (error)
+ 		return (error);
+ 
+ 	error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ 	    &ddt->ddt_histogram[type][class]);
+ 
+ 	/*
+ 	 * Seed the cached statistics.
+ 	 */
+ 	VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+ 
+ 	ddo->ddo_count = ddt_object_count(ddt, type, class);
+ 	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+ 	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+ 
+ 	ASSERT(error == 0);
+ 	return (error);
+ }
+ 
+ static void
+ ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     dmu_tx_t *tx)
+ {
+ 	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+ 	dmu_object_info_t doi;
+ 	char name[DDT_NAMELEN];
+ 
+ 	ddt_object_name(ddt, type, class, name);
+ 
+ 	VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ 	    &ddt->ddt_histogram[type][class], tx) == 0);
+ 
+ 	/*
+ 	 * Cache DDT statistics; this is the only time they'll change.
+ 	 */
+ 	VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+ 
+ 	ddo->ddo_count = ddt_object_count(ddt, type, class);
+ 	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+ 	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+ }
+ 
+ static int
+ ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     ddt_entry_t *dde)
+ {
+ 	if (!ddt_object_exists(ddt, type, class))
+ 		return (ENOENT);
+ 
+ 	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
+ 	    ddt->ddt_object[type][class], dde));
+ }
+ 
+ static void
+ ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     ddt_entry_t *dde)
+ {
+ 	if (!ddt_object_exists(ddt, type, class))
+ 		return;
+ 
+ 	ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
+ 	    ddt->ddt_object[type][class], dde);
+ }
+ 
+ int
+ ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     ddt_entry_t *dde, dmu_tx_t *tx)
+ {
+ 	ASSERT(ddt_object_exists(ddt, type, class));
+ 
+ 	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
+ 	    ddt->ddt_object[type][class], dde, tx));
+ }
+ 
+ static int
+ ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     ddt_entry_t *dde, dmu_tx_t *tx)
+ {
+ 	ASSERT(ddt_object_exists(ddt, type, class));
+ 
+ 	return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
+ 	    ddt->ddt_object[type][class], dde, tx));
+ }
+ 
+ int
+ ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     uint64_t *walk, ddt_entry_t *dde)
+ {
+ 	ASSERT(ddt_object_exists(ddt, type, class));
+ 
+ 	return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
+ 	    ddt->ddt_object[type][class], dde, walk));
+ }
+ 
+ uint64_t
+ ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+ {
+ 	ASSERT(ddt_object_exists(ddt, type, class));
+ 
+ 	return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
+ 	    ddt->ddt_object[type][class]));
+ }
+ 
+ int
+ ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     dmu_object_info_t *doi)
+ {
+ 	if (!ddt_object_exists(ddt, type, class))
+ 		return (ENOENT);
+ 
+ 	return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
+ 	    doi));
+ }
+ 
+ boolean_t
+ ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+ {
+ 	return (!!ddt->ddt_object[type][class]);
+ }
+ 
+ void
+ ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+     char *name)
+ {
+ 	(void) sprintf(name, DMU_POOL_DDT,
+ 	    zio_checksum_table[ddt->ddt_checksum].ci_name,
+ 	    ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
+ }
+ 
+ void
+ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+ {
++	int d;
+ 	ASSERT(txg != 0);
+ 
 -	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
++	for (d = 0; d < SPA_DVAS_PER_BP; d++)
+ 		bp->blk_dva[d] = ddp->ddp_dva[d];
+ 	BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+ }
+ 
+ void
+ ddt_bp_create(enum zio_checksum checksum,
+     const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
+ {
+ 	BP_ZERO(bp);
+ 
+ 	if (ddp != NULL)
+ 		ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+ 
+ 	bp->blk_cksum = ddk->ddk_cksum;
+ 	bp->blk_fill = 1;
+ 
+ 	BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
+ 	BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
+ 	BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
+ 	BP_SET_CHECKSUM(bp, checksum);
+ 	BP_SET_TYPE(bp, DMU_OT_DEDUP);
+ 	BP_SET_LEVEL(bp, 0);
+ 	BP_SET_DEDUP(bp, 0);
+ 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+ }
+ 
+ void
+ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
+ {
+ 	ddk->ddk_cksum = bp->blk_cksum;
+ 	ddk->ddk_prop = 0;
+ 
+ 	DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
+ 	DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
+ 	DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
+ }
+ 
+ void
+ ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+ {
++	int d;
+ 	ASSERT(ddp->ddp_phys_birth == 0);
+ 
 -	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
++	for (d = 0; d < SPA_DVAS_PER_BP; d++)
+ 		ddp->ddp_dva[d] = bp->blk_dva[d];
+ 	ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+ }
+ 
+ void
+ ddt_phys_clear(ddt_phys_t *ddp)
+ {
+ 	bzero(ddp, sizeof (*ddp));
+ }
+ 
+ void
+ ddt_phys_addref(ddt_phys_t *ddp)
+ {
+ 	ddp->ddp_refcnt++;
+ }
+ 
+ void
+ ddt_phys_decref(ddt_phys_t *ddp)
+ {
+ 	ASSERT((int64_t)ddp->ddp_refcnt > 0);
+ 	ddp->ddp_refcnt--;
+ }
+ 
+ void
+ ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+ {
+ 	blkptr_t blk;
+ 
+ 	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ 	ddt_phys_clear(ddp);
+ 	zio_free(ddt->ddt_spa, txg, &blk);
+ }
+ 
+ ddt_phys_t *
+ ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
+ {
+ 	ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
++	int p;
+ 
 -	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
++	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ 		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
+ 		    BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+ 			return (ddp);
+ 	}
+ 	return (NULL);
+ }
+ 
+ uint64_t
+ ddt_phys_total_refcnt(const ddt_entry_t *dde)
+ {
+ 	uint64_t refcnt = 0;
++	int p;
+ 
 -	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
++	for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+ 		refcnt += dde->dde_phys[p].ddp_refcnt;
+ 
+ 	return (refcnt);
+ }
+ 
+ static void
+ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+ {
+ 	spa_t *spa = ddt->ddt_spa;
+ 	ddt_phys_t *ddp = dde->dde_phys;
+ 	ddt_key_t *ddk = &dde->dde_key;
+ 	uint64_t lsize = DDK_GET_LSIZE(ddk);
+ 	uint64_t psize = DDK_GET_PSIZE(ddk);
++	int p, d;
+ 
+ 	bzero(dds, sizeof (*dds));
+ 
 -	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
++	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ 		uint64_t dsize = 0;
+ 		uint64_t refcnt = ddp->ddp_refcnt;
+ 
+ 		if (ddp->ddp_phys_birth == 0)
+ 			continue;
+ 
 -		for (int d = 0; d < SPA_DVAS_PER_BP; d++)
++		for (d = 0; d < SPA_DVAS_PER_BP; d++)
+ 			dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+ 
+ 		dds->dds_blocks += 1;
+ 		dds->dds_lsize += lsize;
+ 		dds->dds_psize += psize;
+ 		dds->dds_dsize += dsize;
+ 
+ 		dds->dds_ref_blocks += refcnt;
+ 		dds->dds_ref_lsize += lsize * refcnt;
+ 		dds->dds_ref_psize += psize * refcnt;
+ 		dds->dds_ref_dsize += dsize * refcnt;
+ 	}
+ }
+ 
+ void
+ ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+ {
+ 	const uint64_t *s = (const uint64_t *)src;
+ 	uint64_t *d = (uint64_t *)dst;
+ 	uint64_t *d_end = (uint64_t *)(dst + 1);
+ 
+ 	ASSERT(neg == 0 || neg == -1ULL);	/* add or subtract */
+ 
+ 	while (d < d_end)
+ 		*d++ += (*s++ ^ neg) - neg;
+ }
+ 
+ static void
+ ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+ {
+ 	ddt_stat_t dds;
+ 	ddt_histogram_t *ddh;
+ 	int bucket;
+ 
+ 	ddt_stat_generate(ddt, dde, &dds);
+ 
+ 	bucket = highbit(dds.dds_ref_blocks) - 1;
+ 	ASSERT(bucket >= 0);
+ 
+ 	ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+ 
+ 	ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+ }
+ 
+ void
+ ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
+ {
 -	for (int h = 0; h < 64; h++)
++	int h;
++
++	for (h = 0; h < 64; h++)
+ 		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+ }
+ 
+ void
+ ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+ {
++	int h;
++
+ 	bzero(dds, sizeof (*dds));
+ 
 -	for (int h = 0; h < 64; h++)
++	for (h = 0; h < 64; h++)
+ 		ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+ }
+ 
+ boolean_t
+ ddt_histogram_empty(const ddt_histogram_t *ddh)
+ {
+ 	const uint64_t *s = (const uint64_t *)ddh;
+ 	const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+ 
+ 	while (s < s_end)
+ 		if (*s++ != 0)
+ 			return (B_FALSE);
+ 
+ 	return (B_TRUE);
+ }
+ 
+ void
+ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
+ {
+ 	/* Sum the statistics we cached in ddt_object_sync(). */
+ 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ 		ddt_t *ddt = spa->spa_ddt[c];
+ 		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ 			for (enum ddt_class class = 0; class < DDT_CLASSES;
+ 			    class++) {
+ 				ddt_object_t *ddo =
+ 				    &ddt->ddt_object_stats[type][class];
+ 				ddo_total->ddo_count += ddo->ddo_count;
+ 				ddo_total->ddo_dspace += ddo->ddo_dspace;
+ 				ddo_total->ddo_mspace += ddo->ddo_mspace;
+ 			}
+ 		}
+ 	}
+ 
+ 	/* ... and compute the averages. */
+ 	if (ddo_total->ddo_count != 0) {
+ 		ddo_total->ddo_dspace /= ddo_total->ddo_count;
+ 		ddo_total->ddo_mspace /= ddo_total->ddo_count;
+ 	} else {
+ 		ASSERT(ddo_total->ddo_dspace == 0);
+ 		ASSERT(ddo_total->ddo_mspace == 0);
+ 	}
+ }
+ 
+ void
+ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
+ {
+ 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ 		ddt_t *ddt = spa->spa_ddt[c];
+ 		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ 			for (enum ddt_class class = 0; class < DDT_CLASSES;
+ 			    class++) {
+ 				ddt_histogram_add(ddh,
+ 				    &ddt->ddt_histogram_cache[type][class]);
+ 			}
+ 		}
+ 	}
+ }
+ 
+ void
+ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
+ {
+ 	ddt_histogram_t *ddh_total;
+ 
+ 	ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+ 	ddt_get_dedup_histogram(spa, ddh_total);
+ 	ddt_histogram_stat(dds_total, ddh_total);
+ 	kmem_free(ddh_total, sizeof (ddt_histogram_t));
+ }
+ 
+ uint64_t
+ ddt_get_dedup_dspace(spa_t *spa)
+ {
+ 	ddt_stat_t dds_total = { 0 };
+ 
+ 	ddt_get_dedup_stats(spa, &dds_total);
+ 	return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
+ }
+ 
+ uint64_t
+ ddt_get_pool_dedup_ratio(spa_t *spa)
+ {
+ 	ddt_stat_t dds_total = { 0 };
+ 
+ 	ddt_get_dedup_stats(spa, &dds_total);
+ 	if (dds_total.dds_dsize == 0)
+ 		return (100);
+ 
+ 	return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
+ }
+ 
+ int
+ ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
+ {
+ 	spa_t *spa = ddt->ddt_spa;
+ 	uint64_t total_refcnt = 0;
+ 	uint64_t ditto = spa->spa_dedup_ditto;
+ 	int total_copies = 0;
+ 	int desired_copies = 0;
++	int p;
+ 
 -	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
++	for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ 		ddt_phys_t *ddp = &dde->dde_phys[p];
+ 		zio_t *zio = dde->dde_lead_zio[p];
+ 		uint64_t refcnt = ddp->ddp_refcnt;	/* committed refs */
+ 		if (zio != NULL)
+ 			refcnt += zio->io_parent_count;	/* pending refs */
+ 		if (ddp == ddp_willref)
+ 			refcnt++;			/* caller's ref */
+ 		if (refcnt != 0) {
+ 			total_refcnt += refcnt;
+ 			total_copies += p;
+ 		}
+ 	}
+ 
+ 	if (ditto == 0 || ditto > UINT32_MAX)
+ 		ditto = UINT32_MAX;
+ 
+ 	if (total_refcnt >= 1)
+ 		desired_copies++;
+ 	if (total_refcnt >= ditto)
+ 		desired_copies++;
+ 	if (total_refcnt >= ditto * ditto)
+ 		desired_copies++;
+ 
+ 	return (MAX(desired_copies, total_copies) - total_copies);
+ }
+ 
+ int
+ ddt_ditto_copies_present(ddt_entry_t *dde)
+ {
+ 	ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
+ 	dva_t *dva = ddp->ddp_dva;
+ 	int copies = 0 - DVA_GET_GANG(dva);
++	int d;
+ 
 -	for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
++	for (d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
+ 		if (DVA_IS_VALID(dva))
+ 			copies++;
+ 
+ 	ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
+ 
+ 	return (copies);
+ }
+ 
+ size_t
+ ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
+ {
+ 	uchar_t *version = dst++;
+ 	int cpfunc = ZIO_COMPRESS_ZLE;
+ 	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ 	size_t c_len;
+ 
+ 	ASSERT(d_len >= s_len + 1);	/* no compression plus version byte */
+ 
+ 	c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
+ 
+ 	if (c_len == s_len) {
+ 		cpfunc = ZIO_COMPRESS_OFF;
+ 		bcopy(src, dst, s_len);
+ 	}
+ 
+ 	*version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
+ 
+ 	return (c_len + 1);
+ }
+ 
+ void
+ ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
+ {
+ 	uchar_t version = *src++;
+ 	int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
+ 	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ 
+ 	if (ci->ci_decompress != NULL)
+ 		(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
+ 	else
+ 		bcopy(src, dst, d_len);
+ 
+ 	if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
+ 		byteswap_uint64_array(dst, d_len);
+ }
+ 
+ ddt_t *
+ ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
+ {
+ 	return (spa->spa_ddt[c]);
+ }
+ 
+ ddt_t *
+ ddt_select(spa_t *spa, const blkptr_t *bp)
+ {
+ 	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
+ }
+ 
+ void
+ ddt_enter(ddt_t *ddt)
+ {
+ 	mutex_enter(&ddt->ddt_lock);
+ }
+ 
+ void
+ ddt_exit(ddt_t *ddt)
+ {
+ 	mutex_exit(&ddt->ddt_lock);
+ }
+ 
+ static ddt_entry_t *
+ ddt_alloc(const ddt_key_t *ddk)
+ {
+ 	ddt_entry_t *dde;
+ 
+ 	dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
+ 	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
+ 
+ 	dde->dde_key = *ddk;
+ 
+ 	return (dde);
+ }
+ 
+ static void
+ ddt_free(ddt_entry_t *dde)
+ {
+ 	ASSERT(!dde->dde_loading);
++	int p;
+ 
 -	for (int p = 0; p < DDT_PHYS_TYPES; p++)
++	for (p = 0; p < DDT_PHYS_TYPES; p++)
+ 		ASSERT(dde->dde_lead_zio[p] == NULL);
+ 
+ 	if (dde->dde_repair_data != NULL)
+ 		zio_buf_free(dde->dde_repair_data,
+ 		    DDK_GET_PSIZE(&dde->dde_key));
+ 
+ 	cv_destroy(&dde->dde_cv);
+ 	kmem_free(dde, sizeof (*dde));
+ }
+ 
+ void
+ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
+ {
+ 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+ 
+ 	avl_remove(&ddt->ddt_tree, dde);
+ 	ddt_free(dde);
+ }
+ 
+ ddt_entry_t *
+ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
+ {
+ 	ddt_entry_t *dde, dde_search;
+ 	enum ddt_type type;
+ 	enum ddt_class class;
+ 	avl_index_t where;
+ 	int error;
+ 
+ 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+ 
+ 	ddt_key_fill(&dde_search.dde_key, bp);
+ 
+ 	dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
+ 	if (dde == NULL) {
+ 		if (!add)
+ 			return (NULL);
+ 		dde = ddt_alloc(&dde_search.dde_key);
+ 		avl_insert(&ddt->ddt_tree, dde, where);
+ 	}
+ 
+ 	while (dde->dde_loading)
+ 		cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+ 
+ 	if (dde->dde_loaded)
+ 		return (dde);
+ 
+ 	dde->dde_loading = B_TRUE;
+ 
+ 	ddt_exit(ddt);
+ 
+ 	error = ENOENT;
+ 
+ 	for (type = 0; type < DDT_TYPES; type++) {
+ 		for (class = 0; class < DDT_CLASSES; class++) {
+ 			error = ddt_object_lookup(ddt, type, class, dde);
+ 			if (error != ENOENT)
+ 				break;
+ 		}
+ 		if (error != ENOENT)
+ 			break;
+ 	}
+ 
+ 	ASSERT(error == 0 || error == ENOENT);
+ 
+ 	ddt_enter(ddt);
+ 
+ 	ASSERT(dde->dde_loaded == B_FALSE);
+ 	ASSERT(dde->dde_loading == B_TRUE);
+ 
+ 	dde->dde_type = type;	/* will be DDT_TYPES if no entry found */
+ 	dde->dde_class = class;	/* will be DDT_CLASSES if no entry found */
+ 	dde->dde_loaded = B_TRUE;
+ 	dde->dde_loading = B_FALSE;
+ 
+ 	if (error == 0)
+ 		ddt_stat_update(ddt, dde, -1ULL);
+ 
+ 	cv_broadcast(&dde->dde_cv);
+ 
+ 	return (dde);
+ }
+ 
+ void
+ ddt_prefetch(spa_t *spa, const blkptr_t *bp)
+ {
+ 	ddt_t *ddt;
+ 	ddt_entry_t dde;
+ 
+ 	if (!BP_GET_DEDUP(bp))
+ 		return;
+ 
+ 	/*
+ 	 * We remove the DDT once it's empty and only prefetch dedup blocks
+ 	 * when there are entries in the DDT.  Thus no locking is required
+ 	 * as the DDT can't disappear on us.
+ 	 */
+ 	ddt = ddt_select(spa, bp);
+ 	ddt_key_fill(&dde.dde_key, bp);
+ 
+ 	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ 		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ 			ddt_object_prefetch(ddt, type, class, &dde);
+ 		}
+ 	}
+ }
+ 
+ int
+ ddt_entry_compare(const void *x1, const void *x2)
+ {
+ 	const ddt_entry_t *dde1 = x1;
+ 	const ddt_entry_t *dde2 = x2;
+ 	const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
+ 	const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
++	int i;
+ 
 -	for (int i = 0; i < DDT_KEY_WORDS; i++) {
++	for (i = 0; i < DDT_KEY_WORDS; i++) {
+ 		if (u1[i] < u2[i])
+ 			return (-1);
+ 		if (u1[i] > u2[i])
+ 			return (1);
+ 	}
+ 
+ 	return (0);
+ }
+ 
+ static ddt_t *
+ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
+ {
+ 	ddt_t *ddt;
+ 
+ 	ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
+ 
+ 	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
+ 	avl_create(&ddt->ddt_tree, ddt_entry_compare,
+ 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ 	avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
+ 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ 	ddt->ddt_checksum = c;
+ 	ddt->ddt_spa = spa;
+ 	ddt->ddt_os = spa->spa_meta_objset;
+ 
+ 	return (ddt);
+ }
+ 
+ static void
+ ddt_table_free(ddt_t *ddt)
+ {
+ 	ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
+ 	ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
+ 	avl_destroy(&ddt->ddt_tree);
+ 	avl_destroy(&ddt->ddt_repair_tree);
+ 	mutex_destroy(&ddt->ddt_lock);
+ 	kmem_free(ddt, sizeof (*ddt));
+ }
+ 
+ void
+ ddt_create(spa_t *spa)
+ {
+ 	spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
+ 
+ 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+ 		spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+ }
+ 
+ int
+ ddt_load(spa_t *spa)
+ {
+ 	int error;
+ 
+ 	ddt_create(spa);
+ 
+ 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ 	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+ 	    &spa->spa_ddt_stat_object);
+ 
+ 	if (error)
+ 		return (error == ENOENT ? 0 : error);
+ 
+ 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ 		ddt_t *ddt = spa->spa_ddt[c];
+ 		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ 			for (enum ddt_class class = 0; class < DDT_CLASSES;
+ 			    class++) {
+ 				error = ddt_object_load(ddt, type, class);
+ 				if (error != 0 && error != ENOENT)
+ 					return (error);
+ 			}
+ 		}
+ 
+ 		/*
+ 		 * Seed the cached histograms.
+ 		 */
+ 		bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ 		    sizeof (ddt->ddt_histogram));
+ 	}
+ 
+ 	return (0);
+ }
+ 
+ void
+ ddt_unload(spa_t *spa)
+ {
+ 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ 		if (spa->spa_ddt[c]) {
+ 			ddt_table_free(spa->spa_ddt[c]);
+ 			spa->spa_ddt[c] = NULL;
+ 		}
+ 	}
+ }
+ 
+ boolean_t
+ ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
+ {
+ 	ddt_t *ddt;
+ 	ddt_entry_t dde;
+ 
+ 	if (!BP_GET_DEDUP(bp))
+ 		return (B_FALSE);
+ 
+ 	if (max_class == DDT_CLASS_UNIQUE)
+ 		return (B_TRUE);
+ 
+ 	ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
+ 
+ 	ddt_key_fill(&dde.dde_key, bp);
+ 
+ 	for (enum ddt_type type = 0; type < DDT_TYPES; type++)
+ 		for (enum ddt_class class = 0; class <= max_class; class++)
+ 			if (ddt_object_lookup(ddt, type, class, &dde) == 0)
+ 				return (B_TRUE);
+ 
+ 	return (B_FALSE);
+ }
+ 
+ ddt_entry_t *
+ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
+ {
+ 	ddt_key_t ddk;
+ 	ddt_entry_t *dde;
+ 
+ 	ddt_key_fill(&ddk, bp);
+ 
+ 	dde = ddt_alloc(&ddk);
+ 
+ 	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ 		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ 			/*
+ 			 * We can only do repair if there are multiple copies
+ 			 * of the block.  For anything in the UNIQUE class,
+ 			 * there's definitely only one copy, so don't even try.
+ 			 */
+ 			if (class != DDT_CLASS_UNIQUE &&
+ 			    ddt_object_lookup(ddt, type, class, dde) == 0)
+ 				return (dde);
+ 		}
+ 	}
+ 
+ 	bzero(dde->dde_phys, sizeof (dde->dde_phys));
+ 
+ 	return (dde);
+ }
+ 
+ void
+ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
+ {
+ 	avl_index_t where;
+ 
+ 	ddt_enter(ddt);
+ 
+ 	if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
+ 	    avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
+ 		avl_insert(&ddt->ddt_repair_tree, dde, where);
+ 	else
+ 		ddt_free(dde);
+ 
+ 	ddt_exit(ddt);
+ }
+ 
+ static void
+ ddt_repair_entry_done(zio_t *zio)
+ {
+ 	ddt_entry_t *rdde = zio->io_private;
+ 
+ 	ddt_free(rdde);
+ }
+ 
+ static void
+ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
+ {
+ 	ddt_phys_t *ddp = dde->dde_phys;
+ 	ddt_phys_t *rddp = rdde->dde_phys;
+ 	ddt_key_t *ddk = &dde->dde_key;
+ 	ddt_key_t *rddk = &rdde->dde_key;
+ 	zio_t *zio;
+ 	blkptr_t blk;
++	int p;
+ 
+ 	zio = zio_null(rio, rio->io_spa, NULL,
+ 	    ddt_repair_entry_done, rdde, rio->io_flags);
+ 
 -	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
++	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
+ 		if (ddp->ddp_phys_birth == 0 ||
+ 		    ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
+ 		    bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+ 			continue;
+ 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ 		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
+ 		    rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
+ 		    ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
+ 	}
+ 
+ 	zio_nowait(zio);
+ }
+ 
+ static void
+ ddt_repair_table(ddt_t *ddt, zio_t *rio)
+ {
+ 	spa_t *spa = ddt->ddt_spa;
+ 	ddt_entry_t *dde, *rdde_next, *rdde;
+ 	avl_tree_t *t = &ddt->ddt_repair_tree;
+ 	blkptr_t blk;
+ 
+ 	if (spa_sync_pass(spa) > 1)
+ 		return;
+ 
+ 	ddt_enter(ddt);
+ 	for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
+ 		rdde_next = AVL_NEXT(t, rdde);
+ 		avl_remove(&ddt->ddt_repair_tree, rdde);
+ 		ddt_exit(ddt);
+ 		ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
+ 		dde = ddt_repair_start(ddt, &blk);
+ 		ddt_repair_entry(ddt, dde, rdde, rio);
+ 		ddt_repair_done(ddt, dde);
+ 		ddt_enter(ddt);
+ 	}
+ 	ddt_exit(ddt);
+ }
+ 
+ static void
+ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+ {
+ 	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
+ 	ddt_phys_t *ddp = dde->dde_phys;
+ 	ddt_key_t *ddk = &dde->dde_key;
+ 	enum ddt_type otype = dde->dde_type;
+ 	enum ddt_type ntype = DDT_TYPE_CURRENT;
+ 	enum ddt_class oclass = dde->dde_class;
+ 	enum ddt_class nclass;
+ 	uint64_t total_refcnt = 0;
++	int p;
+ 
+ 	ASSERT(dde->dde_loaded);
+ 	ASSERT(!dde->dde_loading);
+ 
 -	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
++	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ 		ASSERT(dde->dde_lead_zio[p] == NULL);
+ 		ASSERT((int64_t)ddp->ddp_refcnt >= 0);
+ 		if (ddp->ddp_phys_birth == 0) {
+ 			ASSERT(ddp->ddp_refcnt == 0);
+ 			continue;
+ 		}
+ 		if (p == DDT_PHYS_DITTO) {
+ 			if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
+ 				ddt_phys_free(ddt, ddk, ddp, txg);
+ 			continue;
+ 		}
+ 		if (ddp->ddp_refcnt == 0)
+ 			ddt_phys_free(ddt, ddk, ddp, txg);
+ 		total_refcnt += ddp->ddp_refcnt;
+ 	}
+ 
+ 	if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
+ 		nclass = DDT_CLASS_DITTO;
+ 	else if (total_refcnt > 1)
+ 		nclass = DDT_CLASS_DUPLICATE;
+ 	else
+ 		nclass = DDT_CLASS_UNIQUE;
+ 
+ 	if (otype != DDT_TYPES &&
+ 	    (otype != ntype || oclass != nclass || total_refcnt == 0)) {
+ 		VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
+ 		ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
+ 	}
+ 
+ 	if (total_refcnt != 0) {
+ 		dde->dde_type = ntype;
+ 		dde->dde_class = nclass;
+ 		ddt_stat_update(ddt, dde, 0);
+ 		if (!ddt_object_exists(ddt, ntype, nclass))
+ 			ddt_object_create(ddt, ntype, nclass, tx);
+ 		VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+ 
+ 		/*
+ 		 * If the class changes, the order that we scan this bp
+ 		 * changes.  If it decreases, we could miss it, so
+ 		 * scan it right now.  (This covers both class changing
+ 		 * while we are doing ddt_walk(), and when we are
+ 		 * traversing.)
+ 		 */
+ 		if (nclass < oclass) {
+ 			dsl_scan_ddt_entry(dp->dp_scan,
+ 			    ddt->ddt_checksum, dde, tx);
+ 		}
+ 	}
+ }
+ 
+ static void
+ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
+ {
+ 	spa_t *spa = ddt->ddt_spa;
+ 	ddt_entry_t *dde;
+ 	void *cookie = NULL;
+ 
+ 	if (avl_numnodes(&ddt->ddt_tree) == 0)
+ 		return;
+ 
+ 	ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
+ 
+ 	if (spa->spa_ddt_stat_object == 0) {
+ 		spa->spa_ddt_stat_object = zap_create(ddt->ddt_os,
+ 		    DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx);
+ 		VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT,
+ 		    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+ 		    &spa->spa_ddt_stat_object, tx) == 0);
+ 	}
+ 
+ 	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+ 		ddt_sync_entry(ddt, dde, tx, txg);
+ 		ddt_free(dde);
+ 	}
+ 
+ 	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ 		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ 			if (!ddt_object_exists(ddt, type, class))
+ 				continue;
+ 			ddt_object_sync(ddt, type, class, tx);
+ 			if (ddt_object_count(ddt, type, class) == 0)
+ 				ddt_object_destroy(ddt, type, class, tx);
+ 		}
+ 	}
+ 
+ 	bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ 	    sizeof (ddt->ddt_histogram));
+ }
+ 
+ void
+ ddt_sync(spa_t *spa, uint64_t txg)
+ {
+ 	dmu_tx_t *tx;
+ 	zio_t *rio = zio_root(spa, NULL, NULL,
+ 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+ 
+ 	ASSERT(spa_syncing_txg(spa) == txg);
+ 
+ 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ 
+ 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ 		ddt_t *ddt = spa->spa_ddt[c];
+ 		if (ddt == NULL)
+ 			continue;
+ 		ddt_sync_table(ddt, tx, txg);
+ 		ddt_repair_table(ddt, rio);
+ 	}
+ 
+ 	(void) zio_wait(rio);
+ 
+ 	dmu_tx_commit(tx);
+ }
+ 
+ int
+ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
+ {
+ 	do {
+ 		do {
+ 			do {
+ 				ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
+ 				int error = ENOENT;
+ 				if (ddt_object_exists(ddt, ddb->ddb_type,
+ 				    ddb->ddb_class)) {
+ 					error = ddt_object_walk(ddt,
+ 					    ddb->ddb_type, ddb->ddb_class,
+ 					    &ddb->ddb_cursor, dde);
+ 				}
+ 				dde->dde_type = ddb->ddb_type;
+ 				dde->dde_class = ddb->ddb_class;
+ 				if (error == 0)
+ 					return (0);
+ 				if (error != ENOENT)
+ 					return (error);
+ 				ddb->ddb_cursor = 0;
+ 			} while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
+ 			ddb->ddb_checksum = 0;
+ 		} while (++ddb->ddb_type < DDT_TYPES);
+ 		ddb->ddb_type = 0;
+ 	} while (++ddb->ddb_class < DDT_CLASSES);
+ 
+ 	return (ENOENT);
+ }
diff --cc module/zfs/dmu.c
index d86468202,5b87c81c6..ad7a8f74f
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@@ -1148,6 -1519,8 +1519,9 @@@ dmu_offset_next(objset_t *os, uint64_t 
  void
  dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
  {
+ 	dnode_phys_t *dnp;
++	int i;
+ 
  	rw_enter(&dn->dn_struct_rwlock, RW_READER);
  	mutex_enter(&dn->dn_mtx);
  
@@@ -1157,12 -1535,11 +1536,11 @@@
  	doi->doi_indirection = dn->dn_nlevels;
  	doi->doi_checksum = dn->dn_checksum;
  	doi->doi_compress = dn->dn_compress;
- 	doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) +
- 	    SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT;
- 	doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
- 	doi->doi_type = dn->dn_type;
- 	doi->doi_bonus_size = dn->dn_bonuslen;
- 	doi->doi_bonus_type = dn->dn_bonustype;
+ 	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
+ 	doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz;
+ 	doi->doi_fill_count = 0;
 -	for (int i = 0; i < dnp->dn_nblkptr; i++)
++	for (i = 0; i < dnp->dn_nblkptr; i++)
+ 		doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
  
  	mutex_exit(&dn->dn_mtx);
  	rw_exit(&dn->dn_struct_rwlock);
diff --cc module/zfs/dmu_objset.c
index 8bb6ce2e3,690e6ecde..2ff085e44
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@@ -452,16 -502,12 +502,13 @@@ dmu_objset_evict_dbufs(objset_t *os
  }
  
  void
- dmu_objset_evict(dsl_dataset_t *ds, void *arg)
+ dmu_objset_evict(objset_t *os)
  {
- 	objset_impl_t *osi = arg;
- 	objset_t os;
- 	int i;
+ 	dsl_dataset_t *ds = os->os_dsl_dataset;
++	int t;
  
- 	for (i = 0; i < TXG_SIZE; i++) {
- 		ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
- 		ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
- 	}
 -	for (int t = 0; t < TXG_SIZE; t++)
++	for (t = 0; t < TXG_SIZE; t++)
+ 		ASSERT(!dmu_objset_is_dirty(os, t));
  
  	if (ds) {
  		if (!dsl_dataset_is_snapshot(ds)) {
@@@ -888,13 -949,10 +950,12 @@@ dmu_objset_sync_dnodes(list_t *list, li
  
  /* ARGSUSED */
  static void
- ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
  {
 +	int i;
 +
  	blkptr_t *bp = zio->io_bp;
- 	blkptr_t *bp_orig = &zio->io_bp_orig;
- 	objset_impl_t *os = arg;
+ 	objset_t *os = arg;
  	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
  
  	ASSERT(bp == os->os_rootbp);
@@@ -908,16 -966,26 +969,26 @@@
  	 * dnode and user/group accounting objects).
  	 */
  	bp->blk_fill = 0;
 -	for (int i = 0; i < dnp->dn_nblkptr; i++)
 +	for (i = 0; i < dnp->dn_nblkptr; i++)
  		bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+ }
+ 
+ /* ARGSUSED */
+ static void
+ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
+ {
+ 	blkptr_t *bp = zio->io_bp;
+ 	blkptr_t *bp_orig = &zio->io_bp_orig;
+ 	objset_t *os = arg;
  
  	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
- 		ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+ 		ASSERT(BP_EQUAL(bp, bp_orig));
  	} else {
- 		if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
- 			(void) dsl_dataset_block_kill(os->os_dsl_dataset,
- 			    &zio->io_bp_orig, zio, os->os_synctx);
- 		dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
+ 		dsl_dataset_t *ds = os->os_dsl_dataset;
+ 		dmu_tx_t *tx = os->os_synctx;
+ 
+ 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+ 		dsl_dataset_block_born(ds, bp, tx);
  	}
  }
  
diff --cc module/zfs/dmu_tx.c
index c6fbeeef0,5fc062c16..32dbea622
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@@ -203,6 -216,6 +216,7 @@@ dmu_tx_count_write(dmu_tx_hold_t *txh, 
  	uint64_t start, end, i;
  	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
  	int err = 0;
++	int l;
  
  	if (len == 0)
  		return;
@@@ -289,11 -302,11 +303,11 @@@
  		 * If this write is not off the end of the file
  		 * we need to account for overwrites/unref.
  		 */
- 		if (start <= dn->dn_maxblkid)
- 			bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS);
+ 		if (start <= dn->dn_maxblkid) {
 -			for (int l = 0; l < DN_MAX_LEVELS; l++)
++			for (l = 0; l < DN_MAX_LEVELS; l++)
+ 				history[l] = -1ULL;
+ 		}
  		while (start <= dn->dn_maxblkid) {
- 			spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
- 			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
  			dmu_buf_impl_t *db;
  
  			rw_enter(&dn->dn_struct_rwlock, RW_READER);
diff --cc module/zfs/dsl_dataset.c
index 58fc78684,ddd83576c..2e1fff35a
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@@ -77,16 -87,14 +87,16 @@@ parent_delta(dsl_dataset_t *ds, int64_
  }
  
  void
- dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
  {
 -	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 -	int compressed = BP_GET_PSIZE(bp);
 -	int uncompressed = BP_GET_UCSIZE(bp);
 +	int used, compressed, uncompressed;
  	int64_t delta;
  
 +	used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
 +	compressed = BP_GET_PSIZE(bp);
 +	uncompressed = BP_GET_UCSIZE(bp);
 +
- 	dprintf_bp(bp, "born, ds=%p\n", ds);
+ 	dprintf_bp(bp, "ds=%p", ds);
  
  	ASSERT(dmu_tx_is_syncing(tx));
  	/* It could have been compressed away to nothing */
diff --cc module/zfs/dsl_scan.c
index 000000000,23c37c7cc..e402dde7c
mode 000000,100644..100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@@ -1,0 -1,1739 +1,1741 @@@
+ /*
+  * CDDL HEADER START
+  *
+  * The contents of this file are subject to the terms of the
+  * Common Development and Distribution License (the "License").
+  * You may not use this file except in compliance with the License.
+  *
+  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+  * or http://www.opensolaris.org/os/licensing.
+  * See the License for the specific language governing permissions
+  * and limitations under the License.
+  *
+  * When distributing Covered Code, include this CDDL HEADER in each
+  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+  * If applicable, add the following below this CDDL HEADER, with the
+  * fields enclosed by brackets "[]" replaced with your own identifying
+  * information: Portions Copyright [yyyy] [name of copyright owner]
+  *
+  * CDDL HEADER END
+  */
+ /*
+  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+  */
+ 
+ #include <sys/dsl_scan.h>
+ #include <sys/dsl_pool.h>
+ #include <sys/dsl_dataset.h>
+ #include <sys/dsl_prop.h>
+ #include <sys/dsl_dir.h>
+ #include <sys/dsl_synctask.h>
+ #include <sys/dnode.h>
+ #include <sys/dmu_tx.h>
+ #include <sys/dmu_objset.h>
+ #include <sys/arc.h>
+ #include <sys/zap.h>
+ #include <sys/zio.h>
+ #include <sys/zfs_context.h>
+ #include <sys/fs/zfs.h>
+ #include <sys/zfs_znode.h>
+ #include <sys/spa_impl.h>
+ #include <sys/vdev_impl.h>
+ #include <sys/zil_impl.h>
+ #include <sys/zio_checksum.h>
+ #include <sys/ddt.h>
+ #include <sys/sa.h>
+ #include <sys/sa_impl.h>
+ #ifdef _KERNEL
+ #include <sys/zfs_vfsops.h>
+ #endif
+ 
+ typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
+ 
+ static scan_cb_t dsl_scan_defrag_cb;
+ static scan_cb_t dsl_scan_scrub_cb;
+ static scan_cb_t dsl_scan_remove_cb;
+ static dsl_syncfunc_t dsl_scan_cancel_sync;
+ static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
+ 
+ int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
+ int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
+ int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
+ boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+ boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
+ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+ int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
+ 
+ #define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
+ 	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
+ 	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
+ 
+ extern int zfs_txg_timeout;
+ 
+ /* the order has to match pool_scan_type */
+ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
+ 	NULL,
+ 	dsl_scan_scrub_cb,	/* POOL_SCAN_SCRUB */
+ 	dsl_scan_scrub_cb,	/* POOL_SCAN_RESILVER */
+ };
+ 
+ int
+ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
+ {
+ 	int err;
+ 	dsl_scan_t *scn;
+ 	spa_t *spa = dp->dp_spa;
+ 	uint64_t f;
+ 
+ 	scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
+ 	scn->scn_dp = dp;
+ 
+ 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ 	    "scrub_func", sizeof (uint64_t), 1, &f);
+ 	if (err == 0) {
+ 		/*
+ 		 * There was an old-style scrub in progress.  Restart a
+ 		 * new-style scrub from the beginning.
+ 		 */
+ 		scn->scn_restart_txg = txg;
+ 		zfs_dbgmsg("old-style scrub was in progress; "
+ 		    "restarting new-style scrub in txg %llu",
+ 		    scn->scn_restart_txg);
+ 
+ 		/*
+ 		 * Load the queue obj from the old location so that it
+ 		 * can be freed by dsl_scan_done().
+ 		 */
+ 		(void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ 		    "scrub_queue", sizeof (uint64_t), 1,
+ 		    &scn->scn_phys.scn_queue_obj);
+ 	} else {
+ 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ 		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ 		    &scn->scn_phys);
+ 		if (err == ENOENT)
+ 			return (0);
+ 		else if (err)
+ 			return (err);
+ 
+ 		if (scn->scn_phys.scn_state == DSS_SCANNING &&
+ 		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
+ 			/*
+ 			 * A new-type scrub was in progress on an old
+ 			 * pool, and the pool was accessed by old
+ 			 * software.  Restart from the beginning, since
+ 			 * the old software may have changed the pool in
+ 			 * the meantime.
+ 			 */
+ 			scn->scn_restart_txg = txg;
+ 			zfs_dbgmsg("new-style scrub was modified "
+ 			    "by old software; restarting in txg %llu",
+ 			    scn->scn_restart_txg);
+ 		}
+ 	}
+ 
+ 	spa_scan_stat_init(spa);
+ 	return (0);
+ }
+ 
+ void
+ dsl_scan_fini(dsl_pool_t *dp)
+ {
+ 	if (dp->dp_scan) {
+ 		kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
+ 		dp->dp_scan = NULL;
+ 	}
+ }
+ 
+ /* ARGSUSED */
+ static int
+ dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx)
+ {
+ 	dsl_scan_t *scn = arg1;
+ 
+ 	if (scn->scn_phys.scn_state == DSS_SCANNING)
+ 		return (EBUSY);
+ 
+ 	return (0);
+ }
+ 
+ /* ARGSUSED */
+ static void
+ dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+ {
+ 	dsl_scan_t *scn = arg1;
+ 	pool_scan_func_t *funcp = arg2;
+ 	dmu_object_type_t ot = 0;
+ 	dsl_pool_t *dp = scn->scn_dp;
+ 	spa_t *spa = dp->dp_spa;
+ 
+ 	ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
+ 	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+ 	bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+ 	scn->scn_phys.scn_func = *funcp;
+ 	scn->scn_phys.scn_state = DSS_SCANNING;
+ 	scn->scn_phys.scn_min_txg = 0;
+ 	scn->scn_phys.scn_max_txg = tx->tx_txg;
+ 	scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
+ 	scn->scn_phys.scn_start_time = gethrestime_sec();
+ 	scn->scn_phys.scn_errors = 0;
+ 	scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+ 	scn->scn_restart_txg = 0;
+ 	spa_scan_stat_init(spa);
+ 
+ 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ 		scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
+ 
+ 		/* rewrite all disk labels */
+ 		vdev_config_dirty(spa->spa_root_vdev);
+ 
+ 		if (vdev_resilver_needed(spa->spa_root_vdev,
+ 		    &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
+ 			spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
+ 		} else {
+ 			spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
+ 		}
+ 
+ 		spa->spa_scrub_started = B_TRUE;
+ 		/*
+ 		 * If this is an incremental scrub, limit the DDT scrub phase
+ 		 * to just the auto-ditto class (for correctness); the rest
+ 		 * of the scrub should go faster using top-down pruning.
+ 		 */
+ 		if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
+ 			scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
+ 
+ 	}
+ 
+ 	/* back to the generic stuff */
+ 
+ 	if (dp->dp_blkstats == NULL) {
+ 		dp->dp_blkstats =
+ 		    kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+ 	}
+ 	bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+ 
+ 	if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
+ 		ot = DMU_OT_ZAP_OTHER;
+ 
+ 	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
+ 	    ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
+ 
+ 	dsl_scan_sync_state(scn, tx);
+ 
+ 	spa_history_log_internal(LOG_POOL_SCAN, spa, tx,
+ 	    "func=%u mintxg=%llu maxtxg=%llu",
+ 	    *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
+ }
+ 
+ /* ARGSUSED */
+ static void
+ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+ {
+ 	static const char *old_names[] = {
+ 		"scrub_bookmark",
+ 		"scrub_ddt_bookmark",
+ 		"scrub_ddt_class_max",
+ 		"scrub_queue",
+ 		"scrub_min_txg",
+ 		"scrub_max_txg",
+ 		"scrub_func",
+ 		"scrub_errors",
+ 		NULL
+ 	};
+ 
+ 	dsl_pool_t *dp = scn->scn_dp;
+ 	spa_t *spa = dp->dp_spa;
+ 	int i;
+ 
+ 	/* Remove any remnants of an old-style scrub. */
+ 	for (i = 0; old_names[i]; i++) {
+ 		(void) zap_remove(dp->dp_meta_objset,
+ 		    DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
+ 	}
+ 
+ 	if (scn->scn_phys.scn_queue_obj != 0) {
+ 		VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+ 		    scn->scn_phys.scn_queue_obj, tx));
+ 		scn->scn_phys.scn_queue_obj = 0;
+ 	}
+ 
+ 	/*
+ 	 * If we were "restarted" from a stopped state, don't bother
+ 	 * with anything else.
+ 	 */
+ 	if (scn->scn_phys.scn_state != DSS_SCANNING)
+ 		return;
+ 
+ 	if (complete)
+ 		scn->scn_phys.scn_state = DSS_FINISHED;
+ 	else
+ 		scn->scn_phys.scn_state = DSS_CANCELED;
+ 
+ 	spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx,
+ 	    "complete=%u", complete);
+ 
+ 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ 		mutex_enter(&spa->spa_scrub_lock);
+ 		while (spa->spa_scrub_inflight > 0) {
+ 			cv_wait(&spa->spa_scrub_io_cv,
+ 			    &spa->spa_scrub_lock);
+ 		}
+ 		mutex_exit(&spa->spa_scrub_lock);
+ 		spa->spa_scrub_started = B_FALSE;
+ 		spa->spa_scrub_active = B_FALSE;
+ 
+ 		/*
+ 		 * If the scrub/resilver completed, update all DTLs to
+ 		 * reflect this.  Whether it succeeded or not, vacate
+ 		 * all temporary scrub DTLs.
+ 		 */
+ 		vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+ 		    complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
+ 		if (complete) {
+ 			spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
+ 			    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+ 		}
+ 		spa_errlog_rotate(spa);
+ 
+ 		/*
+ 		 * We may have finished replacing a device.
+ 		 * Let the async thread assess this and handle the detach.
+ 		 */
+ 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+ 	}
+ 
+ 	scn->scn_phys.scn_end_time = gethrestime_sec();
+ }
+ 
+ /* ARGSUSED */
+ static int
+ dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx)
+ {
+ 	dsl_scan_t *scn = arg1;
+ 
+ 	if (scn->scn_phys.scn_state != DSS_SCANNING)
+ 		return (ENOENT);
+ 	return (0);
+ }
+ 
+ /* ARGSUSED */
+ static void
+ dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+ {
+ 	dsl_scan_t *scn = arg1;
+ 
+ 	dsl_scan_done(scn, B_FALSE, tx);
+ 	dsl_scan_sync_state(scn, tx);
+ }
+ 
+ int
+ dsl_scan_cancel(dsl_pool_t *dp)
+ {
+ 	boolean_t complete = B_FALSE;
+ 	int err;
+ 
+ 	err = dsl_sync_task_do(dp, dsl_scan_cancel_check,
+ 	    dsl_scan_cancel_sync, dp->dp_scan, &complete, 3);
+ 	return (err);
+ }
+ 
+ static void dsl_scan_visitbp(blkptr_t *bp,
+     const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf,
+     dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+     dmu_tx_t *tx);
+ static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
+     dmu_objset_type_t ostype,
+     dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx);
+ 
+ void
+ dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
+ {
+ 	zio_free(dp->dp_spa, txg, bp);
+ }
+ 
+ void
+ dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
+ {
+ 	ASSERT(dsl_pool_sync_context(dp));
+ 	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
+ }
+ 
+ int
+ dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
+     arc_done_func_t *done, void *private, int priority, int zio_flags,
+     uint32_t *arc_flags, const zbookmark_t *zb)
+ {
+ 	return (arc_read(pio, spa, bpp, pbuf, done, private,
+ 	    priority, zio_flags, arc_flags, zb));
+ }
+ 
+ int
+ dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
+     arc_done_func_t *done, void *private, int priority, int zio_flags,
+     uint32_t *arc_flags, const zbookmark_t *zb)
+ {
+ 	return (arc_read_nolock(pio, spa, bpp, done, private,
+ 	    priority, zio_flags, arc_flags, zb));
+ }
+ 
+ static boolean_t
+ bookmark_is_zero(const zbookmark_t *zb)
+ {
+ 	return (zb->zb_objset == 0 && zb->zb_object == 0 &&
+ 	    zb->zb_level == 0 && zb->zb_blkid == 0);
+ }
+ 
+ /* dnp is the dnode for zb1->zb_object */
+ static boolean_t
+ bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
+     const zbookmark_t *zb2)
+ {
+ 	uint64_t zb1nextL0, zb2thisobj;
+ 
+ 	ASSERT(zb1->zb_objset == zb2->zb_objset);
+ 	ASSERT(zb2->zb_level == 0);
+ 
+ 	/*
+ 	 * A bookmark in the deadlist is considered to be after
+ 	 * everything else.
+ 	 */
+ 	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
+ 		return (B_TRUE);
+ 
+ 	/* The objset_phys_t isn't before anything. */
+ 	if (dnp == NULL)
+ 		return (B_FALSE);
+ 
+ 	zb1nextL0 = (zb1->zb_blkid + 1) <<
+ 	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+ 
+ 	zb2thisobj = zb2->zb_object ? zb2->zb_object :
+ 	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+ 
+ 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
+ 		uint64_t nextobj = zb1nextL0 *
+ 		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
+ 		return (nextobj <= zb2thisobj);
+ 	}
+ 
+ 	if (zb1->zb_object < zb2thisobj)
+ 		return (B_TRUE);
+ 	if (zb1->zb_object > zb2thisobj)
+ 		return (B_FALSE);
+ 	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
+ 		return (B_FALSE);
+ 	return (zb1nextL0 <= zb2->zb_blkid);
+ }
+ 
+ static uint64_t
+ dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+ {
+ 	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+ 	if (dsl_dataset_is_snapshot(ds))
+ 		return (MIN(smt, ds->ds_phys->ds_creation_txg));
+ 	return (smt);
+ }
+ 
+ static void
+ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+ {
+ 	VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
+ 	    DMU_POOL_DIRECTORY_OBJECT,
+ 	    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ 	    &scn->scn_phys, tx));
+ }
+ 
+ static boolean_t
+ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
+ {
+ 	uint64_t elapsed_nanosecs;
+ 	int mintime;
+ 
+ 	/* we never skip user/group accounting objects */
+ 	if (zb && (int64_t)zb->zb_object < 0)
+ 		return (B_FALSE);
+ 
+ 	if (scn->scn_pausing)
+ 		return (B_TRUE); /* we're already pausing */
+ 
+ 	if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
+ 		return (B_FALSE); /* we're resuming */
+ 
+ 	/* We only know how to resume from level-0 blocks. */
+ 	if (zb && zb->zb_level != 0)
+ 		return (B_FALSE);
+ 
+ 	mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ 	    zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
+ 	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+ 	if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+ 	    (elapsed_nanosecs / MICROSEC > mintime &&
+ 	    txg_sync_waiting(scn->scn_dp)) ||
+ 	    spa_shutting_down(scn->scn_dp->dp_spa)) {
+ 		if (zb) {
+ 			dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
+ 			    (longlong_t)zb->zb_objset,
+ 			    (longlong_t)zb->zb_object,
+ 			    (longlong_t)zb->zb_level,
+ 			    (longlong_t)zb->zb_blkid);
+ 			scn->scn_phys.scn_bookmark = *zb;
+ 		}
+ 		dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
+ 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+ 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+ 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+ 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+ 		scn->scn_pausing = B_TRUE;
+ 		return (B_TRUE);
+ 	}
+ 	return (B_FALSE);
+ }
+ 
+ typedef struct zil_scan_arg {
+ 	dsl_pool_t	*zsa_dp;
+ 	zil_header_t	*zsa_zh;
+ } zil_scan_arg_t;
+ 
+ /* ARGSUSED */
+ static int
+ dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+ {
+ 	zil_scan_arg_t *zsa = arg;
+ 	dsl_pool_t *dp = zsa->zsa_dp;
+ 	dsl_scan_t *scn = dp->dp_scan;
+ 	zil_header_t *zh = zsa->zsa_zh;
+ 	zbookmark_t zb;
+ 
+ 	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ 		return (0);
+ 
+ 	/*
+ 	 * One block ("stubby") can be allocated a long time ago; we
+ 	 * want to visit that one because it has been allocated
+ 	 * (on-disk) even if it hasn't been claimed (even though for
+ 	 * scrub there's nothing to do to it).
+ 	 */
+ 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+ 		return (0);
+ 
+ 	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+ 
+ 	VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+ 	return (0);
+ }
+ 
+ /* ARGSUSED */
+ static int
+ dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+ {
+ 	if (lrc->lrc_txtype == TX_WRITE) {
+ 		zil_scan_arg_t *zsa = arg;
+ 		dsl_pool_t *dp = zsa->zsa_dp;
+ 		dsl_scan_t *scn = dp->dp_scan;
+ 		zil_header_t *zh = zsa->zsa_zh;
+ 		lr_write_t *lr = (lr_write_t *)lrc;
+ 		blkptr_t *bp = &lr->lr_blkptr;
+ 		zbookmark_t zb;
+ 
+ 		if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ 			return (0);
+ 
+ 		/*
+ 		 * birth can be < claim_txg if this record's txg is
+ 		 * already txg sync'ed (but this log block contains
+ 		 * other records that are not synced)
+ 		 */
+ 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+ 			return (0);
+ 
+ 		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ 		    lr->lr_foid, ZB_ZIL_LEVEL,
+ 		    lr->lr_offset / BP_GET_LSIZE(bp));
+ 
+ 		VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+ 	}
+ 	return (0);
+ }
+ 
+ static void
+ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
+ {
+ 	uint64_t claim_txg = zh->zh_claim_txg;
+ 	zil_scan_arg_t zsa = { dp, zh };
+ 	zilog_t *zilog;
+ 
+ 	/*
+ 	 * We only want to visit blocks that have been claimed but not yet
+ 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
+ 	 */
+ 	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+ 		return;
+ 
+ 	zilog = zil_alloc(dp->dp_meta_objset, zh);
+ 
+ 	(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
+ 	    claim_txg);
+ 
+ 	zil_free(zilog);
+ }
+ 
+ /* ARGSUSED */
+ static void
+ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
+     uint64_t objset, uint64_t object, uint64_t blkid)
+ {
+ 	zbookmark_t czb;
+ 	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+ 
+ 	if (zfs_no_scrub_prefetch)
+ 		return;
+ 
+ 	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
+ 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
+ 		return;
+ 
+ 	SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
+ 
+ 	/*
+ 	 * XXX need to make sure all of these arc_read() prefetches are
+ 	 * done before setting xlateall (similar to dsl_read())
+ 	 */
+ 	(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
+ 	    buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+ 	    &flags, &czb);
+ }
+ 
+ static boolean_t
+ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
+     const zbookmark_t *zb)
+ {
+ 	/*
+ 	 * We never skip over user/group accounting objects (obj<0)
+ 	 */
+ 	if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
+ 	    (int64_t)zb->zb_object >= 0) {
+ 		/*
+ 		 * If we already visited this bp & everything below (in
+ 		 * a prior txg sync), don't bother doing it again.
+ 		 */
+ 		if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+ 			return (B_TRUE);
+ 
+ 		/*
+ 		 * If we found the block we're trying to resume from, or
+ 		 * we went past it to a different object, zero it out to
+ 		 * indicate that it's OK to start checking for pausing
+ 		 * again.
+ 		 */
+ 		if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
+ 		    zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
+ 			dprintf("resuming at %llx/%llx/%llx/%llx\n",
+ 			    (longlong_t)zb->zb_objset,
+ 			    (longlong_t)zb->zb_object,
+ 			    (longlong_t)zb->zb_level,
+ 			    (longlong_t)zb->zb_blkid);
+ 			bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
+ 		}
+ 	}
+ 	return (B_FALSE);
+ }
+ 
+ /*
+  * Return nonzero on i/o error.
+  * Return new buf to write out in *bufp.
+  */
+ static int
+ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+     dnode_phys_t *dnp, const blkptr_t *bp,
+     const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
+ {
+ 	dsl_pool_t *dp = scn->scn_dp;
+ 	int err;
+ 
+ 	if (BP_GET_LEVEL(bp) > 0) {
+ 		uint32_t flags = ARC_WAIT;
+ 		int i;
+ 		blkptr_t *cbp;
+ 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+ 
+ 		err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ 		    arc_getbuf_func, bufp,
+ 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ 		if (err) {
+ 			scn->scn_phys.scn_errors++;
+ 			return (err);
+ 		}
+ 		for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+ 			dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset,
+ 			    zb->zb_object, zb->zb_blkid * epb + i);
+ 		}
+ 		for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+ 			zbookmark_t czb;
+ 
+ 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ 			    zb->zb_level - 1,
+ 			    zb->zb_blkid * epb + i);
+ 			dsl_scan_visitbp(cbp, &czb, dnp,
+ 			    *bufp, ds, scn, ostype, tx);
+ 		}
+ 	} else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
+ 		uint32_t flags = ARC_WAIT;
+ 
+ 		err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ 		    arc_getbuf_func, bufp,
+ 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ 		if (err) {
+ 			scn->scn_phys.scn_errors++;
+ 			return (err);
+ 		}
+ 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+ 		uint32_t flags = ARC_WAIT;
+ 		dnode_phys_t *cdnp;
+ 		int i, j;
+ 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+ 
+ 		err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ 		    arc_getbuf_func, bufp,
+ 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ 		if (err) {
+ 			scn->scn_phys.scn_errors++;
+ 			return (err);
+ 		}
+ 		for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+ 			for (j = 0; j < cdnp->dn_nblkptr; j++) {
+ 				blkptr_t *cbp = &cdnp->dn_blkptr[j];
+ 				dsl_scan_prefetch(scn, *bufp, cbp,
+ 				    zb->zb_objset, zb->zb_blkid * epb + i, j);
+ 			}
+ 		}
+ 		for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+ 			dsl_scan_visitdnode(scn, ds, ostype,
+ 			    cdnp, *bufp, zb->zb_blkid * epb + i, tx);
+ 		}
+ 
+ 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ 		uint32_t flags = ARC_WAIT;
+ 		objset_phys_t *osp;
+ 
+ 		err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ 		    arc_getbuf_func, bufp,
+ 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ 		if (err) {
+ 			scn->scn_phys.scn_errors++;
+ 			return (err);
+ 		}
+ 
+ 		osp = (*bufp)->b_data;
+ 
+ 		if (DSL_SCAN_IS_SCRUB_RESILVER(scn))
+ 			dsl_scan_zil(dp, &osp->os_zil_header);
+ 
+ 		dsl_scan_visitdnode(scn, ds, osp->os_type,
+ 		    &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
+ 
+ 		if (OBJSET_BUF_HAS_USERUSED(*bufp)) {
+ 			/*
+ 			 * We also always visit user/group accounting
+ 			 * objects, and never skip them, even if we are
+ 			 * pausing.  This is necessary so that the space
+ 			 * deltas from this txg get integrated.
+ 			 */
+ 			dsl_scan_visitdnode(scn, ds, osp->os_type,
+ 			    &osp->os_groupused_dnode, *bufp,
+ 			    DMU_GROUPUSED_OBJECT, tx);
+ 			dsl_scan_visitdnode(scn, ds, osp->os_type,
+ 			    &osp->os_userused_dnode, *bufp,
+ 			    DMU_USERUSED_OBJECT, tx);
+ 		}
+ 	}
+ 
+ 	return (0);
+ }
+ 
+ static void
+ dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
+     dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf,
+     uint64_t object, dmu_tx_t *tx)
+ {
+ 	int j;
+ 
+ 	for (j = 0; j < dnp->dn_nblkptr; j++) {
+ 		zbookmark_t czb;
+ 
+ 		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+ 		    dnp->dn_nlevels - 1, j);
+ 		dsl_scan_visitbp(&dnp->dn_blkptr[j],
+ 		    &czb, dnp, buf, ds, scn, ostype, tx);
+ 	}
+ 
+ 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ 		zbookmark_t czb;
+ 		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+ 		    0, DMU_SPILL_BLKID);
+ 		dsl_scan_visitbp(&dnp->dn_spill,
+ 		    &czb, dnp, buf, ds, scn, ostype, tx);
+ 	}
+ }
+ 
+ /*
+  * The arguments are in this order because mdb can only print the
+  * first 5; we want them to be useful.
+  */
+ static void
+ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
+     dnode_phys_t *dnp, arc_buf_t *pbuf,
+     dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+     dmu_tx_t *tx)
+ {
+ 	dsl_pool_t *dp = scn->scn_dp;
+ 	arc_buf_t *buf = NULL;
+ 	blkptr_t bp_toread = *bp;
+ 
+ 	/* ASSERT(pbuf == NULL || arc_released(pbuf)); */
+ 
+ 	if (dsl_scan_check_pause(scn, zb))
+ 		return;
+ 
+ 	if (dsl_scan_check_resume(scn, dnp, zb))
+ 		return;
+ 
+ 	if (bp->blk_birth == 0)
+ 		return;
+ 
+ 	scn->scn_visited_this_txg++;
+ 
+ 	dprintf_bp(bp,
+ 	    "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
+ 	    ds, ds ? ds->ds_object : 0,
+ 	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+ 	    pbuf, bp);
+ 
+ 	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ 		return;
+ 
+ 	if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
+ 		/*
+ 		 * For non-user-accounting blocks, we need to read the
+ 		 * new bp (from a deleted snapshot, found in
+ 		 * check_existing_xlation).  If we used the old bp,
+ 		 * pointers inside this block from before we resumed
+ 		 * would be untranslated.
+ 		 *
+ 		 * For user-accounting blocks, we need to read the old
+ 		 * bp, because we will apply the entire space delta to
+ 		 * it (original untranslated -> translations from
+ 		 * deleted snap -> now).
+ 		 */
+ 		bp_toread = *bp;
+ 	}
+ 
+ 	if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
+ 	    &buf) != 0)
+ 		return;
+ 
+ 	/*
+ 	 * If dsl_scan_ddt() has aready visited this block, it will have
+ 	 * already done any translations or scrubbing, so don't call the
+ 	 * callback again.
+ 	 */
+ 	if (ddt_class_contains(dp->dp_spa,
+ 	    scn->scn_phys.scn_ddt_class_max, bp)) {
+ 		ASSERT(buf == NULL);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * If this block is from the future (after cur_max_txg), then we
+ 	 * are doing this on behalf of a deleted snapshot, and we will
+ 	 * revisit the future block on the next pass of this dataset.
+ 	 * Don't scan it now unless we need to because something
+ 	 * under it was modified.
+ 	 */
+ 	if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
+ 		scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+ 	}
+ 	if (buf)
+ 		(void) arc_buf_remove_ref(buf, &buf);
+ }
+ 
+ static void
+ dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
+     dmu_tx_t *tx)
+ {
+ 	zbookmark_t zb;
+ 
+ 	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+ 	dsl_scan_visitbp(bp, &zb, NULL, NULL,
+ 	    ds, scn, DMU_OST_NONE, tx);
+ 
+ 	dprintf_ds(ds, "finished scan%s", "");
+ }
+ 
+ void
+ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+ {
+ 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ 	dsl_scan_t *scn = dp->dp_scan;
+ 	uint64_t mintxg;
+ 
+ 	if (scn->scn_phys.scn_state != DSS_SCANNING)
+ 		return;
+ 
+ 	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+ 		if (dsl_dataset_is_snapshot(ds)) {
+ 			/* Note, scn_cur_{min,max}_txg stays the same. */
+ 			scn->scn_phys.scn_bookmark.zb_objset =
+ 			    ds->ds_phys->ds_next_snap_obj;
+ 			zfs_dbgmsg("destroying ds %llu; currently traversing; "
+ 			    "reset zb_objset to %llu",
+ 			    (u_longlong_t)ds->ds_object,
+ 			    (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+ 			scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
+ 		} else {
+ 			SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
+ 			    ZB_DESTROYED_OBJSET, 0, 0, 0);
+ 			zfs_dbgmsg("destroying ds %llu; currently traversing; "
+ 			    "reset bookmark to -1,0,0,0",
+ 			    (u_longlong_t)ds->ds_object);
+ 		}
+ 	} else if (zap_lookup_int_key(dp->dp_meta_objset,
+ 	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+ 		ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+ 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+ 		if (dsl_dataset_is_snapshot(ds)) {
+ 			/*
+ 			 * We keep the same mintxg; it could be >
+ 			 * ds_creation_txg if the previous snapshot was
+ 			 * deleted too.
+ 			 */
+ 			VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ 			    scn->scn_phys.scn_queue_obj,
+ 			    ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
+ 			zfs_dbgmsg("destroying ds %llu; in queue; "
+ 			    "replacing with %llu",
+ 			    (u_longlong_t)ds->ds_object,
+ 			    (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+ 		} else {
+ 			zfs_dbgmsg("destroying ds %llu; in queue; removing",
+ 			    (u_longlong_t)ds->ds_object);
+ 		}
+ 	} else {
+ 		zfs_dbgmsg("destroying ds %llu; ignoring",
+ 		    (u_longlong_t)ds->ds_object);
+ 	}
+ 
+ 	/*
+ 	 * dsl_scan_sync() should be called after this, and should sync
+ 	 * out our changed state, but just to be safe, do it here.
+ 	 */
+ 	dsl_scan_sync_state(scn, tx);
+ }
+ 
+ void
+ dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+ {
+ 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ 	dsl_scan_t *scn = dp->dp_scan;
+ 	uint64_t mintxg;
+ 
+ 	if (scn->scn_phys.scn_state != DSS_SCANNING)
+ 		return;
+ 
+ 	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
+ 
+ 	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+ 		scn->scn_phys.scn_bookmark.zb_objset =
+ 		    ds->ds_phys->ds_prev_snap_obj;
+ 		zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+ 		    "reset zb_objset to %llu",
+ 		    (u_longlong_t)ds->ds_object,
+ 		    (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+ 	} else if (zap_lookup_int_key(dp->dp_meta_objset,
+ 	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+ 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+ 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ 		    scn->scn_phys.scn_queue_obj,
+ 		    ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
+ 		zfs_dbgmsg("snapshotting ds %llu; in queue; "
+ 		    "replacing with %llu",
+ 		    (u_longlong_t)ds->ds_object,
+ 		    (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+ 	}
+ 	dsl_scan_sync_state(scn, tx);
+ }
+ 
+ void
+ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+ {
+ 	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+ 	dsl_scan_t *scn = dp->dp_scan;
+ 	uint64_t mintxg;
+ 
+ 	if (scn->scn_phys.scn_state != DSS_SCANNING)
+ 		return;
+ 
+ 	if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
+ 		scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
+ 		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+ 		    "reset zb_objset to %llu",
+ 		    (u_longlong_t)ds1->ds_object,
+ 		    (u_longlong_t)ds2->ds_object);
+ 	} else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
+ 		scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
+ 		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+ 		    "reset zb_objset to %llu",
+ 		    (u_longlong_t)ds2->ds_object,
+ 		    (u_longlong_t)ds1->ds_object);
+ 	}
+ 
+ 	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ 	    ds1->ds_object, &mintxg) == 0) {
+ 		int err;
+ 
+ 		ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+ 		ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+ 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ 		    scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
+ 		err = zap_add_int_key(dp->dp_meta_objset,
+ 		    scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
+ 		VERIFY(err == 0 || err == EEXIST);
+ 		if (err == EEXIST) {
+ 			/* Both were there to begin with */
+ 			VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+ 			    scn->scn_phys.scn_queue_obj,
+ 			    ds1->ds_object, mintxg, tx));
+ 		}
+ 		zfs_dbgmsg("clone_swap ds %llu; in queue; "
+ 		    "replacing with %llu",
+ 		    (u_longlong_t)ds1->ds_object,
+ 		    (u_longlong_t)ds2->ds_object);
+ 	} else if (zap_lookup_int_key(dp->dp_meta_objset,
+ 	    scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
+ 		ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+ 		ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+ 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ 		    scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
+ 		VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+ 		    scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
+ 		zfs_dbgmsg("clone_swap ds %llu; in queue; "
+ 		    "replacing with %llu",
+ 		    (u_longlong_t)ds2->ds_object,
+ 		    (u_longlong_t)ds1->ds_object);
+ 	}
+ 
+ 	dsl_scan_sync_state(scn, tx);
+ }
+ 
+ struct enqueue_clones_arg {
+ 	dmu_tx_t *tx;
+ 	uint64_t originobj;
+ };
+ 
+ /* ARGSUSED */
+ static int
+ enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+ {
+ 	struct enqueue_clones_arg *eca = arg;
+ 	dsl_dataset_t *ds;
+ 	int err;
+ 	dsl_pool_t *dp = spa->spa_dsl_pool;
+ 	dsl_scan_t *scn = dp->dp_scan;
+ 
+ 	err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ 	if (err)
+ 		return (err);
+ 
+ 	if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
+ 		while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+ 			dsl_dataset_t *prev;
+ 			err = dsl_dataset_hold_obj(dp,
+ 			    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+ 
+ 			dsl_dataset_rele(ds, FTAG);
+ 			if (err)
+ 				return (err);
+ 			ds = prev;
+ 		}
+ 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ 		    scn->scn_phys.scn_queue_obj, ds->ds_object,
+ 		    ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
+ 	}
+ 	dsl_dataset_rele(ds, FTAG);
+ 	return (0);
+ }
+ 
+ static void
+ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
+ {
+ 	dsl_pool_t *dp = scn->scn_dp;
+ 	dsl_dataset_t *ds;
+ 
+ 	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ 
+ 	/*
+ 	 * Iterate over the bps in this ds.
+ 	 */
+ 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ 	dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
+ 
+ 	char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
+ 	dsl_dataset_name(ds, dsname);
+ 	zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
+ 	    "pausing=%u",
+ 	    (longlong_t)dsobj, dsname,
+ 	    (longlong_t)scn->scn_phys.scn_cur_min_txg,
+ 	    (longlong_t)scn->scn_phys.scn_cur_max_txg,
+ 	    (int)scn->scn_pausing);
+ 	kmem_free(dsname, ZFS_MAXNAMELEN);
+ 
+ 	if (scn->scn_pausing)
+ 		goto out;
+ 
+ 	/*
+ 	 * We've finished this pass over this dataset.
+ 	 */
+ 
+ 	/*
+ 	 * If we did not completely visit this dataset, do another pass.
+ 	 */
+ 	if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
+ 		zfs_dbgmsg("incomplete pass; visiting again");
+ 		scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
+ 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ 		    scn->scn_phys.scn_queue_obj, ds->ds_object,
+ 		    scn->scn_phys.scn_cur_max_txg, tx) == 0);
+ 		goto out;
+ 	}
+ 
+ 	/*
+ 	 * Add descendent datasets to work queue.
+ 	 */
+ 	if (ds->ds_phys->ds_next_snap_obj != 0) {
+ 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ 		    scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
+ 		    ds->ds_phys->ds_creation_txg, tx) == 0);
+ 	}
+ 	if (ds->ds_phys->ds_num_children > 1) {
+ 		boolean_t usenext = B_FALSE;
+ 		if (ds->ds_phys->ds_next_clones_obj != 0) {
+ 			uint64_t count;
+ 			/*
+ 			 * A bug in a previous version of the code could
+ 			 * cause upgrade_clones_cb() to not set
+ 			 * ds_next_snap_obj when it should, leading to a
+ 			 * missing entry.  Therefore we can only use the
+ 			 * next_clones_obj when its count is correct.
+ 			 */
+ 			int err = zap_count(dp->dp_meta_objset,
+ 			    ds->ds_phys->ds_next_clones_obj, &count);
+ 			if (err == 0 &&
+ 			    count == ds->ds_phys->ds_num_children - 1)
+ 				usenext = B_TRUE;
+ 		}
+ 
+ 		if (usenext) {
+ 			VERIFY(zap_join_key(dp->dp_meta_objset,
+ 			    ds->ds_phys->ds_next_clones_obj,
+ 			    scn->scn_phys.scn_queue_obj,
+ 			    ds->ds_phys->ds_creation_txg, tx) == 0);
+ 		} else {
+ 			struct enqueue_clones_arg eca;
+ 			eca.tx = tx;
+ 			eca.originobj = ds->ds_object;
+ 
+ 			(void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
+ 			    NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
+ 		}
+ 	}
+ 
+ out:
+ 	dsl_dataset_rele(ds, FTAG);
+ }
+ 
+ /* ARGSUSED */
+ static int
+ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+ {
+ 	dmu_tx_t *tx = arg;
+ 	dsl_dataset_t *ds;
+ 	int err;
+ 	dsl_pool_t *dp = spa->spa_dsl_pool;
+ 	dsl_scan_t *scn = dp->dp_scan;
+ 
+ 	err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ 	if (err)
+ 		return (err);
+ 
+ 	while (ds->ds_phys->ds_prev_snap_obj != 0) {
+ 		dsl_dataset_t *prev;
+ 		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+ 		    FTAG, &prev);
+ 		if (err) {
+ 			dsl_dataset_rele(ds, FTAG);
+ 			return (err);
+ 		}
+ 
+ 		/*
+ 		 * If this is a clone, we don't need to worry about it for now.
+ 		 */
+ 		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+ 			dsl_dataset_rele(ds, FTAG);
+ 			dsl_dataset_rele(prev, FTAG);
+ 			return (0);
+ 		}
+ 		dsl_dataset_rele(ds, FTAG);
+ 		ds = prev;
+ 	}
+ 
+ 	VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ 	    ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
+ 	dsl_dataset_rele(ds, FTAG);
+ 	return (0);
+ }
+ 
+ /*
+  * Scrub/dedup interaction.
+  *
+  * If there are N references to a deduped block, we don't want to scrub it
+  * N times -- ideally, we should scrub it exactly once.
+  *
+  * We leverage the fact that the dde's replication class (enum ddt_class)
+  * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
+  * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
+  *
+  * To prevent excess scrubbing, the scrub begins by walking the DDT
+  * to find all blocks with refcnt > 1, and scrubs each of these once.
+  * Since there are two replication classes which contain blocks with
+  * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
+  * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
+  *
+  * There would be nothing more to say if a block's refcnt couldn't change
+  * during a scrub, but of course it can so we must account for changes
+  * in a block's replication class.
+  *
+  * Here's an example of what can occur:
+  *
+  * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+  * when visited during the top-down scrub phase, it will be scrubbed twice.
+  * This negates our scrub optimization, but is otherwise harmless.
+  *
+  * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+  * on each visit during the top-down scrub phase, it will never be scrubbed.
+  * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+  * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
+  * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
+  * while a scrub is in progress, it scrubs the block right then.
+  */
+ static void
+ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
+ {
+ 	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
+ 	ddt_entry_t dde = { 0 };
+ 	int error;
+ 	uint64_t n = 0;
+ 
+ 	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+ 		ddt_t *ddt;
+ 
+ 		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
+ 			break;
+ 		dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
+ 		    (longlong_t)ddb->ddb_class,
+ 		    (longlong_t)ddb->ddb_type,
+ 		    (longlong_t)ddb->ddb_checksum,
+ 		    (longlong_t)ddb->ddb_cursor);
+ 
+ 		/* There should be no pending changes to the dedup table */
+ 		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
+ 		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
+ 
+ 		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+ 		n++;
+ 
+ 		if (dsl_scan_check_pause(scn, NULL))
+ 			break;
+ 	}
+ 
+ 	zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
+ 	    (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
+ 	    (int)scn->scn_pausing);
+ 
+ 	ASSERT(error == 0 || error == ENOENT);
+ 	ASSERT(error != ENOENT ||
+ 	    ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
+ }
+ 
+ /* ARGSUSED */
+ void
+ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+     ddt_entry_t *dde, dmu_tx_t *tx)
+ {
+ 	const ddt_key_t *ddk = &dde->dde_key;
+ 	ddt_phys_t *ddp = dde->dde_phys;
+ 	blkptr_t bp;
+ 	zbookmark_t zb = { 0 };
++	int p;
+ 
+ 	if (scn->scn_phys.scn_state != DSS_SCANNING)
+ 		return;
+ 
 -	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
++	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ 		if (ddp->ddp_phys_birth == 0 ||
+ 		    ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
+ 			continue;
+ 		ddt_bp_create(checksum, ddk, ddp, &bp);
+ 
+ 		scn->scn_visited_this_txg++;
+ 		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+ 	}
+ }
+ 
+ static void
+ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
+ {
+ 	dsl_pool_t *dp = scn->scn_dp;
+ 	zap_cursor_t zc;
+ 	zap_attribute_t za;
+ 
+ 	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+ 	    scn->scn_phys.scn_ddt_class_max) {
+ 		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+ 		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+ 		dsl_scan_ddt(scn, tx);
+ 		if (scn->scn_pausing)
+ 			return;
+ 	}
+ 
+ 	if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
+ 		/* First do the MOS & ORIGIN */
+ 
+ 		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+ 		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+ 		dsl_scan_visit_rootbp(scn, NULL,
+ 		    &dp->dp_meta_rootbp, tx);
+ 		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+ 		if (scn->scn_pausing)
+ 			return;
+ 
+ 		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+ 			VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+ 			    NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
+ 		} else {
+ 			dsl_scan_visitds(scn,
+ 			    dp->dp_origin_snap->ds_object, tx);
+ 		}
+ 		ASSERT(!scn->scn_pausing);
+ 	} else if (scn->scn_phys.scn_bookmark.zb_objset !=
+ 	    ZB_DESTROYED_OBJSET) {
+ 		/*
+ 		 * If we were paused, continue from here.  Note if the
+ 		 * ds we were paused on was deleted, the zb_objset may
+ 		 * be -1, so we will skip this and find a new objset
+ 		 * below.
+ 		 */
+ 		dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
+ 		if (scn->scn_pausing)
+ 			return;
+ 	}
+ 
+ 	/*
+ 	 * In case we were paused right at the end of the ds, zero the
+ 	 * bookmark so we don't think that we're still trying to resume.
+ 	 */
+ 	bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
+ 
+ 	/* keep pulling things out of the zap-object-as-queue */
+ 	while (zap_cursor_init(&zc, dp->dp_meta_objset,
+ 	    scn->scn_phys.scn_queue_obj),
+ 	    zap_cursor_retrieve(&zc, &za) == 0) {
+ 		dsl_dataset_t *ds;
+ 		uint64_t dsobj;
+ 
+ 		dsobj = strtonum(za.za_name, NULL);
+ 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ 		    scn->scn_phys.scn_queue_obj, dsobj, tx));
+ 
+ 		/* Set up min/max txg */
+ 		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ 		if (za.za_first_integer != 0) {
+ 			scn->scn_phys.scn_cur_min_txg =
+ 			    MAX(scn->scn_phys.scn_min_txg,
+ 			    za.za_first_integer);
+ 		} else {
+ 			scn->scn_phys.scn_cur_min_txg =
+ 			    MAX(scn->scn_phys.scn_min_txg,
+ 			    ds->ds_phys->ds_prev_snap_txg);
+ 		}
+ 		scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
+ 		dsl_dataset_rele(ds, FTAG);
+ 
+ 		dsl_scan_visitds(scn, dsobj, tx);
+ 		zap_cursor_fini(&zc);
+ 		if (scn->scn_pausing)
+ 			return;
+ 	}
+ 	zap_cursor_fini(&zc);
+ }
+ 
+ static int
+ dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+ {
+ 	dsl_scan_t *scn = arg;
+ 	uint64_t elapsed_nanosecs;
+ 
+ 	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+ 
+ 	if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+ 	    (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
+ 	    txg_sync_waiting(scn->scn_dp)) ||
+ 	    spa_shutting_down(scn->scn_dp->dp_spa))
+ 		return (ERESTART);
+ 
+ 	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
+ 	    dmu_tx_get_txg(tx), bp, 0));
+ 	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+ 	    -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
+ 	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+ 	scn->scn_visited_this_txg++;
+ 	return (0);
+ }
+ 
+ boolean_t
+ dsl_scan_active(dsl_scan_t *scn)
+ {
+ 	spa_t *spa = scn->scn_dp->dp_spa;
+ 	uint64_t used = 0, comp, uncomp;
+ 
+ 	if (spa->spa_load_state != SPA_LOAD_NONE)
+ 		return (B_FALSE);
+ 	if (spa_shutting_down(spa))
+ 		return (B_FALSE);
+ 
+ 	if (scn->scn_phys.scn_state == DSS_SCANNING)
+ 		return (B_TRUE);
+ 
+ 	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ 		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
+ 		    &used, &comp, &uncomp);
+ 	}
+ 	return (used != 0);
+ }
+ 
+ void
+ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+ {
+ 	dsl_scan_t *scn = dp->dp_scan;
+ 	spa_t *spa = dp->dp_spa;
+ 	int err;
+ 
+ 	/*
+ 	 * Check for scn_restart_txg before checking spa_load_state, so
+ 	 * that we can restart an old-style scan while the pool is being
+ 	 * imported (see dsl_scan_init).
+ 	 */
+ 	if (scn->scn_restart_txg != 0 &&
+ 	    scn->scn_restart_txg <= tx->tx_txg) {
+ 		pool_scan_func_t func = POOL_SCAN_SCRUB;
+ 		dsl_scan_done(scn, B_FALSE, tx);
+ 		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+ 			func = POOL_SCAN_RESILVER;
+ 		zfs_dbgmsg("restarting scan func=%u txg=%llu",
+ 		    func, tx->tx_txg);
+ 		dsl_scan_setup_sync(scn, &func, tx);
+ 	}
+ 
+ 
+ 	if (!dsl_scan_active(scn) ||
+ 	    spa_sync_pass(dp->dp_spa) > 1)
+ 		return;
+ 
+ 	scn->scn_visited_this_txg = 0;
+ 	scn->scn_pausing = B_FALSE;
+ 	scn->scn_sync_start_time = gethrtime();
+ 	spa->spa_scrub_active = B_TRUE;
+ 
+ 	/*
+ 	 * First process the free list.  If we pause the free, don't do
+ 	 * any scanning.  This ensures that there is no free list when
+ 	 * we are scanning, so the scan code doesn't have to worry about
+ 	 * traversing it.
+ 	 */
+ 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ 		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ 		    NULL, ZIO_FLAG_MUSTSUCCEED);
+ 		err = bpobj_iterate(&dp->dp_free_bpobj,
+ 		    dsl_scan_free_cb, scn, tx);
+ 		VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+ 		if (scn->scn_visited_this_txg) {
+ 			zfs_dbgmsg("freed %llu blocks in %llums from "
+ 			    "free_bpobj txg %llu",
+ 			    (longlong_t)scn->scn_visited_this_txg,
+ 			    (longlong_t)
+ 			    (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
+ 			    (longlong_t)tx->tx_txg);
+ 			scn->scn_visited_this_txg = 0;
+ 			/*
+ 			 * Re-sync the ddt so that we can further modify
+ 			 * it when doing bprewrite.
+ 			 */
+ 			ddt_sync(spa, tx->tx_txg);
+ 		}
+ 		if (err == ERESTART)
+ 			return;
+ 	}
+ 
+ 	if (scn->scn_phys.scn_state != DSS_SCANNING)
+ 		return;
+ 
+ 
+ 	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+ 	    scn->scn_phys.scn_ddt_class_max) {
+ 		zfs_dbgmsg("doing scan sync txg %llu; "
+ 		    "ddt bm=%llu/%llu/%llu/%llx",
+ 		    (longlong_t)tx->tx_txg,
+ 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+ 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+ 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+ 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+ 		ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
+ 		ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
+ 		ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
+ 		ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
+ 	} else {
+ 		zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
+ 		    (longlong_t)tx->tx_txg,
+ 		    (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
+ 		    (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
+ 		    (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
+ 		    (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
+ 	}
+ 
+ 	scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ 	    NULL, ZIO_FLAG_CANFAIL);
+ 	dsl_scan_visit(scn, tx);
+ 	(void) zio_wait(scn->scn_zio_root);
+ 	scn->scn_zio_root = NULL;
+ 
+ 	zfs_dbgmsg("visited %llu blocks in %llums",
+ 	    (longlong_t)scn->scn_visited_this_txg,
+ 	    (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
+ 
+ 	if (!scn->scn_pausing) {
+ 		/* finished with scan. */
+ 		zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
+ 		dsl_scan_done(scn, B_TRUE, tx);
+ 	}
+ 
+ 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ 		mutex_enter(&spa->spa_scrub_lock);
+ 		while (spa->spa_scrub_inflight > 0) {
+ 			cv_wait(&spa->spa_scrub_io_cv,
+ 			    &spa->spa_scrub_lock);
+ 		}
+ 		mutex_exit(&spa->spa_scrub_lock);
+ 	}
+ 
+ 	dsl_scan_sync_state(scn, tx);
+ }
+ 
+ /*
+  * This will start a new scan, or restart an existing one.
+  */
+ void
+ dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
+ {
+ 	if (txg == 0) {
+ 		dmu_tx_t *tx;
+ 		tx = dmu_tx_create_dd(dp->dp_mos_dir);
+ 		VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+ 
+ 		txg = dmu_tx_get_txg(tx);
+ 		dp->dp_scan->scn_restart_txg = txg;
+ 		dmu_tx_commit(tx);
+ 	} else {
+ 		dp->dp_scan->scn_restart_txg = txg;
+ 	}
+ 	zfs_dbgmsg("restarting resilver txg=%llu", txg);
+ }
+ 
+ boolean_t
+ dsl_scan_resilvering(dsl_pool_t *dp)
+ {
+ 	return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
+ 	    dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+ }
+ 
+ /*
+  * scrub consumers
+  */
+ 
+ static void
+ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+ {
+ 	int i;
+ 
+ 	/*
+ 	 * If we resume after a reboot, zab will be NULL; don't record
+ 	 * incomplete stats in that case.
+ 	 */
+ 	if (zab == NULL)
+ 		return;
+ 
+ 	for (i = 0; i < 4; i++) {
+ 		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
+ 		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+ 		zfs_blkstat_t *zb = &zab->zab_type[l][t];
+ 		int equal;
+ 
+ 		zb->zb_count++;
+ 		zb->zb_asize += BP_GET_ASIZE(bp);
+ 		zb->zb_lsize += BP_GET_LSIZE(bp);
+ 		zb->zb_psize += BP_GET_PSIZE(bp);
+ 		zb->zb_gangs += BP_COUNT_GANG(bp);
+ 
+ 		switch (BP_GET_NDVAS(bp)) {
+ 		case 2:
+ 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ 			    DVA_GET_VDEV(&bp->blk_dva[1]))
+ 				zb->zb_ditto_2_of_2_samevdev++;
+ 			break;
+ 		case 3:
+ 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
+ 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
+ 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+ 			    DVA_GET_VDEV(&bp->blk_dva[2]));
+ 			if (equal == 1)
+ 				zb->zb_ditto_2_of_3_samevdev++;
+ 			else if (equal == 3)
+ 				zb->zb_ditto_3_of_3_samevdev++;
+ 			break;
+ 		}
+ 	}
+ }
+ 
+ static void
+ dsl_scan_scrub_done(zio_t *zio)
+ {
+ 	spa_t *spa = zio->io_spa;
+ 
+ 	zio_data_buf_free(zio->io_data, zio->io_size);
+ 
+ 	mutex_enter(&spa->spa_scrub_lock);
+ 	spa->spa_scrub_inflight--;
+ 	cv_broadcast(&spa->spa_scrub_io_cv);
+ 
+ 	if (zio->io_error && (zio->io_error != ECKSUM ||
+ 	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+ 		spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
+ 	}
+ 	mutex_exit(&spa->spa_scrub_lock);
+ }
+ 
+ static int
+ dsl_scan_scrub_cb(dsl_pool_t *dp,
+     const blkptr_t *bp, const zbookmark_t *zb)
+ {
+ 	dsl_scan_t *scn = dp->dp_scan;
+ 	size_t size = BP_GET_PSIZE(bp);
+ 	spa_t *spa = dp->dp_spa;
+ 	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+ 	boolean_t needs_io;
+ 	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+ 	int zio_priority;
++	int d;
+ 
+ 	if (phys_birth <= scn->scn_phys.scn_min_txg ||
+ 	    phys_birth >= scn->scn_phys.scn_max_txg)
+ 		return (0);
+ 
+ 	count_block(dp->dp_blkstats, bp);
+ 
+ 	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
+ 	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
+ 		zio_flags |= ZIO_FLAG_SCRUB;
+ 		zio_priority = ZIO_PRIORITY_SCRUB;
+ 		needs_io = B_TRUE;
+ 	} else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
+ 		zio_flags |= ZIO_FLAG_RESILVER;
+ 		zio_priority = ZIO_PRIORITY_RESILVER;
+ 		needs_io = B_FALSE;
+ 	}
+ 
+ 	/* If it's an intent log block, failure is expected. */
+ 	if (zb->zb_level == ZB_ZIL_LEVEL)
+ 		zio_flags |= ZIO_FLAG_SPECULATIVE;
+ 
 -	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
++	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ 		vdev_t *vd = vdev_lookup_top(spa,
+ 		    DVA_GET_VDEV(&bp->blk_dva[d]));
+ 
+ 		/*
+ 		 * Keep track of how much data we've examined so that
+ 		 * zpool(1M) status can make useful progress reports.
+ 		 */
+ 		scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
+ 		spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
+ 
+ 		/* if it's a resilver, this may not be in the target range */
+ 		if (!needs_io) {
+ 			if (DVA_GET_GANG(&bp->blk_dva[d])) {
+ 				/*
+ 				 * Gang members may be spread across multiple
+ 				 * vdevs, so the best estimate we have is the
+ 				 * scrub range, which has already been checked.
+ 				 * XXX -- it would be better to change our
+ 				 * allocation policy to ensure that all
+ 				 * gang members reside on the same vdev.
+ 				 */
+ 				needs_io = B_TRUE;
+ 			} else {
+ 				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
+ 				    phys_birth, 1);
+ 			}
+ 		}
+ 	}
+ 
+ 	if (needs_io && !zfs_no_scrub_io) {
+ 		void *data = zio_data_buf_alloc(size);
+ 
+ 		mutex_enter(&spa->spa_scrub_lock);
+ 		while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
+ 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ 		spa->spa_scrub_inflight++;
+ 		mutex_exit(&spa->spa_scrub_lock);
+ 
+ 		zio_nowait(zio_read(NULL, spa, bp, data, size,
+ 		    dsl_scan_scrub_done, NULL, zio_priority,
+ 		    zio_flags, zb));
+ 	}
+ 
+ 	/* do not relocate this block */
+ 	return (0);
+ }
+ 
+ int
+ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+ {
+ 	spa_t *spa = dp->dp_spa;
+ 
+ 	/*
+ 	 * Purge all vdev caches and probe all devices.  We do this here
+ 	 * rather than in sync context because this requires a writer lock
+ 	 * on the spa_config lock, which we can't do from sync context.  The
+ 	 * spa_scrub_reopen flag indicates that vdev_open() should not
+ 	 * attempt to start another scrub.
+ 	 */
+ 	spa_vdev_state_enter(spa, SCL_NONE);
+ 	spa->spa_scrub_reopen = B_TRUE;
+ 	vdev_reopen(spa->spa_root_vdev);
+ 	spa->spa_scrub_reopen = B_FALSE;
+ 	(void) spa_vdev_state_exit(spa, NULL, 0);
+ 
+ 	return (dsl_sync_task_do(dp, dsl_scan_setup_check,
+ 	    dsl_scan_setup_sync, dp->dp_scan, &func, 0));
+ }
diff --cc module/zfs/include/sys/spa.h
index 0a4d55097,41a40300e..86fe01553
--- a/module/zfs/include/sys/spa.h
+++ b/module/zfs/include/sys/spa.h
@@@ -309,6 -338,66 +338,67 @@@ typedef struct blkptr 
  
  #define	BP_SPRINTF_LEN	320
  
+ /*
+  * This macro allows code sharing between zfs, libzpool, and mdb.
+  * 'func' is either snprintf() or mdb_snprintf().
+  * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
+  */
+ #define	SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress)	\
+ {									\
+ 	static const char *copyname[] =					\
+ 	    { "zero", "single", "double", "triple" };			\
+ 	int size = BP_SPRINTF_LEN;					\
+ 	int len = 0;							\
+ 	int copies = 0;							\
++	int d;								\
+ 									\
+ 	if (bp == NULL) {						\
+ 		len = func(buf + len, size - len, "<NULL>");		\
+ 	} else if (BP_IS_HOLE(bp)) {					\
+ 		len = func(buf + len, size - len, "<hole>");		\
+ 	} else {							\
 -		for (int d = 0; d < BP_GET_NDVAS(bp); d++) {		\
++		for (d = 0; d < BP_GET_NDVAS(bp); d++) {		\
+ 			const dva_t *dva = &bp->blk_dva[d];		\
+ 			if (DVA_IS_VALID(dva))				\
+ 				copies++;				\
+ 			len += func(buf + len, size - len,		\
+ 			    "DVA[%d]=<%llu:%llx:%llx>%c", d,		\
+ 			    (u_longlong_t)DVA_GET_VDEV(dva),		\
+ 			    (u_longlong_t)DVA_GET_OFFSET(dva),		\
+ 			    (u_longlong_t)DVA_GET_ASIZE(dva),		\
+ 			    ws);					\
+ 		}							\
+ 		if (BP_IS_GANG(bp) &&					\
+ 		    DVA_GET_ASIZE(&bp->blk_dva[2]) <=			\
+ 		    DVA_GET_ASIZE(&bp->blk_dva[1]) / 2)			\
+ 			copies--;					\
+ 		len += func(buf + len, size - len,			\
+ 		    "[L%llu %s] %s %s %s %s %s %s%c"			\
+ 		    "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c"	\
+ 		    "cksum=%llx:%llx:%llx:%llx",			\
+ 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
+ 		    type,						\
+ 		    checksum,						\
+ 		    compress,						\
+ 		    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",		\
+ 		    BP_IS_GANG(bp) ? "gang" : "contiguous",		\
+ 		    BP_GET_DEDUP(bp) ? "dedup" : "unique",		\
+ 		    copyname[copies],					\
+ 		    ws,							\
+ 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
+ 		    (u_longlong_t)BP_GET_PSIZE(bp),			\
+ 		    (u_longlong_t)bp->blk_birth,			\
+ 		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),		\
+ 		    (u_longlong_t)bp->blk_fill,				\
+ 		    ws,							\
+ 		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
+ 		    (u_longlong_t)bp->blk_cksum.zc_word[1],		\
+ 		    (u_longlong_t)bp->blk_cksum.zc_word[2],		\
+ 		    (u_longlong_t)bp->blk_cksum.zc_word[3]);		\
+ 	}								\
+ 	ASSERT(len < size);						\
+ }
+ 
  #include <sys/dmu.h>
  
  #define	BP_GET_BUFC_TYPE(bp)						\
diff --cc module/zfs/metaslab.c
index 987617ffe,17b4b12c4..1722a53fc
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@@ -495,10 -730,9 +730,10 @@@ voi
  metaslab_fini(metaslab_t *msp)
  {
  	metaslab_group_t *mg = msp->ms_group;
 +	int t;
  
- 	vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
- 	    -msp->ms_smo.smo_alloc, B_TRUE);
+ 	vdev_space_update(mg->mg_vd,
+ 	    -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
  
  	metaslab_group_remove(mg, msp);
  
@@@ -512,6 -746,11 +747,11 @@@
  		space_map_destroy(&msp->ms_freemap[t]);
  	}
  
 -	for (int t = 0; t < TXG_DEFER_SIZE; t++)
++	for (t = 0; t < TXG_DEFER_SIZE; t++)
+ 		space_map_destroy(&msp->ms_defermap[t]);
+ 
+ 	ASSERT3S(msp->ms_deferspace, ==, 0);
+ 
  	mutex_exit(&msp->ms_lock);
  	mutex_destroy(&msp->ms_lock);
  
@@@ -574,17 -846,35 +847,36 @@@ metaslab_prefetch(metaslab_group_t *mg
  static int
  metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
  {
+ 	metaslab_group_t *mg = msp->ms_group;
  	space_map_t *sm = &msp->ms_map;
  	space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
++	int t;
  
  	ASSERT(MUTEX_HELD(&msp->ms_lock));
  
  	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
- 		int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo,
- 		    msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
- 		if (error) {
- 			metaslab_group_sort(msp->ms_group, msp, 0);
- 			return (error);
+ 		space_map_load_wait(sm);
+ 		if (!sm->sm_loaded) {
+ 			int error = space_map_load(sm, sm_ops, SM_FREE,
+ 			    &msp->ms_smo,
+ 			    spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
+ 			if (error)  {
+ 				metaslab_group_sort(msp->ms_group, msp, 0);
+ 				return (error);
+ 			}
 -			for (int t = 0; t < TXG_DEFER_SIZE; t++)
++			for (t = 0; t < TXG_DEFER_SIZE; t++)
+ 				space_map_walk(&msp->ms_defermap[t],
+ 				    space_map_claim, sm);
+ 
+ 		}
+ 
+ 		/*
+ 		 * Track the bonus area as we activate new metaslabs.
+ 		 */
+ 		if (sm->sm_start > mg->mg_bonus_area) {
+ 			mutex_enter(&mg->mg_lock);
+ 			mg->mg_bonus_area = sm->sm_start;
+ 			mutex_exit(&mg->mg_lock);
  		}
  
  		/*
@@@ -632,9 -922,11 +924,12 @@@ metaslab_sync(metaslab_t *msp, uint64_
  	space_map_obj_t *smo = &msp->ms_smo_syncing;
  	dmu_buf_t *db;
  	dmu_tx_t *tx;
 +	int t;
  
- 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+ 	ASSERT(!vd->vdev_ishole);
+ 
+ 	if (allocmap->sm_space == 0 && freemap->sm_space == 0)
+ 		return;
  
  	/*
  	 * The only state that can actually be changing concurrently with
@@@ -683,7 -977,11 +980,11 @@@
  		space_map_walk(sm, space_map_remove, allocmap);
  		space_map_walk(freed_map, space_map_remove, allocmap);
  
 -		for (int t = 0; t < TXG_DEFER_SIZE; t++)
++		for (t = 0; t < TXG_DEFER_SIZE; t++)
+ 			space_map_walk(&msp->ms_defermap[t],
+ 			    space_map_remove, allocmap);
+ 
 -		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
 +		for (t = 1; t < TXG_CONCURRENT_STATES; t++)
  			space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
  			    space_map_remove, allocmap);
  
@@@ -717,10 -1015,13 +1018,14 @@@ metaslab_sync_done(metaslab_t *msp, uin
  	space_map_obj_t *smosync = &msp->ms_smo_syncing;
  	space_map_t *sm = &msp->ms_map;
  	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+ 	space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
  	metaslab_group_t *mg = msp->ms_group;
  	vdev_t *vd = mg->mg_vd;
+ 	int64_t alloc_delta, defer_delta;
 +	int t;
  
+ 	ASSERT(!vd->vdev_ishole);
+ 
  	mutex_enter(&msp->ms_lock);
  
  	/*
@@@ -734,10 -1035,18 +1039,18 @@@
  			space_map_create(&msp->ms_freemap[t], sm->sm_start,
  			    sm->sm_size, sm->sm_shift, sm->sm_lock);
  		}
- 		vdev_space_update(vd, sm->sm_size, 0, B_TRUE);
+ 
 -		for (int t = 0; t < TXG_DEFER_SIZE; t++)
++		for (t = 0; t < TXG_DEFER_SIZE; t++)
+ 			space_map_create(&msp->ms_defermap[t], sm->sm_start,
+ 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
+ 
+ 		vdev_space_update(vd, 0, 0, sm->sm_size);
  	}
  
- 	vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE);
+ 	alloc_delta = smosync->smo_alloc - smo->smo_alloc;
+ 	defer_delta = freed_map->sm_space - defer_map->sm_space;
+ 
+ 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
  
  	ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
  	ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
@@@ -773,6 -1095,32 +1099,33 @@@
  	mutex_exit(&msp->ms_lock);
  }
  
+ void
+ metaslab_sync_reassess(metaslab_group_t *mg)
+ {
+ 	vdev_t *vd = mg->mg_vd;
++	int m;
+ 
+ 	/*
+ 	 * Re-evaluate all metaslabs which have lower offsets than the
+ 	 * bonus area.
+ 	 */
 -	for (int m = 0; m < vd->vdev_ms_count; m++) {
++	for (m = 0; m < vd->vdev_ms_count; m++) {
+ 		metaslab_t *msp = vd->vdev_ms[m];
+ 
+ 		if (msp->ms_map.sm_start > mg->mg_bonus_area)
+ 			break;
+ 
+ 		mutex_enter(&msp->ms_lock);
+ 		metaslab_group_sort(mg, msp, metaslab_weight(msp));
+ 		mutex_exit(&msp->ms_lock);
+ 	}
+ 
+ 	/*
+ 	 * Prefetch the next potential metaslabs
+ 	 */
+ 	metaslab_prefetch(mg);
+ }
+ 
  static uint64_t
  metaslab_distance(metaslab_t *msp, dva_t *dva)
  {
@@@ -1154,9 -1517,10 +1522,10 @@@ metaslab_alloc(spa_t *spa, metaslab_cla
  {
  	dva_t *dva = bp->blk_dva;
  	dva_t *hintdva = hintbp->blk_dva;
 -	int error = 0;
 +	int d, error = 0;
  
  	ASSERT(bp->blk_birth == 0);
+ 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
  
  	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
  
@@@ -1195,10 -1559,10 +1564,10 @@@ voi
  metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
  {
  	const dva_t *dva = bp->blk_dva;
 -	int ndvas = BP_GET_NDVAS(bp);
 +	int d, ndvas = BP_GET_NDVAS(bp);
  
  	ASSERT(!BP_IS_HOLE(bp));
- 	ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg);
+ 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
  
  	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
  
diff --cc module/zfs/spa.c
index 705dda4df,d7c5de0d3..b0236e49f
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@@ -565,27 -601,58 +601,60 @@@ spa_get_errlists(spa_t *spa, avl_tree_
  	    offsetof(spa_error_entry_t, se_avl));
  }
  
- /*
-  * Activate an uninitialized pool.
-  */
- static void
- spa_activate(spa_t *spa, int mode)
+ static taskq_t *
+ spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
+     uint_t value)
  {
- 	int t, q;
+ 	uint_t flags = TASKQ_PREPOPULATE;
+ 	boolean_t batch = B_FALSE;
  
- 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+ 	switch (mode) {
+ 	case zti_mode_null:
+ 		return (NULL);		/* no taskq needed */
  
- 	spa->spa_state = POOL_STATE_ACTIVE;
- 	spa->spa_mode = mode;
+ 	case zti_mode_fixed:
+ 		ASSERT3U(value, >=, 1);
+ 		value = MAX(value, 1);
+ 		break;
+ 
+ 	case zti_mode_batch:
+ 		batch = B_TRUE;
+ 		flags |= TASKQ_THREADS_CPU_PCT;
+ 		value = zio_taskq_batch_pct;
+ 		break;
+ 
+ 	case zti_mode_online_percent:
+ 		flags |= TASKQ_THREADS_CPU_PCT;
+ 		break;
+ 
+ 	default:
+ 		panic("unrecognized mode for %s taskq (%u:%u) in "
+ 		    "spa_activate()",
+ 		    name, mode, value);
+ 		break;
+ 	}
  
- 	spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
- 	spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
+ 	if (zio_taskq_sysdc && spa->spa_proc != &p0) {
+ 		if (batch)
+ 			flags |= TASKQ_DC_BATCH;
+ 
+ 		return (taskq_create_sysdc(name, value, 50, INT_MAX,
+ 		    spa->spa_proc, zio_taskq_basedc, flags));
+ 	}
+ 	return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
+ 	    spa->spa_proc, flags));
+ }
+ 
+ static void
+ spa_create_zio_taskqs(spa_t *spa)
+ {
 -	for (int t = 0; t < ZIO_TYPES; t++) {
 -		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
++	int t, q;
 +
 +	for (t = 0; t < ZIO_TYPES; t++) {
- 		const zio_taskq_info_t *ztip = &zio_taskqs[t];
 +		for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
- 			enum zti_modes mode = ztip->zti_nthreads[q].zti_mode;
- 			uint_t value = ztip->zti_nthreads[q].zti_value;
+ 			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+ 			enum zti_modes mode = ztip->zti_mode;
+ 			uint_t value = ztip->zti_value;
  			char name[32];
  
  			(void) snprintf(name, sizeof (name),
@@@ -660,9 -814,10 +818,10 @@@ spa_deactivate(spa_t *spa
  	list_destroy(&spa->spa_config_dirty_list);
  	list_destroy(&spa->spa_state_dirty_list);
  
 -	for (int t = 0; t < ZIO_TYPES; t++) {
 -		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 +	for (t = 0; t < ZIO_TYPES; t++) {
 +		for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
- 			taskq_destroy(spa->spa_zio_taskq[t][q]);
+ 			if (spa->spa_zio_taskq[t][q] != NULL)
+ 				taskq_destroy(spa->spa_zio_taskq[t][q]);
  			spa->spa_zio_taskq[t][q] = NULL;
  		}
  	}
@@@ -1106,27 -1288,23 +1295,24 @@@ spa_check_removed(vdev_t *vd
   * that the label does not contain the most up-to-date information.
   */
  void
- spa_load_log_state(spa_t *spa)
+ spa_load_log_state(spa_t *spa, nvlist_t *nv)
  {
- 	nvlist_t *nv, *nvroot, **child;
- 	uint64_t is_log;
- 	uint_t children;
- 	vdev_t *rvd = spa->spa_root_vdev;
+ 	vdev_t *ovd, *rvd = spa->spa_root_vdev;
 +	int c;
  
- 	VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0);
- 	VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
- 	VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
- 	    &child, &children) == 0);
- 
- 	for (c = 0; c < children; c++) {
- 		vdev_t *tvd = rvd->vdev_child[c];
+ 	/*
+ 	 * Load the original root vdev tree from the passed config.
+ 	 */
+ 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ 	VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
  
- 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
- 		    &is_log) == 0 && is_log)
- 			vdev_load_log_state(tvd, child[c]);
 -	for (int c = 0; c < rvd->vdev_children; c++) {
++	for (c = 0; c < rvd->vdev_children; c++) {
+ 		vdev_t *cvd = rvd->vdev_child[c];
+ 		if (cvd->vdev_islog)
+ 			vdev_load_log_state(cvd, ovd->vdev_child[c]);
  	}
- 	nvlist_free(nv);
+ 	vdev_free(ovd);
+ 	spa_config_exit(spa, SCL_ALL, FTAG);
  }
  
  /*
@@@ -1149,181 -1327,481 +1335,486 @@@ spa_check_logs(spa_t *spa
  	return (0);
  }
  
- /*
-  * Load an existing storage pool, using the pool's builtin spa_config as a
-  * source of configuration information.
-  */
- static int
- spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
+ static boolean_t
+ spa_passivate_log(spa_t *spa)
  {
- 	int error = 0;
- 	nvlist_t *nvroot = NULL;
- 	vdev_t *rvd;
- 	uberblock_t *ub = &spa->spa_uberblock;
- 	uint64_t config_cache_txg = spa->spa_config_txg;
- 	uint64_t pool_guid;
- 	uint64_t version;
- 	uint64_t autoreplace = 0;
- 	int orig_mode = spa->spa_mode;
- 	char *ereport = FM_EREPORT_ZFS_POOL;
+ 	vdev_t *rvd = spa->spa_root_vdev;
+ 	boolean_t slog_found = B_FALSE;
++	int c;
  
- 	/*
- 	 * If this is an untrusted config, access the pool in read-only mode.
- 	 * This prevents things like resilvering recently removed devices.
- 	 */
- 	if (!mosconfig)
- 		spa->spa_mode = FREAD;
+ 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
  
- 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ 	if (!spa_has_slogs(spa))
+ 		return (B_FALSE);
  
- 	spa->spa_load_state = state;
 -	for (int c = 0; c < rvd->vdev_children; c++) {
++	for (c = 0; c < rvd->vdev_children; c++) {
+ 		vdev_t *tvd = rvd->vdev_child[c];
+ 		metaslab_group_t *mg = tvd->vdev_mg;
  
- 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
- 	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
- 		error = EINVAL;
- 		goto out;
+ 		if (tvd->vdev_islog) {
+ 			metaslab_group_passivate(mg);
+ 			slog_found = B_TRUE;
+ 		}
  	}
  
- 	/*
- 	 * Versioning wasn't explicitly added to the label until later, so if
- 	 * it's not present treat it as the initial version.
- 	 */
- 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
- 		version = SPA_VERSION_INITIAL;
+ 	return (slog_found);
+ }
  
- 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
- 	    &spa->spa_config_txg);
+ static void
+ spa_activate_log(spa_t *spa)
+ {
+ 	vdev_t *rvd = spa->spa_root_vdev;
++	int c;
  
- 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
- 	    spa_guid_exists(pool_guid, 0)) {
- 		error = EEXIST;
- 		goto out;
- 	}
+ 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
  
- 	spa->spa_load_guid = pool_guid;
 -	for (int c = 0; c < rvd->vdev_children; c++) {
++	for (c = 0; c < rvd->vdev_children; c++) {
+ 		vdev_t *tvd = rvd->vdev_child[c];
+ 		metaslab_group_t *mg = tvd->vdev_mg;
  
- 	/*
- 	 * Create "The Godfather" zio to hold all async IOs
- 	 */
- 	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
- 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+ 		if (tvd->vdev_islog)
+ 			metaslab_group_activate(mg);
+ 	}
+ }
  
- 	/*
- 	 * Parse the configuration into a vdev tree.  We explicitly set the
- 	 * value that will be returned by spa_version() since parsing the
- 	 * configuration requires knowing the version number.
- 	 */
- 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- 	spa->spa_ubsync.ub_version = version;
- 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
- 	spa_config_exit(spa, SCL_ALL, FTAG);
+ int
+ spa_offline_log(spa_t *spa)
+ {
+ 	int error = 0;
  
- 	if (error != 0)
- 		goto out;
+ 	if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
+ 	    NULL, DS_FIND_CHILDREN)) == 0) {
  
- 	ASSERT(spa->spa_root_vdev == rvd);
- 	ASSERT(spa_guid(spa) == pool_guid);
+ 		/*
+ 		 * We successfully offlined the log device, sync out the
+ 		 * current txg so that the "stubby" block can be removed
+ 		 * by zil_sync().
+ 		 */
+ 		txg_wait_synced(spa->spa_dsl_pool, 0);
+ 	}
+ 	return (error);
+ }
  
- 	/*
- 	 * Try to open all vdevs, loading each label in the process.
- 	 */
- 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- 	error = vdev_open(rvd);
- 	spa_config_exit(spa, SCL_ALL, FTAG);
- 	if (error != 0)
- 		goto out;
+ static void
+ spa_aux_check_removed(spa_aux_vdev_t *sav)
+ {
 -	for (int i = 0; i < sav->sav_count; i++)
++	int i;
 +
- 	/*
- 	 * We need to validate the vdev labels against the configuration that
- 	 * we have in hand, which is dependent on the setting of mosconfig. If
- 	 * mosconfig is true then we're validating the vdev labels based on
- 	 * that config. Otherwise, we're validating against the cached config
- 	 * (zpool.cache) that was read when we loaded the zfs module, and then
- 	 * later we will recursively call spa_load() and validate against
- 	 * the vdev config.
- 	 */
- 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- 	error = vdev_validate(rvd);
- 	spa_config_exit(spa, SCL_ALL, FTAG);
- 	if (error != 0)
- 		goto out;
++	for (i = 0; i < sav->sav_count; i++)
+ 		spa_check_removed(sav->sav_vdevs[i]);
+ }
  
- 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
- 		error = ENXIO;
- 		goto out;
- 	}
+ void
+ spa_claim_notify(zio_t *zio)
+ {
+ 	spa_t *spa = zio->io_spa;
  
- 	/*
- 	 * Find the best uberblock.
- 	 */
- 	vdev_uberblock_load(NULL, rvd, ub);
+ 	if (zio->io_error)
+ 		return;
  
- 	/*
- 	 * If we weren't able to find a single valid uberblock, return failure.
- 	 */
- 	if (ub->ub_txg == 0) {
- 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- 		    VDEV_AUX_CORRUPT_DATA);
- 		error = ENXIO;
- 		goto out;
- 	}
+ 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
+ 	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
+ 		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+ 	mutex_exit(&spa->spa_props_lock);
+ }
  
- 	/*
- 	 * If the pool is newer than the code, we can't open it.
- 	 */
- 	if (ub->ub_version > SPA_VERSION) {
- 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- 		    VDEV_AUX_VERSION_NEWER);
- 		error = ENOTSUP;
- 		goto out;
- 	}
+ typedef struct spa_load_error {
+ 	uint64_t	sle_meta_count;
+ 	uint64_t	sle_data_count;
+ } spa_load_error_t;
  
- 	/*
- 	 * If the vdev guid sum doesn't match the uberblock, we have an
- 	 * incomplete configuration.
- 	 */
- 	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
- 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- 		    VDEV_AUX_BAD_GUID_SUM);
- 		error = ENXIO;
- 		goto out;
- 	}
+ static void
+ spa_load_verify_done(zio_t *zio)
+ {
+ 	blkptr_t *bp = zio->io_bp;
+ 	spa_load_error_t *sle = zio->io_private;
+ 	dmu_object_type_t type = BP_GET_TYPE(bp);
+ 	int error = zio->io_error;
  
- 	/*
- 	 * Initialize internal SPA structures.
- 	 */
- 	spa->spa_state = POOL_STATE_ACTIVE;
- 	spa->spa_ubsync = spa->spa_uberblock;
- 	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
- 	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
  	if (error) {
- 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- 		    VDEV_AUX_CORRUPT_DATA);
- 		goto out;
+ 		if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
+ 		    type != DMU_OT_INTENT_LOG)
+ 			atomic_add_64(&sle->sle_meta_count, 1);
+ 		else
+ 			atomic_add_64(&sle->sle_data_count, 1);
  	}
- 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+ 	zio_data_buf_free(zio->io_data, zio->io_size);
+ }
  
- 	if (zap_lookup(spa->spa_meta_objset,
- 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
- 	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
- 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- 		    VDEV_AUX_CORRUPT_DATA);
- 		error = EIO;
- 		goto out;
- 	}
+ /*ARGSUSED*/
+ static int
+ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+     arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+ {
+ 	if (bp != NULL) {
+ 		zio_t *rio = arg;
+ 		size_t size = BP_GET_PSIZE(bp);
+ 		void *data = zio_data_buf_alloc(size);
  
- 	if (!mosconfig) {
- 		nvlist_t *newconfig;
- 		uint64_t hostid;
+ 		zio_nowait(zio_read(rio, spa, bp, data, size,
+ 		    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+ 		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+ 		    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
+ 	}
+ 	return (0);
+ }
  
- 		if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
- 			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- 			    VDEV_AUX_CORRUPT_DATA);
- 			error = EIO;
- 			goto out;
+ static int
+ spa_load_verify(spa_t *spa)
+ {
+ 	zio_t *rio;
+ 	spa_load_error_t sle = { 0 };
+ 	zpool_rewind_policy_t policy;
+ 	boolean_t verify_ok = B_FALSE;
+ 	int error;
+ 
+ 	zpool_get_rewind_policy(spa->spa_config, &policy);
+ 
+ 	if (policy.zrp_request & ZPOOL_NEVER_REWIND)
+ 		return (0);
+ 
+ 	rio = zio_root(spa, NULL, &sle,
+ 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+ 
+ 	error = traverse_pool(spa, spa->spa_verify_min_txg,
+ 	    TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
+ 
+ 	(void) zio_wait(rio);
+ 
+ 	spa->spa_load_meta_errors = sle.sle_meta_count;
+ 	spa->spa_load_data_errors = sle.sle_data_count;
+ 
+ 	if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
+ 	    sle.sle_data_count <= policy.zrp_maxdata) {
+ 		verify_ok = B_TRUE;
+ 		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
+ 		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
+ 	} else {
+ 		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
+ 	}
+ 
+ 	if (error) {
+ 		if (error != ENXIO && error != EIO)
+ 			error = EIO;
+ 		return (error);
+ 	}
+ 
+ 	return (verify_ok ? 0 : EIO);
+ }
+ 
+ /*
+  * Find a value in the pool props object.
+  */
+ static void
+ spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
+ {
+ 	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
+ 	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
+ }
+ 
+ /*
+  * Find a value in the pool directory object.
+  */
+ static int
+ spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
+ {
+ 	return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ 	    name, sizeof (uint64_t), 1, val));
+ }
+ 
+ static int
+ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
+ {
+ 	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
+ 	return (err);
+ }
+ 
+ /*
+  * Fix up config after a partly-completed split.  This is done with the
+  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
+  * pool have that entry in their config, but only the splitting one contains
+  * a list of all the guids of the vdevs that are being split off.
+  *
+  * This function determines what to do with that list: either rejoin
+  * all the disks to the pool, or complete the splitting process.  To attempt
+  * the rejoin, each disk that is offlined is marked online again, and
+  * we do a reopen() call.  If the vdev label for every disk that was
+  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
+  * then we call vdev_split() on each disk, and complete the split.
+  *
+  * Otherwise we leave the config alone, with all the vdevs in place in
+  * the original pool.
+  */
+ static void
+ spa_try_repair(spa_t *spa, nvlist_t *config)
+ {
+ 	uint_t extracted;
+ 	uint64_t *glist;
+ 	uint_t i, gcount;
+ 	nvlist_t *nvl;
+ 	vdev_t **vd;
+ 	boolean_t attempt_reopen;
+ 
+ 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
+ 		return;
+ 
+ 	/* check that the config is complete */
+ 	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+ 	    &glist, &gcount) != 0)
+ 		return;
+ 
+ 	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
+ 
+ 	/* attempt to online all the vdevs & validate */
+ 	attempt_reopen = B_TRUE;
+ 	for (i = 0; i < gcount; i++) {
+ 		if (glist[i] == 0)	/* vdev is hole */
+ 			continue;
+ 
+ 		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
+ 		if (vd[i] == NULL) {
+ 			/*
+ 			 * Don't bother attempting to reopen the disks;
+ 			 * just do the split.
+ 			 */
+ 			attempt_reopen = B_FALSE;
+ 		} else {
+ 			/* attempt to re-online it */
+ 			vd[i]->vdev_offline = B_FALSE;
+ 		}
+ 	}
+ 
+ 	if (attempt_reopen) {
+ 		vdev_reopen(spa->spa_root_vdev);
+ 
+ 		/* check each device to see what state it's in */
+ 		for (extracted = 0, i = 0; i < gcount; i++) {
+ 			if (vd[i] != NULL &&
+ 			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
+ 				break;
+ 			++extracted;
  		}
+ 	}
+ 
+ 	/*
+ 	 * If every disk has been moved to the new pool, or if we never
+ 	 * even attempted to look at them, then we split them off for
+ 	 * good.
+ 	 */
+ 	if (!attempt_reopen || gcount == extracted) {
+ 		for (i = 0; i < gcount; i++)
+ 			if (vd[i] != NULL)
+ 				vdev_split(vd[i]);
+ 		vdev_reopen(spa->spa_root_vdev);
+ 	}
+ 
+ 	kmem_free(vd, gcount * sizeof (vdev_t *));
+ }
+ 
+ static int
+ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
+     boolean_t mosconfig)
+ {
+ 	nvlist_t *config = spa->spa_config;
+ 	char *ereport = FM_EREPORT_ZFS_POOL;
+ 	int error;
+ 	uint64_t pool_guid;
+ 	nvlist_t *nvl;
+ 
+ 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
+ 		return (EINVAL);
  
- 		if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig,
+ 	/*
+ 	 * Versioning wasn't explicitly added to the label until later, so if
+ 	 * it's not present treat it as the initial version.
+ 	 */
+ 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ 	    &spa->spa_ubsync.ub_version) != 0)
+ 		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+ 
+ 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ 	    &spa->spa_config_txg);
+ 
+ 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
+ 	    spa_guid_exists(pool_guid, 0)) {
+ 		error = EEXIST;
+ 	} else {
+ 		spa->spa_load_guid = pool_guid;
+ 
+ 		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
+ 		    &nvl) == 0) {
+ 			VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
+ 			    KM_SLEEP) == 0);
+ 		}
+ 
+ 		error = spa_load_impl(spa, pool_guid, config, state, type,
+ 		    mosconfig, &ereport);
+ 	}
+ 
+ 	spa->spa_minref = refcount_count(&spa->spa_refcount);
+ 	if (error && error != EBADF)
+ 		zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+ 	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
+ 	spa->spa_ena = 0;
+ 
+ 	return (error);
+ }
+ 
+ /*
+  * Load an existing storage pool, using the pool's builtin spa_config as a
+  * source of configuration information.
+  */
+ static int
+ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
+     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
+     char **ereport)
+ {
+ 	int error = 0;
+ 	nvlist_t *nvroot = NULL;
+ 	vdev_t *rvd;
+ 	uberblock_t *ub = &spa->spa_uberblock;
+ 	uint64_t config_cache_txg = spa->spa_config_txg;
+ 	int orig_mode = spa->spa_mode;
+ 	int parse;
+ 	uint64_t obj;
++	int c;
+ 
+ 	/*
+ 	 * If this is an untrusted config, access the pool in read-only mode.
+ 	 * This prevents things like resilvering recently removed devices.
+ 	 */
+ 	if (!mosconfig)
+ 		spa->spa_mode = FREAD;
+ 
+ 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ 
+ 	spa->spa_load_state = state;
+ 
+ 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
+ 		return (EINVAL);
+ 
+ 	parse = (type == SPA_IMPORT_EXISTING ?
+ 	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
+ 
+ 	/*
+ 	 * Create "The Godfather" zio to hold all async IOs
+ 	 */
+ 	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
+ 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+ 
+ 	/*
+ 	 * Parse the configuration into a vdev tree.  We explicitly set the
+ 	 * value that will be returned by spa_version() since parsing the
+ 	 * configuration requires knowing the version number.
+ 	 */
+ 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
+ 	spa_config_exit(spa, SCL_ALL, FTAG);
+ 
+ 	if (error != 0)
+ 		return (error);
+ 
+ 	ASSERT(spa->spa_root_vdev == rvd);
+ 
+ 	if (type != SPA_IMPORT_ASSEMBLE) {
+ 		ASSERT(spa_guid(spa) == pool_guid);
+ 	}
+ 
+ 	/*
+ 	 * Try to open all vdevs, loading each label in the process.
+ 	 */
+ 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ 	error = vdev_open(rvd);
+ 	spa_config_exit(spa, SCL_ALL, FTAG);
+ 	if (error != 0)
+ 		return (error);
+ 
+ 	/*
+ 	 * We need to validate the vdev labels against the configuration that
+ 	 * we have in hand, which is dependent on the setting of mosconfig. If
+ 	 * mosconfig is true then we're validating the vdev labels based on
+ 	 * that config.  Otherwise, we're validating against the cached config
+ 	 * (zpool.cache) that was read when we loaded the zfs module, and then
+ 	 * later we will recursively call spa_load() and validate against
+ 	 * the vdev config.
+ 	 *
+ 	 * If we're assembling a new pool that's been split off from an
+ 	 * existing pool, the labels haven't yet been updated so we skip
+ 	 * validation for now.
+ 	 */
+ 	if (type != SPA_IMPORT_ASSEMBLE) {
+ 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ 		error = vdev_validate(rvd);
+ 		spa_config_exit(spa, SCL_ALL, FTAG);
+ 
+ 		if (error != 0)
+ 			return (error);
+ 
+ 		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+ 			return (ENXIO);
+ 	}
+ 
+ 	/*
+ 	 * Find the best uberblock.
+ 	 */
+ 	vdev_uberblock_load(NULL, rvd, ub);
+ 
+ 	/*
+ 	 * If we weren't able to find a single valid uberblock, return failure.
+ 	 */
+ 	if (ub->ub_txg == 0)
+ 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
+ 
+ 	/*
+ 	 * If the pool is newer than the code, we can't open it.
+ 	 */
+ 	if (ub->ub_version > SPA_VERSION)
+ 		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
+ 
+ 	/*
+ 	 * If the vdev guid sum doesn't match the uberblock, we have an
+ 	 * incomplete configuration.
+ 	 */
+ 	if (mosconfig && type != SPA_IMPORT_ASSEMBLE &&
+ 	    rvd->vdev_guid_sum != ub->ub_guid_sum)
+ 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
+ 
+ 	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
+ 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ 		spa_try_repair(spa, config);
+ 		spa_config_exit(spa, SCL_ALL, FTAG);
+ 		nvlist_free(spa->spa_config_splitting);
+ 		spa->spa_config_splitting = NULL;
+ 	}
+ 
+ 	/*
+ 	 * Initialize internal SPA structures.
+ 	 */
+ 	spa->spa_state = POOL_STATE_ACTIVE;
+ 	spa->spa_ubsync = spa->spa_uberblock;
+ 	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+ 	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+ 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+ 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+ 	spa->spa_claim_max_txg = spa->spa_first_txg;
+ 	spa->spa_prev_software_version = ub->ub_software_version;
+ 
+ 	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+ 	if (error)
+ 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+ 
+ 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
+ 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ 
+ 	if (!mosconfig) {
+ 		uint64_t hostid;
+ 		nvlist_t *policy = NULL, *nvconfig;
+ 
+ 		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+ 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ 
+ 		if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
  		    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
  			char *hostname;
  			unsigned long myhostid = 0;
@@@ -1549,15 -1985,50 +1998,51 @@@
  	 * Check the state of the root vdev.  If it can't be opened, it
  	 * indicates one or more toplevel vdevs are faulted.
  	 */
- 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
- 		error = ENXIO;
- 		goto out;
+ 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+ 		return (ENXIO);
+ 
+ 	/*
+ 	 * Load the DDTs (dedup tables).
+ 	 */
+ 	error = ddt_load(spa);
+ 	if (error != 0)
+ 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ 
+ 	spa_update_dspace(spa);
+ 
+ 	if (state != SPA_LOAD_TRYIMPORT) {
+ 		error = spa_load_verify(spa);
+ 		if (error)
+ 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+ 			    error));
  	}
  
- 	if (spa_writeable(spa)) {
+ 	/*
+ 	 * Load the intent log state and check log integrity.  If we're
+ 	 * assembling a pool from a split, the log is not transferred over.
+ 	 */
+ 	if (type != SPA_IMPORT_ASSEMBLE) {
+ 		nvlist_t *nvconfig;
+ 
+ 		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+ 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ 
+ 		VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
+ 		    &nvroot) == 0);
+ 		spa_load_log_state(spa, nvroot);
+ 		nvlist_free(nvconfig);
+ 
+ 		if (spa_check_logs(spa)) {
+ 			*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+ 			return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
+ 		}
+ 	}
+ 
+ 	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
+ 	    spa->spa_load_max_txg == UINT64_MAX)) {
  		dmu_tx_t *tx;
  		int need_update = B_FALSE;
 +		int c;
  
  		ASSERT(state != SPA_LOAD_TRYIMPORT);
  
@@@ -1571,53 -2047,142 +2061,142 @@@
  		    zil_claim, tx, DS_FIND_CHILDREN);
  		dmu_tx_commit(tx);
  
- 		spa->spa_log_state = SPA_LOG_GOOD;
- 		spa->spa_sync_on = B_TRUE;
- 		txg_sync_start(spa->spa_dsl_pool);
+ 		spa->spa_claiming = B_FALSE;
+ 
+ 		spa_set_log_state(spa, SPA_LOG_GOOD);
+ 		spa->spa_sync_on = B_TRUE;
+ 		txg_sync_start(spa->spa_dsl_pool);
+ 
+ 		/*
+ 		 * Wait for all claims to sync.  We sync up to the highest
+ 		 * claimed log block birth time so that claimed log blocks
+ 		 * don't appear to be from the future.  spa_claim_max_txg
+ 		 * will have been set for us by either zil_check_log_chain()
+ 		 * (invoked from spa_check_logs()) or zil_claim() above.
+ 		 */
+ 		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
+ 
+ 		/*
+ 		 * If the config cache is stale, or we have uninitialized
+ 		 * metaslabs (see spa_vdev_add()), then update the config.
+ 		 *
+ 		 * If spa_load_verbatim is true, trust the current
+ 		 * in-core spa_config and update the disk labels.
+ 		 */
+ 		if (config_cache_txg != spa->spa_config_txg ||
+ 		    state == SPA_LOAD_IMPORT || spa->spa_load_verbatim ||
+ 		    state == SPA_LOAD_RECOVER)
+ 			need_update = B_TRUE;
+ 
 -		for (int c = 0; c < rvd->vdev_children; c++)
++		for (c = 0; c < rvd->vdev_children; c++)
+ 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
+ 				need_update = B_TRUE;
+ 
+ 		/*
+ 		 * Update the config cache asychronously in case we're the
+ 		 * root pool, in which case the config cache isn't writable yet.
+ 		 */
+ 		if (need_update)
+ 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+ 
+ 		/*
+ 		 * Check all DTLs to see if anything needs resilvering.
+ 		 */
+ 		if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ 		    vdev_resilver_needed(rvd, NULL, NULL))
+ 			spa_async_request(spa, SPA_ASYNC_RESILVER);
+ 
+ 		/*
+ 		 * Delete any inconsistent datasets.
+ 		 */
+ 		(void) dmu_objset_find(spa_name(spa),
+ 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
+ 
+ 		/*
+ 		 * Clean up any stale temporary dataset userrefs.
+ 		 */
+ 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
+ 	}
+ 
+ 	return (0);
+ }
+ 
+ static int
+ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
+ {
+ 	spa_unload(spa);
+ 	spa_deactivate(spa);
+ 
+ 	spa->spa_load_max_txg--;
+ 
+ 	spa_activate(spa, spa_mode_global);
+ 	spa_async_suspend(spa);
+ 
+ 	return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
+ }
+ 
+ static int
+ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
+     uint64_t max_request, int rewind_flags)
+ {
+ 	nvlist_t *config = NULL;
+ 	int load_error, rewind_error;
+ 	uint64_t safe_rewind_txg;
+ 	uint64_t min_txg;
+ 
+ 	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
+ 		spa->spa_load_max_txg = spa->spa_load_txg;
+ 		spa_set_log_state(spa, SPA_LOG_CLEAR);
+ 	} else {
+ 		spa->spa_load_max_txg = max_request;
+ 	}
  
- 		/*
- 		 * Wait for all claims to sync.
- 		 */
- 		txg_wait_synced(spa->spa_dsl_pool, 0);
+ 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
+ 	    mosconfig);
+ 	if (load_error == 0)
+ 		return (0);
  
- 		/*
- 		 * If the config cache is stale, or we have uninitialized
- 		 * metaslabs (see spa_vdev_add()), then update the config.
- 		 *
- 		 * If spa_load_verbatim is true, trust the current
- 		 * in-core spa_config and update the disk labels.
- 		 */
- 		if (config_cache_txg != spa->spa_config_txg ||
- 		    state == SPA_LOAD_IMPORT || spa->spa_load_verbatim)
- 			need_update = B_TRUE;
+ 	if (spa->spa_root_vdev != NULL)
+ 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
  
- 		for (c = 0; c < rvd->vdev_children; c++)
- 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
- 				need_update = B_TRUE;
+ 	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
+ 	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
  
- 		/*
- 		 * Update the config cache asychronously in case we're the
- 		 * root pool, in which case the config cache isn't writable yet.
- 		 */
- 		if (need_update)
- 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+ 	if (rewind_flags & ZPOOL_NEVER_REWIND) {
+ 		nvlist_free(config);
+ 		return (load_error);
+ 	}
  
- 		/*
- 		 * Check all DTLs to see if anything needs resilvering.
- 		 */
- 		if (vdev_resilver_needed(rvd, NULL, NULL))
- 			spa_async_request(spa, SPA_ASYNC_RESILVER);
+ 	/* Price of rolling back is discarding txgs, including log */
+ 	if (state == SPA_LOAD_RECOVER)
+ 		spa_set_log_state(spa, SPA_LOG_CLEAR);
+ 
+ 	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
+ 	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
+ 	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
+ 	    TXG_INITIAL : safe_rewind_txg;
+ 
+ 	/*
+ 	 * Continue as long as we're finding errors, we're still within
+ 	 * the acceptable rewind range, and we're still finding uberblocks
+ 	 */
+ 	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
+ 	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
+ 		if (spa->spa_load_max_txg < safe_rewind_txg)
+ 			spa->spa_extreme_rewind = B_TRUE;
+ 		rewind_error = spa_load_retry(spa, state, mosconfig);
  	}
  
- 	error = 0;
- out:
- 	spa->spa_minref = refcount_count(&spa->spa_refcount);
- 	if (error && error != EBADF)
- 		zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
- 	spa->spa_load_state = SPA_LOAD_NONE;
- 	spa->spa_ena = 0;
+ 	if (config)
+ 		spa_rewind_data_to_nvlist(spa, config);
  
- 	return (error);
+ 	spa->spa_extreme_rewind = B_FALSE;
+ 	spa->spa_load_max_txg = UINT64_MAX;
+ 
+ 	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
+ 		spa_config_set(spa, config);
+ 
+ 	return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
  }
  
  /*
@@@ -2950,10 -3575,20 +3592,20 @@@ spa_vdev_add(spa_t *spa, nvlist_t *nvro
  	/*
  	 * Transfer each new top-level vdev from vd to rvd.
  	 */
 -	for (int c = 0; c < vd->vdev_children; c++) {
 +	for (c = 0; c < vd->vdev_children; c++) {
+ 
+ 		/*
+ 		 * Set the vdev id to the first hole, if one exists.
+ 		 */
+ 		for (id = 0; id < rvd->vdev_children; id++) {
+ 			if (rvd->vdev_child[id]->vdev_ishole) {
+ 				vdev_free(rvd->vdev_child[id]);
+ 				break;
+ 			}
+ 		}
  		tvd = vd->vdev_child[c];
  		vdev_remove_child(vd, tvd);
- 		tvd->vdev_id = rvd->vdev_children;
+ 		tvd->vdev_id = id;
  		vdev_add_child(rvd, tvd);
  		vdev_config_dirty(tvd);
  	}
@@@ -3201,7 -3841,7 +3858,8 @@@ spa_vdev_detach(spa_t *spa, uint64_t gu
  	boolean_t unspare = B_FALSE;
  	uint64_t unspare_guid;
  	size_t len;
+ 	char *vdpath;
 +	int t;
  
  	txg = spa_vdev_enter(spa);
  
@@@ -3329,77 -3969,366 +3987,366 @@@
  		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
  	}
  
- 	/*
- 	 * If the parent mirror/replacing vdev only has one child,
- 	 * the parent is no longer needed.  Remove it from the tree.
- 	 */
- 	if (pvd->vdev_children == 1)
- 		vdev_remove_parent(cvd);
+ 	/*
+ 	 * If the parent mirror/replacing vdev only has one child,
+ 	 * the parent is no longer needed.  Remove it from the tree.
+ 	 */
+ 	if (pvd->vdev_children == 1)
+ 		vdev_remove_parent(cvd);
+ 
+ 	/*
+ 	 * We don't set tvd until now because the parent we just removed
+ 	 * may have been the previous top-level vdev.
+ 	 */
+ 	tvd = cvd->vdev_top;
+ 	ASSERT(tvd->vdev_parent == rvd);
+ 
+ 	/*
+ 	 * Reevaluate the parent vdev state.
+ 	 */
+ 	vdev_propagate_state(cvd);
+ 
+ 	/*
+ 	 * If the 'autoexpand' property is set on the pool then automatically
+ 	 * try to expand the size of the pool. For example if the device we
+ 	 * just detached was smaller than the others, it may be possible to
+ 	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
+ 	 * first so that we can obtain the updated sizes of the leaf vdevs.
+ 	 */
+ 	if (spa->spa_autoexpand) {
+ 		vdev_reopen(tvd);
+ 		vdev_expand(tvd, txg);
+ 	}
+ 
+ 	vdev_config_dirty(tvd);
+ 
+ 	/*
+ 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
+ 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
+ 	 * But first make sure we're not on any *other* txg's DTL list, to
+ 	 * prevent vd from being accessed after it's freed.
+ 	 */
+ 	vdpath = spa_strdup(vd->vdev_path);
 -	for (int t = 0; t < TXG_SIZE; t++)
++	for (t = 0; t < TXG_SIZE; t++)
+ 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
+ 	vd->vdev_detached = B_TRUE;
+ 	vdev_dirty(tvd, VDD_DTL, vd, txg);
+ 
+ 	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+ 
+ 	error = spa_vdev_exit(spa, vd, txg, 0);
+ 
+ 	spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
+ 	    "vdev=%s", vdpath);
+ 	spa_strfree(vdpath);
+ 
+ 	/*
+ 	 * If this was the removal of the original device in a hot spare vdev,
+ 	 * then we want to go through and remove the device from the hot spare
+ 	 * list of every other pool.
+ 	 */
+ 	if (unspare) {
+ 		spa_t *myspa = spa;
+ 		spa = NULL;
+ 		mutex_enter(&spa_namespace_lock);
+ 		while ((spa = spa_next(spa)) != NULL) {
+ 			if (spa->spa_state != POOL_STATE_ACTIVE)
+ 				continue;
+ 			if (spa == myspa)
+ 				continue;
+ 			spa_open_ref(spa, FTAG);
+ 			mutex_exit(&spa_namespace_lock);
+ 			(void) spa_vdev_remove(spa, unspare_guid,
+ 			    B_TRUE);
+ 			mutex_enter(&spa_namespace_lock);
+ 			spa_close(spa, FTAG);
+ 		}
+ 		mutex_exit(&spa_namespace_lock);
+ 	}
+ 
+ 	return (error);
+ }
+ 
+ /*
+  * Split a set of devices from their mirrors, and create a new pool from them.
+  */
+ int
+ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+     nvlist_t *props, boolean_t exp)
+ {
+ 	int error = 0;
+ 	uint64_t txg, *glist;
+ 	spa_t *newspa;
+ 	uint_t c, children, lastlog;
+ 	nvlist_t **child, *nvl, *tmp;
+ 	dmu_tx_t *tx;
+ 	char *altroot = NULL;
+ 	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
+ 	boolean_t activate_slog;
+ 
+ 	if (!spa_writeable(spa))
+ 		return (EROFS);
+ 
+ 	txg = spa_vdev_enter(spa);
+ 
+ 	/* clear the log and flush everything up to now */
+ 	activate_slog = spa_passivate_log(spa);
+ 	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+ 	error = spa_offline_log(spa);
+ 	txg = spa_vdev_config_enter(spa);
+ 
+ 	if (activate_slog)
+ 		spa_activate_log(spa);
+ 
+ 	if (error != 0)
+ 		return (spa_vdev_exit(spa, NULL, txg, error));
+ 
+ 	/* check new spa name before going any further */
+ 	if (spa_lookup(newname) != NULL)
+ 		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
+ 
+ 	/*
+ 	 * scan through all the children to ensure they're all mirrors
+ 	 */
+ 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
+ 	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
+ 	    &children) != 0)
+ 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+ 
+ 	/* first, check to ensure we've got the right child count */
+ 	rvd = spa->spa_root_vdev;
+ 	lastlog = 0;
+ 	for (c = 0; c < rvd->vdev_children; c++) {
+ 		vdev_t *vd = rvd->vdev_child[c];
+ 
+ 		/* don't count the holes & logs as children */
+ 		if (vd->vdev_islog || vd->vdev_ishole) {
+ 			if (lastlog == 0)
+ 				lastlog = c;
+ 			continue;
+ 		}
+ 
+ 		lastlog = 0;
+ 	}
+ 	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
+ 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+ 
+ 	/* next, ensure no spare or cache devices are part of the split */
+ 	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
+ 	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
+ 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+ 
+ 	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
+ 	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
+ 
+ 	/* then, loop over each vdev and validate it */
+ 	for (c = 0; c < children; c++) {
+ 		uint64_t is_hole = 0;
+ 
+ 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+ 		    &is_hole);
+ 
+ 		if (is_hole != 0) {
+ 			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
+ 			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
+ 				continue;
+ 			} else {
+ 				error = EINVAL;
+ 				break;
+ 			}
+ 		}
+ 
+ 		/* which disk is going to be split? */
+ 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
+ 		    &glist[c]) != 0) {
+ 			error = EINVAL;
+ 			break;
+ 		}
+ 
+ 		/* look it up in the spa */
+ 		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
+ 		if (vml[c] == NULL) {
+ 			error = ENODEV;
+ 			break;
+ 		}
+ 
+ 		/* make sure there's nothing stopping the split */
+ 		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
+ 		    vml[c]->vdev_islog ||
+ 		    vml[c]->vdev_ishole ||
+ 		    vml[c]->vdev_isspare ||
+ 		    vml[c]->vdev_isl2cache ||
+ 		    !vdev_writeable(vml[c]) ||
+ 		    vml[c]->vdev_children != 0 ||
+ 		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
+ 		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
+ 			error = EINVAL;
+ 			break;
+ 		}
+ 
+ 		if (vdev_dtl_required(vml[c])) {
+ 			error = EBUSY;
+ 			break;
+ 		}
+ 
+ 		/* we need certain info from the top level */
+ 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
+ 		    vml[c]->vdev_top->vdev_ms_array) == 0);
+ 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
+ 		    vml[c]->vdev_top->vdev_ms_shift) == 0);
+ 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
+ 		    vml[c]->vdev_top->vdev_asize) == 0);
+ 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
+ 		    vml[c]->vdev_top->vdev_ashift) == 0);
+ 	}
+ 
+ 	if (error != 0) {
+ 		kmem_free(vml, children * sizeof (vdev_t *));
+ 		kmem_free(glist, children * sizeof (uint64_t));
+ 		return (spa_vdev_exit(spa, NULL, txg, error));
+ 	}
+ 
+ 	/* stop writers from using the disks */
+ 	for (c = 0; c < children; c++) {
+ 		if (vml[c] != NULL)
+ 			vml[c]->vdev_offline = B_TRUE;
+ 	}
+ 	vdev_reopen(spa->spa_root_vdev);
  
  	/*
- 	 * We don't set tvd until now because the parent we just removed
- 	 * may have been the previous top-level vdev.
+ 	 * Temporarily record the splitting vdevs in the spa config.  This
+ 	 * will disappear once the config is regenerated.
  	 */
- 	tvd = cvd->vdev_top;
- 	ASSERT(tvd->vdev_parent == rvd);
+ 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ 	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+ 	    glist, children) == 0);
+ 	kmem_free(glist, children * sizeof (uint64_t));
  
- 	/*
- 	 * Reevaluate the parent vdev state.
- 	 */
- 	vdev_propagate_state(cvd);
+ 	mutex_enter(&spa->spa_props_lock);
+ 	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
+ 	    nvl) == 0);
+ 	mutex_exit(&spa->spa_props_lock);
+ 	spa->spa_config_splitting = nvl;
+ 	vdev_config_dirty(spa->spa_root_vdev);
+ 
+ 	/* configure and create the new pool */
+ 	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
+ 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ 	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
+ 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+ 	    spa_version(spa)) == 0);
+ 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ 	    spa->spa_config_txg) == 0);
+ 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ 	    spa_generate_guid(NULL)) == 0);
+ 	(void) nvlist_lookup_string(props,
+ 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
  
- 	/*
- 	 * If the 'autoexpand' property is set on the pool then automatically
- 	 * try to expand the size of the pool. For example if the device we
- 	 * just detached was smaller than the others, it may be possible to
- 	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
- 	 * first so that we can obtain the updated sizes of the leaf vdevs.
- 	 */
- 	if (spa->spa_autoexpand) {
- 		vdev_reopen(tvd);
- 		vdev_expand(tvd, txg);
+ 	/* add the new pool to the namespace */
+ 	newspa = spa_add(newname, config, altroot);
+ 	newspa->spa_config_txg = spa->spa_config_txg;
+ 	spa_set_log_state(newspa, SPA_LOG_CLEAR);
+ 
+ 	/* release the spa config lock, retaining the namespace lock */
+ 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+ 
+ 	if (zio_injection_enabled)
+ 		zio_handle_panic_injection(spa, FTAG, 1);
+ 
+ 	spa_activate(newspa, spa_mode_global);
+ 	spa_async_suspend(newspa);
+ 
+ 	/* create the new pool from the disks of the original pool */
+ 	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
+ 	if (error)
+ 		goto out;
+ 
+ 	/* if that worked, generate a real config for the new pool */
+ 	if (newspa->spa_root_vdev != NULL) {
+ 		VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
+ 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ 		VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
+ 		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
+ 		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
+ 		    B_TRUE));
  	}
  
- 	vdev_config_dirty(tvd);
+ 	/* set the props */
+ 	if (props != NULL) {
+ 		spa_configfile_set(newspa, props, B_FALSE);
+ 		error = spa_prop_set(newspa, props);
+ 		if (error)
+ 			goto out;
+ 	}
  
- 	/*
- 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
- 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
- 	 * But first make sure we're not on any *other* txg's DTL list, to
- 	 * prevent vd from being accessed after it's freed.
- 	 */
- 	for (t = 0; t < TXG_SIZE; t++)
- 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
- 	vd->vdev_detached = B_TRUE;
- 	vdev_dirty(tvd, VDD_DTL, vd, txg);
+ 	/* flush everything */
+ 	txg = spa_vdev_config_enter(newspa);
+ 	vdev_config_dirty(newspa->spa_root_vdev);
+ 	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
  
- 	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+ 	if (zio_injection_enabled)
+ 		zio_handle_panic_injection(spa, FTAG, 2);
  
- 	error = spa_vdev_exit(spa, vd, txg, 0);
+ 	spa_async_resume(newspa);
  
- 	/*
- 	 * If this was the removal of the original device in a hot spare vdev,
- 	 * then we want to go through and remove the device from the hot spare
- 	 * list of every other pool.
- 	 */
- 	if (unspare) {
- 		spa_t *myspa = spa;
- 		spa = NULL;
- 		mutex_enter(&spa_namespace_lock);
- 		while ((spa = spa_next(spa)) != NULL) {
- 			if (spa->spa_state != POOL_STATE_ACTIVE)
- 				continue;
- 			if (spa == myspa)
- 				continue;
- 			spa_open_ref(spa, FTAG);
- 			mutex_exit(&spa_namespace_lock);
- 			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
- 			mutex_enter(&spa_namespace_lock);
- 			spa_close(spa, FTAG);
+ 	/* finally, update the original pool's config */
+ 	txg = spa_vdev_config_enter(spa);
+ 	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ 	error = dmu_tx_assign(tx, TXG_WAIT);
+ 	if (error != 0)
+ 		dmu_tx_abort(tx);
+ 	for (c = 0; c < children; c++) {
+ 		if (vml[c] != NULL) {
+ 			vdev_split(vml[c]);
+ 			if (error == 0)
+ 				spa_history_log_internal(LOG_POOL_VDEV_DETACH,
+ 				    spa, tx, "vdev=%s",
+ 				    vml[c]->vdev_path);
+ 			vdev_free(vml[c]);
  		}
- 		mutex_exit(&spa_namespace_lock);
  	}
+ 	vdev_config_dirty(spa->spa_root_vdev);
+ 	spa->spa_config_splitting = NULL;
+ 	nvlist_free(nvl);
+ 	if (error == 0)
+ 		dmu_tx_commit(tx);
+ 	(void) spa_vdev_exit(spa, NULL, txg, 0);
+ 
+ 	if (zio_injection_enabled)
+ 		zio_handle_panic_injection(spa, FTAG, 3);
+ 
+ 	/* split is complete; log a history record */
+ 	spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
+ 	    "split new pool %s from pool %s", newname, spa_name(spa));
+ 
+ 	kmem_free(vml, children * sizeof (vdev_t *));
+ 
+ 	/* if we're not going to mount the filesystems in userland, export */
+ 	if (exp)
+ 		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
+ 		    B_FALSE, B_FALSE);
+ 
+ 	return (error);
+ 
+ out:
+ 	spa_unload(newspa);
+ 	spa_deactivate(newspa);
+ 	spa_remove(newspa);
+ 
+ 	txg = spa_vdev_config_enter(spa);
+ 
+ 	/* re-online all offlined disks */
+ 	for (c = 0; c < children; c++) {
+ 		if (vml[c] != NULL)
+ 			vml[c]->vdev_offline = B_FALSE;
+ 	}
+ 	vdev_reopen(spa->spa_root_vdev);
+ 
+ 	nvlist_free(spa->spa_config_splitting);
+ 	spa->spa_config_splitting = NULL;
+ 	(void) spa_vdev_exit(spa, NULL, txg, error);
  
+ 	kmem_free(vml, children * sizeof (vdev_t *));
  	return (error);
  }
  
@@@ -3685,12 -4755,21 +4777,23 @@@ spa_scan(spa_t *spa, pool_scan_func_t f
  static void
  spa_async_remove(spa_t *spa, vdev_t *vd)
  {
 +	int c;
 +
  	if (vd->vdev_remove_wanted) {
- 		vd->vdev_remove_wanted = 0;
+ 		vd->vdev_remove_wanted = B_FALSE;
+ 		vd->vdev_delayed_close = B_FALSE;
  		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
- 		vdev_clear(spa, vd);
+ 
+ 		/*
+ 		 * We want to clear the stats, but we don't want to do a full
+ 		 * vdev_clear() as that will cause us to throw away
+ 		 * degraded/faulted state as well as attempt to reopen the
+ 		 * device, all of which is a waste.
+ 		 */
+ 		vd->vdev_stat.vs_read_errors = 0;
+ 		vd->vdev_stat.vs_write_errors = 0;
+ 		vd->vdev_stat.vs_checksum_errors = 0;
+ 
  		vdev_state_dirty(vd->vdev_top);
  	}
  
@@@ -3701,10 -4780,8 +4804,10 @@@
  static void
  spa_async_probe(spa_t *spa, vdev_t *vd)
  {
 +	int c;
 +
  	if (vd->vdev_probe_wanted) {
- 		vd->vdev_probe_wanted = 0;
+ 		vd->vdev_probe_wanted = B_FALSE;
  		vdev_reopen(vd);	/* vdev_open() does the actual probe */
  	}
  
@@@ -3785,11 -4860,11 +4887,11 @@@ spa_async_thread(spa_t *spa
  	 * See if any devices need to be marked REMOVED.
  	 */
  	if (tasks & SPA_ASYNC_REMOVE) {
- 		spa_vdev_state_enter(spa);
+ 		spa_vdev_state_enter(spa, SCL_NONE);
  		spa_async_remove(spa, spa->spa_root_vdev);
 -		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 +		for (i = 0; i < spa->spa_l2cache.sav_count; i++)
  			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
 -		for (int i = 0; i < spa->spa_spares.sav_count; i++)
 +		for (i = 0; i < spa->spa_spares.sav_count; i++)
  			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
  		(void) spa_vdev_state_exit(spa, NULL, 0);
  	}
@@@ -4146,9 -5247,7 +5274,8 @@@ spa_sync(spa_t *spa, uint64_t txg
  	vdev_t *rvd = spa->spa_root_vdev;
  	vdev_t *vd;
  	dmu_tx_t *tx;
- 	int dirty_vdevs;
  	int error;
 +	int c;
  
  	/*
  	 * Lock out configuration changes.
diff --cc module/zfs/spa_misc.c
index 88ae172b4,52af7fcb7..20946c4e7
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@@ -433,6 -424,6 +433,7 @@@ spa_add(const char *name, nvlist_t *con
  {
  	spa_t *spa;
  	spa_config_dirent_t *dp;
++	int t;
  
  	ASSERT(MUTEX_HELD(&spa_namespace_lock));
  
@@@ -450,6 -444,9 +454,9 @@@
  	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
  	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
  
 -	for (int t = 0; t < TXG_SIZE; t++)
++	for (t = 0; t < TXG_SIZE; t++)
+ 		bplist_create(&spa->spa_free_bplist[t]);
+ 
  	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
  	spa->spa_state = POOL_STATE_UNINITIALIZED;
  	spa->spa_freeze_txg = UINT64_MAX;
@@@ -492,6 -493,6 +503,7 @@@ voi
  spa_remove(spa_t *spa)
  {
  	spa_config_dirent_t *dp;
++	int t;
  
  	ASSERT(MUTEX_HELD(&spa_namespace_lock));
  	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
@@@ -519,7 -522,11 +533,11 @@@
  
  	spa_config_lock_destroy(spa);
  
 -	for (int t = 0; t < TXG_SIZE; t++)
++	for (t = 0; t < TXG_SIZE; t++)
+ 		bplist_destroy(&spa->spa_free_bplist[t]);
+ 
  	cv_destroy(&spa->spa_async_cv);
+ 	cv_destroy(&spa->spa_proc_cv);
  	cv_destroy(&spa->spa_scrub_io_cv);
  	cv_destroy(&spa->spa_suspend_cv);
  
@@@ -1302,24 -1419,52 +1430,54 @@@ spa_max_replication(spa_t *spa
  	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
  }
  
+ int
+ spa_prev_software_version(spa_t *spa)
+ {
+ 	return (spa->spa_prev_software_version);
+ }
+ 
  uint64_t
- bp_get_dasize(spa_t *spa, const blkptr_t *bp)
+ dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
  {
- 	int sz = 0, i;
+ 	uint64_t asize = DVA_GET_ASIZE(dva);
+ 	uint64_t dsize = asize;
  
- 	if (!spa->spa_deflate)
- 		return (BP_GET_ASIZE(bp));
+ 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
  
- 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- 	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
- 		vdev_t *vd =
- 		    vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
- 		if (vd)
- 			sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
- 			    SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
+ 	if (asize != 0 && spa->spa_deflate) {
+ 		vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+ 		dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
  	}
+ 
+ 	return (dsize);
+ }
+ 
+ uint64_t
+ bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
+ {
+ 	uint64_t dsize = 0;
++	int d;
+ 
 -	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
++	for (d = 0; d < SPA_DVAS_PER_BP; d++)
+ 		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+ 
+ 	return (dsize);
+ }
+ 
+ uint64_t
+ bp_get_dsize(spa_t *spa, const blkptr_t *bp)
+ {
+ 	uint64_t dsize = 0;
++	int d;
+ 
+ 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ 
 -	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
++	for (d = 0; d < SPA_DVAS_PER_BP; d++)
+ 		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+ 
  	spa_config_exit(spa, SCL_VDEV, FTAG);
- 	return (sz);
+ 
+ 	return (dsize);
  }
  
  /*
diff --cc module/zfs/vdev.c
index cb4a3e252,a61f29b8e..e4c1a7707
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@@ -1016,13 -1058,35 +1067,38 @@@ vdev_open_child(void *arg
  	vd->vdev_open_thread = NULL;
  }
  
+ boolean_t
+ vdev_uses_zvols(vdev_t *vd)
+ {
++	int c;
++
+ 	if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
+ 	    strlen(ZVOL_DIR)) == 0)
+ 		return (B_TRUE);
 -	for (int c = 0; c < vd->vdev_children; c++)
++	for (c = 0; c < vd->vdev_children; c++)
+ 		if (vdev_uses_zvols(vd->vdev_child[c]))
+ 			return (B_TRUE);
+ 	return (B_FALSE);
+ }
+ 
  void
  vdev_open_children(vdev_t *vd)
  {
  	taskq_t *tq;
  	int children = vd->vdev_children;
 +	int c;
  
+ 	/*
+ 	 * in order to handle pools on top of zvols, do the opens
+ 	 * in a single thread so that the same thread holds the
+ 	 * spa_namespace_lock
+ 	 */
+ 	if (vdev_uses_zvols(vd)) {
 -		for (int c = 0; c < children; c++)
++		for (c = 0; c < children; c++)
+ 			vd->vdev_child[c]->vdev_open_error =
+ 			    vdev_open(vd->vdev_child[c]);
+ 		return;
+ 	}
  	tq = taskq_create("vdev_open", children, minclsyspri,
  	    children, children, TASKQ_PREPOPULATE);
  
@@@ -1090,10 -1177,16 +1190,16 @@@ vdev_open(vdev_t *vd
  		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
  		    VDEV_AUX_ERR_EXCEEDED);
  	} else {
- 		vd->vdev_state = VDEV_STATE_HEALTHY;
+ 		vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
  	}
  
+ 	/*
+ 	 * For hole or missing vdevs we just return success.
+ 	 */
+ 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+ 		return (0);
+ 
 -	for (int c = 0; c < vd->vdev_children; c++) {
 +	for (c = 0; c < vd->vdev_children; c++) {
  		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
  			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
  			    VDEV_AUX_NONE);
@@@ -1200,11 -1293,10 +1306,11 @@@ vdev_validate(vdev_t *vd
  {
  	spa_t *spa = vd->vdev_spa;
  	nvlist_t *label;
- 	uint64_t guid, top_guid;
+ 	uint64_t guid = 0, top_guid;
  	uint64_t state;
 +	int c;
  
 -	for (int c = 0; c < vd->vdev_children; c++)
 +	for (c = 0; c < vd->vdev_children; c++)
  		if (vdev_validate(vd->vdev_child[c]) != 0)
  			return (EBADF);
  
@@@ -1308,6 -1431,41 +1445,43 @@@ vdev_close(vdev_t *vd
  	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
  }
  
+ void
+ vdev_hold(vdev_t *vd)
+ {
+ 	spa_t *spa = vd->vdev_spa;
++	int c;
+ 
+ 	ASSERT(spa_is_root(spa));
+ 	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ 		return;
+ 
 -	for (int c = 0; c < vd->vdev_children; c++)
++	for (c = 0; c < vd->vdev_children; c++)
+ 		vdev_hold(vd->vdev_child[c]);
+ 
+ 	if (vd->vdev_ops->vdev_op_leaf)
+ 		vd->vdev_ops->vdev_op_hold(vd);
+ }
+ 
+ void
+ vdev_rele(vdev_t *vd)
+ {
+ 	spa_t *spa = vd->vdev_spa;
++	int c;
+ 
+ 	ASSERT(spa_is_root(spa));
 -	for (int c = 0; c < vd->vdev_children; c++)
++	for (c = 0; c < vd->vdev_children; c++)
+ 		vdev_rele(vd->vdev_child[c]);
+ 
+ 	if (vd->vdev_ops->vdev_op_leaf)
+ 		vd->vdev_ops->vdev_op_rele(vd);
+ }
+ 
+ /*
+  * Reopen all interior vdevs and any unopened leaves.  We don't actually
+  * reopen leaf vdevs which had previously been opened as they might deadlock
+  * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
+  * If the leaf has never been opened then open it, as usual.
+  */
  void
  vdev_reopen(vdev_t *vd)
  {
@@@ -1545,7 -1708,9 +1724,9 @@@ vdev_dtl_reassess(vdev_t *vd, uint64_t 
  	}
  
  	mutex_enter(&vd->vdev_dtl_lock);
 -	for (int t = 0; t < DTL_TYPES; t++) {
 +	for (t = 0; t < DTL_TYPES; t++) {
+ 		/* account for child's outage in parent's missing map */
+ 		int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
  		if (t == DTL_SCRUB)
  			continue;			/* leaf vdevs only */
  		if (t == DTL_PARTIAL)
@@@ -1555,10 -1720,10 +1736,10 @@@
  		else
  			minref = vd->vdev_children;	/* any kind of mirror */
  		space_map_ref_create(&reftree);
 -		for (int c = 0; c < vd->vdev_children; c++) {
 +		for (c = 0; c < vd->vdev_children; c++) {
  			vdev_t *cvd = vd->vdev_child[c];
  			mutex_enter(&cvd->vdev_dtl_lock);
- 			space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1);
+ 			space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
  			mutex_exit(&cvd->vdev_dtl_lock);
  		}
  		space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
@@@ -1803,6 -1969,42 +1988,43 @@@ vdev_validate_aux(vdev_t *vd
  	return (0);
  }
  
+ void
+ vdev_remove(vdev_t *vd, uint64_t txg)
+ {
+ 	spa_t *spa = vd->vdev_spa;
+ 	objset_t *mos = spa->spa_meta_objset;
+ 	dmu_tx_t *tx;
++	int m;
+ 
+ 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+ 
+ 	if (vd->vdev_dtl_smo.smo_object) {
+ 		ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
+ 		(void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
+ 		vd->vdev_dtl_smo.smo_object = 0;
+ 	}
+ 
+ 	if (vd->vdev_ms != NULL) {
 -		for (int m = 0; m < vd->vdev_ms_count; m++) {
++		for (m = 0; m < vd->vdev_ms_count; m++) {
+ 			metaslab_t *msp = vd->vdev_ms[m];
+ 
+ 			if (msp == NULL || msp->ms_smo.smo_object == 0)
+ 				continue;
+ 
+ 			ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
+ 			(void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
+ 			msp->ms_smo.smo_object = 0;
+ 		}
+ 	}
+ 
+ 	if (vd->vdev_ms_array) {
+ 		(void) dmu_object_free(mos, vd->vdev_ms_array, tx);
+ 		vd->vdev_ms_array = 0;
+ 		vd->vdev_ms_shift = 0;
+ 	}
+ 	dmu_tx_commit(tx);
+ }
+ 
  void
  vdev_sync_done(vdev_t *vd, uint64_t txg)
  {
@@@ -2201,6 -2484,19 +2506,20 @@@ vdev_clear_stats(vdev_t *vd
  	mutex_exit(&vd->vdev_stat_lock);
  }
  
+ void
+ vdev_scan_stat_init(vdev_t *vd)
+ {
+ 	vdev_stat_t *vs = &vd->vdev_stat;
++	int c;
+ 
 -	for (int c = 0; c < vd->vdev_children; c++)
++	for (c = 0; c < vd->vdev_children; c++)
+ 		vdev_scan_stat_init(vd->vdev_child[c]);
+ 
+ 	mutex_enter(&vd->vdev_stat_lock);
+ 	vs->vs_scan_processed = 0;
+ 	mutex_exit(&vd->vdev_stat_lock);
+ }
+ 
  void
  vdev_stat_update(zio_t *zio, uint64_t psize)
  {
@@@ -2536,12 -2827,17 +2850,18 @@@ vdev_propagate_state(vdev_t *vd
  	int degraded = 0, faulted = 0;
  	int corrupted = 0;
  	vdev_t *child;
 +	int c;
  
  	if (vd->vdev_children > 0) {
 -		for (int c = 0; c < vd->vdev_children; c++) {
 +		for (c = 0; c < vd->vdev_children; c++) {
  			child = vd->vdev_child[c];
  
+ 			/*
+ 			 * Don't factor holes into the decision.
+ 			 */
+ 			if (child->vdev_ishole)
+ 				continue;
+ 
  			if (!vdev_readable(child) ||
  			    (!vdev_writeable(child) && spa_writeable(spa))) {
  				/*
@@@ -2737,24 -3039,24 +3065,25 @@@ vdev_is_bootable(vdev_t *vd
  	return (B_TRUE);
  }
  
+ /*
+  * Load the state from the original vdev tree (ovd) which
+  * we've retrieved from the MOS config object. If the original
+  * vdev was offline then we transfer that state to the device
+  * in the current vdev tree (nvd).
+  */
  void
- vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
+ vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
  {
- 	uint_t children;
- 	nvlist_t **child;
- 	uint64_t val;
- 	spa_t *spa = vd->vdev_spa;
+ 	spa_t *spa = nvd->vdev_spa;
 +	int c;
  
- 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- 	    &child, &children) == 0) {
- 		for (c = 0; c < children; c++)
- 			vdev_load_log_state(vd->vdev_child[c], child[c]);
- 	}
+ 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ 	ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
  
- 	if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
- 	    ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
 -	for (int c = 0; c < nvd->vdev_children; c++)
++	for (c = 0; c < nvd->vdev_children; c++)
+ 		vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
  
+ 	if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) {
  		/*
  		 * It would be nice to call vdev_offline()
  		 * directly but the pool isn't fully loaded and
diff --cc module/zfs/zio.c
index 6efce7705,88d80af4e..520639e06
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@@ -904,10 -991,10 +992,10 @@@ zio_write_bp_init(zio_t *zio
  	 * spa_sync() to allocate new blocks, but force rewrites after that.
  	 * There should only be a handful of blocks after pass 1 in any case.
  	 */
- 	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+ 	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
  	    pass > SYNC_PASS_REWRITE) {
- 		uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
- 		ASSERT(csize != 0);
 -		ASSERT(psize != 0);
+ 		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
++		ASSERT(psize != 0);
  		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
  		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
  	} else {
@@@ -1113,24 -1234,13 +1237,13 @@@ zio_reexecute(zio_t *pio
  	pio->io_pipeline = pio->io_orig_pipeline;
  	pio->io_reexecute = 0;
  	pio->io_error = 0;
 -	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 +	for (w = 0; w < ZIO_WAIT_TYPES; w++)
  		pio->io_state[w] = 0;
 -	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 +	for (c = 0; c < ZIO_CHILD_TYPES; c++)
  		pio->io_child_error[c] = 0;
  
- 	if (IO_IS_ALLOCATING(pio)) {
- 		/*
- 		 * Remember the failed bp so that the io_ready() callback
- 		 * can update its accounting upon reexecution.  The block
- 		 * was already freed in zio_done(); we indicate this with
- 		 * a fill count of -1 so that zio_free() knows to skip it.
- 		 */
- 		blkptr_t *bp = pio->io_bp;
- 		ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg);
- 		bp->blk_fill = BLK_FILL_ALREADY_FREED;
- 		pio->io_bp_orig = *bp;
- 		BP_ZERO(bp);
- 	}
+ 	if (IO_IS_ALLOCATING(pio))
+ 		BP_ZERO(pio->io_bp);
  
  	/*
  	 * As we reexecute pio's children, new children could be created.
@@@ -1416,10 -1530,9 +1535,10 @@@ zio_gang_tree_assemble_done(zio_t *zio
  	zio_t *gio = zio->io_gang_leader;
  	zio_gang_node_t *gn = zio->io_private;
  	blkptr_t *bp = zio->io_bp;
 +	int g;
  
  	ASSERT(gio == zio_unique_parent(zio));
- 	ASSERT(zio_walk_children(zio) == NULL);
+ 	ASSERT(zio->io_child_count == 0);
  
  	if (zio->io_error)
  		return;
@@@ -1429,9 -1542,9 +1548,9 @@@
  
  	ASSERT(zio->io_data == gn->gn_gbh);
  	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
- 	ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+ 	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
  
 -	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 +	for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
  		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
  		if (!BP_IS_GANG(gbp))
  			continue;
@@@ -1457,9 -1569,9 +1576,9 @@@ zio_gang_tree_issue(zio_t *pio, zio_gan
  	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
  
  	if (gn != NULL) {
- 		ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+ 		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
  
 -		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 +		for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
  			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
  			if (BP_IS_HOLE(gbp))
  				continue;
@@@ -1554,13 -1665,13 +1673,13 @@@ zio_write_gang_block(zio_t *pio
  	uint64_t txg = pio->io_txg;
  	uint64_t resid = pio->io_size;
  	uint64_t lsize;
- 	int ndvas = gio->io_prop.zp_ndvas;
- 	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
+ 	int copies = gio->io_prop.zp_copies;
+ 	int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
  	zio_prop_t zp;
 -	int error;
 +	int g, error;
  
- 	error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
- 	    bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp,
+ 	error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
+ 	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
  	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
  	if (error) {
  		pio->io_error = error;
@@@ -1617,10 -1730,378 +1738,380 @@@
  
  /*
   * ==========================================================================
-  * Allocate and free blocks
+  * Dedup
   * ==========================================================================
   */
+ static void
+ zio_ddt_child_read_done(zio_t *zio)
+ {
+ 	blkptr_t *bp = zio->io_bp;
+ 	ddt_entry_t *dde = zio->io_private;
+ 	ddt_phys_t *ddp;
+ 	zio_t *pio = zio_unique_parent(zio);
+ 
+ 	mutex_enter(&pio->io_lock);
+ 	ddp = ddt_phys_select(dde, bp);
+ 	if (zio->io_error == 0)
+ 		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
+ 	if (zio->io_error == 0 && dde->dde_repair_data == NULL)
+ 		dde->dde_repair_data = zio->io_data;
+ 	else
+ 		zio_buf_free(zio->io_data, zio->io_size);
+ 	mutex_exit(&pio->io_lock);
+ }
+ 
+ static int
+ zio_ddt_read_start(zio_t *zio)
+ {
+ 	blkptr_t *bp = zio->io_bp;
++	int p;
+ 
+ 	ASSERT(BP_GET_DEDUP(bp));
+ 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+ 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ 
+ 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
+ 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ 		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
+ 		ddt_phys_t *ddp = dde->dde_phys;
+ 		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+ 		blkptr_t blk;
+ 
+ 		ASSERT(zio->io_vsd == NULL);
+ 		zio->io_vsd = dde;
+ 
+ 		if (ddp_self == NULL)
+ 			return (ZIO_PIPELINE_CONTINUE);
+ 
 -		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
++		for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ 			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+ 				continue;
+ 			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
+ 			    &blk);
+ 			zio_nowait(zio_read(zio, zio->io_spa, &blk,
+ 			    zio_buf_alloc(zio->io_size), zio->io_size,
+ 			    zio_ddt_child_read_done, dde, zio->io_priority,
+ 			    ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
+ 			    &zio->io_bookmark));
+ 		}
+ 		return (ZIO_PIPELINE_CONTINUE);
+ 	}
+ 
+ 	zio_nowait(zio_read(zio, zio->io_spa, bp,
+ 	    zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
+ 	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
+ 
+ 	return (ZIO_PIPELINE_CONTINUE);
+ }
  
+ static int
+ zio_ddt_read_done(zio_t *zio)
+ {
+ 	blkptr_t *bp = zio->io_bp;
+ 
+ 	if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
+ 		return (ZIO_PIPELINE_STOP);
+ 
+ 	ASSERT(BP_GET_DEDUP(bp));
+ 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+ 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ 
+ 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
+ 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ 		ddt_entry_t *dde = zio->io_vsd;
+ 		if (ddt == NULL) {
+ 			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
+ 			return (ZIO_PIPELINE_CONTINUE);
+ 		}
+ 		if (dde == NULL) {
+ 			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
+ 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
+ 			return (ZIO_PIPELINE_STOP);
+ 		}
+ 		if (dde->dde_repair_data != NULL) {
+ 			bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
+ 			zio->io_child_error[ZIO_CHILD_DDT] = 0;
+ 		}
+ 		ddt_repair_done(ddt, dde);
+ 		zio->io_vsd = NULL;
+ 	}
+ 
+ 	ASSERT(zio->io_vsd == NULL);
+ 
+ 	return (ZIO_PIPELINE_CONTINUE);
+ }
+ 
+ static boolean_t
+ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
+ {
+ 	spa_t *spa = zio->io_spa;
++	int p;
+ 
+ 	/*
+ 	 * Note: we compare the original data, not the transformed data,
+ 	 * because when zio->io_bp is an override bp, we will not have
+ 	 * pushed the I/O transforms.  That's an important optimization
+ 	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
+ 	 */
 -	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
++	for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ 		zio_t *lio = dde->dde_lead_zio[p];
+ 
+ 		if (lio != NULL) {
+ 			return (lio->io_orig_size != zio->io_orig_size ||
+ 			    bcmp(zio->io_orig_data, lio->io_orig_data,
+ 			    zio->io_orig_size) != 0);
+ 		}
+ 	}
+ 
 -	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
++	for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ 		ddt_phys_t *ddp = &dde->dde_phys[p];
+ 
+ 		if (ddp->ddp_phys_birth != 0) {
+ 			arc_buf_t *abuf = NULL;
+ 			uint32_t aflags = ARC_WAIT;
+ 			blkptr_t blk = *zio->io_bp;
+ 			int error;
+ 
+ 			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+ 
+ 			ddt_exit(ddt);
+ 
+ 			error = arc_read_nolock(NULL, spa, &blk,
+ 			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
+ 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ 			    &aflags, &zio->io_bookmark);
+ 
+ 			if (error == 0) {
+ 				if (arc_buf_size(abuf) != zio->io_orig_size ||
+ 				    bcmp(abuf->b_data, zio->io_orig_data,
+ 				    zio->io_orig_size) != 0)
+ 					error = EEXIST;
+ 				VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ 			}
+ 
+ 			ddt_enter(ddt);
+ 			return (error != 0);
+ 		}
+ 	}
+ 
+ 	return (B_FALSE);
+ }
+ 
+ static void
+ zio_ddt_child_write_ready(zio_t *zio)
+ {
+ 	int p = zio->io_prop.zp_copies;
+ 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+ 	ddt_entry_t *dde = zio->io_private;
+ 	ddt_phys_t *ddp = &dde->dde_phys[p];
+ 	zio_t *pio;
+ 
+ 	if (zio->io_error)
+ 		return;
+ 
+ 	ddt_enter(ddt);
+ 
+ 	ASSERT(dde->dde_lead_zio[p] == zio);
+ 
+ 	ddt_phys_fill(ddp, zio->io_bp);
+ 
+ 	while ((pio = zio_walk_parents(zio)) != NULL)
+ 		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+ 
+ 	ddt_exit(ddt);
+ }
+ 
+ static void
+ zio_ddt_child_write_done(zio_t *zio)
+ {
+ 	int p = zio->io_prop.zp_copies;
+ 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+ 	ddt_entry_t *dde = zio->io_private;
+ 	ddt_phys_t *ddp = &dde->dde_phys[p];
+ 
+ 	ddt_enter(ddt);
+ 
+ 	ASSERT(ddp->ddp_refcnt == 0);
+ 	ASSERT(dde->dde_lead_zio[p] == zio);
+ 	dde->dde_lead_zio[p] = NULL;
+ 
+ 	if (zio->io_error == 0) {
+ 		while (zio_walk_parents(zio) != NULL)
+ 			ddt_phys_addref(ddp);
+ 	} else {
+ 		ddt_phys_clear(ddp);
+ 	}
+ 
+ 	ddt_exit(ddt);
+ }
+ 
+ static void
+ zio_ddt_ditto_write_done(zio_t *zio)
+ {
+ 	int p = DDT_PHYS_DITTO;
+ 	zio_prop_t *zp = &zio->io_prop;
+ 	blkptr_t *bp = zio->io_bp;
+ 	ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ 	ddt_entry_t *dde = zio->io_private;
+ 	ddt_phys_t *ddp = &dde->dde_phys[p];
+ 	ddt_key_t *ddk = &dde->dde_key;
+ 
+ 	ddt_enter(ddt);
+ 
+ 	ASSERT(ddp->ddp_refcnt == 0);
+ 	ASSERT(dde->dde_lead_zio[p] == zio);
+ 	dde->dde_lead_zio[p] = NULL;
+ 
+ 	if (zio->io_error == 0) {
+ 		ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
+ 		ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
+ 		ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
+ 		if (ddp->ddp_phys_birth != 0)
+ 			ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
+ 		ddt_phys_fill(ddp, bp);
+ 	}
+ 
+ 	ddt_exit(ddt);
+ }
+ 
+ static int
+ zio_ddt_write(zio_t *zio)
+ {
+ 	spa_t *spa = zio->io_spa;
+ 	blkptr_t *bp = zio->io_bp;
+ 	uint64_t txg = zio->io_txg;
+ 	zio_prop_t *zp = &zio->io_prop;
+ 	int p = zp->zp_copies;
+ 	int ditto_copies;
+ 	zio_t *cio = NULL;
+ 	zio_t *dio = NULL;
+ 	ddt_t *ddt = ddt_select(spa, bp);
+ 	ddt_entry_t *dde;
+ 	ddt_phys_t *ddp;
+ 
+ 	ASSERT(BP_GET_DEDUP(bp));
+ 	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
+ 	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
+ 
+ 	ddt_enter(ddt);
+ 	dde = ddt_lookup(ddt, bp, B_TRUE);
+ 	ddp = &dde->dde_phys[p];
+ 
+ 	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
+ 		/*
+ 		 * If we're using a weak checksum, upgrade to a strong checksum
+ 		 * and try again.  If we're already using a strong checksum,
+ 		 * we can't resolve it, so just convert to an ordinary write.
+ 		 * (And automatically e-mail a paper to Nature?)
+ 		 */
+ 		if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
+ 			zp->zp_checksum = spa_dedup_checksum(spa);
+ 			zio_pop_transforms(zio);
+ 			zio->io_stage = ZIO_STAGE_OPEN;
+ 			BP_ZERO(bp);
+ 		} else {
+ 			zp->zp_dedup = 0;
+ 		}
+ 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ 		ddt_exit(ddt);
+ 		return (ZIO_PIPELINE_CONTINUE);
+ 	}
+ 
+ 	ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
+ 	ASSERT(ditto_copies < SPA_DVAS_PER_BP);
+ 
+ 	if (ditto_copies > ddt_ditto_copies_present(dde) &&
+ 	    dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
+ 		zio_prop_t czp = *zp;
+ 
+ 		czp.zp_copies = ditto_copies;
+ 
+ 		/*
+ 		 * If we arrived here with an override bp, we won't have run
+ 		 * the transform stack, so we won't have the data we need to
+ 		 * generate a child i/o.  So, toss the override bp and restart.
+ 		 * This is safe, because using the override bp is just an
+ 		 * optimization; and it's rare, so the cost doesn't matter.
+ 		 */
+ 		if (zio->io_bp_override) {
+ 			zio_pop_transforms(zio);
+ 			zio->io_stage = ZIO_STAGE_OPEN;
+ 			zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ 			zio->io_bp_override = NULL;
+ 			BP_ZERO(bp);
+ 			ddt_exit(ddt);
+ 			return (ZIO_PIPELINE_CONTINUE);
+ 		}
+ 
+ 		dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+ 		    zio->io_orig_size, &czp, NULL,
+ 		    zio_ddt_ditto_write_done, dde, zio->io_priority,
+ 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+ 
+ 		zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
+ 		dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
+ 	}
+ 
+ 	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
+ 		if (ddp->ddp_phys_birth != 0)
+ 			ddt_bp_fill(ddp, bp, txg);
+ 		if (dde->dde_lead_zio[p] != NULL)
+ 			zio_add_child(zio, dde->dde_lead_zio[p]);
+ 		else
+ 			ddt_phys_addref(ddp);
+ 	} else if (zio->io_bp_override) {
+ 		ASSERT(bp->blk_birth == txg);
+ 		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+ 		ddt_phys_fill(ddp, bp);
+ 		ddt_phys_addref(ddp);
+ 	} else {
+ 		cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+ 		    zio->io_orig_size, zp, zio_ddt_child_write_ready,
+ 		    zio_ddt_child_write_done, dde, zio->io_priority,
+ 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+ 
+ 		zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
+ 		dde->dde_lead_zio[p] = cio;
+ 	}
+ 
+ 	ddt_exit(ddt);
+ 
+ 	if (cio)
+ 		zio_nowait(cio);
+ 	if (dio)
+ 		zio_nowait(dio);
+ 
+ 	return (ZIO_PIPELINE_CONTINUE);
+ }
+ 
+ ddt_entry_t *freedde; /* for debugging */
+ 
+ static int
+ zio_ddt_free(zio_t *zio)
+ {
+ 	spa_t *spa = zio->io_spa;
+ 	blkptr_t *bp = zio->io_bp;
+ 	ddt_t *ddt = ddt_select(spa, bp);
+ 	ddt_entry_t *dde;
+ 	ddt_phys_t *ddp;
+ 
+ 	ASSERT(BP_GET_DEDUP(bp));
+ 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ 
+ 	ddt_enter(ddt);
+ 	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
+ 	ddp = ddt_phys_select(dde, bp);
+ 	ddt_phys_decref(ddp);
+ 	ddt_exit(ddt);
+ 
+ 	return (ZIO_PIPELINE_CONTINUE);
+ }
+ 
+ /*
+  * ==========================================================================
+  * Allocate and free blocks
+  * ==========================================================================
+  */
  static int
  zio_dva_allocate(zio_t *zio)
  {
@@@ -1680,40 -2161,14 +2171,14 @@@ zio_dva_claim(zio_t *zio
  static void
  zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
  {
- 	spa_t *spa = zio->io_spa;
- 	boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE);
- 	int g;
- 
  	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
- 
- 	if (zio->io_bp == bp && !now) {
- 		/*
- 		 * This is a rewrite for sync-to-convergence.
- 		 * We can't do a metaslab_free(NOW) because bp wasn't allocated
- 		 * during this sync pass, which means that metaslab_sync()
- 		 * already committed the allocation.
- 		 */
- 		ASSERT(DVA_EQUAL(BP_IDENTITY(bp),
- 		    BP_IDENTITY(&zio->io_bp_orig)));
- 		ASSERT(spa_sync_pass(spa) > 1);
- 
- 		if (BP_IS_GANG(bp) && gn == NULL) {
- 			/*
- 			 * This is a gang leader whose gang header(s) we
- 			 * couldn't read now, so defer the free until later.
- 			 * The block should still be intact because without
- 			 * the headers, we'd never even start the rewrite.
- 			 */
- 			bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
- 			return;
- 		}
- 	}
+ 	ASSERT(zio->io_bp_override == NULL);
  
  	if (!BP_IS_HOLE(bp))
- 		metaslab_free(spa, bp, bp->blk_birth, now);
+ 		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
  
  	if (gn != NULL) {
 -		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 +		for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
  			zio_dva_unallocate(zio, gn->gn_child[g],
  			    &gn->gn_gbh->zg_blkptr[g]);
  		}