}
}
- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ static void
+ dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+ {
+ const ddt_phys_t *ddp = dde->dde_phys;
+ const ddt_key_t *ddk = &dde->dde_key;
+ char *types[4] = { "ditto", "single", "double", "triple" };
+ char blkbuf[BP_SPRINTF_LEN];
+ blkptr_t blk;
++ int p;
+
++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ sprintf_blkptr(blkbuf, &blk);
+ (void) printf("index %llx refcnt %llu %s %s\n",
+ (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
+ types[p], blkbuf);
+ }
+ }
+
+ static void
+ dump_dedup_ratio(const ddt_stat_t *dds)
+ {
+ double rL, rP, rD, D, dedup, compress, copies;
+
+ if (dds->dds_blocks == 0)
+ return;
+
+ rL = (double)dds->dds_ref_lsize;
+ rP = (double)dds->dds_ref_psize;
+ rD = (double)dds->dds_ref_dsize;
+ D = (double)dds->dds_dsize;
+
+ dedup = rD / D;
+ compress = rL / rP;
+ copies = rD / rP;
+
+ (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
+ "dedup * compress / copies = %.2f\n\n",
+ dedup, compress, copies, dedup * compress / copies);
+ }
+
+ static void
+ dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+ {
+ char name[DDT_NAMELEN];
+ ddt_entry_t dde;
+ uint64_t walk = 0;
+ dmu_object_info_t doi;
+ uint64_t count, dspace, mspace;
+ int error;
+
+ error = ddt_object_info(ddt, type, class, &doi);
+
+ if (error == ENOENT)
+ return;
+ ASSERT(error == 0);
+
+ count = ddt_object_count(ddt, type, class);
+ dspace = doi.doi_physical_blocks_512 << 9;
+ mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+ ASSERT(count != 0); /* we should have destroyed it */
+
+ ddt_object_name(ddt, type, class, name);
+
+ (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
+ name,
+ (u_longlong_t)count,
+ (u_longlong_t)(dspace / count),
+ (u_longlong_t)(mspace / count));
+
+ if (dump_opt['D'] < 3)
+ return;
+
+ zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
+
+ if (dump_opt['D'] < 4)
+ return;
+
+ if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
+ return;
+
+ (void) printf("%s contents:\n\n", name);
+
+ while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
+ dump_dde(ddt, &dde, walk);
+
+ ASSERT(error == ENOENT);
+
+ (void) printf("\n");
+ }
+
+ static void
+ dump_all_ddts(spa_t *spa)
+ {
+ ddt_histogram_t ddh_total = { 0 };
+ ddt_stat_t dds_total = { 0 };
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ dump_ddt(ddt, type, class);
+ }
+ }
+ }
+
+ ddt_get_dedup_stats(spa, &dds_total);
+
+ if (dds_total.dds_blocks == 0) {
+ (void) printf("All DDTs are empty\n");
+ return;
+ }
+
+ (void) printf("\n");
+
+ if (dump_opt['D'] > 1) {
+ (void) printf("DDT histogram (aggregated over all DDTs):\n");
+ ddt_get_dedup_histogram(spa, &ddh_total);
+ zpool_dump_ddt(&dds_total, &ddh_total);
+ }
+
+ dump_dedup_ratio(&dds_total);
+ }
+
static void
dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size)
{
boolean_t required;
char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
char prefix[256];
+ int c, t;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
required = vdev_dtl_required(vd);
(void) spa_vdev_state_exit(spa, NULL, 0);
dump_dtl(vd->vdev_child[c], indent + 4);
}
- for (int i = 0; i < num; i++) {
+ static void
+ dump_history(spa_t *spa)
+ {
+ nvlist_t **events = NULL;
+ char buf[SPA_MAXBLOCKSIZE];
+ uint64_t resid, len, off = 0;
+ uint_t num = 0;
+ int error;
+ time_t tsec;
+ struct tm t;
+ char tbuf[30];
+ char internalstr[MAXPATHLEN];
++ int i;
+
+ do {
+ len = sizeof (buf);
+
+ if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
+ (void) fprintf(stderr, "Unable to read history: "
+ "error %d\n", error);
+ return;
+ }
+
+ if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
+ break;
+
+ off -= resid;
+ } while (len != 0);
+
+ (void) printf("\nHistory:\n");
++ for (i = 0; i < num; i++) {
+ uint64_t time, txg, ievent;
+ char *cmd, *intstr;
+
+ if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
+ &time) != 0)
+ continue;
+ if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
+ &cmd) != 0) {
+ if (nvlist_lookup_uint64(events[i],
+ ZPOOL_HIST_INT_EVENT, &ievent) != 0)
+ continue;
+ verify(nvlist_lookup_uint64(events[i],
+ ZPOOL_HIST_TXG, &txg) == 0);
+ verify(nvlist_lookup_string(events[i],
+ ZPOOL_HIST_INT_STR, &intstr) == 0);
+ if (ievent >= LOG_END)
+ continue;
+
+ (void) snprintf(internalstr,
+ sizeof (internalstr),
+ "[internal %s txg:%lld] %s",
+ zfs_history_event_names[ievent], txg,
+ intstr);
+ cmd = internalstr;
+ }
+ tsec = time;
+ (void) localtime_r(&tsec, &t);
+ (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+ (void) printf("%s %s\n", tbuf, cmd);
+ }
+ }
+
/*ARGSUSED*/
static void
dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
}
static void
- sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas)
+ sprintf_blkptr_compact(char *blkbuf, const blkptr_t *bp)
{
- dva_t *dva = bp->blk_dva;
- int ndvas = alldvas ? BP_GET_NDVAS(bp) : 1;
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
+ int i;
+ if (dump_opt['b'] >= 5) {
+ sprintf_blkptr(blkbuf, bp);
+ return;
+ }
+
blkbuf[0] = '\0';
- for (int i = 0; i < ndvas; i++)
+ for (i = 0; i < ndvas; i++)
(void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ",
(u_longlong_t)DVA_GET_VDEV(&dva[i]),
(u_longlong_t)DVA_GET_OFFSET(&dva[i]),
nvlist_free(config);
}
- for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
+ #define ZDB_MAX_UB_HEADER_SIZE 32
+
+ static void
+ dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
+ {
+ vdev_t vd;
+ vdev_t *vdp = &vd;
+ char header[ZDB_MAX_UB_HEADER_SIZE];
++ int i;
+
+ vd.vdev_ashift = ashift;
+ vdp->vdev_top = vdp;
+
++ for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
+ uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
+ uberblock_t *ub = (void *)((char *)lbl + uoff);
+
+ if (uberblock_verify(ub))
+ continue;
+ (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
+ "Uberblock[%d]\n", i);
+ dump_uberblock(ub, header, "");
+ }
+ }
+
static void
dump_label(const char *dev)
{
int fd;
vdev_label_t label;
- char *buf = label.vl_vdev_phys.vp_nvlist;
+ char *path, *buf = label.vl_vdev_phys.vp_nvlist;
size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
struct stat64 statbuf;
- uint64_t psize;
+ uint64_t psize, ashift;
+ int len = strlen(dev) + 1;
+ int l;
- if ((fd = open64(dev, O_RDONLY)) < 0) {
- (void) printf("cannot open '%s': %s\n", dev, strerror(errno));
+ if (strncmp(dev, "/dev/dsk/", 9) == 0) {
+ len++;
+ path = malloc(len);
+ (void) snprintf(path, len, "%s%s", "/dev/rdsk/", dev + 9);
+ } else {
+ path = strdup(dev);
+ }
+
+ if ((fd = open64(path, O_RDONLY)) < 0) {
+ (void) printf("cannot open '%s': %s\n", path, strerror(errno));
+ free(path);
exit(1);
}
psize = statbuf.st_size;
psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
- for (int l = 0; l < VDEV_LABELS; l++) {
+ for (l = 0; l < VDEV_LABELS; l++) {
-
nvlist_t *config = NULL;
(void) printf("--------------------------------------------\n");
} zdb_cb_t;
static void
- zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type)
+ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
+ dmu_object_type_t type)
{
+ uint64_t refcnt = 0;
+ int i;
- for (int i = 0; i < 4; i++) {
+ ASSERT(type < ZDB_OT_TOTAL);
+
+ if (zilog && zil_bp_tree_add(zilog, bp) != 0)
+ return;
+
+ for (i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
- int t = (i & 1) ? type : DMU_OT_TOTAL;
+ int t = (i & 1) ? type : ZDB_OT_TOTAL;
zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
zb->zb_asize += BP_GET_ASIZE(bp);
return (0);
}
- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ static void
+ zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
+ {
+ vdev_t *vd = sm->sm_ppd;
+
+ (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
+ (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
+ }
+
+ /* ARGSUSED */
+ static void
+ zdb_space_map_load(space_map_t *sm)
+ {
+ }
+
+ static void
+ zdb_space_map_unload(space_map_t *sm)
+ {
+ space_map_vacate(sm, zdb_leak, sm);
+ }
+
+ /* ARGSUSED */
+ static void
+ zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
+ {
+ }
+
+ static space_map_ops_t zdb_space_map_ops = {
+ zdb_space_map_load,
+ zdb_space_map_unload,
+ NULL, /* alloc */
+ zdb_space_map_claim,
+ NULL, /* free */
+ NULL /* maxsize */
+ };
+
+ static void
+ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
+ {
+ ddt_bookmark_t ddb = { 0 };
+ ddt_entry_t dde;
+ int error;
++ int p;
+
+ while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
+ blkptr_t blk;
+ ddt_phys_t *ddp = dde.dde_phys;
+
+ if (ddb.ddb_class == DDT_CLASS_UNIQUE)
+ return;
+
+ ASSERT(ddt_phys_total_refcnt(&dde) > 1);
+
- for (int c = 0; c < rvd->vdev_children; c++) {
++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+ ddt_bp_create(ddb.ddb_checksum,
+ &dde.dde_key, ddp, &blk);
+ if (p == DDT_PHYS_DITTO) {
+ zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
+ } else {
+ zcb->zcb_dedup_asize +=
+ BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
+ zcb->zcb_dedup_blocks++;
+ }
+ }
+ if (!dump_opt['L']) {
+ ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+ ddt_enter(ddt);
+ VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+ ddt_exit(ddt);
+ }
+ }
+
+ ASSERT(error == ENOENT);
+ }
+
+ static void
+ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
+ {
+ zcb->zcb_spa = spa;
++ int c, m;
+
+ if (!dump_opt['L']) {
+ vdev_t *rvd = spa->spa_root_vdev;
- for (int m = 0; m < vd->vdev_ms_count; m++) {
++ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
- for (int c = 0; c < rvd->vdev_children; c++) {
++ for (m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ mutex_enter(&msp->ms_lock);
+ space_map_unload(&msp->ms_map);
+ VERIFY(space_map_load(&msp->ms_map,
+ &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo,
+ spa->spa_meta_objset) == 0);
+ msp->ms_map.sm_ppd = vd;
+ mutex_exit(&msp->ms_lock);
+ }
+ }
+ }
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ zdb_ddt_leak_init(spa, zcb);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+
+ static void
+ zdb_leak_fini(spa_t *spa)
+ {
++ int c, m;
++
+ if (!dump_opt['L']) {
+ vdev_t *rvd = spa->spa_root_vdev;
- for (int m = 0; m < vd->vdev_ms_count; m++) {
++ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
++ for (m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ mutex_enter(&msp->ms_lock);
+ space_map_unload(&msp->ms_map);
+ mutex_exit(&msp->ms_lock);
+ }
+ }
+ }
+ }
+
+ /* ARGSUSED */
+ static int
+ count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+ {
+ zdb_cb_t *zcb = arg;
+
+ if (dump_opt['b'] >= 4) {
+ char blkbuf[BP_SPRINTF_LEN];
+ sprintf_blkptr(blkbuf, bp);
+ (void) printf("[%s] %s\n",
+ "deferred free", blkbuf);
+ }
+ zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
+ return (0);
+ }
+
static int
dump_block_stats(spa_t *spa)
{
zdb_cb_t zcb = { 0 };
zdb_blkstats_t *zb, *tzb;
- uint64_t alloc, space, logalloc;
- vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t norm_alloc, norm_space, total_alloc, total_found;
+ int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
int leaks = 0;
- int c, e;
++ int e;
- if (!dump_opt['S']) {
- (void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
- (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
- (dump_opt['c'] == 1) ? "metadata " : "",
- dump_opt['c'] ? "checksums " : "",
- (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
- !dump_opt['L'] ? "nothing leaked " : "");
- }
+ (void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
+ (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
+ (dump_opt['c'] == 1) ? "metadata " : "",
+ dump_opt['c'] ? "checksums " : "",
+ (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
+ !dump_opt['L'] ? "nothing leaked " : "");
/*
* Load all space maps as SM_ALLOC maps, then traverse the pool
/*
* If there's a deferred-free bplist, process that first.
*/
- if (spa->spa_sync_bplist_obj != 0) {
- bplist_t *bpl = &spa->spa_sync_bplist;
- blkptr_t blk;
- uint64_t itor = 0;
+ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
+ count_block_cb, &zcb, NULL);
+ (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
+ count_block_cb, &zcb, NULL);
- VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset,
- spa->spa_sync_bplist_obj));
+ if (dump_opt['c'] > 1)
+ flags |= TRAVERSE_PREFETCH_DATA;
- while (bplist_iterate(bpl, &itor, &blk) == 0) {
- if (dump_opt['b'] >= 4) {
- char blkbuf[BP_SPRINTF_LEN];
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk);
- (void) printf("[%s] %s\n",
- "deferred free", blkbuf);
- }
- zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
- }
-
- bplist_close(bpl);
- }
-
- zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb);
+ zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
- if (zcb.zcb_haderrors && !dump_opt['S']) {
+ if (zcb.zcb_haderrors) {
(void) printf("\nError counts:\n\n");
(void) printf("\t%5s %s\n", "errno", "count");
- for (int e = 0; e < 256; e++) {
+ for (e = 0; e < 256; e++) {
if (zcb.zcb_errors[e] != 0) {
(void) printf("\t%5d %llu\n",
e, (u_longlong_t)zcb.zcb_errors[e]);
return (error);
}
- zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
- NULL, /* 0 no such transaction type */
- ztest_replay_create, /* TX_CREATE */
- NULL, /* TX_MKDIR */
- NULL, /* TX_MKXATTR */
- NULL, /* TX_SYMLINK */
- ztest_replay_remove, /* TX_REMOVE */
- NULL, /* TX_RMDIR */
- NULL, /* TX_LINK */
- NULL, /* TX_RENAME */
- NULL, /* TX_WRITE */
- NULL, /* TX_TRUNCATE */
- NULL, /* TX_SETATTR */
- NULL, /* TX_ACL */
- };
+ static void
+ ztest_rll_init(rll_t *rll)
+ {
+ rll->rll_writer = NULL;
+ rll->rll_readers = 0;
+ VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0);
+ VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0);
+ }
- /*
- * Verify that we can't destroy an active pool, create an existing pool,
- * or create a pool with a bad vdev spec.
- */
- void
- ztest_spa_create_destroy(ztest_args_t *za)
+ static void
+ ztest_rll_destroy(rll_t *rll)
{
- int error;
- spa_t *spa;
- nvlist_t *nvroot;
+ ASSERT(rll->rll_writer == NULL);
+ ASSERT(rll->rll_readers == 0);
+ VERIFY(_mutex_destroy(&rll->rll_lock) == 0);
+ VERIFY(cond_destroy(&rll->rll_cv) == 0);
+ }
- /*
- * Attempt to create using a bad file.
- */
- nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
- error = spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL);
- nvlist_free(nvroot);
- if (error != ENOENT)
- fatal(0, "spa_create(bad_file) = %d", error);
+ static void
+ ztest_rll_lock(rll_t *rll, rl_type_t type)
+ {
+ VERIFY(mutex_lock(&rll->rll_lock) == 0);
- /*
- * Attempt to create using a bad mirror.
- */
- nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
- error = spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL);
- nvlist_free(nvroot);
- if (error != ENOENT)
- fatal(0, "spa_create(bad_mirror) = %d", error);
+ if (type == RL_READER) {
+ while (rll->rll_writer != NULL)
+ (void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+ rll->rll_readers++;
+ } else {
+ while (rll->rll_writer != NULL || rll->rll_readers)
+ (void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+ rll->rll_writer = curthread;
+ }
- /*
- * Attempt to create an existing pool. It shouldn't matter
- * what's in the nvroot; we should fail with EEXIST.
- */
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
- nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
- error = spa_create(za->za_pool, nvroot, NULL, NULL, NULL);
- nvlist_free(nvroot);
- if (error != EEXIST)
- fatal(0, "spa_create(whatever) = %d", error);
+ VERIFY(mutex_unlock(&rll->rll_lock) == 0);
+ }
- error = spa_open(za->za_pool, &spa, FTAG);
- if (error)
- fatal(0, "spa_open() = %d", error);
+ static void
+ ztest_rll_unlock(rll_t *rll)
+ {
+ VERIFY(mutex_lock(&rll->rll_lock) == 0);
- error = spa_destroy(za->za_pool);
- if (error != EBUSY)
- fatal(0, "spa_destroy() = %d", error);
+ if (rll->rll_writer) {
+ ASSERT(rll->rll_readers == 0);
+ rll->rll_writer = NULL;
+ } else {
+ ASSERT(rll->rll_readers != 0);
+ ASSERT(rll->rll_writer == NULL);
+ rll->rll_readers--;
+ }
- spa_close(spa, FTAG);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ if (rll->rll_writer == NULL && rll->rll_readers == 0)
+ VERIFY(cond_broadcast(&rll->rll_cv) == 0);
+
+ VERIFY(mutex_unlock(&rll->rll_lock) == 0);
}
- static vdev_t *
- vdev_lookup_by_path(vdev_t *vd, const char *path)
+ static void
+ ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
{
- vdev_t *mvd;
- int c;
+ rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
- if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
- return (vd);
+ ztest_rll_lock(rll, type);
+ }
- for (c = 0; c < vd->vdev_children; c++)
- if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
- NULL)
- return (mvd);
+ static void
+ ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
+ {
+ rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
- return (NULL);
+ ztest_rll_unlock(rll);
}
- /*
- * Verify that vdev_add() works as expected.
- */
- void
- ztest_vdev_add_remove(ztest_args_t *za)
+ static rl_t *
+ ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
+ uint64_t size, rl_type_t type)
{
- spa_t *spa = za->za_spa;
- uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
- nvlist_t *nvroot;
- int error;
+ uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
+ rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
+ rl_t *rl;
- (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+ rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
+ rl->rl_object = object;
+ rl->rl_offset = offset;
+ rl->rl_size = size;
+ rl->rl_lock = rll;
- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ ztest_rll_lock(rll, type);
- ztest_shared->zs_vdev_primaries =
- spa->spa_root_vdev->vdev_children * leaves;
+ return (rl);
+ }
- spa_config_exit(spa, SCL_VDEV, FTAG);
+ static void
+ ztest_range_unlock(rl_t *rl)
+ {
+ rll_t *rll = rl->rl_lock;
- /*
- * Make 1/4 of the devices be log devices.
- */
- nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
- ztest_random(4) == 0, zopt_raidz, zopt_mirrors, 1);
+ ztest_rll_unlock(rll);
- error = spa_vdev_add(spa, nvroot);
- nvlist_free(nvroot);
+ umem_free(rl, sizeof (*rl));
+ }
+
+ static void
+ ztest_zd_init(ztest_ds_t *zd, objset_t *os)
+ {
+ zd->zd_os = os;
+ zd->zd_zilog = dmu_objset_zil(os);
+ zd->zd_seq = 0;
+ dmu_objset_name(os, zd->zd_name);
++ int l;
- (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+ VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0);
- if (error == ENOSPC)
- ztest_record_enospc("spa_vdev_add");
- else if (error != 0)
- fatal(0, "spa_vdev_add() = %d", error);
- for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
++ for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+ ztest_rll_init(&zd->zd_object_lock[l]);
+
- for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
++ for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
+ ztest_rll_init(&zd->zd_range_lock[l]);
}
- /*
- * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
- */
- void
- ztest_vdev_aux_add_remove(ztest_args_t *za)
+ static void
+ ztest_zd_fini(ztest_ds_t *zd)
{
- spa_t *spa = za->za_spa;
- vdev_t *rvd = spa->spa_root_vdev;
- spa_aux_vdev_t *sav;
- char *aux;
- uint64_t guid = 0;
- int error;
++ int l;
+
- if (ztest_random(2) == 0) {
- sav = &spa->spa_spares;
- aux = ZPOOL_CONFIG_SPARES;
- } else {
- sav = &spa->spa_l2cache;
- aux = ZPOOL_CONFIG_L2CACHE;
- }
+ VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0);
- (void) mutex_lock(&ztest_shared->zs_vdev_lock);
- for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
++ for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+ ztest_rll_destroy(&zd->zd_object_lock[l]);
- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
++ for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
+ ztest_rll_destroy(&zd->zd_range_lock[l]);
+ }
+
+ #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
+
+ static uint64_t
+ ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
+ {
+ uint64_t txg;
+ int error;
+
+ /*
+ * Attempt to assign tx to some transaction group.
+ */
+ error = dmu_tx_assign(tx, txg_how);
+ if (error) {
+ if (error == ERESTART) {
+ ASSERT(txg_how == TXG_NOWAIT);
+ dmu_tx_wait(tx);
+ } else {
+ ASSERT3U(error, ==, ENOSPC);
+ ztest_record_enospc(tag);
+ }
+ dmu_tx_abort(tx);
+ return (0);
+ }
+ txg = dmu_tx_get_txg(tx);
+ ASSERT(txg != 0);
+ return (txg);
+ }
+
+ static void
+ ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
+ {
+ uint64_t *ip = buf;
+ uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+
+ while (ip < ip_end)
+ *ip++ = value;
+ }
+
+ static boolean_t
+ ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
+ {
+ uint64_t *ip = buf;
+ uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+ uint64_t diff = 0;
+
+ while (ip < ip_end)
+ diff |= (value - *ip++);
+
+ return (diff == 0);
+ }
+
+ static void
+ ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+ uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+ {
+ bt->bt_magic = BT_MAGIC;
+ bt->bt_objset = dmu_objset_id(os);
+ bt->bt_object = object;
+ bt->bt_offset = offset;
+ bt->bt_gen = gen;
+ bt->bt_txg = txg;
+ bt->bt_crtxg = crtxg;
+ }
+
+ static void
+ ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+ uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+ {
+ ASSERT(bt->bt_magic == BT_MAGIC);
+ ASSERT(bt->bt_objset == dmu_objset_id(os));
+ ASSERT(bt->bt_object == object);
+ ASSERT(bt->bt_offset == offset);
+ ASSERT(bt->bt_gen <= gen);
+ ASSERT(bt->bt_txg <= txg);
+ ASSERT(bt->bt_crtxg == crtxg);
+ }
+
+ static ztest_block_tag_t *
+ ztest_bt_bonus(dmu_buf_t *db)
+ {
+ dmu_object_info_t doi;
+ ztest_block_tag_t *bt;
+
+ dmu_object_info_from_db(db, &doi);
+ ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
+ ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
+ bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
+
+ return (bt);
+ }
+
+ /*
+ * ZIL logging ops
+ */
+
+ #define lrz_type lr_mode
+ #define lrz_blocksize lr_uid
+ #define lrz_ibshift lr_gid
+ #define lrz_bonustype lr_rdev
+ #define lrz_bonuslen lr_crtime[1]
+
+ static uint64_t
+ ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
+ {
+ char *name = (void *)(lr + 1); /* name follows lr */
+ size_t namesize = strlen(name) + 1;
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return (0);
+
+ itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) + namesize - sizeof (lr_t));
+
+ return (zil_itx_assign(zd->zd_zilog, itx, tx));
+ }
+
+ static uint64_t
+ ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr)
+ {
+ char *name = (void *)(lr + 1); /* name follows lr */
+ size_t namesize = strlen(name) + 1;
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return (0);
+
+ itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) + namesize - sizeof (lr_t));
+
+ return (zil_itx_assign(zd->zd_zilog, itx, tx));
+ }
+
+ static uint64_t
+ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
+ {
+ itx_t *itx;
+ itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return (0);
+
+ if (lr->lr_length > ZIL_MAX_LOG_DATA)
+ write_state = WR_INDIRECT;
+
+ itx = zil_itx_create(TX_WRITE,
+ sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
+
+ if (write_state == WR_COPIED &&
+ dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
+ ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
+ zil_itx_destroy(itx);
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+ write_state = WR_NEED_COPY;
+ }
+ itx->itx_private = zd;
+ itx->itx_wr_state = write_state;
+ itx->itx_sync = (ztest_random(8) == 0);
+ itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
+
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) - sizeof (lr_t));
+
+ return (zil_itx_assign(zd->zd_zilog, itx, tx));
+ }
+
+ static uint64_t
+ ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
+ {
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return (0);
+
+ itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) - sizeof (lr_t));
+
+ return (zil_itx_assign(zd->zd_zilog, itx, tx));
+ }
+
+ static uint64_t
+ ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
+ {
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return (0);
+
+ itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) - sizeof (lr_t));
+
+ return (zil_itx_assign(zd->zd_zilog, itx, tx));
+ }
+
+ /*
+ * ZIL replay ops
+ */
+ static int
+ ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap)
+ {
+ char *name = (void *)(lr + 1); /* name follows lr */
+ objset_t *os = zd->zd_os;
+ ztest_block_tag_t *bbt;
+ dmu_buf_t *db;
+ dmu_tx_t *tx;
+ uint64_t txg;
+ int error = 0;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+ ASSERT(name[0] != '\0');
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
+
+ if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ } else {
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ }
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0)
+ return (ENOSPC);
+
+ ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
+
+ if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+ if (lr->lr_foid == 0) {
+ lr->lr_foid = zap_create(os,
+ lr->lrz_type, lr->lrz_bonustype,
+ lr->lrz_bonuslen, tx);
+ } else {
+ error = zap_create_claim(os, lr->lr_foid,
+ lr->lrz_type, lr->lrz_bonustype,
+ lr->lrz_bonuslen, tx);
+ }
+ } else {
+ if (lr->lr_foid == 0) {
+ lr->lr_foid = dmu_object_alloc(os,
+ lr->lrz_type, 0, lr->lrz_bonustype,
+ lr->lrz_bonuslen, tx);
+ } else {
+ error = dmu_object_claim(os, lr->lr_foid,
+ lr->lrz_type, 0, lr->lrz_bonustype,
+ lr->lrz_bonuslen, tx);
+ }
+ }
+
+ if (error) {
+ ASSERT3U(error, ==, EEXIST);
+ ASSERT(zd->zd_zilog->zl_replay);
+ dmu_tx_commit(tx);
+ return (error);
+ }
+
+ ASSERT(lr->lr_foid != 0);
+
+ if (lr->lrz_type != DMU_OT_ZAP_OTHER)
+ VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
+ lr->lrz_blocksize, lr->lrz_ibshift, tx));
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+ bbt = ztest_bt_bonus(db);
+ dmu_buf_will_dirty(db, tx);
+ ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg);
+ dmu_buf_rele(db, FTAG);
+
+ VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
+ &lr->lr_foid, tx));
+
+ (void) ztest_log_create(zd, tx, lr);
+
+ dmu_tx_commit(tx);
+
+ return (0);
+ }
+
+ static int
+ ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap)
+ {
+ char *name = (void *)(lr + 1); /* name follows lr */
+ objset_t *os = zd->zd_os;
+ dmu_object_info_t doi;
+ dmu_tx_t *tx;
+ uint64_t object, txg;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+ ASSERT(name[0] != '\0');
+
+ VERIFY3U(0, ==,
+ zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
+ ASSERT(object != 0);
+
+ ztest_object_lock(zd, object, RL_WRITER);
+
+ VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
+ dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ ztest_object_unlock(zd, object);
+ return (ENOSPC);
+ }
+
+ if (doi.doi_type == DMU_OT_ZAP_OTHER) {
+ VERIFY3U(0, ==, zap_destroy(os, object, tx));
+ } else {
+ VERIFY3U(0, ==, dmu_object_free(os, object, tx));
+ }
+
+ VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
+
+ (void) ztest_log_remove(zd, tx, lr);
+
+ dmu_tx_commit(tx);
+
+ ztest_object_unlock(zd, object);
+
+ return (0);
+ }
+
+ static int
+ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
+ {
+ objset_t *os = zd->zd_os;
+ void *data = lr + 1; /* data follows lr */
+ uint64_t offset, length;
+ ztest_block_tag_t *bt = data;
+ ztest_block_tag_t *bbt;
+ uint64_t gen, txg, lrtxg, crtxg;
+ dmu_object_info_t doi;
+ dmu_tx_t *tx;
+ dmu_buf_t *db;
+ arc_buf_t *abuf = NULL;
+ rl_t *rl;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+
+ /* If it's a dmu_sync() block, write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ if (length < blocksize) {
+ offset -= offset % blocksize;
+ length = blocksize;
+ }
+ }
+
+ if (bt->bt_magic == BSWAP_64(BT_MAGIC))
+ byteswap_uint64_array(bt, sizeof (*bt));
+
+ if (bt->bt_magic != BT_MAGIC)
+ bt = NULL;
+
+ ztest_object_lock(zd, lr->lr_foid, RL_READER);
+ rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+ dmu_object_info_from_db(db, &doi);
+
+ bbt = ztest_bt_bonus(db);
+ ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+ gen = bbt->bt_gen;
+ crtxg = bbt->bt_crtxg;
+ lrtxg = lr->lr_common.lrc_txg;
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
+
+ if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
+ P2PHASE(offset, length) == 0)
+ abuf = dmu_request_arcbuf(db, length);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ if (abuf != NULL)
+ dmu_return_arcbuf(abuf);
+ dmu_buf_rele(db, FTAG);
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+ return (ENOSPC);
+ }
+
+ if (bt != NULL) {
+ /*
+ * Usually, verify the old data before writing new data --
+ * but not always, because we also want to verify correct
+ * behavior when the data was not recently read into cache.
+ */
+ ASSERT(offset % doi.doi_data_block_size == 0);
+ if (ztest_random(4) != 0) {
+ int prefetch = ztest_random(2) ?
+ DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
+ ztest_block_tag_t rbt;
+
+ VERIFY(dmu_read(os, lr->lr_foid, offset,
+ sizeof (rbt), &rbt, prefetch) == 0);
+ if (rbt.bt_magic == BT_MAGIC) {
+ ztest_bt_verify(&rbt, os, lr->lr_foid,
+ offset, gen, txg, crtxg);
+ }
+ }
+
+ /*
+ * Writes can appear to be newer than the bonus buffer because
+ * the ztest_get_data() callback does a dmu_read() of the
+ * open-context data, which may be different than the data
+ * as it was when the write was generated.
+ */
+ if (zd->zd_zilog->zl_replay) {
+ ztest_bt_verify(bt, os, lr->lr_foid, offset,
+ MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
+ bt->bt_crtxg);
+ }
+
+ /*
+ * Set the bt's gen/txg to the bonus buffer's gen/txg
+ * so that all of the usual ASSERTs will work.
+ */
+ ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg);
+ }
+
+ if (abuf == NULL) {
+ dmu_write(os, lr->lr_foid, offset, length, data, tx);
+ } else {
+ bcopy(data, abuf->b_data, length);
+ dmu_assign_arcbuf(db, offset, abuf, tx);
+ }
+
+ (void) ztest_log_write(zd, tx, lr);
+
+ dmu_buf_rele(db, FTAG);
+
+ dmu_tx_commit(tx);
+
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+
+ return (0);
+ }
+
+ static int
+ ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap)
+ {
+ objset_t *os = zd->zd_os;
+ dmu_tx_t *tx;
+ uint64_t txg;
+ rl_t *rl;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ ztest_object_lock(zd, lr->lr_foid, RL_READER);
+ rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
+ RL_WRITER);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+ return (ENOSPC);
+ }
+
+ VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
+ lr->lr_length, tx) == 0);
+
+ (void) ztest_log_truncate(zd, tx, lr);
+
+ dmu_tx_commit(tx);
+
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+
+ return (0);
+ }
+
+ static int
+ ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
+ {
+ objset_t *os = zd->zd_os;
+ dmu_tx_t *tx;
+ dmu_buf_t *db;
+ ztest_block_tag_t *bbt;
+ uint64_t txg, lrtxg, crtxg;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, lr->lr_foid);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ dmu_buf_rele(db, FTAG);
+ ztest_object_unlock(zd, lr->lr_foid);
+ return (ENOSPC);
+ }
+
+ bbt = ztest_bt_bonus(db);
+ ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+ crtxg = bbt->bt_crtxg;
+ lrtxg = lr->lr_common.lrc_txg;
+
+ if (zd->zd_zilog->zl_replay) {
+ ASSERT(lr->lr_size != 0);
+ ASSERT(lr->lr_mode != 0);
+ ASSERT(lrtxg != 0);
+ } else {
+ /*
+ * Randomly change the size and increment the generation.
+ */
+ lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
+ sizeof (*bbt);
+ lr->lr_mode = bbt->bt_gen + 1;
+ ASSERT(lrtxg == 0);
+ }
+
+ /*
+ * Verify that the current bonus buffer is not newer than our txg.
+ */
+ ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode,
+ MAX(txg, lrtxg), crtxg);
+
+ dmu_buf_will_dirty(db, tx);
+
+ ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
+ ASSERT3U(lr->lr_size, <=, db->db_size);
+ VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0);
+ bbt = ztest_bt_bonus(db);
+
+ ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);
+
+ dmu_buf_rele(db, FTAG);
+
+ (void) ztest_log_setattr(zd, tx, lr);
+
+ dmu_tx_commit(tx);
+
+ ztest_object_unlock(zd, lr->lr_foid);
+
+ return (0);
+ }
+
+ zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
+ NULL, /* 0 no such transaction type */
+ ztest_replay_create, /* TX_CREATE */
+ NULL, /* TX_MKDIR */
+ NULL, /* TX_MKXATTR */
+ NULL, /* TX_SYMLINK */
+ ztest_replay_remove, /* TX_REMOVE */
+ NULL, /* TX_RMDIR */
+ NULL, /* TX_LINK */
+ NULL, /* TX_RENAME */
+ ztest_replay_write, /* TX_WRITE */
+ ztest_replay_truncate, /* TX_TRUNCATE */
+ ztest_replay_setattr, /* TX_SETATTR */
+ NULL, /* TX_ACL */
+ NULL, /* TX_CREATE_ACL */
+ NULL, /* TX_CREATE_ATTR */
+ NULL, /* TX_CREATE_ACL_ATTR */
+ NULL, /* TX_MKDIR_ACL */
+ NULL, /* TX_MKDIR_ATTR */
+ NULL, /* TX_MKDIR_ACL_ATTR */
+ NULL, /* TX_WRITE2 */
+ };
+
+ /*
+ * ZIL get_data callbacks
+ */
+
+ static void
+ ztest_get_done(zgd_t *zgd, int error)
+ {
+ ztest_ds_t *zd = zgd->zgd_private;
+ uint64_t object = zgd->zgd_rl->rl_object;
+
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ ztest_range_unlock(zgd->zgd_rl);
+ ztest_object_unlock(zd, object);
+
+ if (error == 0 && zgd->zgd_bp)
+ zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+
+ umem_free(zgd, sizeof (*zgd));
+ }
+
+ static int
+ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+ {
+ ztest_ds_t *zd = arg;
+ objset_t *os = zd->zd_os;
+ uint64_t object = lr->lr_foid;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length;
+ blkptr_t *bp = &lr->lr_blkptr;
+ uint64_t txg = lr->lr_common.lrc_txg;
+ uint64_t crtxg;
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ zgd_t *zgd;
+ int error;
+
+ ztest_object_lock(zd, object, RL_READER);
+ error = dmu_bonus_hold(os, object, FTAG, &db);
+ if (error) {
+ ztest_object_unlock(zd, object);
+ return (error);
+ }
+
+ crtxg = ztest_bt_bonus(db)->bt_crtxg;
+
+ if (crtxg == 0 || crtxg > txg) {
+ dmu_buf_rele(db, FTAG);
+ ztest_object_unlock(zd, object);
+ return (ENOENT);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ dmu_buf_rele(db, FTAG);
+ db = NULL;
+
+ zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
+ zgd->zgd_zilog = zd->zd_zilog;
+ zgd->zgd_private = zd;
+
+ if (buf != NULL) { /* immediate write */
+ zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+ RL_READER);
+
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ ASSERT(error == 0);
+ } else {
+ size = doi.doi_data_block_size;
+ if (ISP2(size)) {
+ offset = P2ALIGN(offset, size);
+ } else {
+ ASSERT(offset < size);
+ offset = 0;
+ }
+
+ zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+ RL_READER);
+
+ error = dmu_buf_hold(os, object, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
+
+ if (error == 0) {
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ ztest_get_done, zgd);
+
+ if (error == 0)
+ return (0);
+ }
+ }
+
+ ztest_get_done(zgd, error);
+
+ return (error);
+ }
+
+ static void *
+ ztest_lr_alloc(size_t lrsize, char *name)
+ {
+ char *lr;
+ size_t namesize = name ? strlen(name) + 1 : 0;
+
+ lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
+
+ if (name)
+ bcopy(name, lr + lrsize, namesize);
+
+ return (lr);
+ }
+
+ void
+ ztest_lr_free(void *lr, size_t lrsize, char *name)
+ {
+ size_t namesize = name ? strlen(name) + 1 : 0;
+
+ umem_free(lr, lrsize + namesize);
+ }
+
+ /*
+ * Lookup a bunch of objects. Returns the number of objects not found.
+ */
+ static int
+ ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
+ {
+ int missing = 0;
+ int error;
++ int i;
+
+ ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
- for (int i = 0; i < count; i++, od++) {
++ for (i = 0; i < count; i++, od++) {
+ od->od_object = 0;
+ error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
+ sizeof (uint64_t), 1, &od->od_object);
+ if (error) {
+ ASSERT(error == ENOENT);
+ ASSERT(od->od_object == 0);
+ missing++;
+ } else {
+ dmu_buf_t *db;
+ ztest_block_tag_t *bbt;
+ dmu_object_info_t doi;
+
+ ASSERT(od->od_object != 0);
+ ASSERT(missing == 0); /* there should be no gaps */
+
+ ztest_object_lock(zd, od->od_object, RL_READER);
+ VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
+ od->od_object, FTAG, &db));
+ dmu_object_info_from_db(db, &doi);
+ bbt = ztest_bt_bonus(db);
+ ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+ od->od_type = doi.doi_type;
+ od->od_blocksize = doi.doi_data_block_size;
+ od->od_gen = bbt->bt_gen;
+ dmu_buf_rele(db, FTAG);
+ ztest_object_unlock(zd, od->od_object);
+ }
+ }
+
+ return (missing);
+ }
+
+ static int
+ ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
+ {
+ int missing = 0;
++ int i;
+
+ ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
- for (int i = 0; i < count; i++, od++) {
++ for (i = 0; i < count; i++, od++) {
+ if (missing) {
+ od->od_object = 0;
+ missing++;
+ continue;
+ }
+
+ lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+ lr->lr_doid = od->od_dir;
+ lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */
+ lr->lrz_type = od->od_crtype;
+ lr->lrz_blocksize = od->od_crblocksize;
+ lr->lrz_ibshift = ztest_random_ibshift();
+ lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
+ lr->lrz_bonuslen = dmu_bonus_max();
+ lr->lr_gen = od->od_crgen;
+ lr->lr_crtime[0] = time(NULL);
+
+ if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
+ ASSERT(missing == 0);
+ od->od_object = 0;
+ missing++;
+ } else {
+ od->od_object = lr->lr_foid;
+ od->od_type = od->od_crtype;
+ od->od_blocksize = od->od_crblocksize;
+ od->od_gen = od->od_crgen;
+ ASSERT(od->od_object != 0);
+ }
+
+ ztest_lr_free(lr, sizeof (*lr), od->od_name);
+ }
+
+ return (missing);
+ }
+
+ static int
+ ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
+ {
+ int missing = 0;
+ int error;
++ int i;
+
+ ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+ od += count - 1;
+
- for (int i = count - 1; i >= 0; i--, od--) {
++ for (i = count - 1; i >= 0; i--, od--) {
+ if (missing) {
+ missing++;
+ continue;
+ }
+
+ if (od->od_object == 0)
+ continue;
+
+ lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+ lr->lr_doid = od->od_dir;
+
+ if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
+ ASSERT3U(error, ==, ENOSPC);
+ missing++;
+ } else {
+ od->od_object = 0;
+ }
+ ztest_lr_free(lr, sizeof (*lr), od->od_name);
+ }
+
+ return (missing);
+ }
+
+ static int
+ ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
+ void *data)
+ {
+ lr_write_t *lr;
+ int error;
+
+ lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
+
+ lr->lr_foid = object;
+ lr->lr_offset = offset;
+ lr->lr_length = size;
+ lr->lr_blkoff = 0;
+ BP_ZERO(&lr->lr_blkptr);
+
+ bcopy(data, lr + 1, size);
+
+ error = ztest_replay_write(zd, lr, B_FALSE);
+
+ ztest_lr_free(lr, sizeof (*lr) + size, NULL);
+
+ return (error);
+ }
+
+ static int
+ ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+ {
+ lr_truncate_t *lr;
+ int error;
+
+ lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+ lr->lr_foid = object;
+ lr->lr_offset = offset;
+ lr->lr_length = size;
+
+ error = ztest_replay_truncate(zd, lr, B_FALSE);
+
+ ztest_lr_free(lr, sizeof (*lr), NULL);
+
+ return (error);
+ }
+
+ static int
+ ztest_setattr(ztest_ds_t *zd, uint64_t object)
+ {
+ lr_setattr_t *lr;
+ int error;
+
+ lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+ lr->lr_foid = object;
+ lr->lr_size = 0;
+ lr->lr_mode = 0;
+
+ error = ztest_replay_setattr(zd, lr, B_FALSE);
+
+ ztest_lr_free(lr, sizeof (*lr), NULL);
+
+ return (error);
+ }
+
+ static void
+ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+ {
+ objset_t *os = zd->zd_os;
+ dmu_tx_t *tx;
+ uint64_t txg;
+ rl_t *rl;
+
+ txg_wait_synced(dmu_objset_pool(os), 0);
+
+ ztest_object_lock(zd, object, RL_READER);
+ rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, object, offset, size);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+
+ if (txg != 0) {
+ dmu_prealloc(os, object, offset, size, tx);
+ dmu_tx_commit(tx);
+ txg_wait_synced(dmu_objset_pool(os), txg);
+ } else {
+ (void) dmu_free_long_range(os, object, offset, size);
+ }
+
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, object);
+ }
+
+ static void
+ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
+ {
+ ztest_block_tag_t wbt;
+ dmu_object_info_t doi;
+ enum ztest_io_type io_type;
+ uint64_t blocksize;
+ void *data;
+
+ VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
+ blocksize = doi.doi_data_block_size;
+ data = umem_alloc(blocksize, UMEM_NOFAIL);
+
+ /*
+ * Pick an i/o type at random, biased toward writing block tags.
+ */
+ io_type = ztest_random(ZTEST_IO_TYPES);
+ if (ztest_random(2) == 0)
+ io_type = ZTEST_IO_WRITE_TAG;
+
+ switch (io_type) {
+
+ case ZTEST_IO_WRITE_TAG:
+ ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
+ (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
+ break;
+
+ case ZTEST_IO_WRITE_PATTERN:
+ (void) memset(data, 'a' + (object + offset) % 5, blocksize);
+ if (ztest_random(2) == 0) {
+ /*
+ * Induce fletcher2 collisions to ensure that
+ * zio_ddt_collision() detects and resolves them
+ * when using fletcher2-verify for deduplication.
+ */
+ ((uint64_t *)data)[0] ^= 1ULL << 63;
+ ((uint64_t *)data)[4] ^= 1ULL << 63;
+ }
+ (void) ztest_write(zd, object, offset, blocksize, data);
+ break;
+
+ case ZTEST_IO_WRITE_ZEROES:
+ bzero(data, blocksize);
+ (void) ztest_write(zd, object, offset, blocksize, data);
+ break;
+
+ case ZTEST_IO_TRUNCATE:
+ (void) ztest_truncate(zd, object, offset, blocksize);
+ break;
+
+ case ZTEST_IO_SETATTR:
+ (void) ztest_setattr(zd, object);
+ break;
+ }
+
+ umem_free(data, blocksize);
+ }
+
+ /*
+ * Initialize an object description template.
+ */
+ static void
+ ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
+ dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
+ {
+ od->od_dir = ZTEST_DIROBJ;
+ od->od_object = 0;
+
+ od->od_crtype = type;
+ od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
+ od->od_crgen = gen;
+
+ od->od_type = DMU_OT_NONE;
+ od->od_blocksize = 0;
+ od->od_gen = 0;
+
+ (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
+ tag, (int64_t)id, index);
+ }
+
+ /*
+ * Lookup or create the objects for a test using the od template.
+ * If the objects do not all exist, or if 'remove' is specified,
+ * remove any existing objects and create new ones. Otherwise,
+ * use the existing objects.
+ */
+ static int
+ ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
+ {
+ int count = size / sizeof (*od);
+ int rv = 0;
+
+ VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0);
+ if ((ztest_lookup(zd, od, count) != 0 || remove) &&
+ (ztest_remove(zd, od, count) != 0 ||
+ ztest_create(zd, od, count) != 0))
+ rv = -1;
+ zd->zd_od = od;
+ VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
+
+ return (rv);
+ }
+
+ /* ARGSUSED */
+ void
+ ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
+ {
+ zilog_t *zilog = zd->zd_zilog;
+
+ zil_commit(zilog, UINT64_MAX, ztest_random(ZTEST_OBJECTS));
+
+ /*
+ * Remember the committed values in zd, which is in parent/child
+ * shared memory. If we die, the next iteration of ztest_run()
+ * will verify that the log really does contain this record.
+ */
+ mutex_enter(&zilog->zl_lock);
+ ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq);
+ zd->zd_seq = zilog->zl_commit_lr_seq;
+ mutex_exit(&zilog->zl_lock);
+ }
+
+ /*
+ * Verify that we can't destroy an active pool, create an existing pool,
+ * or create a pool with a bad vdev spec.
+ */
+ /* ARGSUSED */
+ void
+ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
+ {
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa;
+ nvlist_t *nvroot;
+
+ /*
+ * Attempt to create using a bad file.
+ */
+ nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
+ VERIFY3U(ENOENT, ==,
+ spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL));
+ nvlist_free(nvroot);
+
+ /*
+ * Attempt to create using a bad mirror.
+ */
+ nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
+ VERIFY3U(ENOENT, ==,
+ spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL));
+ nvlist_free(nvroot);
+
+ /*
+ * Attempt to create an existing pool. It shouldn't matter
+ * what's in the nvroot; we should fail with EEXIST.
+ */
+ (void) rw_rdlock(&zs->zs_name_lock);
+ nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
+ VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL));
+ nvlist_free(nvroot);
+ VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+ VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool));
+ spa_close(spa, FTAG);
+
+ (void) rw_unlock(&zs->zs_name_lock);
+ }
+
+ static vdev_t *
+ vdev_lookup_by_path(vdev_t *vd, const char *path)
+ {
+ vdev_t *mvd;
++ int c;
+
+ if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
+ return (vd);
+
- for (int c = 0; c < vd->vdev_children; c++)
++ for (c = 0; c < vd->vdev_children; c++)
+ if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
+ NULL)
+ return (mvd);
+
+ return (NULL);
+ }
+
+ /*
+ * Find the first available hole which can be used as a top-level.
+ */
+ int
+ find_vdev_hole(spa_t *spa)
+ {
+ vdev_t *rvd = spa->spa_root_vdev;
+ int c;
+
+ ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);
+
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *cvd = rvd->vdev_child[c];
+
+ if (cvd->vdev_ishole)
+ break;
+ }
+ return (c);
+ }
+
+ /*
+ * Verify that vdev_add() works as expected.
+ */
+ /* ARGSUSED */
+ void
+ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
+ {
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
+ uint64_t leaves;
+ uint64_t guid;
+ nvlist_t *nvroot;
+ int error;
+
+ VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+ leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * zopt_raidz;
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+ ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
+
+ /*
+ * If we have slogs then remove them 1/4 of the time.
+ */
+ if (spa_has_slogs(spa) && ztest_random(4) == 0) {
+ /*
+ * Grab the guid from the head of the log class rotor.
+ */
+ guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
+
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ /*
+ * We have to grab the zs_name_lock as writer to
+ * prevent a race between removing a slog (dmu_objset_find)
+ * and destroying a dataset. Removing the slog will
+ * grab a reference on the dataset which may cause
+ * dmu_objset_destroy() to fail with EBUSY thus
+ * leaving the dataset in an inconsistent state.
+ */
+ VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0);
+ error = spa_vdev_remove(spa, guid, B_FALSE);
+ VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0);
+
+ if (error && error != EEXIST)
+ fatal(0, "spa_vdev_remove() = %d", error);
+ } else {
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ /*
+ * Make 1/4 of the devices be log devices.
+ */
+ nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
+ ztest_random(4) == 0, zopt_raidz, zs->zs_mirrors, 1);
+
+ error = spa_vdev_add(spa, nvroot);
+ nvlist_free(nvroot);
+
+ if (error == ENOSPC)
+ ztest_record_enospc("spa_vdev_add");
+ else if (error != 0)
+ fatal(0, "spa_vdev_add() = %d", error);
+ }
+
+ VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0);
+ }
+
+ /*
+ * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
+ */
+ /* ARGSUSED */
+ void
+ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
+ {
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ spa_aux_vdev_t *sav;
+ char *aux;
+ uint64_t guid = 0;
+ int error;
+
+ if (ztest_random(2) == 0) {
+ sav = &spa->spa_spares;
+ aux = ZPOOL_CONFIG_SPARES;
+ } else {
+ sav = &spa->spa_l2cache;
+ aux = ZPOOL_CONFIG_L2CACHE;
+ }
+
+ VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
if (sav->sav_count != 0 && ztest_random(4) == 0) {
/*
return (0);
}
- /*
- * Verify that dmu_objset_{create,destroy,open,close} work as expected.
- */
- static uint64_t
- ztest_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t object, int mode)
+ static boolean_t
+ ztest_snapshot_create(char *osname, uint64_t id)
+ {
+ char snapname[MAXNAMELEN];
+ int error;
+
+ (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+ (u_longlong_t)id);
+
+ error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1,
+ NULL, B_FALSE);
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
+ return (B_FALSE);
+ }
+ if (error != 0 && error != EEXIST)
+ fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error);
+ return (B_TRUE);
+ }
+
+ static boolean_t
+ ztest_snapshot_destroy(char *osname, uint64_t id)
{
- itx_t *itx;
- lr_create_t *lr;
- size_t namesize;
- char name[24];
-
- (void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object);
- namesize = strlen(name) + 1;
-
- itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize +
- ztest_random(ZIL_MAX_BLKSZ));
- lr = (lr_create_t *)&itx->itx_lr;
- bzero(lr + 1, lr->lr_common.lrc_reclen - sizeof (*lr));
- lr->lr_doid = object;
- lr->lr_foid = 0;
- lr->lr_mode = mode;
- lr->lr_uid = 0;
- lr->lr_gid = 0;
- lr->lr_gen = dmu_tx_get_txg(tx);
- lr->lr_crtime[0] = time(NULL);
- lr->lr_crtime[1] = 0;
- lr->lr_rdev = 0;
- bcopy(name, (char *)(lr + 1), namesize);
-
- return (zil_itx_assign(zilog, itx, tx));
+ char snapname[MAXNAMELEN];
+ int error;
+
+ (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+ (u_longlong_t)id);
+
+ error = dmu_objset_destroy(snapname, B_FALSE);
+ if (error != 0 && error != ENOENT)
+ fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
+ return (B_TRUE);
}
+ /* ARGSUSED */
void
- ztest_dmu_objset_create_destroy(ztest_args_t *za)
+ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
{
+ ztest_shared_t *zs = ztest_shared;
+ ztest_ds_t zdtmp;
+ int iters;
int error;
objset_t *os, *os2;
- char name[100];
- int basemode, expected_error;
+ char name[MAXNAMELEN];
zilog_t *zilog;
- uint64_t seq;
- uint64_t objects;
++ int i;
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
- (void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool,
- (u_longlong_t)za->za_instance);
+ (void) rw_rdlock(&zs->zs_name_lock);
- basemode = DS_MODE_TYPE(za->za_instance);
- if (basemode != DS_MODE_USER && basemode != DS_MODE_OWNER)
- basemode = DS_MODE_USER;
+ (void) snprintf(name, MAXNAMELEN, "%s/temp_%llu",
+ zs->zs_pool, (u_longlong_t)id);
/*
* If this dataset exists from a previous run, process its replay log
/*
* Open the intent log for it.
*/
- zilog = zil_open(os, NULL);
+ zilog = zil_open(os, ztest_get_data);
/*
- * Put a random number of objects in there.
+ * Put some objects in there, do a little I/O to them,
+ * and randomly take a couple of snapshots along the way.
*/
- objects = ztest_random(20);
- seq = 0;
- while (objects-- != 0) {
- uint64_t object;
- dmu_tx_t *tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, sizeof (name));
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- ztest_set_random_blocksize(os, object, tx);
- seq = ztest_log_create(zilog, tx, object,
- DMU_OT_UINT64_OTHER);
- dmu_write(os, object, 0, sizeof (name), name, tx);
- dmu_tx_commit(tx);
- }
- if (ztest_random(5) == 0) {
- zil_commit(zilog, seq, object);
- }
- if (ztest_random(100) == 0) {
- error = zil_suspend(zilog);
- if (error == 0) {
- zil_resume(zilog);
- }
- }
+ iters = ztest_random(5);
- for (int i = 0; i < iters; i++) {
++ for (i = 0; i < iters; i++) {
+ ztest_dmu_object_alloc_free(&zdtmp, id);
+ if (ztest_random(iters) == 0)
+ (void) ztest_snapshot_create(name, i);
}
/*
* Verify that dmu_object_{alloc,free} work as expected.
*/
void
- ztest_dmu_object_alloc_free(ztest_args_t *za)
+ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
- dmu_buf_t *db;
- dmu_tx_t *tx;
- uint64_t batchobj, object, batchsize, endoff, temp;
- int b, c, error, bonuslen;
- dmu_object_info_t *doi = &za->za_doi;
- char osname[MAXNAMELEN];
-
- dmu_objset_name(os, osname);
-
- endoff = -8ULL;
- batchsize = 2;
-
- /*
- * Create a batch object if necessary, and record it in the directory.
- */
- VERIFY3U(0, ==, dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t), &batchobj, DMU_READ_PREFETCH));
- if (batchobj == 0) {
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t));
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create a batch object");
- dmu_tx_abort(tx);
- return;
- }
- batchobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- ztest_set_random_blocksize(os, batchobj, tx);
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t), &batchobj, tx);
- dmu_tx_commit(tx);
- }
-
- /*
- * Destroy the previous batch of objects.
- */
- for (b = 0; b < batchsize; b++) {
- VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t),
- sizeof (uint64_t), &object, DMU_READ_PREFETCH));
- if (object == 0)
- continue;
- /*
- * Read and validate contents.
- * We expect the nth byte of the bonus buffer to be n.
- */
- VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
- za->za_dbuf = db;
-
- dmu_object_info_from_db(db, doi);
- ASSERT(doi->doi_type == DMU_OT_UINT64_OTHER);
- ASSERT(doi->doi_bonus_type == DMU_OT_PLAIN_OTHER);
- ASSERT3S(doi->doi_physical_blks, >=, 0);
-
- bonuslen = doi->doi_bonus_size;
-
- for (c = 0; c < bonuslen; c++) {
- if (((uint8_t *)db->db_data)[c] !=
- (uint8_t)(c + bonuslen)) {
- fatal(0,
- "bad bonus: %s, obj %llu, off %d: %u != %u",
- osname, object, c,
- ((uint8_t *)db->db_data)[c],
- (uint8_t)(c + bonuslen));
- }
- }
-
- dmu_buf_rele(db, FTAG);
- za->za_dbuf = NULL;
-
- /*
- * We expect the word at endoff to be our object number.
- */
- VERIFY(0 == dmu_read(os, object, endoff,
- sizeof (uint64_t), &temp, DMU_READ_PREFETCH));
-
- if (temp != object) {
- fatal(0, "bad data in %s, got %llu, expected %llu",
- osname, temp, object);
- }
-
- /*
- * Destroy old object and clear batch entry.
- */
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, batchobj,
- b * sizeof (uint64_t), sizeof (uint64_t));
- dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("free object");
- dmu_tx_abort(tx);
- return;
- }
- error = dmu_object_free(os, object, tx);
- if (error) {
- fatal(0, "dmu_object_free('%s', %llu) = %d",
- osname, object, error);
- }
- object = 0;
-
- dmu_object_set_checksum(os, batchobj,
- ztest_random_checksum(), tx);
- dmu_object_set_compress(os, batchobj,
- ztest_random_compress(), tx);
-
- dmu_write(os, batchobj, b * sizeof (uint64_t),
- sizeof (uint64_t), &object, tx);
-
- dmu_tx_commit(tx);
- }
+ ztest_od_t od[4];
+ int batchsize = sizeof (od) / sizeof (od[0]);
++ int b;
- /*
- * Before creating the new batch of objects, generate a bunch of churn.
- */
- for (b = ztest_random(100); b > 0; b--) {
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("churn objects");
- dmu_tx_abort(tx);
- return;
- }
- object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- ztest_set_random_blocksize(os, object, tx);
- error = dmu_object_free(os, object, tx);
- if (error) {
- fatal(0, "dmu_object_free('%s', %llu) = %d",
- osname, object, error);
- }
- dmu_tx_commit(tx);
- }
- for (int b = 0; b < batchsize; b++)
++ for (b = 0; b < batchsize; b++)
+ ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
/*
- * Create a new batch of objects with randomly chosen
- * blocksizes and record them in the batch directory.
+ * Destroy the previous batch of objects, create a new batch,
+ * and do some I/O on the new objects.
*/
- for (b = 0; b < batchsize; b++) {
- uint32_t va_blksize;
- u_longlong_t va_nblocks;
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, batchobj, b * sizeof (uint64_t),
- sizeof (uint64_t));
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, endoff,
- sizeof (uint64_t));
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create batchobj");
- dmu_tx_abort(tx);
- return;
- }
- bonuslen = (int)ztest_random(dmu_bonus_max()) + 1;
-
- object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_PLAIN_OTHER, bonuslen, tx);
-
- ztest_set_random_blocksize(os, object, tx);
-
- dmu_object_set_checksum(os, object,
- ztest_random_checksum(), tx);
- dmu_object_set_compress(os, object,
- ztest_random_compress(), tx);
-
- dmu_write(os, batchobj, b * sizeof (uint64_t),
- sizeof (uint64_t), &object, tx);
-
- /*
- * Write to both the bonus buffer and the regular data.
- */
- VERIFY(dmu_bonus_hold(os, object, FTAG, &db) == 0);
- za->za_dbuf = db;
- ASSERT3U(bonuslen, <=, db->db_size);
-
- dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
- ASSERT3S(va_nblocks, >=, 0);
-
- dmu_buf_will_dirty(db, tx);
-
- /*
- * See comments above regarding the contents of
- * the bonus buffer and the word at endoff.
- */
- for (c = 0; c < bonuslen; c++)
- ((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
-
- dmu_buf_rele(db, FTAG);
- za->za_dbuf = NULL;
-
- /*
- * Write to a large offset to increase indirection.
- */
- dmu_write(os, object, endoff, sizeof (uint64_t), &object, tx);
+ if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0)
+ return;
- dmu_tx_commit(tx);
- }
+ while (ztest_random(4 * batchsize) != 0)
+ ztest_io(zd, od[ztest_random(batchsize)].od_object,
+ ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
}
/*
ASSERT3U(error, ==, 0);
tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, object, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("remove zap entry");
- dmu_tx_abort(tx);
+ dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
+ return;
+ VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
+ VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
+ dmu_tx_commit(tx);
+ }
+
+ /*
+ * Testcase to test the upgrading of a microzap to fatzap.
+ */
+ void
+ ztest_fzap(ztest_ds_t *zd, uint64_t id)
+ {
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[1];
+ uint64_t object, txg;
++ int i;
+
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
return;
- for (int i = 0; i < 2050; i++) {
+
+ object = od[0].od_object;
+
+ /*
+ * Add entries to this ZAP and make sure it spills over
+ * and gets upgraded to a fatzap. Also, since we are adding
+ * 2050 entries we should see ptrtbl growth and leaf-block split.
+ */
++ for (i = 0; i < 2050; i++) {
+ char name[MAXNAMELEN];
+ uint64_t value = i;
+ dmu_tx_t *tx;
+ int error;
+
+ (void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
+ id, value);
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, B_TRUE, name);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
+ return;
+ error = zap_add(os, object, name, sizeof (uint64_t), 1,
+ &value, tx);
+ ASSERT(error == 0 || error == EEXIST);
+ dmu_tx_commit(tx);
}
- error = zap_remove(os, object, txgname, tx);
- if (error)
- fatal(0, "zap_remove('%s', %llu, '%s') = %d",
- osname, object, txgname, error);
+ }
- error = zap_remove(os, object, propname, tx);
- if (error)
- fatal(0, "zap_remove('%s', %llu, '%s') = %d",
- osname, object, propname, error);
+ /* ARGSUSED */
+ void
+ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
+ {
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[1];
+ uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
+ dmu_tx_t *tx;
+ int i, namelen, error;
+ int micro = ztest_random(2);
+ char name[20], string_value[20];
+ void *data;
- dmu_tx_commit(tx);
+ ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+ return;
+
+ object = od[0].od_object;
+
+ /*
+ * Generate a random name of the form 'xxx.....' where each
+ * x is a random printable character and the dots are dots.
+ * There are 94 such characters, and the name length goes from
+ * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
+ */
+ namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
+
+ for (i = 0; i < 3; i++)
+ name[i] = '!' + ztest_random('~' - '!' + 1);
+ for (; i < namelen - 1; i++)
+ name[i] = '.';
+ name[i] = '\0';
+
+ if ((namelen & 1) || micro) {
+ wsize = sizeof (txg);
+ wc = 1;
+ data = &txg;
+ } else {
+ wsize = 1;
+ wc = namelen;
+ data = string_value;
+ }
+
+ count = -1ULL;
+ VERIFY(zap_count(os, object, &count) == 0);
+ ASSERT(count != -1ULL);
/*
- * Once in a while, destroy the object.
+ * Select an operation: length, lookup, add, update, remove.
*/
- if (ztest_random(1000) != 0)
+ i = ztest_random(5);
+
+ if (i >= 2) {
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
+ return;
+ bcopy(name, string_value, namelen);
+ } else {
+ tx = NULL;
+ txg = 0;
+ bzero(string_value, namelen);
+ }
+
+ switch (i) {
+
+ case 0:
+ error = zap_length(os, object, name, &zl_wsize, &zl_wc);
+ if (error == 0) {
+ ASSERT3U(wsize, ==, zl_wsize);
+ ASSERT3U(wc, ==, zl_wc);
+ } else {
+ ASSERT3U(error, ==, ENOENT);
+ }
+ break;
+
+ case 1:
+ error = zap_lookup(os, object, name, wsize, wc, data);
+ if (error == 0) {
+ if (data == string_value &&
+ bcmp(name, data, namelen) != 0)
+ fatal(0, "name '%s' != val '%s' len %d",
+ name, data, namelen);
+ } else {
+ ASSERT3U(error, ==, ENOENT);
+ }
+ break;
+
+ case 2:
+ error = zap_add(os, object, name, wsize, wc, data, tx);
+ ASSERT(error == 0 || error == EEXIST);
+ break;
+
+ case 3:
+ VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
+ break;
+
+ case 4:
+ error = zap_remove(os, object, name, tx);
+ ASSERT(error == 0 || error == ENOENT);
+ break;
+ }
+
+ if (tx != NULL)
+ dmu_tx_commit(tx);
+ }
+
+ /*
+ * Commit callback data.
+ */
+ typedef struct ztest_cb_data {
+ list_node_t zcd_node;
+ uint64_t zcd_txg;
+ int zcd_expected_err;
+ boolean_t zcd_added;
+ boolean_t zcd_called;
+ spa_t *zcd_spa;
+ } ztest_cb_data_t;
+
+ /* This is the actual commit callback function */
+ static void
+ ztest_commit_callback(void *arg, int error)
+ {
+ ztest_cb_data_t *data = arg;
+ uint64_t synced_txg;
+
+ VERIFY(data != NULL);
+ VERIFY3S(data->zcd_expected_err, ==, error);
+ VERIFY(!data->zcd_called);
+
+ synced_txg = spa_last_synced_txg(data->zcd_spa);
+ if (data->zcd_txg > synced_txg)
+ fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
+ ", last synced txg = %" PRIu64 "\n", data->zcd_txg,
+ synced_txg);
+
+ data->zcd_called = B_TRUE;
+
+ if (error == ECANCELED) {
+ ASSERT3U(data->zcd_txg, ==, 0);
+ ASSERT(!data->zcd_added);
+
+ /*
+ * The private callback data should be destroyed here, but
+ * since we are going to check the zcd_called field after
+ * dmu_tx_abort(), we will destroy it there.
+ */
+ return;
+ }
+
+ /* Was this callback added to the global callback list? */
+ if (!data->zcd_added)
+ goto out;
+
+ ASSERT3U(data->zcd_txg, !=, 0);
+
+ /* Remove our callback from the list */
+ (void) mutex_lock(&zcl.zcl_callbacks_lock);
+ list_remove(&zcl.zcl_callbacks, data);
+ (void) mutex_unlock(&zcl.zcl_callbacks_lock);
+
+ out:
+ umem_free(data, sizeof (ztest_cb_data_t));
+ }
+
+ /* Allocate and initialize callback data structure */
+ static ztest_cb_data_t *
+ ztest_create_cb_data(objset_t *os, uint64_t txg)
+ {
+ ztest_cb_data_t *cb_data;
+
+ cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
+
+ cb_data->zcd_txg = txg;
+ cb_data->zcd_spa = dmu_objset_spa(os);
+
+ return (cb_data);
+ }
+
+ /*
+ * If a number of txgs equal to this threshold have been created after a commit
+ * callback has been registered but not called, then we assume there is an
+ * implementation bug.
+ */
+ #define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2)
+
+ /*
+ * Commit callback test.
+ */
+ void
+ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
+ {
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[1];
+ dmu_tx_t *tx;
+ ztest_cb_data_t *cb_data[3], *tmp_cb;
+ uint64_t old_txg, txg;
+ int i, error;
+
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
return;
tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
- dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
- error = dmu_tx_assign(tx, TXG_WAIT);
+
+ cb_data[0] = ztest_create_cb_data(os, 0);
+ dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
+
+ dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t));
+
+ /* Every once in a while, abort the transaction on purpose */
+ if (ztest_random(100) == 0)
+ error = -1;
+
+ if (!error)
+ error = dmu_tx_assign(tx, TXG_NOWAIT);
+
+ txg = error ? 0 : dmu_tx_get_txg(tx);
+
+ cb_data[0]->zcd_txg = txg;
+ cb_data[1] = ztest_create_cb_data(os, txg);
+ dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);
+
if (error) {
- ztest_record_enospc("destroy zap object");
+ /*
+ * It's not a strict requirement to call the registered
+ * callbacks from inside dmu_tx_abort(), but that's what
+ * it's supposed to happen in the current implementation
+ * so we will check for that.
+ */
+ for (i = 0; i < 2; i++) {
+ cb_data[i]->zcd_expected_err = ECANCELED;
+ VERIFY(!cb_data[i]->zcd_called);
+ }
+
dmu_tx_abort(tx);
+
+ for (i = 0; i < 2; i++) {
+ VERIFY(cb_data[i]->zcd_called);
+ umem_free(cb_data[i], sizeof (ztest_cb_data_t));
+ }
+
return;
}
- error = zap_destroy(os, object, tx);
- if (error)
- fatal(0, "zap_destroy('%s', %llu) = %d",
- osname, object, error);
- object = 0;
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
- &object, tx);
- dmu_tx_commit(tx);
- }
- void
- ztest_zap_parallel(ztest_args_t *za)
- {
- objset_t *os = za->za_os;
- uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
- dmu_tx_t *tx;
- int i, namelen, error;
- char name[20], string_value[20];
- void *data;
+ cb_data[2] = ztest_create_cb_data(os, txg);
+ dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
/*
- * Generate a random name of the form 'xxx.....' where each
- * x is a random printable character and the dots are dots.
- * There are 94 such characters, and the name length goes from
- * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
+ * Read existing data to make sure there isn't a future leak.
*/
- namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
+ VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t),
+ &old_txg, DMU_READ_PREFETCH));
- for (i = 0; i < 3; i++)
- name[i] = '!' + ztest_random('~' - '!' + 1);
- for (; i < namelen - 1; i++)
- name[i] = '.';
- name[i] = '\0';
+ if (old_txg > txg)
+ fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
+ old_txg, txg);
- if (ztest_random(2) == 0)
- object = ZTEST_MICROZAP_OBJ;
- else
- object = ZTEST_FATZAP_OBJ;
+ dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx);
- if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
- wsize = sizeof (txg);
- wc = 1;
- data = &txg;
- } else {
- wsize = 1;
- wc = namelen;
- data = string_value;
- }
+ (void) mutex_lock(&zcl.zcl_callbacks_lock);
- count = -1ULL;
- VERIFY(zap_count(os, object, &count) == 0);
- ASSERT(count != -1ULL);
+ /*
+ * Since commit callbacks don't have any ordering requirement and since
+ * it is theoretically possible for a commit callback to be called
+ * after an arbitrary amount of time has elapsed since its txg has been
+ * synced, it is difficult to reliably determine whether a commit
+ * callback hasn't been called due to high load or due to a flawed
+ * implementation.
+ *
+ * In practice, we will assume that if after a certain number of txgs a
+ * commit callback hasn't been called, then most likely there's an
+ * implementation bug..
+ */
+ tmp_cb = list_head(&zcl.zcl_callbacks);
+ if (tmp_cb != NULL &&
+ tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) {
+ fatal(0, "Commit callback threshold exceeded, oldest txg: %"
+ PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
+ }
/*
- * Select an operation: length, lookup, add, update, remove.
+ * Let's find the place to insert our callbacks.
+ *
+ * Even though the list is ordered by txg, it is possible for the
+ * insertion point to not be the end because our txg may already be
+ * quiescing at this point and other callbacks in the open txg
+ * (from other objsets) may have sneaked in.
*/
- i = ztest_random(5);
+ tmp_cb = list_tail(&zcl.zcl_callbacks);
+ while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
+ tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
+
+ /* Add the 3 callbacks to the list */
+ for (i = 0; i < 3; i++) {
+ if (tmp_cb == NULL)
+ list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
+ else
+ list_insert_after(&zcl.zcl_callbacks, tmp_cb,
+ cb_data[i]);
- if (i >= 2) {
- tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, object, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("zap parallel");
- dmu_tx_abort(tx);
- return;
- }
- txg = dmu_tx_get_txg(tx);
- bcopy(name, string_value, namelen);
- } else {
- tx = NULL;
- txg = 0;
- bzero(string_value, namelen);
+ cb_data[i]->zcd_added = B_TRUE;
+ VERIFY(!cb_data[i]->zcd_called);
+
+ tmp_cb = cb_data[i];
}
- switch (i) {
+ (void) mutex_unlock(&zcl.zcl_callbacks_lock);
- case 0:
- error = zap_length(os, object, name, &zl_wsize, &zl_wc);
- if (error == 0) {
- ASSERT3U(wsize, ==, zl_wsize);
- ASSERT3U(wc, ==, zl_wc);
- } else {
- ASSERT3U(error, ==, ENOENT);
- }
- break;
+ dmu_tx_commit(tx);
+ }
- case 1:
- error = zap_lookup(os, object, name, wsize, wc, data);
- if (error == 0) {
- if (data == string_value &&
- bcmp(name, data, namelen) != 0)
- fatal(0, "name '%s' != val '%s' len %d",
- name, data, namelen);
- } else {
- ASSERT3U(error, ==, ENOENT);
- }
- break;
+ /* ARGSUSED */
+ void
+ ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
+ {
+ zfs_prop_t proplist[] = {
+ ZFS_PROP_CHECKSUM,
+ ZFS_PROP_COMPRESSION,
+ ZFS_PROP_COPIES,
+ ZFS_PROP_DEDUP
+ };
+ ztest_shared_t *zs = ztest_shared;
++ int p;
- case 2:
- error = zap_add(os, object, name, wsize, wc, data, tx);
- ASSERT(error == 0 || error == EEXIST);
- break;
+ (void) rw_rdlock(&zs->zs_name_lock);
- case 3:
- VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
- break;
- for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
++ for (p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
+ (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
+ ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
- case 4:
- error = zap_remove(os, object, name, tx);
- ASSERT(error == 0 || error == ENOENT);
- break;
- }
+ (void) rw_unlock(&zs->zs_name_lock);
+ }
- if (tx != NULL)
- dmu_tx_commit(tx);
+ /* ARGSUSED */
+ void
+ ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
+ {
+ ztest_shared_t *zs = ztest_shared;
+ nvlist_t *props = NULL;
+
+ (void) rw_rdlock(&zs->zs_name_lock);
+
+ (void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO,
+ ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
+
+ VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0);
+
+ if (zopt_verbose >= 6)
+ dump_nvlist(props, 4);
+
+ nvlist_free(props);
+
+ (void) rw_unlock(&zs->zs_name_lock);
}
+ /*
+ * Test snapshot hold/release and deferred destroy.
+ */
void
- ztest_dsl_prop_get_set(ztest_args_t *za)
+ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
- int i, inherit;
- uint64_t value;
- const char *prop, *valname;
- char setpoint[MAXPATHLEN];
- char osname[MAXNAMELEN];
int error;
+ objset_t *os = zd->zd_os;
+ objset_t *origin;
+ char snapname[100];
+ char fullname[100];
+ char clonename[100];
+ char tag[100];
+ char osname[MAXNAMELEN];
(void) rw_rdlock(&ztest_shared->zs_name_lock);
}
/*
- * Scrub the pool.
+ * Verify that DDT repair works as expected.
*/
void
- ztest_scrub(ztest_args_t *za)
+ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
{
- spa_t *spa = za->za_spa;
-
- (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
- (void) poll(NULL, 0, 1000); /* wait a second, then force a restart */
- (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
- }
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[1];
+ uint64_t object, blocksize, txg, pattern, psize;
+ enum zio_checksum checksum = spa_dedup_checksum(spa);
+ dmu_buf_t *db;
+ dmu_tx_t *tx;
+ void *buf;
+ blkptr_t blk;
+ int copies = 2 * ZIO_DEDUPDITTO_MIN;
++ int i;
- /*
- * Rename the pool to a different name and then rename it back.
- */
- void
- ztest_spa_rename(ztest_args_t *za)
- {
- char *oldname, *newname;
- int error;
- spa_t *spa;
+ blocksize = ztest_random_blocksize();
+ blocksize = MIN(blocksize, 2048); /* because we write so many */
- (void) rw_wrlock(&ztest_shared->zs_name_lock);
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
- oldname = za->za_pool;
- newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
- (void) strcpy(newname, oldname);
- (void) strcat(newname, "_tmp");
+ if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+ return;
/*
- * Do the rename
+ * Take the name lock as writer to prevent anyone else from changing
+ * the pool and dataset properies we need to maintain during this test.
*/
- error = spa_rename(oldname, newname);
- if (error)
- fatal(0, "spa_rename('%s', '%s') = %d", oldname,
- newname, error);
+ (void) rw_wrlock(&zs->zs_name_lock);
- /*
- * Try to open it under the old name, which shouldn't exist
- */
- error = spa_open(oldname, &spa, FTAG);
- if (error != ENOENT)
- fatal(0, "spa_open('%s') = %d", oldname, error);
+ if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
+ B_FALSE) != 0 ||
+ ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
+ B_FALSE) != 0) {
+ (void) rw_unlock(&zs->zs_name_lock);
+ return;
+ }
+
+ object = od[0].od_object;
+ blocksize = od[0].od_blocksize;
+ pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os);
+
+ ASSERT(object != 0);
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, object, 0, copies * blocksize);
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ (void) rw_unlock(&zs->zs_name_lock);
+ return;
+ }
/*
- * Open it under the new name and make sure it's still the same spa_t.
+ * Write all the copies of our block.
*/
- error = spa_open(newname, &spa, FTAG);
- if (error != 0)
- fatal(0, "spa_open('%s') = %d", newname, error);
- for (int i = 0; i < copies; i++) {
++ for (i = 0; i < copies; i++) {
+ uint64_t offset = i * blocksize;
+ VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db,
+ DMU_READ_NO_PREFETCH) == 0);
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == blocksize);
+ ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
+ ztest_pattern_match(db->db_data, db->db_size, 0ULL));
+ dmu_buf_will_fill(db, tx);
+ ztest_pattern_set(db->db_data, db->db_size, pattern);
+ dmu_buf_rele(db, FTAG);
+ }
- ASSERT(spa == za->za_spa);
- spa_close(spa, FTAG);
+ dmu_tx_commit(tx);
+ txg_wait_synced(spa_get_dsl(spa), txg);
/*
- * Rename it back to the original
+ * Find out what block we got.
*/
- error = spa_rename(newname, oldname);
- if (error)
- fatal(0, "spa_rename('%s', '%s') = %d", newname,
- oldname, error);
+ VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db,
+ DMU_READ_NO_PREFETCH) == 0);
+ blk = *((dmu_buf_impl_t *)db)->db_blkptr;
+ dmu_buf_rele(db, FTAG);
/*
- * Make sure it can still be opened
+ * Damage the block. Dedup-ditto will save us when we read it later.
*/
- error = spa_open(oldname, &spa, FTAG);
- if (error != 0)
- fatal(0, "spa_open('%s') = %d", oldname, error);
+ psize = BP_GET_PSIZE(&blk);
+ buf = zio_buf_alloc(psize);
+ ztest_pattern_set(buf, psize, ~pattern);
- ASSERT(spa == za->za_spa);
- spa_close(spa, FTAG);
+ (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
+ buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
- umem_free(newname, strlen(newname) + 1);
+ zio_buf_free(buf, psize);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ (void) rw_unlock(&zs->zs_name_lock);
}
-
/*
- * Completely obliterate one disk.
+ * Scrub the pool.
*/
- static void
- ztest_obliterate_one_disk(uint64_t vdev)
+ /* ARGSUSED */
+ void
+ ztest_scrub(ztest_ds_t *zd, uint64_t id)
{
- int fd;
- char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN];
- size_t fsize;
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
- if (zopt_maxfaults < 2)
- return;
+ (void) spa_scan(spa, POOL_SCAN_SCRUB);
+ (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
+ (void) spa_scan(spa, POOL_SCAN_SCRUB);
+ }
- (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
- (void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name);
+ /*
+ * Rename the pool to a different name and then rename it back.
+ */
+ /* ARGSUSED */
+ void
+ ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
+ {
+ ztest_shared_t *zs = ztest_shared;
+ char *oldname, *newname;
+ spa_t *spa;
- fd = open(dev_name, O_RDWR);
+ (void) rw_wrlock(&zs->zs_name_lock);
- if (fd == -1)
- fatal(1, "can't open %s", dev_name);
+ oldname = zs->zs_pool;
+ newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
+ (void) strcpy(newname, oldname);
+ (void) strcat(newname, "_tmp");
/*
- * Determine the size.
+ * Do the rename
*/
- fsize = lseek(fd, 0, SEEK_END);
-
- (void) close(fd);
+ VERIFY3U(0, ==, spa_rename(oldname, newname));
/*
- * Rename the old device to dev_name.old (useful for debugging).
+ * Try to open it under the old name, which shouldn't exist
*/
- VERIFY(rename(dev_name, copy_name) == 0);
+ VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
/*
- * Create a new one.
+ * Open it under the new name and make sure it's still the same spa_t.
*/
- VERIFY((fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666)) >= 0);
- VERIFY(ftruncate(fd, fsize) == 0);
- (void) close(fd);
- }
+ VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
- static void
- ztest_replace_one_disk(spa_t *spa, uint64_t vdev)
- {
- char dev_name[MAXPATHLEN];
- nvlist_t *root;
- int error;
- uint64_t guid;
- vdev_t *vd;
+ ASSERT(spa == zs->zs_spa);
+ spa_close(spa, FTAG);
- (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+ /*
+ * Rename it back to the original
+ */
+ VERIFY3U(0, ==, spa_rename(newname, oldname));
/*
- * Build the nvlist describing dev_name.
+ * Make sure it can still be opened
*/
- root = make_vdev_root(dev_name, NULL, 0, 0, 0, 0, 0, 1);
+ VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, dev_name)) == NULL)
- guid = 0;
- else
- guid = vd->vdev_guid;
- spa_config_exit(spa, SCL_VDEV, FTAG);
- error = spa_vdev_attach(spa, guid, root, B_TRUE);
- if (error != 0 &&
- error != EBUSY &&
- error != ENOTSUP &&
- error != ENODEV &&
- error != EDOM)
- fatal(0, "spa_vdev_attach(in-place) = %d", error);
+ ASSERT(spa == zs->zs_spa);
+ spa_close(spa, FTAG);
- nvlist_free(root);
+ umem_free(newname, strlen(newname) + 1);
+
+ (void) rw_unlock(&zs->zs_name_lock);
}
+ /*
+ * Verify pool integrity by running zdb.
+ */
static void
- ztest_verify_blocks(char *pool)
+ ztest_run_zdb(char *pool)
{
int status;
char zdb[MAXPATHLEN + MAXNAMELEN + 20];
return (NULL);
}
- for (int i = 0; i < zi->zi_iters; i++)
+ static void *
+ ztest_deadman_thread(void *arg)
+ {
+ ztest_shared_t *zs = arg;
+ int grace = 300;
+ hrtime_t delta;
+
+ delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace;
+
+ (void) poll(NULL, 0, (int)(1000 * delta));
+
+ fatal(0, "failed to complete within %d seconds of deadline", grace);
+
+ return (NULL);
+ }
+
+ static void
+ ztest_execute(ztest_info_t *zi, uint64_t id)
+ {
+ ztest_shared_t *zs = ztest_shared;
+ ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets];
+ hrtime_t functime = gethrtime();
++ int i;
+
++ for (i = 0; i < zi->zi_iters; i++)
+ zi->zi_func(zd, id);
+
+ functime = gethrtime() - functime;
+
+ atomic_add_64(&zi->zi_call_count, 1);
+ atomic_add_64(&zi->zi_call_time, functime);
+
+ if (zopt_verbose >= 4) {
+ Dl_info dli;
+ (void) dladdr((void *)zi->zi_func, &dli);
+ (void) printf("%6.2f sec in %s\n",
+ (double)functime / NANOSEC, dli.dli_sname);
+ }
+ }
+
static void *
ztest_thread(void *arg)
{
/*
* See if it's time to force a crash.
*/
- if (now > za->za_kill) {
- zs->zs_alloc = spa_get_alloc(za->za_spa);
- zs->zs_space = spa_get_space(za->za_spa);
- (void) kill(getpid(), SIGKILL);
- }
+ if (now > zs->zs_thread_kill)
+ ztest_kill(zs);
/*
- * Pick a random function.
+ * If we're getting ENOSPC with some regularity, stop.
*/
- f = ztest_random(ZTEST_FUNCS);
- zi = &zs->zs_info[f];
+ if (zs->zs_enospc_count > 10)
+ break;
/*
- * Decide whether to call it, based on the requested frequency.
+ * Pick a random function to execute.
*/
- if (zi->zi_call_target == 0 ||
- (double)zi->zi_call_total / zi->zi_call_target >
- (double)(now - zs->zs_start_time) / (zopt_time * NANOSEC))
- continue;
+ zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)];
+ call_next = zi->zi_call_next;
+
+ if (now >= call_next &&
+ atomic_cas_64(&zi->zi_call_next, call_next, call_next +
+ ztest_random(2 * zi->zi_interval[0] + 1)) == call_next)
+ ztest_execute(zi, id);
+ }
- atomic_add_64(&zi->zi_calls, 1);
- atomic_add_64(&zi->zi_call_total, 1);
+ return (NULL);
+ }
- za->za_diroff = (za->za_instance * ZTEST_FUNCS + f) *
- ZTEST_DIRSIZE;
- za->za_diroff_shared = (1ULL << 63);
+ static void
+ ztest_dataset_name(char *dsname, char *pool, int d)
+ {
+ (void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d);
+ }
- for (i = 0; i < zi->zi_iters; i++)
- zi->zi_func(za);
+ static void
+ ztest_dataset_destroy(ztest_shared_t *zs, int d)
+ {
+ char name[MAXNAMELEN];
++ int t;
- functime = gethrtime() - now;
+ ztest_dataset_name(name, zs->zs_pool, d);
- atomic_add_64(&zi->zi_call_time, functime);
+ if (zopt_verbose >= 3)
+ (void) printf("Destroying %s to free up space\n", name);
- if (zopt_verbose >= 4) {
- Dl_info dli;
- (void) dladdr((void *)zi->zi_func, &dli);
- (void) printf("%6.2f sec in %s\n",
- (double)functime / NANOSEC, dli.dli_sname);
- }
+ /*
+ * Cleanup any non-standard clones and snapshots. In general,
+ * ztest thread t operates on dataset (t % zopt_datasets),
+ * so there may be more than one thing to clean up.
+ */
- for (int t = d; t < zopt_threads; t += zopt_datasets)
++ for (t = d; t < zopt_threads; t += zopt_datasets)
+ ztest_dsl_dataset_cleanup(name, t);
- /*
- * If we're getting ENOSPC with some regularity, stop.
- */
- if (zs->zs_enospc_count > 10)
- break;
+ (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
+ DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+ }
+
+ static void
+ ztest_dataset_dirobj_verify(ztest_ds_t *zd)
+ {
+ uint64_t usedobjs, dirobjs, scratch;
+
+ /*
+ * ZTEST_DIROBJ is the object directory for the entire dataset.
+ * Therefore, the number of objects in use should equal the
+ * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
+ * If not, we have an object leak.
+ *
+ * Note that we can only check this in ztest_dataset_open(),
+ * when the open-context and syncing-context values agree.
+ * That's because zap_count() returns the open-context value,
+ * while dmu_objset_space() returns the rootbp fill count.
+ */
+ VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
+ dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
+ ASSERT3U(dirobjs + 1, ==, usedobjs);
+ }
+
+ static int
+ ztest_dataset_open(ztest_shared_t *zs, int d)
+ {
+ ztest_ds_t *zd = &zs->zs_zd[d];
+ uint64_t committed_seq = zd->zd_seq;
+ objset_t *os;
+ zilog_t *zilog;
+ char name[MAXNAMELEN];
+ int error;
+
+ ztest_dataset_name(name, zs->zs_pool, d);
+
+ (void) rw_rdlock(&zs->zs_name_lock);
+
+ error = ztest_dataset_create(name);
+ if (error == ENOSPC) {
+ (void) rw_unlock(&zs->zs_name_lock);
+ ztest_record_enospc(FTAG);
+ return (error);
}
+ ASSERT(error == 0 || error == EEXIST);
- return (NULL);
+ VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0);
+ (void) rw_unlock(&zs->zs_name_lock);
+
+ ztest_zd_init(zd, os);
+
+ zilog = zd->zd_zilog;
+
+ if (zilog->zl_header->zh_claim_lr_seq != 0 &&
+ zilog->zl_header->zh_claim_lr_seq < committed_seq)
+ fatal(0, "missing log records: claimed %llu < committed %llu",
+ zilog->zl_header->zh_claim_lr_seq, committed_seq);
+
+ ztest_dataset_dirobj_verify(zd);
+
+ zil_replay(os, zd, ztest_replay_vector);
+
+ ztest_dataset_dirobj_verify(zd);
+
+ if (zopt_verbose >= 6)
+ (void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
+ zd->zd_name,
+ (u_longlong_t)zilog->zl_parse_blk_count,
+ (u_longlong_t)zilog->zl_parse_lr_count,
+ (u_longlong_t)zilog->zl_replaying_seq);
+
+ zilog = zil_open(os, ztest_get_data);
+
+ if (zilog->zl_replaying_seq != 0 &&
+ zilog->zl_replaying_seq < committed_seq)
+ fatal(0, "missing log records: replayed %llu < committed %llu",
+ zilog->zl_replaying_seq, committed_seq);
+
+ return (0);
+ }
+
+ static void
+ ztest_dataset_close(ztest_shared_t *zs, int d)
+ {
+ ztest_ds_t *zd = &zs->zs_zd[d];
+
+ zil_close(zd->zd_zilog);
+ dmu_objset_rele(zd->zd_os, zd);
+
+ ztest_zd_fini(zd);
}
/*
* Kick off threads to run tests on all datasets in parallel.
*/
static void
- ztest_run(char *pool)
+ ztest_run(ztest_shared_t *zs)
{
- int t, d, error;
- ztest_shared_t *zs = ztest_shared;
- ztest_args_t *za;
+ thread_t *tid;
spa_t *spa;
- char name[100];
thread_t resume_tid;
+ int error;
++ int t, d;
ztest_exiting = B_FALSE;
if (zopt_verbose >= 4)
(void) printf("starting main threads...\n");
- za[0].za_start = gethrtime();
- za[0].za_stop = za[0].za_start + zopt_passtime * NANOSEC;
- za[0].za_stop = MIN(za[0].za_stop, zs->zs_stop_time);
- za[0].za_kill = za[0].za_stop;
- if (ztest_random(100) < zopt_killrate)
- za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC);
-
+ /*
+ * Kick off all the tests that run in parallel.
+ */
- for (int t = 0; t < zopt_threads; t++) {
+ for (t = 0; t < zopt_threads; t++) {
- d = t % zopt_datasets;
-
- (void) strcpy(za[t].za_pool, pool);
- za[t].za_os = za[d].za_os;
- za[t].za_spa = spa;
- za[t].za_zilog = za[d].za_zilog;
- za[t].za_instance = t;
- za[t].za_random = ztest_random(-1ULL);
- za[t].za_start = za[0].za_start;
- za[t].za_stop = za[0].za_stop;
- za[t].za_kill = za[0].za_kill;
-
- if (t < zopt_datasets) {
- int test_future = FALSE;
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
- (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
- error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0,
- ztest_create_cb, NULL);
- if (error == EEXIST) {
- test_future = TRUE;
- } else if (error == ENOSPC) {
- zs->zs_enospc_count++;
- (void) rw_unlock(&ztest_shared->zs_name_lock);
- break;
- } else if (error != 0) {
- fatal(0, "dmu_objset_create(%s) = %d",
- name, error);
- }
- error = dmu_objset_open(name, DMU_OST_OTHER,
- DS_MODE_USER, &za[d].za_os);
- if (error)
- fatal(0, "dmu_objset_open('%s') = %d",
- name, error);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
- if (test_future)
- ztest_dmu_check_future_leak(&za[t]);
- zil_replay(za[d].za_os, za[d].za_os,
- ztest_replay_vector);
- za[d].za_zilog = zil_open(za[d].za_os, NULL);
- }
-
- VERIFY(thr_create(0, 0, ztest_thread, &za[t], THR_BOUND,
- &za[t].za_thread) == 0);
+ if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0)
+ return;
+ VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
+ THR_BOUND, &tid[t]) == 0);
}
- while (--t >= 0) {
- VERIFY(thr_join(za[t].za_thread, NULL, NULL) == 0);
- if (t < zopt_datasets) {
- zil_close(za[t].za_zilog);
- dmu_objset_close(za[t].za_os);
- }
+ /*
+ * Wait for all of the tests to complete. We go in reverse order
+ * so we don't close datasets while threads are still using them.
+ */
- for (int t = zopt_threads - 1; t >= 0; t--) {
++ for (t = zopt_threads - 1; t >= 0; t--) {
+ VERIFY(thr_join(tid[t], NULL, NULL) == 0);
+ if (t < zopt_datasets)
+ ztest_dataset_close(zs, t);
}
- if (zopt_verbose >= 3)
- show_pool_stats(spa);
-
txg_wait_synced(spa_get_dsl(spa), 0);
- zs->zs_alloc = spa_get_alloc(spa);
- zs->zs_space = spa_get_space(spa);
+ zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
+
+ umem_free(tid, zopt_threads * sizeof (thread_t));
+
+ /* Kill the resume thread */
+ ztest_exiting = B_TRUE;
+ VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
+ ztest_resume(spa);
+
+ /*
+ * Right before closing the pool, kick off a bunch of async I/O;
+ * spa_close() should wait for it to complete.
+ */
+ for (uint64_t object = 1; object < 50; object++)
+ dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
+
+ spa_close(spa, FTAG);
/*
- * If we had out-of-space errors, destroy a random objset.
+ * Verify that we can loop over all pools.
*/
- if (zs->zs_enospc_count != 0) {
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
- d = (int)ztest_random(zopt_datasets);
- (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
- if (zopt_verbose >= 3)
- (void) printf("Destroying %s to free up space\n", name);
+ mutex_enter(&spa_namespace_lock);
+ for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
+ if (zopt_verbose > 3)
+ (void) printf("spa_next: found %s\n", spa_name(spa));
+ mutex_exit(&spa_namespace_lock);
+
+ /*
+ * Verify that we can export the pool and reimport it under a
+ * different name.
+ */
+ if (ztest_random(2) == 0) {
+ char name[MAXNAMELEN];
+ (void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool);
+ ztest_spa_import_export(zs->zs_pool, name);
+ ztest_spa_import_export(name, zs->zs_pool);
+ }
+
+ kernel_fini();
+ }
+
+ static void
+ ztest_freeze(ztest_shared_t *zs)
+ {
+ ztest_ds_t *zd = &zs->zs_zd[0];
+ spa_t *spa;
+ int numloops = 0;
+
+ if (zopt_verbose >= 3)
+ (void) printf("testing spa_freeze()...\n");
- /* Cleanup any non-standard clones and snapshots */
- ztest_dsl_dataset_cleanup(name, za[d].za_instance);
+ kernel_init(FREAD | FWRITE);
+ VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+ VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
- (void) dmu_objset_find(name, ztest_destroy_cb, &za[d],
- DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ /*
+ * Force the first log block to be transactionally allocated.
+ * We have to do this before we freeze the pool -- otherwise
+ * the log chain won't be anchored.
+ */
+ while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
+ ztest_dmu_object_alloc_free(zd, 0);
+ zil_commit(zd->zd_zilog, UINT64_MAX, 0);
}
txg_wait_synced(spa_get_dsl(spa), 0);
ztest_info_t *zi;
char timebuf[100];
char numbuf[6];
+ spa_t *spa;
++ int i, f;
(void) setvbuf(stdout, NULL, _IOLBF, 0);
bzero(zs, sizeof (ztest_shared_t));
if (zopt_verbose >= 3 && zopt_init != 1)
(void) printf("ztest_init(), pass %d\n", i);
- ztest_init(zopt_pool);
+ zs->zs_pool = zopt_pool;
+ ztest_init(zs);
}
- /*
- * Initialize the call targets for each function.
- */
+ zs->zs_pool = zopt_pool;
+ zs->zs_proc_start = gethrtime();
+ zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC;
+
- for (int f = 0; f < ZTEST_FUNCS; f++) {
+ for (f = 0; f < ZTEST_FUNCS; f++) {
zi = &zs->zs_info[f];
-
*zi = ztest_info[f];
-
- if (*zi->zi_interval == 0)
- zi->zi_call_target = UINT64_MAX;
+ if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
+ zi->zi_call_next = UINT64_MAX;
else
- zi->zi_call_target = zopt_time / *zi->zi_interval;
+ zi->zi_call_next = zs->zs_proc_start +
+ ztest_random(2 * zi->zi_interval[0] + 1);
}
- zs->zs_start_time = gethrtime();
- zs->zs_stop_time = zs->zs_start_time + zopt_time * NANOSEC;
-
/*
* Run the tests in a loop. These tests include fault injection
* to verify that self-healing data works, and forced crashes
/*
* Initialize the workload counters for each function.
*/
- for (int f = 0; f < ZTEST_FUNCS; f++) {
+ for (f = 0; f < ZTEST_FUNCS; f++) {
zi = &zs->zs_info[f];
- zi->zi_calls = 0;
+ zi->zi_call_count = 0;
zi->zi_call_time = 0;
}
return (nvl);
}
- for (int c = 0; c < holes; c++) {
+ /*
+ * Determine if the vdev id is a hole in the namespace.
+ */
+ boolean_t
+ vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
+ {
++ int c;
++
++ for (c = 0; c < holes; c++) {
+
+ /* Top-level is a hole */
+ if (hole_array[c] == id)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+ }
+
/*
* Convert our list of pools into the definitive set of configurations. We
* start by picking the best config for each toplevel vdev. Once that's done,
dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
- objset_impl_t *os = dn->dn_objset;
- uint64_t obj, hv, idx;
+ objset_t *os = dn->dn_objset;
+ uint64_t obj = dn->dn_object;
+ uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+ uint64_t idx = hv & h->hash_table_mask;
dmu_buf_impl_t *db;
+ obj = dn->dn_object;
+ hv = DBUF_HASH(os, obj, level, blkid);
+ idx = hv & h->hash_table_mask;
+
mutex_enter(DBUF_HASH_MUTEX(h, idx));
for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
if (DBUF_EQUAL(db, os, obj, level, blkid)) {
dbuf_hash_insert(dmu_buf_impl_t *db)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
- objset_impl_t *os = db->db_objset;
+ objset_t *os = db->db_objset;
uint64_t obj = db->db.db_object;
int level = db->db_level;
- uint64_t blkid = db->db_blkid;
- uint64_t hv = DBUF_HASH(os, obj, level, blkid);
- uint64_t idx = hv & h->hash_table_mask;
+ uint64_t blkid, hv, idx;
dmu_buf_impl_t *dbf;
+ blkid = db->db_blkid;
+ hv = DBUF_HASH(os, obj, level, blkid);
+ idx = hv & h->hash_table_mask;
+
mutex_enter(DBUF_HASH_MUTEX(h, idx));
for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
--- /dev/null
- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ /*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+ /*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+ #include <sys/zfs_context.h>
+ #include <sys/spa.h>
+ #include <sys/spa_impl.h>
+ #include <sys/zio.h>
+ #include <sys/ddt.h>
+ #include <sys/zap.h>
+ #include <sys/dmu_tx.h>
+ #include <sys/arc.h>
+ #include <sys/dsl_pool.h>
+ #include <sys/zio_checksum.h>
+ #include <sys/zio_compress.h>
+ #include <sys/dsl_scan.h>
+
+ static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
+ &ddt_zap_ops,
+ };
+
+ static const char *ddt_class_name[DDT_CLASSES] = {
+ "ditto",
+ "duplicate",
+ "unique",
+ };
+
+ static void
+ ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+ {
+ spa_t *spa = ddt->ddt_spa;
+ objset_t *os = ddt->ddt_os;
+ uint64_t *objectp = &ddt->ddt_object[type][class];
+ boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ ASSERT(*objectp == 0);
+ VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
+ ASSERT(*objectp != 0);
+
+ VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, objectp, tx) == 0);
+
+ VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class], tx) == 0);
+ }
+
+ static void
+ ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+ {
+ spa_t *spa = ddt->ddt_spa;
+ objset_t *os = ddt->ddt_os;
+ uint64_t *objectp = &ddt->ddt_object[type][class];
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ ASSERT(*objectp != 0);
+ ASSERT(ddt_object_count(ddt, type, class) == 0);
+ ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
+ VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
+ VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
+ VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
+ bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
+
+ *objectp = 0;
+ }
+
+ static int
+ ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+ {
+ ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+ dmu_object_info_t doi;
+ char name[DDT_NAMELEN];
+ int error;
+
+ ddt_object_name(ddt, type, class, name);
+
+ error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
+
+ if (error)
+ return (error);
+
+ error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class]);
+
+ /*
+ * Seed the cached statistics.
+ */
+ VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+
+ ddo->ddo_count = ddt_object_count(ddt, type, class);
+ ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+ ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+ ASSERT(error == 0);
+ return (error);
+ }
+
+ static void
+ ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+ {
+ ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+ dmu_object_info_t doi;
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class], tx) == 0);
+
+ /*
+ * Cache DDT statistics; this is the only time they'll change.
+ */
+ VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+
+ ddo->ddo_count = ddt_object_count(ddt, type, class);
+ ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+ ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+ }
+
+ static int
+ ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde)
+ {
+ if (!ddt_object_exists(ddt, type, class))
+ return (ENOENT);
+
+ return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde));
+ }
+
+ static void
+ ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde)
+ {
+ if (!ddt_object_exists(ddt, type, class))
+ return;
+
+ ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde);
+ }
+
+ int
+ ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+ {
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, tx));
+ }
+
+ static int
+ ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+ {
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, tx));
+ }
+
+ int
+ ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ uint64_t *walk, ddt_entry_t *dde)
+ {
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, walk));
+ }
+
+ uint64_t
+ ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+ {
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
+ ddt->ddt_object[type][class]));
+ }
+
+ int
+ ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_object_info_t *doi)
+ {
+ if (!ddt_object_exists(ddt, type, class))
+ return (ENOENT);
+
+ return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
+ doi));
+ }
+
+ boolean_t
+ ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+ {
+ return (!!ddt->ddt_object[type][class]);
+ }
+
+ void
+ ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ char *name)
+ {
+ (void) sprintf(name, DMU_POOL_DDT,
+ zio_checksum_table[ddt->ddt_checksum].ci_name,
+ ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
+ }
+
+ void
+ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+ {
++ int d;
+ ASSERT(txg != 0);
+
- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
++ for (d = 0; d < SPA_DVAS_PER_BP; d++)
+ bp->blk_dva[d] = ddp->ddp_dva[d];
+ BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+ }
+
+ void
+ ddt_bp_create(enum zio_checksum checksum,
+ const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
+ {
+ BP_ZERO(bp);
+
+ if (ddp != NULL)
+ ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+
+ bp->blk_cksum = ddk->ddk_cksum;
+ bp->blk_fill = 1;
+
+ BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
+ BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
+ BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
+ BP_SET_CHECKSUM(bp, checksum);
+ BP_SET_TYPE(bp, DMU_OT_DEDUP);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+ }
+
+ void
+ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
+ {
+ ddk->ddk_cksum = bp->blk_cksum;
+ ddk->ddk_prop = 0;
+
+ DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
+ DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
+ DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
+ }
+
+ void
+ ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+ {
++ int d;
+ ASSERT(ddp->ddp_phys_birth == 0);
+
- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
++ for (d = 0; d < SPA_DVAS_PER_BP; d++)
+ ddp->ddp_dva[d] = bp->blk_dva[d];
+ ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+ }
+
+ void
+ ddt_phys_clear(ddt_phys_t *ddp)
+ {
+ bzero(ddp, sizeof (*ddp));
+ }
+
+ void
+ ddt_phys_addref(ddt_phys_t *ddp)
+ {
+ ddp->ddp_refcnt++;
+ }
+
+ void
+ ddt_phys_decref(ddt_phys_t *ddp)
+ {
+ ASSERT((int64_t)ddp->ddp_refcnt > 0);
+ ddp->ddp_refcnt--;
+ }
+
+ void
+ ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+ {
+ blkptr_t blk;
+
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ ddt_phys_clear(ddp);
+ zio_free(ddt->ddt_spa, txg, &blk);
+ }
+
+ ddt_phys_t *
+ ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
+ {
+ ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
++ int p;
+
- for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
+ BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+ return (ddp);
+ }
+ return (NULL);
+ }
+
+ uint64_t
+ ddt_phys_total_refcnt(const ddt_entry_t *dde)
+ {
+ uint64_t refcnt = 0;
++ int p;
+
- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
++ for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+ refcnt += dde->dde_phys[p].ddp_refcnt;
+
+ return (refcnt);
+ }
+
+ static void
+ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+ {
+ spa_t *spa = ddt->ddt_spa;
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ uint64_t lsize = DDK_GET_LSIZE(ddk);
+ uint64_t psize = DDK_GET_PSIZE(ddk);
++ int p, d;
+
+ bzero(dds, sizeof (*dds));
+
- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ uint64_t dsize = 0;
+ uint64_t refcnt = ddp->ddp_refcnt;
+
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+
- for (int h = 0; h < 64; h++)
++ for (d = 0; d < SPA_DVAS_PER_BP; d++)
+ dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+
+ dds->dds_blocks += 1;
+ dds->dds_lsize += lsize;
+ dds->dds_psize += psize;
+ dds->dds_dsize += dsize;
+
+ dds->dds_ref_blocks += refcnt;
+ dds->dds_ref_lsize += lsize * refcnt;
+ dds->dds_ref_psize += psize * refcnt;
+ dds->dds_ref_dsize += dsize * refcnt;
+ }
+ }
+
+ void
+ ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+ {
+ const uint64_t *s = (const uint64_t *)src;
+ uint64_t *d = (uint64_t *)dst;
+ uint64_t *d_end = (uint64_t *)(dst + 1);
+
+ ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */
+
+ while (d < d_end)
+ *d++ += (*s++ ^ neg) - neg;
+ }
+
+ static void
+ ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+ {
+ ddt_stat_t dds;
+ ddt_histogram_t *ddh;
+ int bucket;
+
+ ddt_stat_generate(ddt, dde, &dds);
+
+ bucket = highbit(dds.dds_ref_blocks) - 1;
+ ASSERT(bucket >= 0);
+
+ ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+
+ ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+ }
+
+ void
+ ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
+ {
- for (int h = 0; h < 64; h++)
++ int h;
++
++ for (h = 0; h < 64; h++)
+ ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+ }
+
+ void
+ ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+ {
++ int h;
++
+ bzero(dds, sizeof (*dds));
+
- for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
++ for (h = 0; h < 64; h++)
+ ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+ }
+
+ boolean_t
+ ddt_histogram_empty(const ddt_histogram_t *ddh)
+ {
+ const uint64_t *s = (const uint64_t *)ddh;
+ const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+
+ while (s < s_end)
+ if (*s++ != 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+ }
+
+ void
+ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
+ {
+ /* Sum the statistics we cached in ddt_object_sync(). */
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_object_t *ddo =
+ &ddt->ddt_object_stats[type][class];
+ ddo_total->ddo_count += ddo->ddo_count;
+ ddo_total->ddo_dspace += ddo->ddo_dspace;
+ ddo_total->ddo_mspace += ddo->ddo_mspace;
+ }
+ }
+ }
+
+ /* ... and compute the averages. */
+ if (ddo_total->ddo_count != 0) {
+ ddo_total->ddo_dspace /= ddo_total->ddo_count;
+ ddo_total->ddo_mspace /= ddo_total->ddo_count;
+ } else {
+ ASSERT(ddo_total->ddo_dspace == 0);
+ ASSERT(ddo_total->ddo_mspace == 0);
+ }
+ }
+
+ void
+ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
+ {
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_histogram_add(ddh,
+ &ddt->ddt_histogram_cache[type][class]);
+ }
+ }
+ }
+ }
+
+ void
+ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
+ {
+ ddt_histogram_t *ddh_total;
+
+ ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+ ddt_get_dedup_histogram(spa, ddh_total);
+ ddt_histogram_stat(dds_total, ddh_total);
+ kmem_free(ddh_total, sizeof (ddt_histogram_t));
+ }
+
+ uint64_t
+ ddt_get_dedup_dspace(spa_t *spa)
+ {
+ ddt_stat_t dds_total = { 0 };
+
+ ddt_get_dedup_stats(spa, &dds_total);
+ return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
+ }
+
+ uint64_t
+ ddt_get_pool_dedup_ratio(spa_t *spa)
+ {
+ ddt_stat_t dds_total = { 0 };
+
+ ddt_get_dedup_stats(spa, &dds_total);
+ if (dds_total.dds_dsize == 0)
+ return (100);
+
+ return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
+ }
+
+ int
+ ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
+ {
+ spa_t *spa = ddt->ddt_spa;
+ uint64_t total_refcnt = 0;
+ uint64_t ditto = spa->spa_dedup_ditto;
+ int total_copies = 0;
+ int desired_copies = 0;
++ int p;
+
- for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
++ for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ zio_t *zio = dde->dde_lead_zio[p];
+ uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */
+ if (zio != NULL)
+ refcnt += zio->io_parent_count; /* pending refs */
+ if (ddp == ddp_willref)
+ refcnt++; /* caller's ref */
+ if (refcnt != 0) {
+ total_refcnt += refcnt;
+ total_copies += p;
+ }
+ }
+
+ if (ditto == 0 || ditto > UINT32_MAX)
+ ditto = UINT32_MAX;
+
+ if (total_refcnt >= 1)
+ desired_copies++;
+ if (total_refcnt >= ditto)
+ desired_copies++;
+ if (total_refcnt >= ditto * ditto)
+ desired_copies++;
+
+ return (MAX(desired_copies, total_copies) - total_copies);
+ }
+
+ int
+ ddt_ditto_copies_present(ddt_entry_t *dde)
+ {
+ ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
+ dva_t *dva = ddp->ddp_dva;
+ int copies = 0 - DVA_GET_GANG(dva);
++ int d;
+
- for (int p = 0; p < DDT_PHYS_TYPES; p++)
++ for (d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
+ if (DVA_IS_VALID(dva))
+ copies++;
+
+ ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
+
+ return (copies);
+ }
+
+ size_t
+ ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
+ {
+ uchar_t *version = dst++;
+ int cpfunc = ZIO_COMPRESS_ZLE;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ size_t c_len;
+
+ ASSERT(d_len >= s_len + 1); /* no compression plus version byte */
+
+ c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
+
+ if (c_len == s_len) {
+ cpfunc = ZIO_COMPRESS_OFF;
+ bcopy(src, dst, s_len);
+ }
+
+ *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
+
+ return (c_len + 1);
+ }
+
+ void
+ ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
+ {
+ uchar_t version = *src++;
+ int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+ if (ci->ci_decompress != NULL)
+ (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
+ else
+ bcopy(src, dst, d_len);
+
+ if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
+ byteswap_uint64_array(dst, d_len);
+ }
+
+ ddt_t *
+ ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
+ {
+ return (spa->spa_ddt[c]);
+ }
+
+ ddt_t *
+ ddt_select(spa_t *spa, const blkptr_t *bp)
+ {
+ return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
+ }
+
+ void
+ ddt_enter(ddt_t *ddt)
+ {
+ mutex_enter(&ddt->ddt_lock);
+ }
+
+ void
+ ddt_exit(ddt_t *ddt)
+ {
+ mutex_exit(&ddt->ddt_lock);
+ }
+
+ static ddt_entry_t *
+ ddt_alloc(const ddt_key_t *ddk)
+ {
+ ddt_entry_t *dde;
+
+ dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
+ cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
+
+ dde->dde_key = *ddk;
+
+ return (dde);
+ }
+
+ static void
+ ddt_free(ddt_entry_t *dde)
+ {
+ ASSERT(!dde->dde_loading);
++ int p;
+
- for (int i = 0; i < DDT_KEY_WORDS; i++) {
++ for (p = 0; p < DDT_PHYS_TYPES; p++)
+ ASSERT(dde->dde_lead_zio[p] == NULL);
+
+ if (dde->dde_repair_data != NULL)
+ zio_buf_free(dde->dde_repair_data,
+ DDK_GET_PSIZE(&dde->dde_key));
+
+ cv_destroy(&dde->dde_cv);
+ kmem_free(dde, sizeof (*dde));
+ }
+
+ void
+ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
+ {
+ ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+ avl_remove(&ddt->ddt_tree, dde);
+ ddt_free(dde);
+ }
+
+ ddt_entry_t *
+ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
+ {
+ ddt_entry_t *dde, dde_search;
+ enum ddt_type type;
+ enum ddt_class class;
+ avl_index_t where;
+ int error;
+
+ ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+ ddt_key_fill(&dde_search.dde_key, bp);
+
+ dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
+ if (dde == NULL) {
+ if (!add)
+ return (NULL);
+ dde = ddt_alloc(&dde_search.dde_key);
+ avl_insert(&ddt->ddt_tree, dde, where);
+ }
+
+ while (dde->dde_loading)
+ cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+
+ if (dde->dde_loaded)
+ return (dde);
+
+ dde->dde_loading = B_TRUE;
+
+ ddt_exit(ddt);
+
+ error = ENOENT;
+
+ for (type = 0; type < DDT_TYPES; type++) {
+ for (class = 0; class < DDT_CLASSES; class++) {
+ error = ddt_object_lookup(ddt, type, class, dde);
+ if (error != ENOENT)
+ break;
+ }
+ if (error != ENOENT)
+ break;
+ }
+
+ ASSERT(error == 0 || error == ENOENT);
+
+ ddt_enter(ddt);
+
+ ASSERT(dde->dde_loaded == B_FALSE);
+ ASSERT(dde->dde_loading == B_TRUE);
+
+ dde->dde_type = type; /* will be DDT_TYPES if no entry found */
+ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
+ dde->dde_loaded = B_TRUE;
+ dde->dde_loading = B_FALSE;
+
+ if (error == 0)
+ ddt_stat_update(ddt, dde, -1ULL);
+
+ cv_broadcast(&dde->dde_cv);
+
+ return (dde);
+ }
+
+ void
+ ddt_prefetch(spa_t *spa, const blkptr_t *bp)
+ {
+ ddt_t *ddt;
+ ddt_entry_t dde;
+
+ if (!BP_GET_DEDUP(bp))
+ return;
+
+ /*
+ * We remove the DDT once it's empty and only prefetch dedup blocks
+ * when there are entries in the DDT. Thus no locking is required
+ * as the DDT can't disappear on us.
+ */
+ ddt = ddt_select(spa, bp);
+ ddt_key_fill(&dde.dde_key, bp);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ ddt_object_prefetch(ddt, type, class, &dde);
+ }
+ }
+ }
+
+ int
+ ddt_entry_compare(const void *x1, const void *x2)
+ {
+ const ddt_entry_t *dde1 = x1;
+ const ddt_entry_t *dde2 = x2;
+ const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
+ const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
++ int i;
+
- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
++ for (i = 0; i < DDT_KEY_WORDS; i++) {
+ if (u1[i] < u2[i])
+ return (-1);
+ if (u1[i] > u2[i])
+ return (1);
+ }
+
+ return (0);
+ }
+
+ static ddt_t *
+ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
+ {
+ ddt_t *ddt;
+
+ ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
+
+ mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&ddt->ddt_tree, ddt_entry_compare,
+ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
+ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ ddt->ddt_checksum = c;
+ ddt->ddt_spa = spa;
+ ddt->ddt_os = spa->spa_meta_objset;
+
+ return (ddt);
+ }
+
+ static void
+ ddt_table_free(ddt_t *ddt)
+ {
+ ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
+ ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
+ avl_destroy(&ddt->ddt_tree);
+ avl_destroy(&ddt->ddt_repair_tree);
+ mutex_destroy(&ddt->ddt_lock);
+ kmem_free(ddt, sizeof (*ddt));
+ }
+
+ void
+ ddt_create(spa_t *spa)
+ {
+ spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+ spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+ }
+
+ int
+ ddt_load(spa_t *spa)
+ {
+ int error;
+
+ ddt_create(spa);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+ &spa->spa_ddt_stat_object);
+
+ if (error)
+ return (error == ENOENT ? 0 : error);
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ error = ddt_object_load(ddt, type, class);
+ if (error != 0 && error != ENOENT)
+ return (error);
+ }
+ }
+
+ /*
+ * Seed the cached histograms.
+ */
+ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ sizeof (ddt->ddt_histogram));
+ }
+
+ return (0);
+ }
+
+ void
+ ddt_unload(spa_t *spa)
+ {
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ if (spa->spa_ddt[c]) {
+ ddt_table_free(spa->spa_ddt[c]);
+ spa->spa_ddt[c] = NULL;
+ }
+ }
+ }
+
+ boolean_t
+ ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
+ {
+ ddt_t *ddt;
+ ddt_entry_t dde;
+
+ if (!BP_GET_DEDUP(bp))
+ return (B_FALSE);
+
+ if (max_class == DDT_CLASS_UNIQUE)
+ return (B_TRUE);
+
+ ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
+
+ ddt_key_fill(&dde.dde_key, bp);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++)
+ for (enum ddt_class class = 0; class <= max_class; class++)
+ if (ddt_object_lookup(ddt, type, class, &dde) == 0)
+ return (B_TRUE);
+
+ return (B_FALSE);
+ }
+
+ ddt_entry_t *
+ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
+ {
+ ddt_key_t ddk;
+ ddt_entry_t *dde;
+
+ ddt_key_fill(&ddk, bp);
+
+ dde = ddt_alloc(&ddk);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ /*
+ * We can only do repair if there are multiple copies
+ * of the block. For anything in the UNIQUE class,
+ * there's definitely only one copy, so don't even try.
+ */
+ if (class != DDT_CLASS_UNIQUE &&
+ ddt_object_lookup(ddt, type, class, dde) == 0)
+ return (dde);
+ }
+ }
+
+ bzero(dde->dde_phys, sizeof (dde->dde_phys));
+
+ return (dde);
+ }
+
+ void
+ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
+ {
+ avl_index_t where;
+
+ ddt_enter(ddt);
+
+ if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
+ avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
+ avl_insert(&ddt->ddt_repair_tree, dde, where);
+ else
+ ddt_free(dde);
+
+ ddt_exit(ddt);
+ }
+
+ static void
+ ddt_repair_entry_done(zio_t *zio)
+ {
+ ddt_entry_t *rdde = zio->io_private;
+
+ ddt_free(rdde);
+ }
+
+ static void
+ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
+ {
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_phys_t *rddp = rdde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ ddt_key_t *rddk = &rdde->dde_key;
+ zio_t *zio;
+ blkptr_t blk;
++ int p;
+
+ zio = zio_null(rio, rio->io_spa, NULL,
+ ddt_repair_entry_done, rdde, rio->io_flags);
+
- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
+ if (ddp->ddp_phys_birth == 0 ||
+ ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
+ bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+ continue;
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
+ rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
+ }
+
+ zio_nowait(zio);
+ }
+
+ static void
+ ddt_repair_table(ddt_t *ddt, zio_t *rio)
+ {
+ spa_t *spa = ddt->ddt_spa;
+ ddt_entry_t *dde, *rdde_next, *rdde;
+ avl_tree_t *t = &ddt->ddt_repair_tree;
+ blkptr_t blk;
+
+ if (spa_sync_pass(spa) > 1)
+ return;
+
+ ddt_enter(ddt);
+ for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
+ rdde_next = AVL_NEXT(t, rdde);
+ avl_remove(&ddt->ddt_repair_tree, rdde);
+ ddt_exit(ddt);
+ ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
+ dde = ddt_repair_start(ddt, &blk);
+ ddt_repair_entry(ddt, dde, rdde, rio);
+ ddt_repair_done(ddt, dde);
+ ddt_enter(ddt);
+ }
+ ddt_exit(ddt);
+ }
+
+ static void
+ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+ {
+ dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ enum ddt_type otype = dde->dde_type;
+ enum ddt_type ntype = DDT_TYPE_CURRENT;
+ enum ddt_class oclass = dde->dde_class;
+ enum ddt_class nclass;
+ uint64_t total_refcnt = 0;
++ int p;
+
+ ASSERT(dde->dde_loaded);
+ ASSERT(!dde->dde_loading);
+
++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ ASSERT(dde->dde_lead_zio[p] == NULL);
+ ASSERT((int64_t)ddp->ddp_refcnt >= 0);
+ if (ddp->ddp_phys_birth == 0) {
+ ASSERT(ddp->ddp_refcnt == 0);
+ continue;
+ }
+ if (p == DDT_PHYS_DITTO) {
+ if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
+ ddt_phys_free(ddt, ddk, ddp, txg);
+ continue;
+ }
+ if (ddp->ddp_refcnt == 0)
+ ddt_phys_free(ddt, ddk, ddp, txg);
+ total_refcnt += ddp->ddp_refcnt;
+ }
+
+ if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
+ nclass = DDT_CLASS_DITTO;
+ else if (total_refcnt > 1)
+ nclass = DDT_CLASS_DUPLICATE;
+ else
+ nclass = DDT_CLASS_UNIQUE;
+
+ if (otype != DDT_TYPES &&
+ (otype != ntype || oclass != nclass || total_refcnt == 0)) {
+ VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
+ ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
+ }
+
+ if (total_refcnt != 0) {
+ dde->dde_type = ntype;
+ dde->dde_class = nclass;
+ ddt_stat_update(ddt, dde, 0);
+ if (!ddt_object_exists(ddt, ntype, nclass))
+ ddt_object_create(ddt, ntype, nclass, tx);
+ VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+
+ /*
+ * If the class changes, the order that we scan this bp
+ * changes. If it decreases, we could miss it, so
+ * scan it right now. (This covers both class changing
+ * while we are doing ddt_walk(), and when we are
+ * traversing.)
+ */
+ if (nclass < oclass) {
+ dsl_scan_ddt_entry(dp->dp_scan,
+ ddt->ddt_checksum, dde, tx);
+ }
+ }
+ }
+
+ static void
+ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
+ {
+ spa_t *spa = ddt->ddt_spa;
+ ddt_entry_t *dde;
+ void *cookie = NULL;
+
+ if (avl_numnodes(&ddt->ddt_tree) == 0)
+ return;
+
+ ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
+
+ if (spa->spa_ddt_stat_object == 0) {
+ spa->spa_ddt_stat_object = zap_create(ddt->ddt_os,
+ DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx);
+ VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+ &spa->spa_ddt_stat_object, tx) == 0);
+ }
+
+ while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+ ddt_sync_entry(ddt, dde, tx, txg);
+ ddt_free(dde);
+ }
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ if (!ddt_object_exists(ddt, type, class))
+ continue;
+ ddt_object_sync(ddt, type, class, tx);
+ if (ddt_object_count(ddt, type, class) == 0)
+ ddt_object_destroy(ddt, type, class, tx);
+ }
+ }
+
+ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ sizeof (ddt->ddt_histogram));
+ }
+
+ void
+ ddt_sync(spa_t *spa, uint64_t txg)
+ {
+ dmu_tx_t *tx;
+ zio_t *rio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+ ASSERT(spa_syncing_txg(spa) == txg);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ if (ddt == NULL)
+ continue;
+ ddt_sync_table(ddt, tx, txg);
+ ddt_repair_table(ddt, rio);
+ }
+
+ (void) zio_wait(rio);
+
+ dmu_tx_commit(tx);
+ }
+
+ int
+ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
+ {
+ do {
+ do {
+ do {
+ ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
+ int error = ENOENT;
+ if (ddt_object_exists(ddt, ddb->ddb_type,
+ ddb->ddb_class)) {
+ error = ddt_object_walk(ddt,
+ ddb->ddb_type, ddb->ddb_class,
+ &ddb->ddb_cursor, dde);
+ }
+ dde->dde_type = ddb->ddb_type;
+ dde->dde_class = ddb->ddb_class;
+ if (error == 0)
+ return (0);
+ if (error != ENOENT)
+ return (error);
+ ddb->ddb_cursor = 0;
+ } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
+ ddb->ddb_checksum = 0;
+ } while (++ddb->ddb_type < DDT_TYPES);
+ ddb->ddb_type = 0;
+ } while (++ddb->ddb_class < DDT_CLASSES);
+
+ return (ENOENT);
+ }
void
dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
{
+ dnode_phys_t *dnp;
++ int i;
+
rw_enter(&dn->dn_struct_rwlock, RW_READER);
mutex_enter(&dn->dn_mtx);
doi->doi_indirection = dn->dn_nlevels;
doi->doi_checksum = dn->dn_checksum;
doi->doi_compress = dn->dn_compress;
- doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) +
- SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT;
- doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
- doi->doi_type = dn->dn_type;
- doi->doi_bonus_size = dn->dn_bonuslen;
- doi->doi_bonus_type = dn->dn_bonustype;
+ doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
+ doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz;
+ doi->doi_fill_count = 0;
- for (int i = 0; i < dnp->dn_nblkptr; i++)
++ for (i = 0; i < dnp->dn_nblkptr; i++)
+ doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
mutex_exit(&dn->dn_mtx);
rw_exit(&dn->dn_struct_rwlock);
}
void
- dmu_objset_evict(dsl_dataset_t *ds, void *arg)
+ dmu_objset_evict(objset_t *os)
{
- objset_impl_t *osi = arg;
- objset_t os;
- int i;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
++ int t;
- for (i = 0; i < TXG_SIZE; i++) {
- ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
- ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
- }
- for (int t = 0; t < TXG_SIZE; t++)
++ for (t = 0; t < TXG_SIZE; t++)
+ ASSERT(!dmu_objset_is_dirty(os, t));
if (ds) {
if (!dsl_dataset_is_snapshot(ds)) {
/* ARGSUSED */
static void
- ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
{
+ int i;
+
blkptr_t *bp = zio->io_bp;
- blkptr_t *bp_orig = &zio->io_bp_orig;
- objset_impl_t *os = arg;
+ objset_t *os = arg;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
ASSERT(bp == os->os_rootbp);
* dnode and user/group accounting objects).
*/
bp->blk_fill = 0;
- for (int i = 0; i < dnp->dn_nblkptr; i++)
+ for (i = 0; i < dnp->dn_nblkptr; i++)
bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+ }
+
+ /* ARGSUSED */
+ static void
+ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
+ {
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ objset_t *os = arg;
if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
- ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+ ASSERT(BP_EQUAL(bp, bp_orig));
} else {
- if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
- (void) dsl_dataset_block_kill(os->os_dsl_dataset,
- &zio->io_bp_orig, zio, os->os_synctx);
- dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+ dsl_dataset_block_born(ds, bp, tx);
}
}
uint64_t start, end, i;
int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
int err = 0;
++ int l;
if (len == 0)
return;
* If this write is not off the end of the file
* we need to account for overwrites/unref.
*/
- if (start <= dn->dn_maxblkid)
- bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS);
+ if (start <= dn->dn_maxblkid) {
- for (int l = 0; l < DN_MAX_LEVELS; l++)
++ for (l = 0; l < DN_MAX_LEVELS; l++)
+ history[l] = -1ULL;
+ }
while (start <= dn->dn_maxblkid) {
- spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
- dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
dmu_buf_impl_t *db;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
}
void
- dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
{
- int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
- int compressed = BP_GET_PSIZE(bp);
- int uncompressed = BP_GET_UCSIZE(bp);
+ int used, compressed, uncompressed;
int64_t delta;
- dprintf_bp(bp, "born, ds=%p\n", ds);
+ used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
+ compressed = BP_GET_PSIZE(bp);
+ uncompressed = BP_GET_UCSIZE(bp);
+
+ dprintf_bp(bp, "ds=%p", ds);
ASSERT(dmu_tx_is_syncing(tx));
/* It could have been compressed away to nothing */
--- /dev/null
- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ /*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+ /*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+ #include <sys/dsl_scan.h>
+ #include <sys/dsl_pool.h>
+ #include <sys/dsl_dataset.h>
+ #include <sys/dsl_prop.h>
+ #include <sys/dsl_dir.h>
+ #include <sys/dsl_synctask.h>
+ #include <sys/dnode.h>
+ #include <sys/dmu_tx.h>
+ #include <sys/dmu_objset.h>
+ #include <sys/arc.h>
+ #include <sys/zap.h>
+ #include <sys/zio.h>
+ #include <sys/zfs_context.h>
+ #include <sys/fs/zfs.h>
+ #include <sys/zfs_znode.h>
+ #include <sys/spa_impl.h>
+ #include <sys/vdev_impl.h>
+ #include <sys/zil_impl.h>
+ #include <sys/zio_checksum.h>
+ #include <sys/ddt.h>
+ #include <sys/sa.h>
+ #include <sys/sa_impl.h>
+ #ifdef _KERNEL
+ #include <sys/zfs_vfsops.h>
+ #endif
+
+ typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
+
+ static scan_cb_t dsl_scan_defrag_cb;
+ static scan_cb_t dsl_scan_scrub_cb;
+ static scan_cb_t dsl_scan_remove_cb;
+ static dsl_syncfunc_t dsl_scan_cancel_sync;
+ static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
+
+ int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
+ int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
+ int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
+ boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+ boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
+ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+ int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
+
+ #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
+ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
+ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
+
+ extern int zfs_txg_timeout;
+
+ /* the order has to match pool_scan_type */
+ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
+ NULL,
+ dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */
+ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */
+ };
+
+ int
+ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
+ {
+ int err;
+ dsl_scan_t *scn;
+ spa_t *spa = dp->dp_spa;
+ uint64_t f;
+
+ scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
+ scn->scn_dp = dp;
+
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ "scrub_func", sizeof (uint64_t), 1, &f);
+ if (err == 0) {
+ /*
+ * There was an old-style scrub in progress. Restart a
+ * new-style scrub from the beginning.
+ */
+ scn->scn_restart_txg = txg;
+ zfs_dbgmsg("old-style scrub was in progress; "
+ "restarting new-style scrub in txg %llu",
+ scn->scn_restart_txg);
+
+ /*
+ * Load the queue obj from the old location so that it
+ * can be freed by dsl_scan_done().
+ */
+ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ "scrub_queue", sizeof (uint64_t), 1,
+ &scn->scn_phys.scn_queue_obj);
+ } else {
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys);
+ if (err == ENOENT)
+ return (0);
+ else if (err)
+ return (err);
+
+ if (scn->scn_phys.scn_state == DSS_SCANNING &&
+ spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
+ /*
+ * A new-type scrub was in progress on an old
+ * pool, and the pool was accessed by old
+ * software. Restart from the beginning, since
+ * the old software may have changed the pool in
+ * the meantime.
+ */
+ scn->scn_restart_txg = txg;
+ zfs_dbgmsg("new-style scrub was modified "
+ "by old software; restarting in txg %llu",
+ scn->scn_restart_txg);
+ }
+ }
+
+ spa_scan_stat_init(spa);
+ return (0);
+ }
+
+ void
+ dsl_scan_fini(dsl_pool_t *dp)
+ {
+ if (dp->dp_scan) {
+ kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
+ dp->dp_scan = NULL;
+ }
+ }
+
+ /* ARGSUSED */
+ static int
+ dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx)
+ {
+ dsl_scan_t *scn = arg1;
+
+ if (scn->scn_phys.scn_state == DSS_SCANNING)
+ return (EBUSY);
+
+ return (0);
+ }
+
+ /* ARGSUSED */
+ static void
+ dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+ {
+ dsl_scan_t *scn = arg1;
+ pool_scan_func_t *funcp = arg2;
+ dmu_object_type_t ot = 0;
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+
+ ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
+ ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+ bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+ scn->scn_phys.scn_func = *funcp;
+ scn->scn_phys.scn_state = DSS_SCANNING;
+ scn->scn_phys.scn_min_txg = 0;
+ scn->scn_phys.scn_max_txg = tx->tx_txg;
+ scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
+ scn->scn_phys.scn_start_time = gethrestime_sec();
+ scn->scn_phys.scn_errors = 0;
+ scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+ scn->scn_restart_txg = 0;
+ spa_scan_stat_init(spa);
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
+
+ /* rewrite all disk labels */
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ if (vdev_resilver_needed(spa->spa_root_vdev,
+ &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
+ spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
+ } else {
+ spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
+ }
+
+ spa->spa_scrub_started = B_TRUE;
+ /*
+ * If this is an incremental scrub, limit the DDT scrub phase
+ * to just the auto-ditto class (for correctness); the rest
+ * of the scrub should go faster using top-down pruning.
+ */
+ if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
+ scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
+
+ }
+
+ /* back to the generic stuff */
+
+ if (dp->dp_blkstats == NULL) {
+ dp->dp_blkstats =
+ kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+ }
+ bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+
+ if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
+ ot = DMU_OT_ZAP_OTHER;
+
+ scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
+ ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
+
+ dsl_scan_sync_state(scn, tx);
+
+ spa_history_log_internal(LOG_POOL_SCAN, spa, tx,
+ "func=%u mintxg=%llu maxtxg=%llu",
+ *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
+ }
+
+ /* ARGSUSED */
+ static void
+ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+ {
+ static const char *old_names[] = {
+ "scrub_bookmark",
+ "scrub_ddt_bookmark",
+ "scrub_ddt_class_max",
+ "scrub_queue",
+ "scrub_min_txg",
+ "scrub_max_txg",
+ "scrub_func",
+ "scrub_errors",
+ NULL
+ };
+
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+ int i;
+
+ /* Remove any remnants of an old-style scrub. */
+ for (i = 0; old_names[i]; i++) {
+ (void) zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
+ }
+
+ if (scn->scn_phys.scn_queue_obj != 0) {
+ VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, tx));
+ scn->scn_phys.scn_queue_obj = 0;
+ }
+
+ /*
+ * If we were "restarted" from a stopped state, don't bother
+ * with anything else.
+ */
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ if (complete)
+ scn->scn_phys.scn_state = DSS_FINISHED;
+ else
+ scn->scn_phys.scn_state = DSS_CANCELED;
+
+ spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx,
+ "complete=%u", complete);
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight > 0) {
+ cv_wait(&spa->spa_scrub_io_cv,
+ &spa->spa_scrub_lock);
+ }
+ mutex_exit(&spa->spa_scrub_lock);
+ spa->spa_scrub_started = B_FALSE;
+ spa->spa_scrub_active = B_FALSE;
+
+ /*
+ * If the scrub/resilver completed, update all DTLs to
+ * reflect this. Whether it succeeded or not, vacate
+ * all temporary scrub DTLs.
+ */
+ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+ complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
+ if (complete) {
+ spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
+ ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+ }
+ spa_errlog_rotate(spa);
+
+ /*
+ * We may have finished replacing a device.
+ * Let the async thread assess this and handle the detach.
+ */
+ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+ }
+
+ scn->scn_phys.scn_end_time = gethrestime_sec();
+ }
+
+ /* ARGSUSED */
+ static int
+ dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx)
+ {
+ dsl_scan_t *scn = arg1;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return (ENOENT);
+ return (0);
+ }
+
+ /* ARGSUSED */
+ static void
+ dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+ {
+ dsl_scan_t *scn = arg1;
+
+ dsl_scan_done(scn, B_FALSE, tx);
+ dsl_scan_sync_state(scn, tx);
+ }
+
+ int
+ dsl_scan_cancel(dsl_pool_t *dp)
+ {
+ boolean_t complete = B_FALSE;
+ int err;
+
+ err = dsl_sync_task_do(dp, dsl_scan_cancel_check,
+ dsl_scan_cancel_sync, dp->dp_scan, &complete, 3);
+ return (err);
+ }
+
+ static void dsl_scan_visitbp(blkptr_t *bp,
+ const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf,
+ dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+ dmu_tx_t *tx);
+ static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
+ dmu_objset_type_t ostype,
+ dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx);
+
+ void
+ dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
+ {
+ zio_free(dp->dp_spa, txg, bp);
+ }
+
+ void
+ dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
+ {
+ ASSERT(dsl_pool_sync_context(dp));
+ zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
+ }
+
+ int
+ dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
+ {
+ return (arc_read(pio, spa, bpp, pbuf, done, private,
+ priority, zio_flags, arc_flags, zb));
+ }
+
+ int
+ dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
+ {
+ return (arc_read_nolock(pio, spa, bpp, done, private,
+ priority, zio_flags, arc_flags, zb));
+ }
+
+ static boolean_t
+ bookmark_is_zero(const zbookmark_t *zb)
+ {
+ return (zb->zb_objset == 0 && zb->zb_object == 0 &&
+ zb->zb_level == 0 && zb->zb_blkid == 0);
+ }
+
+ /* dnp is the dnode for zb1->zb_object */
+ static boolean_t
+ bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
+ const zbookmark_t *zb2)
+ {
+ uint64_t zb1nextL0, zb2thisobj;
+
+ ASSERT(zb1->zb_objset == zb2->zb_objset);
+ ASSERT(zb2->zb_level == 0);
+
+ /*
+ * A bookmark in the deadlist is considered to be after
+ * everything else.
+ */
+ if (zb2->zb_object == DMU_DEADLIST_OBJECT)
+ return (B_TRUE);
+
+ /* The objset_phys_t isn't before anything. */
+ if (dnp == NULL)
+ return (B_FALSE);
+
+ zb1nextL0 = (zb1->zb_blkid + 1) <<
+ ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+
+ zb2thisobj = zb2->zb_object ? zb2->zb_object :
+ zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+
+ if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
+ uint64_t nextobj = zb1nextL0 *
+ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
+ return (nextobj <= zb2thisobj);
+ }
+
+ if (zb1->zb_object < zb2thisobj)
+ return (B_TRUE);
+ if (zb1->zb_object > zb2thisobj)
+ return (B_FALSE);
+ if (zb2->zb_object == DMU_META_DNODE_OBJECT)
+ return (B_FALSE);
+ return (zb1nextL0 <= zb2->zb_blkid);
+ }
+
+ static uint64_t
+ dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+ {
+ uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+ if (dsl_dataset_is_snapshot(ds))
+ return (MIN(smt, ds->ds_phys->ds_creation_txg));
+ return (smt);
+ }
+
+ static void
+ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+ {
+ VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys, tx));
+ }
+
+ static boolean_t
+ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
+ {
+ uint64_t elapsed_nanosecs;
+ int mintime;
+
+ /* we never skip user/group accounting objects */
+ if (zb && (int64_t)zb->zb_object < 0)
+ return (B_FALSE);
+
+ if (scn->scn_pausing)
+ return (B_TRUE); /* we're already pausing */
+
+ if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
+ return (B_FALSE); /* we're resuming */
+
+ /* We only know how to resume from level-0 blocks. */
+ if (zb && zb->zb_level != 0)
+ return (B_FALSE);
+
+ mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
+ elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+ if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+ (elapsed_nanosecs / MICROSEC > mintime &&
+ txg_sync_waiting(scn->scn_dp)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa)) {
+ if (zb) {
+ dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ scn->scn_phys.scn_bookmark = *zb;
+ }
+ dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+ scn->scn_pausing = B_TRUE;
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+ }
+
+ typedef struct zil_scan_arg {
+ dsl_pool_t *zsa_dp;
+ zil_header_t *zsa_zh;
+ } zil_scan_arg_t;
+
+ /* ARGSUSED */
+ static int
+ dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+ {
+ zil_scan_arg_t *zsa = arg;
+ dsl_pool_t *dp = zsa->zsa_dp;
+ dsl_scan_t *scn = dp->dp_scan;
+ zil_header_t *zh = zsa->zsa_zh;
+ zbookmark_t zb;
+
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return (0);
+
+ /*
+ * One block ("stubby") can be allocated a long time ago; we
+ * want to visit that one because it has been allocated
+ * (on-disk) even if it hasn't been claimed (even though for
+ * scrub there's nothing to do to it).
+ */
+ if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+ return (0);
+
+ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+ return (0);
+ }
+
+ /* ARGSUSED */
+ static int
+ dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+ {
+ if (lrc->lrc_txtype == TX_WRITE) {
+ zil_scan_arg_t *zsa = arg;
+ dsl_pool_t *dp = zsa->zsa_dp;
+ dsl_scan_t *scn = dp->dp_scan;
+ zil_header_t *zh = zsa->zsa_zh;
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+ zbookmark_t zb;
+
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return (0);
+
+ /*
+ * birth can be < claim_txg if this record's txg is
+ * already txg sync'ed (but this log block contains
+ * other records that are not synced)
+ */
+ if (claim_txg == 0 || bp->blk_birth < claim_txg)
+ return (0);
+
+ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ lr->lr_foid, ZB_ZIL_LEVEL,
+ lr->lr_offset / BP_GET_LSIZE(bp));
+
+ VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+ }
+ return (0);
+ }
+
+ static void
+ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
+ {
+ uint64_t claim_txg = zh->zh_claim_txg;
+ zil_scan_arg_t zsa = { dp, zh };
+ zilog_t *zilog;
+
+ /*
+ * We only want to visit blocks that have been claimed but not yet
+ * replayed (or, in read-only mode, blocks that *would* be claimed).
+ */
+ if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+ return;
+
+ zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+ (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
+ claim_txg);
+
+ zil_free(zilog);
+ }
+
+ /* ARGSUSED */
+ static void
+ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
+ uint64_t objset, uint64_t object, uint64_t blkid)
+ {
+ zbookmark_t czb;
+ uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+
+ if (zfs_no_scrub_prefetch)
+ return;
+
+ if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
+ (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
+ return;
+
+ SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
+
+ /*
+ * XXX need to make sure all of these arc_read() prefetches are
+ * done before setting xlateall (similar to dsl_read())
+ */
+ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
+ buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+ &flags, &czb);
+ }
+
+ static boolean_t
+ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
+ const zbookmark_t *zb)
+ {
+ /*
+ * We never skip over user/group accounting objects (obj<0)
+ */
+ if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
+ (int64_t)zb->zb_object >= 0) {
+ /*
+ * If we already visited this bp & everything below (in
+ * a prior txg sync), don't bother doing it again.
+ */
+ if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+ return (B_TRUE);
+
+ /*
+ * If we found the block we're trying to resume from, or
+ * we went past it to a different object, zero it out to
+ * indicate that it's OK to start checking for pausing
+ * again.
+ */
+ if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
+ zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
+ dprintf("resuming at %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
+ }
+ }
+ return (B_FALSE);
+ }
+
+ /*
+ * Return nonzero on i/o error.
+ * Return new buf to write out in *bufp.
+ */
+ static int
+ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+ dnode_phys_t *dnp, const blkptr_t *bp,
+ const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
+ {
+ dsl_pool_t *dp = scn->scn_dp;
+ int err;
+
+ if (BP_GET_LEVEL(bp) > 0) {
+ uint32_t flags = ARC_WAIT;
+ int i;
+ blkptr_t *cbp;
+ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+ dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset,
+ zb->zb_object, zb->zb_blkid * epb + i);
+ }
+ for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+ zbookmark_t czb;
+
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1,
+ zb->zb_blkid * epb + i);
+ dsl_scan_visitbp(cbp, &czb, dnp,
+ *bufp, ds, scn, ostype, tx);
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
+ uint32_t flags = ARC_WAIT;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+ uint32_t flags = ARC_WAIT;
+ dnode_phys_t *cdnp;
+ int i, j;
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+ for (j = 0; j < cdnp->dn_nblkptr; j++) {
+ blkptr_t *cbp = &cdnp->dn_blkptr[j];
+ dsl_scan_prefetch(scn, *bufp, cbp,
+ zb->zb_objset, zb->zb_blkid * epb + i, j);
+ }
+ }
+ for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+ dsl_scan_visitdnode(scn, ds, ostype,
+ cdnp, *bufp, zb->zb_blkid * epb + i, tx);
+ }
+
+ } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ uint32_t flags = ARC_WAIT;
+ objset_phys_t *osp;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+
+ osp = (*bufp)->b_data;
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn))
+ dsl_scan_zil(dp, &osp->os_zil_header);
+
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
+
+ if (OBJSET_BUF_HAS_USERUSED(*bufp)) {
+ /*
+ * We also always visit user/group accounting
+ * objects, and never skip them, even if we are
+ * pausing. This is necessary so that the space
+ * deltas from this txg get integrated.
+ */
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_groupused_dnode, *bufp,
+ DMU_GROUPUSED_OBJECT, tx);
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_userused_dnode, *bufp,
+ DMU_USERUSED_OBJECT, tx);
+ }
+ }
+
+ return (0);
+ }
+
+ static void
+ dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
+ dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf,
+ uint64_t object, dmu_tx_t *tx)
+ {
+ int j;
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ zbookmark_t czb;
+
+ SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+ dnp->dn_nlevels - 1, j);
+ dsl_scan_visitbp(&dnp->dn_blkptr[j],
+ &czb, dnp, buf, ds, scn, ostype, tx);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ zbookmark_t czb;
+ SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+ 0, DMU_SPILL_BLKID);
+ dsl_scan_visitbp(&dnp->dn_spill,
+ &czb, dnp, buf, ds, scn, ostype, tx);
+ }
+ }
+
+ /*
+ * The arguments are in this order because mdb can only print the
+ * first 5; we want them to be useful.
+ */
+ static void
+ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
+ dnode_phys_t *dnp, arc_buf_t *pbuf,
+ dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+ dmu_tx_t *tx)
+ {
+ dsl_pool_t *dp = scn->scn_dp;
+ arc_buf_t *buf = NULL;
+ blkptr_t bp_toread = *bp;
+
+ /* ASSERT(pbuf == NULL || arc_released(pbuf)); */
+
+ if (dsl_scan_check_pause(scn, zb))
+ return;
+
+ if (dsl_scan_check_resume(scn, dnp, zb))
+ return;
+
+ if (bp->blk_birth == 0)
+ return;
+
+ scn->scn_visited_this_txg++;
+
+ dprintf_bp(bp,
+ "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
+ ds, ds ? ds->ds_object : 0,
+ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+ pbuf, bp);
+
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return;
+
+ if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
+ /*
+ * For non-user-accounting blocks, we need to read the
+ * new bp (from a deleted snapshot, found in
+ * check_existing_xlation). If we used the old bp,
+ * pointers inside this block from before we resumed
+ * would be untranslated.
+ *
+ * For user-accounting blocks, we need to read the old
+ * bp, because we will apply the entire space delta to
+ * it (original untranslated -> translations from
+ * deleted snap -> now).
+ */
+ bp_toread = *bp;
+ }
+
+ if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
+ &buf) != 0)
+ return;
+
+ /*
+ * If dsl_scan_ddt() has aready visited this block, it will have
+ * already done any translations or scrubbing, so don't call the
+ * callback again.
+ */
+ if (ddt_class_contains(dp->dp_spa,
+ scn->scn_phys.scn_ddt_class_max, bp)) {
+ ASSERT(buf == NULL);
+ return;
+ }
+
+ /*
+ * If this block is from the future (after cur_max_txg), then we
+ * are doing this on behalf of a deleted snapshot, and we will
+ * revisit the future block on the next pass of this dataset.
+ * Don't scan it now unless we need to because something
+ * under it was modified.
+ */
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
+ scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+ }
+ if (buf)
+ (void) arc_buf_remove_ref(buf, &buf);
+ }
+
+ static void
+ dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
+ dmu_tx_t *tx)
+ {
+ zbookmark_t zb;
+
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+ dsl_scan_visitbp(bp, &zb, NULL, NULL,
+ ds, scn, DMU_OST_NONE, tx);
+
+ dprintf_ds(ds, "finished scan%s", "");
+ }
+
+ void
+ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+ {
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+ if (dsl_dataset_is_snapshot(ds)) {
+ /* Note, scn_cur_{min,max}_txg stays the same. */
+ scn->scn_phys.scn_bookmark.zb_objset =
+ ds->ds_phys->ds_next_snap_obj;
+ zfs_dbgmsg("destroying ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+ scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
+ } else {
+ SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
+ ZB_DESTROYED_OBJSET, 0, 0, 0);
+ zfs_dbgmsg("destroying ds %llu; currently traversing; "
+ "reset bookmark to -1,0,0,0",
+ (u_longlong_t)ds->ds_object);
+ }
+ } else if (zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+ ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+ if (dsl_dataset_is_snapshot(ds)) {
+ /*
+ * We keep the same mintxg; it could be >
+ * ds_creation_txg if the previous snapshot was
+ * deleted too.
+ */
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
+ zfs_dbgmsg("destroying ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+ } else {
+ zfs_dbgmsg("destroying ds %llu; in queue; removing",
+ (u_longlong_t)ds->ds_object);
+ }
+ } else {
+ zfs_dbgmsg("destroying ds %llu; ignoring",
+ (u_longlong_t)ds->ds_object);
+ }
+
+ /*
+ * dsl_scan_sync() should be called after this, and should sync
+ * out our changed state, but just to be safe, do it here.
+ */
+ dsl_scan_sync_state(scn, tx);
+ }
+
+ void
+ dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+ {
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+ scn->scn_phys.scn_bookmark.zb_objset =
+ ds->ds_phys->ds_prev_snap_obj;
+ zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+ } else if (zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
+ zfs_dbgmsg("snapshotting ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+ }
+ dsl_scan_sync_state(scn, tx);
+ }
+
+ void
+ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+ {
+ dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
+ scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
+ zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds1->ds_object,
+ (u_longlong_t)ds2->ds_object);
+ } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
+ scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
+ zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds2->ds_object,
+ (u_longlong_t)ds1->ds_object);
+ }
+
+ if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ ds1->ds_object, &mintxg) == 0) {
+ int err;
+
+ ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+ ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
+ err = zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
+ VERIFY(err == 0 || err == EEXIST);
+ if (err == EEXIST) {
+ /* Both were there to begin with */
+ VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ ds1->ds_object, mintxg, tx));
+ }
+ zfs_dbgmsg("clone_swap ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds1->ds_object,
+ (u_longlong_t)ds2->ds_object);
+ } else if (zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
+ ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+ ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
+ VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
+ zfs_dbgmsg("clone_swap ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds2->ds_object,
+ (u_longlong_t)ds1->ds_object);
+ }
+
+ dsl_scan_sync_state(scn, tx);
+ }
+
+ struct enqueue_clones_arg {
+ dmu_tx_t *tx;
+ uint64_t originobj;
+ };
+
+ /* ARGSUSED */
+ static int
+ enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+ {
+ struct enqueue_clones_arg *eca = arg;
+ dsl_dataset_t *ds;
+ int err;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err)
+ return (err);
+
+ if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
+ while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+ dsl_dataset_t *prev;
+ err = dsl_dataset_hold_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+
+ dsl_dataset_rele(ds, FTAG);
+ if (err)
+ return (err);
+ ds = prev;
+ }
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object,
+ ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ static void
+ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
+ {
+ dsl_pool_t *dp = scn->scn_dp;
+ dsl_dataset_t *ds;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+ /*
+ * Iterate over the bps in this ds.
+ */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
+
+ char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
+ dsl_dataset_name(ds, dsname);
+ zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
+ "pausing=%u",
+ (longlong_t)dsobj, dsname,
+ (longlong_t)scn->scn_phys.scn_cur_min_txg,
+ (longlong_t)scn->scn_phys.scn_cur_max_txg,
+ (int)scn->scn_pausing);
+ kmem_free(dsname, ZFS_MAXNAMELEN);
+
+ if (scn->scn_pausing)
+ goto out;
+
+ /*
+ * We've finished this pass over this dataset.
+ */
+
+ /*
+ * If we did not completely visit this dataset, do another pass.
+ */
+ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
+ zfs_dbgmsg("incomplete pass; visiting again");
+ scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object,
+ scn->scn_phys.scn_cur_max_txg, tx) == 0);
+ goto out;
+ }
+
+ /*
+ * Add descendent datasets to work queue.
+ */
+ if (ds->ds_phys->ds_next_snap_obj != 0) {
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
+ ds->ds_phys->ds_creation_txg, tx) == 0);
+ }
+ if (ds->ds_phys->ds_num_children > 1) {
+ boolean_t usenext = B_FALSE;
+ if (ds->ds_phys->ds_next_clones_obj != 0) {
+ uint64_t count;
+ /*
+ * A bug in a previous version of the code could
+ * cause upgrade_clones_cb() to not set
+ * ds_next_snap_obj when it should, leading to a
+ * missing entry. Therefore we can only use the
+ * next_clones_obj when its count is correct.
+ */
+ int err = zap_count(dp->dp_meta_objset,
+ ds->ds_phys->ds_next_clones_obj, &count);
+ if (err == 0 &&
+ count == ds->ds_phys->ds_num_children - 1)
+ usenext = B_TRUE;
+ }
+
+ if (usenext) {
+ VERIFY(zap_join_key(dp->dp_meta_objset,
+ ds->ds_phys->ds_next_clones_obj,
+ scn->scn_phys.scn_queue_obj,
+ ds->ds_phys->ds_creation_txg, tx) == 0);
+ } else {
+ struct enqueue_clones_arg eca;
+ eca.tx = tx;
+ eca.originobj = ds->ds_object;
+
+ (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
+ NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
+ }
+ }
+
+ out:
+ dsl_dataset_rele(ds, FTAG);
+ }
+
+ /* ARGSUSED */
+ static int
+ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+ {
+ dmu_tx_t *tx = arg;
+ dsl_dataset_t *ds;
+ int err;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err)
+ return (err);
+
+ while (ds->ds_phys->ds_prev_snap_obj != 0) {
+ dsl_dataset_t *prev;
+ err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+ FTAG, &prev);
+ if (err) {
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+ }
+
+ /*
+ * If this is a clone, we don't need to worry about it for now.
+ */
+ if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele(prev, FTAG);
+ return (0);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ ds = prev;
+ }
+
+ VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ /*
+ * Scrub/dedup interaction.
+ *
+ * If there are N references to a deduped block, we don't want to scrub it
+ * N times -- ideally, we should scrub it exactly once.
+ *
+ * We leverage the fact that the dde's replication class (enum ddt_class)
+ * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
+ * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
+ *
+ * To prevent excess scrubbing, the scrub begins by walking the DDT
+ * to find all blocks with refcnt > 1, and scrubs each of these once.
+ * Since there are two replication classes which contain blocks with
+ * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
+ * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
+ *
+ * There would be nothing more to say if a block's refcnt couldn't change
+ * during a scrub, but of course it can so we must account for changes
+ * in a block's replication class.
+ *
+ * Here's an example of what can occur:
+ *
+ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+ * when visited during the top-down scrub phase, it will be scrubbed twice.
+ * This negates our scrub optimization, but is otherwise harmless.
+ *
+ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+ * on each visit during the top-down scrub phase, it will never be scrubbed.
+ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+ * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
+ * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
+ * while a scrub is in progress, it scrubs the block right then.
+ */
+ static void
+ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
+ {
+ ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
+ ddt_entry_t dde = { 0 };
+ int error;
+ uint64_t n = 0;
+
+ while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+ ddt_t *ddt;
+
+ if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
+ break;
+ dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
+ (longlong_t)ddb->ddb_class,
+ (longlong_t)ddb->ddb_type,
+ (longlong_t)ddb->ddb_checksum,
+ (longlong_t)ddb->ddb_cursor);
+
+ /* There should be no pending changes to the dedup table */
+ ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
+ ASSERT(avl_first(&ddt->ddt_tree) == NULL);
+
+ dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+ n++;
+
+ if (dsl_scan_check_pause(scn, NULL))
+ break;
+ }
+
+ zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
+ (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
+ (int)scn->scn_pausing);
+
+ ASSERT(error == 0 || error == ENOENT);
+ ASSERT(error != ENOENT ||
+ ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
+ }
+
+ /* ARGSUSED */
+ void
+ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+ {
+ const ddt_key_t *ddk = &dde->dde_key;
+ ddt_phys_t *ddp = dde->dde_phys;
+ blkptr_t bp;
+ zbookmark_t zb = { 0 };
++ int p;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
- for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0 ||
+ ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
+ continue;
+ ddt_bp_create(checksum, ddk, ddp, &bp);
+
+ scn->scn_visited_this_txg++;
+ scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+ }
+ }
+
+ static void
+ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
+ {
+ dsl_pool_t *dp = scn->scn_dp;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+ scn->scn_phys.scn_ddt_class_max) {
+ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+ scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+ dsl_scan_ddt(scn, tx);
+ if (scn->scn_pausing)
+ return;
+ }
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
+ /* First do the MOS & ORIGIN */
+
+ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+ scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+ dsl_scan_visit_rootbp(scn, NULL,
+ &dp->dp_meta_rootbp, tx);
+ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+ if (scn->scn_pausing)
+ return;
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+ VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+ NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
+ } else {
+ dsl_scan_visitds(scn,
+ dp->dp_origin_snap->ds_object, tx);
+ }
+ ASSERT(!scn->scn_pausing);
+ } else if (scn->scn_phys.scn_bookmark.zb_objset !=
+ ZB_DESTROYED_OBJSET) {
+ /*
+ * If we were paused, continue from here. Note if the
+ * ds we were paused on was deleted, the zb_objset may
+ * be -1, so we will skip this and find a new objset
+ * below.
+ */
+ dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
+ if (scn->scn_pausing)
+ return;
+ }
+
+ /*
+ * In case we were paused right at the end of the ds, zero the
+ * bookmark so we don't think that we're still trying to resume.
+ */
+ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
+
+ /* keep pulling things out of the zap-object-as-queue */
+ while (zap_cursor_init(&zc, dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj),
+ zap_cursor_retrieve(&zc, &za) == 0) {
+ dsl_dataset_t *ds;
+ uint64_t dsobj;
+
+ dsobj = strtonum(za.za_name, NULL);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, dsobj, tx));
+
+ /* Set up min/max txg */
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ if (za.za_first_integer != 0) {
+ scn->scn_phys.scn_cur_min_txg =
+ MAX(scn->scn_phys.scn_min_txg,
+ za.za_first_integer);
+ } else {
+ scn->scn_phys.scn_cur_min_txg =
+ MAX(scn->scn_phys.scn_min_txg,
+ ds->ds_phys->ds_prev_snap_txg);
+ }
+ scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
+ dsl_dataset_rele(ds, FTAG);
+
+ dsl_scan_visitds(scn, dsobj, tx);
+ zap_cursor_fini(&zc);
+ if (scn->scn_pausing)
+ return;
+ }
+ zap_cursor_fini(&zc);
+ }
+
+ static int
+ dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+ {
+ dsl_scan_t *scn = arg;
+ uint64_t elapsed_nanosecs;
+
+ elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+
+ if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+ (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
+ txg_sync_waiting(scn->scn_dp)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa))
+ return (ERESTART);
+
+ zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
+ dmu_tx_get_txg(tx), bp, 0));
+ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+ -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
+ -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+ scn->scn_visited_this_txg++;
+ return (0);
+ }
+
+ boolean_t
+ dsl_scan_active(dsl_scan_t *scn)
+ {
+ spa_t *spa = scn->scn_dp->dp_spa;
+ uint64_t used = 0, comp, uncomp;
+
+ if (spa->spa_load_state != SPA_LOAD_NONE)
+ return (B_FALSE);
+ if (spa_shutting_down(spa))
+ return (B_FALSE);
+
+ if (scn->scn_phys.scn_state == DSS_SCANNING)
+ return (B_TRUE);
+
+ if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
+ &used, &comp, &uncomp);
+ }
+ return (used != 0);
+ }
+
+ void
+ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+ {
+ dsl_scan_t *scn = dp->dp_scan;
+ spa_t *spa = dp->dp_spa;
+ int err;
+
+ /*
+ * Check for scn_restart_txg before checking spa_load_state, so
+ * that we can restart an old-style scan while the pool is being
+ * imported (see dsl_scan_init).
+ */
+ if (scn->scn_restart_txg != 0 &&
+ scn->scn_restart_txg <= tx->tx_txg) {
+ pool_scan_func_t func = POOL_SCAN_SCRUB;
+ dsl_scan_done(scn, B_FALSE, tx);
+ if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+ func = POOL_SCAN_RESILVER;
+ zfs_dbgmsg("restarting scan func=%u txg=%llu",
+ func, tx->tx_txg);
+ dsl_scan_setup_sync(scn, &func, tx);
+ }
+
+
+ if (!dsl_scan_active(scn) ||
+ spa_sync_pass(dp->dp_spa) > 1)
+ return;
+
+ scn->scn_visited_this_txg = 0;
+ scn->scn_pausing = B_FALSE;
+ scn->scn_sync_start_time = gethrtime();
+ spa->spa_scrub_active = B_TRUE;
+
+ /*
+ * First process the free list. If we pause the free, don't do
+ * any scanning. This ensures that there is no free list when
+ * we are scanning, so the scan code doesn't have to worry about
+ * traversing it.
+ */
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_MUSTSUCCEED);
+ err = bpobj_iterate(&dp->dp_free_bpobj,
+ dsl_scan_free_cb, scn, tx);
+ VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+ if (scn->scn_visited_this_txg) {
+ zfs_dbgmsg("freed %llu blocks in %llums from "
+ "free_bpobj txg %llu",
+ (longlong_t)scn->scn_visited_this_txg,
+ (longlong_t)
+ (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
+ (longlong_t)tx->tx_txg);
+ scn->scn_visited_this_txg = 0;
+ /*
+ * Re-sync the ddt so that we can further modify
+ * it when doing bprewrite.
+ */
+ ddt_sync(spa, tx->tx_txg);
+ }
+ if (err == ERESTART)
+ return;
+ }
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+
+ if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+ scn->scn_phys.scn_ddt_class_max) {
+ zfs_dbgmsg("doing scan sync txg %llu; "
+ "ddt bm=%llu/%llu/%llu/%llx",
+ (longlong_t)tx->tx_txg,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
+ } else {
+ zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
+ (longlong_t)tx->tx_txg,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
+ }
+
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_CANFAIL);
+ dsl_scan_visit(scn, tx);
+ (void) zio_wait(scn->scn_zio_root);
+ scn->scn_zio_root = NULL;
+
+ zfs_dbgmsg("visited %llu blocks in %llums",
+ (longlong_t)scn->scn_visited_this_txg,
+ (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
+
+ if (!scn->scn_pausing) {
+ /* finished with scan. */
+ zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
+ dsl_scan_done(scn, B_TRUE, tx);
+ }
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight > 0) {
+ cv_wait(&spa->spa_scrub_io_cv,
+ &spa->spa_scrub_lock);
+ }
+ mutex_exit(&spa->spa_scrub_lock);
+ }
+
+ dsl_scan_sync_state(scn, tx);
+ }
+
+ /*
+ * This will start a new scan, or restart an existing one.
+ */
+ void
+ dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
+ {
+ if (txg == 0) {
+ dmu_tx_t *tx;
+ tx = dmu_tx_create_dd(dp->dp_mos_dir);
+ VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+
+ txg = dmu_tx_get_txg(tx);
+ dp->dp_scan->scn_restart_txg = txg;
+ dmu_tx_commit(tx);
+ } else {
+ dp->dp_scan->scn_restart_txg = txg;
+ }
+ zfs_dbgmsg("restarting resilver txg=%llu", txg);
+ }
+
+ boolean_t
+ dsl_scan_resilvering(dsl_pool_t *dp)
+ {
+ return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
+ dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+ }
+
+ /*
+ * scrub consumers
+ */
+
+ static void
+ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+ {
+ int i;
+
+ /*
+ * If we resume after a reboot, zab will be NULL; don't record
+ * incomplete stats in that case.
+ */
+ if (zab == NULL)
+ return;
+
+ for (i = 0; i < 4; i++) {
+ int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
+ int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+ zfs_blkstat_t *zb = &zab->zab_type[l][t];
+ int equal;
+
+ zb->zb_count++;
+ zb->zb_asize += BP_GET_ASIZE(bp);
+ zb->zb_lsize += BP_GET_LSIZE(bp);
+ zb->zb_psize += BP_GET_PSIZE(bp);
+ zb->zb_gangs += BP_COUNT_GANG(bp);
+
+ switch (BP_GET_NDVAS(bp)) {
+ case 2:
+ if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1]))
+ zb->zb_ditto_2_of_2_samevdev++;
+ break;
+ case 3:
+ equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1])) +
+ (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2])) +
+ (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]));
+ if (equal == 1)
+ zb->zb_ditto_2_of_3_samevdev++;
+ else if (equal == 3)
+ zb->zb_ditto_3_of_3_samevdev++;
+ break;
+ }
+ }
+ }
+
+ static void
+ dsl_scan_scrub_done(zio_t *zio)
+ {
+ spa_t *spa = zio->io_spa;
+
+ zio_data_buf_free(zio->io_data, zio->io_size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_inflight--;
+ cv_broadcast(&spa->spa_scrub_io_cv);
+
+ if (zio->io_error && (zio->io_error != ECKSUM ||
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+ spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
+ }
+ mutex_exit(&spa->spa_scrub_lock);
+ }
+
+ static int
+ dsl_scan_scrub_cb(dsl_pool_t *dp,
+ const blkptr_t *bp, const zbookmark_t *zb)
+ {
+ dsl_scan_t *scn = dp->dp_scan;
+ size_t size = BP_GET_PSIZE(bp);
+ spa_t *spa = dp->dp_spa;
+ uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+ boolean_t needs_io;
+ int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+ int zio_priority;
++ int d;
+
+ if (phys_birth <= scn->scn_phys.scn_min_txg ||
+ phys_birth >= scn->scn_phys.scn_max_txg)
+ return (0);
+
+ count_block(dp->dp_blkstats, bp);
+
+ ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
+ if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
+ zio_flags |= ZIO_FLAG_SCRUB;
+ zio_priority = ZIO_PRIORITY_SCRUB;
+ needs_io = B_TRUE;
+ } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
+ zio_flags |= ZIO_FLAG_RESILVER;
+ zio_priority = ZIO_PRIORITY_RESILVER;
+ needs_io = B_FALSE;
+ }
+
+ /* If it's an intent log block, failure is expected. */
+ if (zb->zb_level == ZB_ZIL_LEVEL)
+ zio_flags |= ZIO_FLAG_SPECULATIVE;
+
++ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ vdev_t *vd = vdev_lookup_top(spa,
+ DVA_GET_VDEV(&bp->blk_dva[d]));
+
+ /*
+ * Keep track of how much data we've examined so that
+ * zpool(1M) status can make useful progress reports.
+ */
+ scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
+ spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
+
+ /* if it's a resilver, this may not be in the target range */
+ if (!needs_io) {
+ if (DVA_GET_GANG(&bp->blk_dva[d])) {
+ /*
+ * Gang members may be spread across multiple
+ * vdevs, so the best estimate we have is the
+ * scrub range, which has already been checked.
+ * XXX -- it would be better to change our
+ * allocation policy to ensure that all
+ * gang members reside on the same vdev.
+ */
+ needs_io = B_TRUE;
+ } else {
+ needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
+ phys_birth, 1);
+ }
+ }
+ }
+
+ if (needs_io && !zfs_no_scrub_io) {
+ void *data = zio_data_buf_alloc(size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ spa->spa_scrub_inflight++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ zio_nowait(zio_read(NULL, spa, bp, data, size,
+ dsl_scan_scrub_done, NULL, zio_priority,
+ zio_flags, zb));
+ }
+
+ /* do not relocate this block */
+ return (0);
+ }
+
+ int
+ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+ {
+ spa_t *spa = dp->dp_spa;
+
+ /*
+ * Purge all vdev caches and probe all devices. We do this here
+ * rather than in sync context because this requires a writer lock
+ * on the spa_config lock, which we can't do from sync context. The
+ * spa_scrub_reopen flag indicates that vdev_open() should not
+ * attempt to start another scrub.
+ */
+ spa_vdev_state_enter(spa, SCL_NONE);
+ spa->spa_scrub_reopen = B_TRUE;
+ vdev_reopen(spa->spa_root_vdev);
+ spa->spa_scrub_reopen = B_FALSE;
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+
+ return (dsl_sync_task_do(dp, dsl_scan_setup_check,
+ dsl_scan_setup_sync, dp->dp_scan, &func, 0));
+ }
#define BP_SPRINTF_LEN 320
- for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \
+ /*
+ * This macro allows code sharing between zfs, libzpool, and mdb.
+ * 'func' is either snprintf() or mdb_snprintf().
+ * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
+ */
+ #define SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress) \
+ { \
+ static const char *copyname[] = \
+ { "zero", "single", "double", "triple" }; \
+ int size = BP_SPRINTF_LEN; \
+ int len = 0; \
+ int copies = 0; \
++ int d; \
+ \
+ if (bp == NULL) { \
+ len = func(buf + len, size - len, "<NULL>"); \
+ } else if (BP_IS_HOLE(bp)) { \
+ len = func(buf + len, size - len, "<hole>"); \
+ } else { \
++ for (d = 0; d < BP_GET_NDVAS(bp); d++) { \
+ const dva_t *dva = &bp->blk_dva[d]; \
+ if (DVA_IS_VALID(dva)) \
+ copies++; \
+ len += func(buf + len, size - len, \
+ "DVA[%d]=<%llu:%llx:%llx>%c", d, \
+ (u_longlong_t)DVA_GET_VDEV(dva), \
+ (u_longlong_t)DVA_GET_OFFSET(dva), \
+ (u_longlong_t)DVA_GET_ASIZE(dva), \
+ ws); \
+ } \
+ if (BP_IS_GANG(bp) && \
+ DVA_GET_ASIZE(&bp->blk_dva[2]) <= \
+ DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \
+ copies--; \
+ len += func(buf + len, size - len, \
+ "[L%llu %s] %s %s %s %s %s %s%c" \
+ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \
+ "cksum=%llx:%llx:%llx:%llx", \
+ (u_longlong_t)BP_GET_LEVEL(bp), \
+ type, \
+ checksum, \
+ compress, \
+ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \
+ BP_IS_GANG(bp) ? "gang" : "contiguous", \
+ BP_GET_DEDUP(bp) ? "dedup" : "unique", \
+ copyname[copies], \
+ ws, \
+ (u_longlong_t)BP_GET_LSIZE(bp), \
+ (u_longlong_t)BP_GET_PSIZE(bp), \
+ (u_longlong_t)bp->blk_birth, \
+ (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \
+ (u_longlong_t)bp->blk_fill, \
+ ws, \
+ (u_longlong_t)bp->blk_cksum.zc_word[0], \
+ (u_longlong_t)bp->blk_cksum.zc_word[1], \
+ (u_longlong_t)bp->blk_cksum.zc_word[2], \
+ (u_longlong_t)bp->blk_cksum.zc_word[3]); \
+ } \
+ ASSERT(len < size); \
+ }
+
#include <sys/dmu.h>
#define BP_GET_BUFC_TYPE(bp) \
metaslab_fini(metaslab_t *msp)
{
metaslab_group_t *mg = msp->ms_group;
+ int t;
- vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
- -msp->ms_smo.smo_alloc, B_TRUE);
+ vdev_space_update(mg->mg_vd,
+ -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
metaslab_group_remove(mg, msp);
space_map_destroy(&msp->ms_freemap[t]);
}
- for (int t = 0; t < TXG_DEFER_SIZE; t++)
++ for (t = 0; t < TXG_DEFER_SIZE; t++)
+ space_map_destroy(&msp->ms_defermap[t]);
+
+ ASSERT3S(msp->ms_deferspace, ==, 0);
+
mutex_exit(&msp->ms_lock);
mutex_destroy(&msp->ms_lock);
static int
metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
{
+ metaslab_group_t *mg = msp->ms_group;
space_map_t *sm = &msp->ms_map;
space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
++ int t;
ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
- int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo,
- msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
- if (error) {
- metaslab_group_sort(msp->ms_group, msp, 0);
- return (error);
+ space_map_load_wait(sm);
+ if (!sm->sm_loaded) {
+ int error = space_map_load(sm, sm_ops, SM_FREE,
+ &msp->ms_smo,
+ spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
+ if (error) {
+ metaslab_group_sort(msp->ms_group, msp, 0);
+ return (error);
+ }
- for (int t = 0; t < TXG_DEFER_SIZE; t++)
++ for (t = 0; t < TXG_DEFER_SIZE; t++)
+ space_map_walk(&msp->ms_defermap[t],
+ space_map_claim, sm);
+
+ }
+
+ /*
+ * Track the bonus area as we activate new metaslabs.
+ */
+ if (sm->sm_start > mg->mg_bonus_area) {
+ mutex_enter(&mg->mg_lock);
+ mg->mg_bonus_area = sm->sm_start;
+ mutex_exit(&mg->mg_lock);
}
/*
space_map_obj_t *smo = &msp->ms_smo_syncing;
dmu_buf_t *db;
dmu_tx_t *tx;
+ int t;
- tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+ ASSERT(!vd->vdev_ishole);
+
+ if (allocmap->sm_space == 0 && freemap->sm_space == 0)
+ return;
/*
* The only state that can actually be changing concurrently with
space_map_walk(sm, space_map_remove, allocmap);
space_map_walk(freed_map, space_map_remove, allocmap);
- for (int t = 0; t < TXG_DEFER_SIZE; t++)
++ for (t = 0; t < TXG_DEFER_SIZE; t++)
+ space_map_walk(&msp->ms_defermap[t],
+ space_map_remove, allocmap);
+
- for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
+ for (t = 1; t < TXG_CONCURRENT_STATES; t++)
space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
space_map_remove, allocmap);
space_map_obj_t *smosync = &msp->ms_smo_syncing;
space_map_t *sm = &msp->ms_map;
space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+ space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
metaslab_group_t *mg = msp->ms_group;
vdev_t *vd = mg->mg_vd;
+ int64_t alloc_delta, defer_delta;
+ int t;
+ ASSERT(!vd->vdev_ishole);
+
mutex_enter(&msp->ms_lock);
/*
space_map_create(&msp->ms_freemap[t], sm->sm_start,
sm->sm_size, sm->sm_shift, sm->sm_lock);
}
- vdev_space_update(vd, sm->sm_size, 0, B_TRUE);
+
- for (int t = 0; t < TXG_DEFER_SIZE; t++)
++ for (t = 0; t < TXG_DEFER_SIZE; t++)
+ space_map_create(&msp->ms_defermap[t], sm->sm_start,
+ sm->sm_size, sm->sm_shift, sm->sm_lock);
+
+ vdev_space_update(vd, 0, 0, sm->sm_size);
}
- vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE);
+ alloc_delta = smosync->smo_alloc - smo->smo_alloc;
+ defer_delta = freed_map->sm_space - defer_map->sm_space;
+
+ vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
mutex_exit(&msp->ms_lock);
}
- for (int m = 0; m < vd->vdev_ms_count; m++) {
+ void
+ metaslab_sync_reassess(metaslab_group_t *mg)
+ {
+ vdev_t *vd = mg->mg_vd;
++ int m;
+
+ /*
+ * Re-evaluate all metaslabs which have lower offsets than the
+ * bonus area.
+ */
++ for (m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ if (msp->ms_map.sm_start > mg->mg_bonus_area)
+ break;
+
+ mutex_enter(&msp->ms_lock);
+ metaslab_group_sort(mg, msp, metaslab_weight(msp));
+ mutex_exit(&msp->ms_lock);
+ }
+
+ /*
+ * Prefetch the next potential metaslabs
+ */
+ metaslab_prefetch(mg);
+ }
+
static uint64_t
metaslab_distance(metaslab_t *msp, dva_t *dva)
{
{
dva_t *dva = bp->blk_dva;
dva_t *hintdva = hintbp->blk_dva;
- int error = 0;
+ int d, error = 0;
ASSERT(bp->blk_birth == 0);
+ ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
{
const dva_t *dva = bp->blk_dva;
- int ndvas = BP_GET_NDVAS(bp);
+ int d, ndvas = BP_GET_NDVAS(bp);
ASSERT(!BP_IS_HOLE(bp));
- ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg);
+ ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
offsetof(spa_error_entry_t, se_avl));
}
- /*
- * Activate an uninitialized pool.
- */
- static void
- spa_activate(spa_t *spa, int mode)
+ static taskq_t *
+ spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
+ uint_t value)
{
- int t, q;
+ uint_t flags = TASKQ_PREPOPULATE;
+ boolean_t batch = B_FALSE;
- ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+ switch (mode) {
+ case zti_mode_null:
+ return (NULL); /* no taskq needed */
- spa->spa_state = POOL_STATE_ACTIVE;
- spa->spa_mode = mode;
+ case zti_mode_fixed:
+ ASSERT3U(value, >=, 1);
+ value = MAX(value, 1);
+ break;
+
+ case zti_mode_batch:
+ batch = B_TRUE;
+ flags |= TASKQ_THREADS_CPU_PCT;
+ value = zio_taskq_batch_pct;
+ break;
+
+ case zti_mode_online_percent:
+ flags |= TASKQ_THREADS_CPU_PCT;
+ break;
+
+ default:
+ panic("unrecognized mode for %s taskq (%u:%u) in "
+ "spa_activate()",
+ name, mode, value);
+ break;
+ }
- spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
- spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
+ if (zio_taskq_sysdc && spa->spa_proc != &p0) {
+ if (batch)
+ flags |= TASKQ_DC_BATCH;
+
+ return (taskq_create_sysdc(name, value, 50, INT_MAX,
+ spa->spa_proc, zio_taskq_basedc, flags));
+ }
+ return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
+ spa->spa_proc, flags));
+ }
+
+ static void
+ spa_create_zio_taskqs(spa_t *spa)
+ {
- for (int t = 0; t < ZIO_TYPES; t++) {
- for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
++ int t, q;
+
+ for (t = 0; t < ZIO_TYPES; t++) {
- const zio_taskq_info_t *ztip = &zio_taskqs[t];
+ for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
- enum zti_modes mode = ztip->zti_nthreads[q].zti_mode;
- uint_t value = ztip->zti_nthreads[q].zti_value;
+ const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+ enum zti_modes mode = ztip->zti_mode;
+ uint_t value = ztip->zti_value;
char name[32];
(void) snprintf(name, sizeof (name),
list_destroy(&spa->spa_config_dirty_list);
list_destroy(&spa->spa_state_dirty_list);
- for (int t = 0; t < ZIO_TYPES; t++) {
- for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
+ for (t = 0; t < ZIO_TYPES; t++) {
+ for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
- taskq_destroy(spa->spa_zio_taskq[t][q]);
+ if (spa->spa_zio_taskq[t][q] != NULL)
+ taskq_destroy(spa->spa_zio_taskq[t][q]);
spa->spa_zio_taskq[t][q] = NULL;
}
}
* that the label does not contain the most up-to-date information.
*/
void
- spa_load_log_state(spa_t *spa)
+ spa_load_log_state(spa_t *spa, nvlist_t *nv)
{
- nvlist_t *nv, *nvroot, **child;
- uint64_t is_log;
- uint_t children;
- vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *ovd, *rvd = spa->spa_root_vdev;
+ int c;
- VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0);
- VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
- VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0);
-
- for (c = 0; c < children; c++) {
- vdev_t *tvd = rvd->vdev_child[c];
+ /*
+ * Load the original root vdev tree from the passed config.
+ */
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
- if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
- &is_log) == 0 && is_log)
- vdev_load_log_state(tvd, child[c]);
- for (int c = 0; c < rvd->vdev_children; c++) {
++ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *cvd = rvd->vdev_child[c];
+ if (cvd->vdev_islog)
+ vdev_load_log_state(cvd, ovd->vdev_child[c]);
}
- nvlist_free(nv);
+ vdev_free(ovd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
}
/*
return (0);
}
- /*
- * Load an existing storage pool, using the pool's builtin spa_config as a
- * source of configuration information.
- */
- static int
- spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
+ static boolean_t
+ spa_passivate_log(spa_t *spa)
{
- int error = 0;
- nvlist_t *nvroot = NULL;
- vdev_t *rvd;
- uberblock_t *ub = &spa->spa_uberblock;
- uint64_t config_cache_txg = spa->spa_config_txg;
- uint64_t pool_guid;
- uint64_t version;
- uint64_t autoreplace = 0;
- int orig_mode = spa->spa_mode;
- char *ereport = FM_EREPORT_ZFS_POOL;
+ vdev_t *rvd = spa->spa_root_vdev;
+ boolean_t slog_found = B_FALSE;
++ int c;
- /*
- * If this is an untrusted config, access the pool in read-only mode.
- * This prevents things like resilvering recently removed devices.
- */
- if (!mosconfig)
- spa->spa_mode = FREAD;
+ ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (!spa_has_slogs(spa))
+ return (B_FALSE);
- spa->spa_load_state = state;
- for (int c = 0; c < rvd->vdev_children; c++) {
++ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
- if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
- nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
- error = EINVAL;
- goto out;
+ if (tvd->vdev_islog) {
+ metaslab_group_passivate(mg);
+ slog_found = B_TRUE;
+ }
}
- /*
- * Versioning wasn't explicitly added to the label until later, so if
- * it's not present treat it as the initial version.
- */
- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
- version = SPA_VERSION_INITIAL;
+ return (slog_found);
+ }
- (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
- &spa->spa_config_txg);
+ static void
+ spa_activate_log(spa_t *spa)
+ {
+ vdev_t *rvd = spa->spa_root_vdev;
++ int c;
- if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
- spa_guid_exists(pool_guid, 0)) {
- error = EEXIST;
- goto out;
- }
+ ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
- spa->spa_load_guid = pool_guid;
- for (int c = 0; c < rvd->vdev_children; c++) {
++ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
- /*
- * Create "The Godfather" zio to hold all async IOs
- */
- spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+ if (tvd->vdev_islog)
+ metaslab_group_activate(mg);
+ }
+ }
- /*
- * Parse the configuration into a vdev tree. We explicitly set the
- * value that will be returned by spa_version() since parsing the
- * configuration requires knowing the version number.
- */
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- spa->spa_ubsync.ub_version = version;
- error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
- spa_config_exit(spa, SCL_ALL, FTAG);
+ int
+ spa_offline_log(spa_t *spa)
+ {
+ int error = 0;
- if (error != 0)
- goto out;
+ if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
+ NULL, DS_FIND_CHILDREN)) == 0) {
- ASSERT(spa->spa_root_vdev == rvd);
- ASSERT(spa_guid(spa) == pool_guid);
+ /*
+ * We successfully offlined the log device, sync out the
+ * current txg so that the "stubby" block can be removed
+ * by zil_sync().
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ }
+ return (error);
+ }
- /*
- * Try to open all vdevs, loading each label in the process.
- */
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- error = vdev_open(rvd);
- spa_config_exit(spa, SCL_ALL, FTAG);
- if (error != 0)
- goto out;
+ static void
+ spa_aux_check_removed(spa_aux_vdev_t *sav)
+ {
- for (int i = 0; i < sav->sav_count; i++)
++ int i;
+
- /*
- * We need to validate the vdev labels against the configuration that
- * we have in hand, which is dependent on the setting of mosconfig. If
- * mosconfig is true then we're validating the vdev labels based on
- * that config. Otherwise, we're validating against the cached config
- * (zpool.cache) that was read when we loaded the zfs module, and then
- * later we will recursively call spa_load() and validate against
- * the vdev config.
- */
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- error = vdev_validate(rvd);
- spa_config_exit(spa, SCL_ALL, FTAG);
- if (error != 0)
- goto out;
++ for (i = 0; i < sav->sav_count; i++)
+ spa_check_removed(sav->sav_vdevs[i]);
+ }
- if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
- error = ENXIO;
- goto out;
- }
+ void
+ spa_claim_notify(zio_t *zio)
+ {
+ spa_t *spa = zio->io_spa;
- /*
- * Find the best uberblock.
- */
- vdev_uberblock_load(NULL, rvd, ub);
+ if (zio->io_error)
+ return;
- /*
- * If we weren't able to find a single valid uberblock, return failure.
- */
- if (ub->ub_txg == 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = ENXIO;
- goto out;
- }
+ mutex_enter(&spa->spa_props_lock); /* any mutex will do */
+ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
+ spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+ mutex_exit(&spa->spa_props_lock);
+ }
- /*
- * If the pool is newer than the code, we can't open it.
- */
- if (ub->ub_version > SPA_VERSION) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_VERSION_NEWER);
- error = ENOTSUP;
- goto out;
- }
+ typedef struct spa_load_error {
+ uint64_t sle_meta_count;
+ uint64_t sle_data_count;
+ } spa_load_error_t;
- /*
- * If the vdev guid sum doesn't match the uberblock, we have an
- * incomplete configuration.
- */
- if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_BAD_GUID_SUM);
- error = ENXIO;
- goto out;
- }
+ static void
+ spa_load_verify_done(zio_t *zio)
+ {
+ blkptr_t *bp = zio->io_bp;
+ spa_load_error_t *sle = zio->io_private;
+ dmu_object_type_t type = BP_GET_TYPE(bp);
+ int error = zio->io_error;
- /*
- * Initialize internal SPA structures.
- */
- spa->spa_state = POOL_STATE_ACTIVE;
- spa->spa_ubsync = spa->spa_uberblock;
- spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
- error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
if (error) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- goto out;
+ if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
+ type != DMU_OT_INTENT_LOG)
+ atomic_add_64(&sle->sle_meta_count, 1);
+ else
+ atomic_add_64(&sle->sle_data_count, 1);
}
- spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+ zio_data_buf_free(zio->io_data, zio->io_size);
+ }
- if (zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
- sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
+ /*ARGSUSED*/
+ static int
+ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+ {
+ if (bp != NULL) {
+ zio_t *rio = arg;
+ size_t size = BP_GET_PSIZE(bp);
+ void *data = zio_data_buf_alloc(size);
- if (!mosconfig) {
- nvlist_t *newconfig;
- uint64_t hostid;
+ zio_nowait(zio_read(rio, spa, bp, data, size,
+ spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
+ }
+ return (0);
+ }
- if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
+ static int
+ spa_load_verify(spa_t *spa)
+ {
+ zio_t *rio;
+ spa_load_error_t sle = { 0 };
+ zpool_rewind_policy_t policy;
+ boolean_t verify_ok = B_FALSE;
+ int error;
+
+ zpool_get_rewind_policy(spa->spa_config, &policy);
+
+ if (policy.zrp_request & ZPOOL_NEVER_REWIND)
+ return (0);
+
+ rio = zio_root(spa, NULL, &sle,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+ error = traverse_pool(spa, spa->spa_verify_min_txg,
+ TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
+
+ (void) zio_wait(rio);
+
+ spa->spa_load_meta_errors = sle.sle_meta_count;
+ spa->spa_load_data_errors = sle.sle_data_count;
+
+ if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
+ sle.sle_data_count <= policy.zrp_maxdata) {
+ verify_ok = B_TRUE;
+ spa->spa_load_txg = spa->spa_uberblock.ub_txg;
+ spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
+ } else {
+ spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
+ }
+
+ if (error) {
+ if (error != ENXIO && error != EIO)
+ error = EIO;
+ return (error);
+ }
+
+ return (verify_ok ? 0 : EIO);
+ }
+
+ /*
+ * Find a value in the pool props object.
+ */
+ static void
+ spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
+ {
+ (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
+ zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
+ }
+
+ /*
+ * Find a value in the pool directory object.
+ */
+ static int
+ spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
+ {
+ return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ name, sizeof (uint64_t), 1, val));
+ }
+
+ static int
+ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
+ {
+ vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
+ return (err);
+ }
+
+ /*
+ * Fix up config after a partly-completed split. This is done with the
+ * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
+ * pool have that entry in their config, but only the splitting one contains
+ * a list of all the guids of the vdevs that are being split off.
+ *
+ * This function determines what to do with that list: either rejoin
+ * all the disks to the pool, or complete the splitting process. To attempt
+ * the rejoin, each disk that is offlined is marked online again, and
+ * we do a reopen() call. If the vdev label for every disk that was
+ * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
+ * then we call vdev_split() on each disk, and complete the split.
+ *
+ * Otherwise we leave the config alone, with all the vdevs in place in
+ * the original pool.
+ */
+ static void
+ spa_try_repair(spa_t *spa, nvlist_t *config)
+ {
+ uint_t extracted;
+ uint64_t *glist;
+ uint_t i, gcount;
+ nvlist_t *nvl;
+ vdev_t **vd;
+ boolean_t attempt_reopen;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
+ return;
+
+ /* check that the config is complete */
+ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+ &glist, &gcount) != 0)
+ return;
+
+ vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
+
+ /* attempt to online all the vdevs & validate */
+ attempt_reopen = B_TRUE;
+ for (i = 0; i < gcount; i++) {
+ if (glist[i] == 0) /* vdev is hole */
+ continue;
+
+ vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
+ if (vd[i] == NULL) {
+ /*
+ * Don't bother attempting to reopen the disks;
+ * just do the split.
+ */
+ attempt_reopen = B_FALSE;
+ } else {
+ /* attempt to re-online it */
+ vd[i]->vdev_offline = B_FALSE;
+ }
+ }
+
+ if (attempt_reopen) {
+ vdev_reopen(spa->spa_root_vdev);
+
+ /* check each device to see what state it's in */
+ for (extracted = 0, i = 0; i < gcount; i++) {
+ if (vd[i] != NULL &&
+ vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
+ break;
+ ++extracted;
}
+ }
+
+ /*
+ * If every disk has been moved to the new pool, or if we never
+ * even attempted to look at them, then we split them off for
+ * good.
+ */
+ if (!attempt_reopen || gcount == extracted) {
+ for (i = 0; i < gcount; i++)
+ if (vd[i] != NULL)
+ vdev_split(vd[i]);
+ vdev_reopen(spa->spa_root_vdev);
+ }
+
+ kmem_free(vd, gcount * sizeof (vdev_t *));
+ }
+
+ static int
+ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
+ boolean_t mosconfig)
+ {
+ nvlist_t *config = spa->spa_config;
+ char *ereport = FM_EREPORT_ZFS_POOL;
+ int error;
+ uint64_t pool_guid;
+ nvlist_t *nvl;
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
+ return (EINVAL);
- if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig,
+ /*
+ * Versioning wasn't explicitly added to the label until later, so if
+ * it's not present treat it as the initial version.
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &spa->spa_ubsync.ub_version) != 0)
+ spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ &spa->spa_config_txg);
+
+ if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
+ spa_guid_exists(pool_guid, 0)) {
+ error = EEXIST;
+ } else {
+ spa->spa_load_guid = pool_guid;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
+ &nvl) == 0) {
+ VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
+ KM_SLEEP) == 0);
+ }
+
+ error = spa_load_impl(spa, pool_guid, config, state, type,
+ mosconfig, &ereport);
+ }
+
+ spa->spa_minref = refcount_count(&spa->spa_refcount);
+ if (error && error != EBADF)
+ zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+ spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
+ spa->spa_ena = 0;
+
+ return (error);
+ }
+
+ /*
+ * Load an existing storage pool, using the pool's builtin spa_config as a
+ * source of configuration information.
+ */
+ static int
+ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
+ spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
+ char **ereport)
+ {
+ int error = 0;
+ nvlist_t *nvroot = NULL;
+ vdev_t *rvd;
+ uberblock_t *ub = &spa->spa_uberblock;
+ uint64_t config_cache_txg = spa->spa_config_txg;
+ int orig_mode = spa->spa_mode;
+ int parse;
+ uint64_t obj;
++ int c;
+
+ /*
+ * If this is an untrusted config, access the pool in read-only mode.
+ * This prevents things like resilvering recently removed devices.
+ */
+ if (!mosconfig)
+ spa->spa_mode = FREAD;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa->spa_load_state = state;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
+ return (EINVAL);
+
+ parse = (type == SPA_IMPORT_EXISTING ?
+ VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
+
+ /*
+ * Create "The Godfather" zio to hold all async IOs
+ */
+ spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+
+ /*
+ * Parse the configuration into a vdev tree. We explicitly set the
+ * value that will be returned by spa_version() since parsing the
+ * configuration requires knowing the version number.
+ */
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (error != 0)
+ return (error);
+
+ ASSERT(spa->spa_root_vdev == rvd);
+
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ ASSERT(spa_guid(spa) == pool_guid);
+ }
+
+ /*
+ * Try to open all vdevs, loading each label in the process.
+ */
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = vdev_open(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (error != 0)
+ return (error);
+
+ /*
+ * We need to validate the vdev labels against the configuration that
+ * we have in hand, which is dependent on the setting of mosconfig. If
+ * mosconfig is true then we're validating the vdev labels based on
+ * that config. Otherwise, we're validating against the cached config
+ * (zpool.cache) that was read when we loaded the zfs module, and then
+ * later we will recursively call spa_load() and validate against
+ * the vdev config.
+ *
+ * If we're assembling a new pool that's been split off from an
+ * existing pool, the labels haven't yet been updated so we skip
+ * validation for now.
+ */
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = vdev_validate(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (error != 0)
+ return (error);
+
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+ return (ENXIO);
+ }
+
+ /*
+ * Find the best uberblock.
+ */
+ vdev_uberblock_load(NULL, rvd, ub);
+
+ /*
+ * If we weren't able to find a single valid uberblock, return failure.
+ */
+ if (ub->ub_txg == 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
+
+ /*
+ * If the pool is newer than the code, we can't open it.
+ */
+ if (ub->ub_version > SPA_VERSION)
+ return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
+
+ /*
+ * If the vdev guid sum doesn't match the uberblock, we have an
+ * incomplete configuration.
+ */
+ if (mosconfig && type != SPA_IMPORT_ASSEMBLE &&
+ rvd->vdev_guid_sum != ub->ub_guid_sum)
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
+
+ if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_try_repair(spa, config);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ nvlist_free(spa->spa_config_splitting);
+ spa->spa_config_splitting = NULL;
+ }
+
+ /*
+ * Initialize internal SPA structures.
+ */
+ spa->spa_state = POOL_STATE_ACTIVE;
+ spa->spa_ubsync = spa->spa_uberblock;
+ spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+ TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+ spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+ spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+ spa->spa_claim_max_txg = spa->spa_first_txg;
+ spa->spa_prev_software_version = ub->ub_software_version;
+
+ error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+ if (error)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+
+ if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ if (!mosconfig) {
+ uint64_t hostid;
+ nvlist_t *policy = NULL, *nvconfig;
+
+ if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
char *hostname;
unsigned long myhostid = 0;
* Check the state of the root vdev. If it can't be opened, it
* indicates one or more toplevel vdevs are faulted.
*/
- if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
- error = ENXIO;
- goto out;
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+ return (ENXIO);
+
+ /*
+ * Load the DDTs (dedup tables).
+ */
+ error = ddt_load(spa);
+ if (error != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ spa_update_dspace(spa);
+
+ if (state != SPA_LOAD_TRYIMPORT) {
+ error = spa_load_verify(spa);
+ if (error)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+ error));
}
- if (spa_writeable(spa)) {
+ /*
+ * Load the intent log state and check log integrity. If we're
+ * assembling a pool from a split, the log is not transferred over.
+ */
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ nvlist_t *nvconfig;
+
+ if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+ VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ spa_load_log_state(spa, nvroot);
+ nvlist_free(nvconfig);
+
+ if (spa_check_logs(spa)) {
+ *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
+ }
+ }
+
+ if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
+ spa->spa_load_max_txg == UINT64_MAX)) {
dmu_tx_t *tx;
int need_update = B_FALSE;
+ int c;
ASSERT(state != SPA_LOAD_TRYIMPORT);
zil_claim, tx, DS_FIND_CHILDREN);
dmu_tx_commit(tx);
- spa->spa_log_state = SPA_LOG_GOOD;
- spa->spa_sync_on = B_TRUE;
- txg_sync_start(spa->spa_dsl_pool);
+ spa->spa_claiming = B_FALSE;
+
+ spa_set_log_state(spa, SPA_LOG_GOOD);
+ spa->spa_sync_on = B_TRUE;
+ txg_sync_start(spa->spa_dsl_pool);
+
+ /*
+ * Wait for all claims to sync. We sync up to the highest
+ * claimed log block birth time so that claimed log blocks
+ * don't appear to be from the future. spa_claim_max_txg
+ * will have been set for us by either zil_check_log_chain()
+ * (invoked from spa_check_logs()) or zil_claim() above.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
+
+ /*
+ * If the config cache is stale, or we have uninitialized
+ * metaslabs (see spa_vdev_add()), then update the config.
+ *
+ * If spa_load_verbatim is true, trust the current
+ * in-core spa_config and update the disk labels.
+ */
+ if (config_cache_txg != spa->spa_config_txg ||
+ state == SPA_LOAD_IMPORT || spa->spa_load_verbatim ||
+ state == SPA_LOAD_RECOVER)
+ need_update = B_TRUE;
+
- for (int c = 0; c < rvd->vdev_children; c++)
++ for (c = 0; c < rvd->vdev_children; c++)
+ if (rvd->vdev_child[c]->vdev_ms_array == 0)
+ need_update = B_TRUE;
+
+ /*
+ * Update the config cache asychronously in case we're the
+ * root pool, in which case the config cache isn't writable yet.
+ */
+ if (need_update)
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+
+ /*
+ * Check all DTLs to see if anything needs resilvering.
+ */
+ if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ vdev_resilver_needed(rvd, NULL, NULL))
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+
+ /*
+ * Delete any inconsistent datasets.
+ */
+ (void) dmu_objset_find(spa_name(spa),
+ dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
+
+ /*
+ * Clean up any stale temporary dataset userrefs.
+ */
+ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
+ }
+
+ return (0);
+ }
+
+ static int
+ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
+ {
+ spa_unload(spa);
+ spa_deactivate(spa);
+
+ spa->spa_load_max_txg--;
+
+ spa_activate(spa, spa_mode_global);
+ spa_async_suspend(spa);
+
+ return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
+ }
+
+ static int
+ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
+ uint64_t max_request, int rewind_flags)
+ {
+ nvlist_t *config = NULL;
+ int load_error, rewind_error;
+ uint64_t safe_rewind_txg;
+ uint64_t min_txg;
+
+ if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
+ spa->spa_load_max_txg = spa->spa_load_txg;
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+ } else {
+ spa->spa_load_max_txg = max_request;
+ }
- /*
- * Wait for all claims to sync.
- */
- txg_wait_synced(spa->spa_dsl_pool, 0);
+ load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
+ mosconfig);
+ if (load_error == 0)
+ return (0);
- /*
- * If the config cache is stale, or we have uninitialized
- * metaslabs (see spa_vdev_add()), then update the config.
- *
- * If spa_load_verbatim is true, trust the current
- * in-core spa_config and update the disk labels.
- */
- if (config_cache_txg != spa->spa_config_txg ||
- state == SPA_LOAD_IMPORT || spa->spa_load_verbatim)
- need_update = B_TRUE;
+ if (spa->spa_root_vdev != NULL)
+ config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
- for (c = 0; c < rvd->vdev_children; c++)
- if (rvd->vdev_child[c]->vdev_ms_array == 0)
- need_update = B_TRUE;
+ spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
+ spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
- /*
- * Update the config cache asychronously in case we're the
- * root pool, in which case the config cache isn't writable yet.
- */
- if (need_update)
- spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+ if (rewind_flags & ZPOOL_NEVER_REWIND) {
+ nvlist_free(config);
+ return (load_error);
+ }
- /*
- * Check all DTLs to see if anything needs resilvering.
- */
- if (vdev_resilver_needed(rvd, NULL, NULL))
- spa_async_request(spa, SPA_ASYNC_RESILVER);
+ /* Price of rolling back is discarding txgs, including log */
+ if (state == SPA_LOAD_RECOVER)
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+
+ spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
+ safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
+ min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
+ TXG_INITIAL : safe_rewind_txg;
+
+ /*
+ * Continue as long as we're finding errors, we're still within
+ * the acceptable rewind range, and we're still finding uberblocks
+ */
+ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
+ spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
+ if (spa->spa_load_max_txg < safe_rewind_txg)
+ spa->spa_extreme_rewind = B_TRUE;
+ rewind_error = spa_load_retry(spa, state, mosconfig);
}
- error = 0;
- out:
- spa->spa_minref = refcount_count(&spa->spa_refcount);
- if (error && error != EBADF)
- zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
- spa->spa_load_state = SPA_LOAD_NONE;
- spa->spa_ena = 0;
+ if (config)
+ spa_rewind_data_to_nvlist(spa, config);
- return (error);
+ spa->spa_extreme_rewind = B_FALSE;
+ spa->spa_load_max_txg = UINT64_MAX;
+
+ if (config && (rewind_error || state != SPA_LOAD_RECOVER))
+ spa_config_set(spa, config);
+
+ return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
}
/*
/*
* Transfer each new top-level vdev from vd to rvd.
*/
- for (int c = 0; c < vd->vdev_children; c++) {
+ for (c = 0; c < vd->vdev_children; c++) {
+
+ /*
+ * Set the vdev id to the first hole, if one exists.
+ */
+ for (id = 0; id < rvd->vdev_children; id++) {
+ if (rvd->vdev_child[id]->vdev_ishole) {
+ vdev_free(rvd->vdev_child[id]);
+ break;
+ }
+ }
tvd = vd->vdev_child[c];
vdev_remove_child(vd, tvd);
- tvd->vdev_id = rvd->vdev_children;
+ tvd->vdev_id = id;
vdev_add_child(rvd, tvd);
vdev_config_dirty(tvd);
}
boolean_t unspare = B_FALSE;
uint64_t unspare_guid;
size_t len;
+ char *vdpath;
+ int t;
txg = spa_vdev_enter(spa);
(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
}
- /*
- * If the parent mirror/replacing vdev only has one child,
- * the parent is no longer needed. Remove it from the tree.
- */
- if (pvd->vdev_children == 1)
- vdev_remove_parent(cvd);
+ /*
+ * If the parent mirror/replacing vdev only has one child,
+ * the parent is no longer needed. Remove it from the tree.
+ */
+ if (pvd->vdev_children == 1)
+ vdev_remove_parent(cvd);
+
+ /*
+ * We don't set tvd until now because the parent we just removed
+ * may have been the previous top-level vdev.
+ */
+ tvd = cvd->vdev_top;
+ ASSERT(tvd->vdev_parent == rvd);
+
+ /*
+ * Reevaluate the parent vdev state.
+ */
+ vdev_propagate_state(cvd);
+
+ /*
+ * If the 'autoexpand' property is set on the pool then automatically
+ * try to expand the size of the pool. For example if the device we
+ * just detached was smaller than the others, it may be possible to
+ * add metaslabs (i.e. grow the pool). We need to reopen the vdev
+ * first so that we can obtain the updated sizes of the leaf vdevs.
+ */
+ if (spa->spa_autoexpand) {
+ vdev_reopen(tvd);
+ vdev_expand(tvd, txg);
+ }
+
+ vdev_config_dirty(tvd);
+
+ /*
+ * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
+ * vd->vdev_detached is set and free vd's DTL object in syncing context.
+ * But first make sure we're not on any *other* txg's DTL list, to
+ * prevent vd from being accessed after it's freed.
+ */
+ vdpath = spa_strdup(vd->vdev_path);
- for (int t = 0; t < TXG_SIZE; t++)
++ for (t = 0; t < TXG_SIZE; t++)
+ (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
+ vd->vdev_detached = B_TRUE;
+ vdev_dirty(tvd, VDD_DTL, vd, txg);
+
+ spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+
+ error = spa_vdev_exit(spa, vd, txg, 0);
+
+ spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
+ "vdev=%s", vdpath);
+ spa_strfree(vdpath);
+
+ /*
+ * If this was the removal of the original device in a hot spare vdev,
+ * then we want to go through and remove the device from the hot spare
+ * list of every other pool.
+ */
+ if (unspare) {
+ spa_t *myspa = spa;
+ spa = NULL;
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (spa->spa_state != POOL_STATE_ACTIVE)
+ continue;
+ if (spa == myspa)
+ continue;
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ (void) spa_vdev_remove(spa, unspare_guid,
+ B_TRUE);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+ }
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ return (error);
+ }
+
+ /*
+ * Split a set of devices from their mirrors, and create a new pool from them.
+ */
+ int
+ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+ nvlist_t *props, boolean_t exp)
+ {
+ int error = 0;
+ uint64_t txg, *glist;
+ spa_t *newspa;
+ uint_t c, children, lastlog;
+ nvlist_t **child, *nvl, *tmp;
+ dmu_tx_t *tx;
+ char *altroot = NULL;
+ vdev_t *rvd, **vml = NULL; /* vdev modify list */
+ boolean_t activate_slog;
+
+ if (!spa_writeable(spa))
+ return (EROFS);
+
+ txg = spa_vdev_enter(spa);
+
+ /* clear the log and flush everything up to now */
+ activate_slog = spa_passivate_log(spa);
+ (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+ error = spa_offline_log(spa);
+ txg = spa_vdev_config_enter(spa);
+
+ if (activate_slog)
+ spa_activate_log(spa);
+
+ if (error != 0)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ /* check new spa name before going any further */
+ if (spa_lookup(newname) != NULL)
+ return (spa_vdev_exit(spa, NULL, txg, EEXIST));
+
+ /*
+ * scan through all the children to ensure they're all mirrors
+ */
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
+ nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
+ &children) != 0)
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ /* first, check to ensure we've got the right child count */
+ rvd = spa->spa_root_vdev;
+ lastlog = 0;
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ /* don't count the holes & logs as children */
+ if (vd->vdev_islog || vd->vdev_ishole) {
+ if (lastlog == 0)
+ lastlog = c;
+ continue;
+ }
+
+ lastlog = 0;
+ }
+ if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ /* next, ensure no spare or cache devices are part of the split */
+ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
+ nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
+ return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+ vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
+ glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
+
+ /* then, loop over each vdev and validate it */
+ for (c = 0; c < children; c++) {
+ uint64_t is_hole = 0;
+
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+ &is_hole);
+
+ if (is_hole != 0) {
+ if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
+ spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
+ continue;
+ } else {
+ error = EINVAL;
+ break;
+ }
+ }
+
+ /* which disk is going to be split? */
+ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
+ &glist[c]) != 0) {
+ error = EINVAL;
+ break;
+ }
+
+ /* look it up in the spa */
+ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
+ if (vml[c] == NULL) {
+ error = ENODEV;
+ break;
+ }
+
+ /* make sure there's nothing stopping the split */
+ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
+ vml[c]->vdev_islog ||
+ vml[c]->vdev_ishole ||
+ vml[c]->vdev_isspare ||
+ vml[c]->vdev_isl2cache ||
+ !vdev_writeable(vml[c]) ||
+ vml[c]->vdev_children != 0 ||
+ vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
+ c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
+ error = EINVAL;
+ break;
+ }
+
+ if (vdev_dtl_required(vml[c])) {
+ error = EBUSY;
+ break;
+ }
+
+ /* we need certain info from the top level */
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
+ vml[c]->vdev_top->vdev_ms_array) == 0);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
+ vml[c]->vdev_top->vdev_ms_shift) == 0);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
+ vml[c]->vdev_top->vdev_asize) == 0);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
+ vml[c]->vdev_top->vdev_ashift) == 0);
+ }
+
+ if (error != 0) {
+ kmem_free(vml, children * sizeof (vdev_t *));
+ kmem_free(glist, children * sizeof (uint64_t));
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+
+ /* stop writers from using the disks */
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL)
+ vml[c]->vdev_offline = B_TRUE;
+ }
+ vdev_reopen(spa->spa_root_vdev);
/*
- * We don't set tvd until now because the parent we just removed
- * may have been the previous top-level vdev.
+ * Temporarily record the splitting vdevs in the spa config. This
+ * will disappear once the config is regenerated.
*/
- tvd = cvd->vdev_top;
- ASSERT(tvd->vdev_parent == rvd);
+ VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+ glist, children) == 0);
+ kmem_free(glist, children * sizeof (uint64_t));
- /*
- * Reevaluate the parent vdev state.
- */
- vdev_propagate_state(cvd);
+ mutex_enter(&spa->spa_props_lock);
+ VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
+ nvl) == 0);
+ mutex_exit(&spa->spa_props_lock);
+ spa->spa_config_splitting = nvl;
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ /* configure and create the new pool */
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ spa->spa_config_txg) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ spa_generate_guid(NULL)) == 0);
+ (void) nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
- /*
- * If the 'autoexpand' property is set on the pool then automatically
- * try to expand the size of the pool. For example if the device we
- * just detached was smaller than the others, it may be possible to
- * add metaslabs (i.e. grow the pool). We need to reopen the vdev
- * first so that we can obtain the updated sizes of the leaf vdevs.
- */
- if (spa->spa_autoexpand) {
- vdev_reopen(tvd);
- vdev_expand(tvd, txg);
+ /* add the new pool to the namespace */
+ newspa = spa_add(newname, config, altroot);
+ newspa->spa_config_txg = spa->spa_config_txg;
+ spa_set_log_state(newspa, SPA_LOG_CLEAR);
+
+ /* release the spa config lock, retaining the namespace lock */
+ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, FTAG, 1);
+
+ spa_activate(newspa, spa_mode_global);
+ spa_async_suspend(newspa);
+
+ /* create the new pool from the disks of the original pool */
+ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
+ if (error)
+ goto out;
+
+ /* if that worked, generate a real config for the new pool */
+ if (newspa->spa_root_vdev != NULL) {
+ VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
+ ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
+ spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
+ B_TRUE));
}
- vdev_config_dirty(tvd);
+ /* set the props */
+ if (props != NULL) {
+ spa_configfile_set(newspa, props, B_FALSE);
+ error = spa_prop_set(newspa, props);
+ if (error)
+ goto out;
+ }
- /*
- * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
- * vd->vdev_detached is set and free vd's DTL object in syncing context.
- * But first make sure we're not on any *other* txg's DTL list, to
- * prevent vd from being accessed after it's freed.
- */
- for (t = 0; t < TXG_SIZE; t++)
- (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
- vd->vdev_detached = B_TRUE;
- vdev_dirty(tvd, VDD_DTL, vd, txg);
+ /* flush everything */
+ txg = spa_vdev_config_enter(newspa);
+ vdev_config_dirty(newspa->spa_root_vdev);
+ (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
- spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, FTAG, 2);
- error = spa_vdev_exit(spa, vd, txg, 0);
+ spa_async_resume(newspa);
- /*
- * If this was the removal of the original device in a hot spare vdev,
- * then we want to go through and remove the device from the hot spare
- * list of every other pool.
- */
- if (unspare) {
- spa_t *myspa = spa;
- spa = NULL;
- mutex_enter(&spa_namespace_lock);
- while ((spa = spa_next(spa)) != NULL) {
- if (spa->spa_state != POOL_STATE_ACTIVE)
- continue;
- if (spa == myspa)
- continue;
- spa_open_ref(spa, FTAG);
- mutex_exit(&spa_namespace_lock);
- (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
- mutex_enter(&spa_namespace_lock);
- spa_close(spa, FTAG);
+ /* finally, update the original pool's config */
+ txg = spa_vdev_config_enter(spa);
+ tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0)
+ dmu_tx_abort(tx);
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL) {
+ vdev_split(vml[c]);
+ if (error == 0)
+ spa_history_log_internal(LOG_POOL_VDEV_DETACH,
+ spa, tx, "vdev=%s",
+ vml[c]->vdev_path);
+ vdev_free(vml[c]);
}
- mutex_exit(&spa_namespace_lock);
}
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa->spa_config_splitting = NULL;
+ nvlist_free(nvl);
+ if (error == 0)
+ dmu_tx_commit(tx);
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, FTAG, 3);
+
+ /* split is complete; log a history record */
+ spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
+ "split new pool %s from pool %s", newname, spa_name(spa));
+
+ kmem_free(vml, children * sizeof (vdev_t *));
+
+ /* if we're not going to mount the filesystems in userland, export */
+ if (exp)
+ error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
+ B_FALSE, B_FALSE);
+
+ return (error);
+
+ out:
+ spa_unload(newspa);
+ spa_deactivate(newspa);
+ spa_remove(newspa);
+
+ txg = spa_vdev_config_enter(spa);
+
+ /* re-online all offlined disks */
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL)
+ vml[c]->vdev_offline = B_FALSE;
+ }
+ vdev_reopen(spa->spa_root_vdev);
+
+ nvlist_free(spa->spa_config_splitting);
+ spa->spa_config_splitting = NULL;
+ (void) spa_vdev_exit(spa, NULL, txg, error);
+ kmem_free(vml, children * sizeof (vdev_t *));
return (error);
}
static void
spa_async_remove(spa_t *spa, vdev_t *vd)
{
+ int c;
+
if (vd->vdev_remove_wanted) {
- vd->vdev_remove_wanted = 0;
+ vd->vdev_remove_wanted = B_FALSE;
+ vd->vdev_delayed_close = B_FALSE;
vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
- vdev_clear(spa, vd);
+
+ /*
+ * We want to clear the stats, but we don't want to do a full
+ * vdev_clear() as that will cause us to throw away
+ * degraded/faulted state as well as attempt to reopen the
+ * device, all of which is a waste.
+ */
+ vd->vdev_stat.vs_read_errors = 0;
+ vd->vdev_stat.vs_write_errors = 0;
+ vd->vdev_stat.vs_checksum_errors = 0;
+
vdev_state_dirty(vd->vdev_top);
}
static void
spa_async_probe(spa_t *spa, vdev_t *vd)
{
+ int c;
+
if (vd->vdev_probe_wanted) {
- vd->vdev_probe_wanted = 0;
+ vd->vdev_probe_wanted = B_FALSE;
vdev_reopen(vd); /* vdev_open() does the actual probe */
}
* See if any devices need to be marked REMOVED.
*/
if (tasks & SPA_ASYNC_REMOVE) {
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
spa_async_remove(spa, spa->spa_root_vdev);
- for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
+ for (i = 0; i < spa->spa_l2cache.sav_count; i++)
spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
- for (int i = 0; i < spa->spa_spares.sav_count; i++)
+ for (i = 0; i < spa->spa_spares.sav_count; i++)
spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
(void) spa_vdev_state_exit(spa, NULL, 0);
}
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
dmu_tx_t *tx;
- int dirty_vdevs;
int error;
+ int c;
/*
* Lock out configuration changes.
{
spa_t *spa;
spa_config_dirent_t *dp;
++ int t;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
- for (int t = 0; t < TXG_SIZE; t++)
++ for (t = 0; t < TXG_SIZE; t++)
+ bplist_create(&spa->spa_free_bplist[t]);
+
(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
spa->spa_state = POOL_STATE_UNINITIALIZED;
spa->spa_freeze_txg = UINT64_MAX;
spa_remove(spa_t *spa)
{
spa_config_dirent_t *dp;
++ int t;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
spa_config_lock_destroy(spa);
- for (int t = 0; t < TXG_SIZE; t++)
++ for (t = 0; t < TXG_SIZE; t++)
+ bplist_destroy(&spa->spa_free_bplist[t]);
+
cv_destroy(&spa->spa_async_cv);
+ cv_destroy(&spa->spa_proc_cv);
cv_destroy(&spa->spa_scrub_io_cv);
cv_destroy(&spa->spa_suspend_cv);
return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
}
+ int
+ spa_prev_software_version(spa_t *spa)
+ {
+ return (spa->spa_prev_software_version);
+ }
+
uint64_t
- bp_get_dasize(spa_t *spa, const blkptr_t *bp)
+ dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
{
- int sz = 0, i;
+ uint64_t asize = DVA_GET_ASIZE(dva);
+ uint64_t dsize = asize;
- if (!spa->spa_deflate)
- return (BP_GET_ASIZE(bp));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- for (i = 0; i < SPA_DVAS_PER_BP; i++) {
- vdev_t *vd =
- vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
- if (vd)
- sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
- SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
+ if (asize != 0 && spa->spa_deflate) {
+ vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+ dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
}
- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+
+ return (dsize);
+ }
+
+ uint64_t
+ bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
+ {
+ uint64_t dsize = 0;
++ int d;
+
- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
++ for (d = 0; d < SPA_DVAS_PER_BP; d++)
+ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
+ return (dsize);
+ }
+
+ uint64_t
+ bp_get_dsize(spa_t *spa, const blkptr_t *bp)
+ {
+ uint64_t dsize = 0;
++ int d;
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
++ for (d = 0; d < SPA_DVAS_PER_BP; d++)
+ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
spa_config_exit(spa, SCL_VDEV, FTAG);
- return (sz);
+
+ return (dsize);
}
/*
vd->vdev_open_thread = NULL;
}
- for (int c = 0; c < vd->vdev_children; c++)
+ boolean_t
+ vdev_uses_zvols(vdev_t *vd)
+ {
++ int c;
++
+ if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
+ strlen(ZVOL_DIR)) == 0)
+ return (B_TRUE);
++ for (c = 0; c < vd->vdev_children; c++)
+ if (vdev_uses_zvols(vd->vdev_child[c]))
+ return (B_TRUE);
+ return (B_FALSE);
+ }
+
void
vdev_open_children(vdev_t *vd)
{
taskq_t *tq;
int children = vd->vdev_children;
+ int c;
- for (int c = 0; c < children; c++)
+ /*
+ * in order to handle pools on top of zvols, do the opens
+ * in a single thread so that the same thread holds the
+ * spa_namespace_lock
+ */
+ if (vdev_uses_zvols(vd)) {
++ for (c = 0; c < children; c++)
+ vd->vdev_child[c]->vdev_open_error =
+ vdev_open(vd->vdev_child[c]);
+ return;
+ }
tq = taskq_create("vdev_open", children, minclsyspri,
children, children, TASKQ_PREPOPULATE);
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_ERR_EXCEEDED);
} else {
- vd->vdev_state = VDEV_STATE_HEALTHY;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
}
- for (int c = 0; c < vd->vdev_children; c++) {
+ /*
+ * For hole or missing vdevs we just return success.
+ */
+ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+ return (0);
+
+ for (c = 0; c < vd->vdev_children; c++) {
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_NONE);
{
spa_t *spa = vd->vdev_spa;
nvlist_t *label;
- uint64_t guid, top_guid;
+ uint64_t guid = 0, top_guid;
uint64_t state;
+ int c;
- for (int c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < vd->vdev_children; c++)
if (vdev_validate(vd->vdev_child[c]) != 0)
return (EBADF);
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
}
- for (int c = 0; c < vd->vdev_children; c++)
+ void
+ vdev_hold(vdev_t *vd)
+ {
+ spa_t *spa = vd->vdev_spa;
++ int c;
+
+ ASSERT(spa_is_root(spa));
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ return;
+
- for (int c = 0; c < vd->vdev_children; c++)
++ for (c = 0; c < vd->vdev_children; c++)
+ vdev_hold(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_hold(vd);
+ }
+
+ void
+ vdev_rele(vdev_t *vd)
+ {
+ spa_t *spa = vd->vdev_spa;
++ int c;
+
+ ASSERT(spa_is_root(spa));
++ for (c = 0; c < vd->vdev_children; c++)
+ vdev_rele(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_rele(vd);
+ }
+
+ /*
+ * Reopen all interior vdevs and any unopened leaves. We don't actually
+ * reopen leaf vdevs which had previously been opened as they might deadlock
+ * on the spa_config_lock. Instead we only obtain the leaf's physical size.
+ * If the leaf has never been opened then open it, as usual.
+ */
void
vdev_reopen(vdev_t *vd)
{
}
mutex_enter(&vd->vdev_dtl_lock);
- for (int t = 0; t < DTL_TYPES; t++) {
+ for (t = 0; t < DTL_TYPES; t++) {
+ /* account for child's outage in parent's missing map */
+ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
if (t == DTL_SCRUB)
continue; /* leaf vdevs only */
if (t == DTL_PARTIAL)
else
minref = vd->vdev_children; /* any kind of mirror */
space_map_ref_create(&reftree);
- for (int c = 0; c < vd->vdev_children; c++) {
+ for (c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
mutex_enter(&cvd->vdev_dtl_lock);
- space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1);
+ space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
mutex_exit(&cvd->vdev_dtl_lock);
}
space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
return (0);
}
- for (int m = 0; m < vd->vdev_ms_count; m++) {
+ void
+ vdev_remove(vdev_t *vd, uint64_t txg)
+ {
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_tx_t *tx;
++ int m;
+
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ if (vd->vdev_dtl_smo.smo_object) {
+ ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
+ (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
+ vd->vdev_dtl_smo.smo_object = 0;
+ }
+
+ if (vd->vdev_ms != NULL) {
++ for (m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ if (msp == NULL || msp->ms_smo.smo_object == 0)
+ continue;
+
+ ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
+ (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
+ msp->ms_smo.smo_object = 0;
+ }
+ }
+
+ if (vd->vdev_ms_array) {
+ (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
+ vd->vdev_ms_array = 0;
+ vd->vdev_ms_shift = 0;
+ }
+ dmu_tx_commit(tx);
+ }
+
void
vdev_sync_done(vdev_t *vd, uint64_t txg)
{
mutex_exit(&vd->vdev_stat_lock);
}
- for (int c = 0; c < vd->vdev_children; c++)
+ void
+ vdev_scan_stat_init(vdev_t *vd)
+ {
+ vdev_stat_t *vs = &vd->vdev_stat;
++ int c;
+
++ for (c = 0; c < vd->vdev_children; c++)
+ vdev_scan_stat_init(vd->vdev_child[c]);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vs->vs_scan_processed = 0;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+
void
vdev_stat_update(zio_t *zio, uint64_t psize)
{
int degraded = 0, faulted = 0;
int corrupted = 0;
vdev_t *child;
+ int c;
if (vd->vdev_children > 0) {
- for (int c = 0; c < vd->vdev_children; c++) {
+ for (c = 0; c < vd->vdev_children; c++) {
child = vd->vdev_child[c];
+ /*
+ * Don't factor holes into the decision.
+ */
+ if (child->vdev_ishole)
+ continue;
+
if (!vdev_readable(child) ||
(!vdev_writeable(child) && spa_writeable(spa))) {
/*
return (B_TRUE);
}
+ /*
+ * Load the state from the original vdev tree (ovd) which
+ * we've retrieved from the MOS config object. If the original
+ * vdev was offline then we transfer that state to the device
+ * in the current vdev tree (nvd).
+ */
void
- vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
+ vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
{
- uint_t children;
- nvlist_t **child;
- uint64_t val;
- spa_t *spa = vd->vdev_spa;
+ spa_t *spa = nvd->vdev_spa;
+ int c;
- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0) {
- for (c = 0; c < children; c++)
- vdev_load_log_state(vd->vdev_child[c], child[c]);
- }
+ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
- if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
- ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
- for (int c = 0; c < nvd->vdev_children; c++)
++ for (c = 0; c < nvd->vdev_children; c++)
+ vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
+ if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) {
/*
* It would be nice to call vdev_offline()
* directly but the pool isn't fully loaded and
* spa_sync() to allocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
- if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+ if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
pass > SYNC_PASS_REWRITE) {
- uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
- ASSERT(csize != 0);
- ASSERT(psize != 0);
+ enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
++ ASSERT(psize != 0);
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
zio->io_flags |= ZIO_FLAG_IO_REWRITE;
} else {
pio->io_pipeline = pio->io_orig_pipeline;
pio->io_reexecute = 0;
pio->io_error = 0;
- for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+ for (w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_state[w] = 0;
- for (int c = 0; c < ZIO_CHILD_TYPES; c++)
+ for (c = 0; c < ZIO_CHILD_TYPES; c++)
pio->io_child_error[c] = 0;
- if (IO_IS_ALLOCATING(pio)) {
- /*
- * Remember the failed bp so that the io_ready() callback
- * can update its accounting upon reexecution. The block
- * was already freed in zio_done(); we indicate this with
- * a fill count of -1 so that zio_free() knows to skip it.
- */
- blkptr_t *bp = pio->io_bp;
- ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg);
- bp->blk_fill = BLK_FILL_ALREADY_FREED;
- pio->io_bp_orig = *bp;
- BP_ZERO(bp);
- }
+ if (IO_IS_ALLOCATING(pio))
+ BP_ZERO(pio->io_bp);
/*
* As we reexecute pio's children, new children could be created.
zio_t *gio = zio->io_gang_leader;
zio_gang_node_t *gn = zio->io_private;
blkptr_t *bp = zio->io_bp;
+ int g;
ASSERT(gio == zio_unique_parent(zio));
- ASSERT(zio_walk_children(zio) == NULL);
+ ASSERT(zio->io_child_count == 0);
if (zio->io_error)
return;
ASSERT(zio->io_data == gn->gn_gbh);
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
- ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+ ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
if (!BP_IS_GANG(gbp))
continue;
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
if (gn != NULL) {
- ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+ ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
if (BP_IS_HOLE(gbp))
continue;
uint64_t txg = pio->io_txg;
uint64_t resid = pio->io_size;
uint64_t lsize;
- int ndvas = gio->io_prop.zp_ndvas;
- int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
+ int copies = gio->io_prop.zp_copies;
+ int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
zio_prop_t zp;
- int error;
+ int g, error;
- error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
- bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp,
+ error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
+ bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
if (error) {
pio->io_error = error;
/*
* ==========================================================================
- * Allocate and free blocks
+ * Dedup
* ==========================================================================
*/
- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ static void
+ zio_ddt_child_read_done(zio_t *zio)
+ {
+ blkptr_t *bp = zio->io_bp;
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp;
+ zio_t *pio = zio_unique_parent(zio);
+
+ mutex_enter(&pio->io_lock);
+ ddp = ddt_phys_select(dde, bp);
+ if (zio->io_error == 0)
+ ddt_phys_clear(ddp); /* this ddp doesn't need repair */
+ if (zio->io_error == 0 && dde->dde_repair_data == NULL)
+ dde->dde_repair_data = zio->io_data;
+ else
+ zio_buf_free(zio->io_data, zio->io_size);
+ mutex_exit(&pio->io_lock);
+ }
+
+ static int
+ zio_ddt_read_start(zio_t *zio)
+ {
+ blkptr_t *bp = zio->io_bp;
++ int p;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ if (zio->io_child_error[ZIO_CHILD_DDT]) {
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = ddt_repair_start(ddt, bp);
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+ blkptr_t blk;
+
+ ASSERT(zio->io_vsd == NULL);
+ zio->io_vsd = dde;
+
+ if (ddp_self == NULL)
+ return (ZIO_PIPELINE_CONTINUE);
+
++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+ continue;
+ ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
+ &blk);
+ zio_nowait(zio_read(zio, zio->io_spa, &blk,
+ zio_buf_alloc(zio->io_size), zio->io_size,
+ zio_ddt_child_read_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
+ &zio->io_bookmark));
+ }
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ zio_nowait(zio_read(zio, zio->io_spa, bp,
+ zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
+
+ return (ZIO_PIPELINE_CONTINUE);
+ }
- for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ static int
+ zio_ddt_read_done(zio_t *zio)
+ {
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
+ return (ZIO_PIPELINE_STOP);
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ if (zio->io_child_error[ZIO_CHILD_DDT]) {
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = zio->io_vsd;
+ if (ddt == NULL) {
+ ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+ if (dde == NULL) {
+ zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
+ return (ZIO_PIPELINE_STOP);
+ }
+ if (dde->dde_repair_data != NULL) {
+ bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
+ zio->io_child_error[ZIO_CHILD_DDT] = 0;
+ }
+ ddt_repair_done(ddt, dde);
+ zio->io_vsd = NULL;
+ }
+
+ ASSERT(zio->io_vsd == NULL);
+
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ static boolean_t
+ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
+ {
+ spa_t *spa = zio->io_spa;
++ int p;
+
+ /*
+ * Note: we compare the original data, not the transformed data,
+ * because when zio->io_bp is an override bp, we will not have
+ * pushed the I/O transforms. That's an important optimization
+ * because otherwise we'd compress/encrypt all dmu_sync() data twice.
+ */
- for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
++ for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ zio_t *lio = dde->dde_lead_zio[p];
+
+ if (lio != NULL) {
+ return (lio->io_orig_size != zio->io_orig_size ||
+ bcmp(zio->io_orig_data, lio->io_orig_data,
+ zio->io_orig_size) != 0);
+ }
+ }
+
++ for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+
+ if (ddp->ddp_phys_birth != 0) {
+ arc_buf_t *abuf = NULL;
+ uint32_t aflags = ARC_WAIT;
+ blkptr_t blk = *zio->io_bp;
+ int error;
+
+ ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+
+ ddt_exit(ddt);
+
+ error = arc_read_nolock(NULL, spa, &blk,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &aflags, &zio->io_bookmark);
+
+ if (error == 0) {
+ if (arc_buf_size(abuf) != zio->io_orig_size ||
+ bcmp(abuf->b_data, zio->io_orig_data,
+ zio->io_orig_size) != 0)
+ error = EEXIST;
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ }
+
+ ddt_enter(ddt);
+ return (error != 0);
+ }
+ }
+
+ return (B_FALSE);
+ }
+
+ static void
+ zio_ddt_child_write_ready(zio_t *zio)
+ {
+ int p = zio->io_prop.zp_copies;
+ ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ zio_t *pio;
+
+ if (zio->io_error)
+ return;
+
+ ddt_enter(ddt);
+
+ ASSERT(dde->dde_lead_zio[p] == zio);
+
+ ddt_phys_fill(ddp, zio->io_bp);
+
+ while ((pio = zio_walk_parents(zio)) != NULL)
+ ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+
+ ddt_exit(ddt);
+ }
+
+ static void
+ zio_ddt_child_write_done(zio_t *zio)
+ {
+ int p = zio->io_prop.zp_copies;
+ ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+
+ ddt_enter(ddt);
+
+ ASSERT(ddp->ddp_refcnt == 0);
+ ASSERT(dde->dde_lead_zio[p] == zio);
+ dde->dde_lead_zio[p] = NULL;
+
+ if (zio->io_error == 0) {
+ while (zio_walk_parents(zio) != NULL)
+ ddt_phys_addref(ddp);
+ } else {
+ ddt_phys_clear(ddp);
+ }
+
+ ddt_exit(ddt);
+ }
+
+ static void
+ zio_ddt_ditto_write_done(zio_t *zio)
+ {
+ int p = DDT_PHYS_DITTO;
+ zio_prop_t *zp = &zio->io_prop;
+ blkptr_t *bp = zio->io_bp;
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ ddt_key_t *ddk = &dde->dde_key;
+
+ ddt_enter(ddt);
+
+ ASSERT(ddp->ddp_refcnt == 0);
+ ASSERT(dde->dde_lead_zio[p] == zio);
+ dde->dde_lead_zio[p] = NULL;
+
+ if (zio->io_error == 0) {
+ ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
+ ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
+ ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
+ if (ddp->ddp_phys_birth != 0)
+ ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
+ ddt_phys_fill(ddp, bp);
+ }
+
+ ddt_exit(ddt);
+ }
+
+ static int
+ zio_ddt_write(zio_t *zio)
+ {
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t txg = zio->io_txg;
+ zio_prop_t *zp = &zio->io_prop;
+ int p = zp->zp_copies;
+ int ditto_copies;
+ zio_t *cio = NULL;
+ zio_t *dio = NULL;
+ ddt_t *ddt = ddt_select(spa, bp);
+ ddt_entry_t *dde;
+ ddt_phys_t *ddp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
+ ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
+
+ ddt_enter(ddt);
+ dde = ddt_lookup(ddt, bp, B_TRUE);
+ ddp = &dde->dde_phys[p];
+
+ if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
+ /*
+ * If we're using a weak checksum, upgrade to a strong checksum
+ * and try again. If we're already using a strong checksum,
+ * we can't resolve it, so just convert to an ordinary write.
+ * (And automatically e-mail a paper to Nature?)
+ */
+ if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
+ zp->zp_checksum = spa_dedup_checksum(spa);
+ zio_pop_transforms(zio);
+ zio->io_stage = ZIO_STAGE_OPEN;
+ BP_ZERO(bp);
+ } else {
+ zp->zp_dedup = 0;
+ }
+ zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ ddt_exit(ddt);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
+ ASSERT(ditto_copies < SPA_DVAS_PER_BP);
+
+ if (ditto_copies > ddt_ditto_copies_present(dde) &&
+ dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
+ zio_prop_t czp = *zp;
+
+ czp.zp_copies = ditto_copies;
+
+ /*
+ * If we arrived here with an override bp, we won't have run
+ * the transform stack, so we won't have the data we need to
+ * generate a child i/o. So, toss the override bp and restart.
+ * This is safe, because using the override bp is just an
+ * optimization; and it's rare, so the cost doesn't matter.
+ */
+ if (zio->io_bp_override) {
+ zio_pop_transforms(zio);
+ zio->io_stage = ZIO_STAGE_OPEN;
+ zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ zio->io_bp_override = NULL;
+ BP_ZERO(bp);
+ ddt_exit(ddt);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+ zio->io_orig_size, &czp, NULL,
+ zio_ddt_ditto_write_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+ zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
+ dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
+ }
+
+ if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
+ if (ddp->ddp_phys_birth != 0)
+ ddt_bp_fill(ddp, bp, txg);
+ if (dde->dde_lead_zio[p] != NULL)
+ zio_add_child(zio, dde->dde_lead_zio[p]);
+ else
+ ddt_phys_addref(ddp);
+ } else if (zio->io_bp_override) {
+ ASSERT(bp->blk_birth == txg);
+ ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+ ddt_phys_fill(ddp, bp);
+ ddt_phys_addref(ddp);
+ } else {
+ cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+ zio->io_orig_size, zp, zio_ddt_child_write_ready,
+ zio_ddt_child_write_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+ zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
+ dde->dde_lead_zio[p] = cio;
+ }
+
+ ddt_exit(ddt);
+
+ if (cio)
+ zio_nowait(cio);
+ if (dio)
+ zio_nowait(dio);
+
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ ddt_entry_t *freedde; /* for debugging */
+
+ static int
+ zio_ddt_free(zio_t *zio)
+ {
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ ddt_t *ddt = ddt_select(spa, bp);
+ ddt_entry_t *dde;
+ ddt_phys_t *ddp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ ddt_enter(ddt);
+ freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
+ ddp = ddt_phys_select(dde, bp);
+ ddt_phys_decref(ddp);
+ ddt_exit(ddt);
+
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ /*
+ * ==========================================================================
+ * Allocate and free blocks
+ * ==========================================================================
+ */
static int
zio_dva_allocate(zio_t *zio)
{
static void
zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
{
- spa_t *spa = zio->io_spa;
- boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE);
- int g;
-
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
-
- if (zio->io_bp == bp && !now) {
- /*
- * This is a rewrite for sync-to-convergence.
- * We can't do a metaslab_free(NOW) because bp wasn't allocated
- * during this sync pass, which means that metaslab_sync()
- * already committed the allocation.
- */
- ASSERT(DVA_EQUAL(BP_IDENTITY(bp),
- BP_IDENTITY(&zio->io_bp_orig)));
- ASSERT(spa_sync_pass(spa) > 1);
-
- if (BP_IS_GANG(bp) && gn == NULL) {
- /*
- * This is a gang leader whose gang header(s) we
- * couldn't read now, so defer the free until later.
- * The block should still be intact because without
- * the headers, we'd never even start the rewrite.
- */
- bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
- return;
- }
- }
+ ASSERT(zio->io_bp_override == NULL);
if (!BP_IS_HOLE(bp))
- metaslab_free(spa, bp, bp->blk_birth, now);
+ metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
if (gn != NULL) {
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
zio_dva_unallocate(zio, gn->gn_child[g],
&gn->gn_gbh->zg_blkptr[g]);
}