From: Brian Behlendorf Date: Fri, 28 May 2010 21:19:22 +0000 (-0700) Subject: Merge commit 'refs/top-bases/gcc-c90' into gcc-c90 X-Git-Tag: zfs-0.5.0~38^2^2~1^2^2~34^2~1^2^2~12^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=957b7b41d48c05e0b9fbdd28a98744ea2298f5a4;p=zfs Merge commit 'refs/top-bases/gcc-c90' into gcc-c90 Conflicts: cmd/zdb/zdb.c cmd/ztest/ztest.c module/zfs/dbuf.c module/zfs/dsl_dataset.c module/zfs/dsl_scrub.c module/zfs/spa.c module/zfs/vdev.c module/zfs/zio.c --- 957b7b41d48c05e0b9fbdd28a98744ea2298f5a4 diff --cc cmd/zdb/zdb.c index 478781882,ff73072f8..202b5a619 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@@ -546,6 -637,133 +637,134 @@@ dump_metaslabs(spa_t *spa } } + static void + dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) + { + const ddt_phys_t *ddp = dde->dde_phys; + const ddt_key_t *ddk = &dde->dde_key; + char *types[4] = { "ditto", "single", "double", "triple" }; + char blkbuf[BP_SPRINTF_LEN]; + blkptr_t blk; ++ int p; + - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { ++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + sprintf_blkptr(blkbuf, &blk); + (void) printf("index %llx refcnt %llu %s %s\n", + (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, + types[p], blkbuf); + } + } + + static void + dump_dedup_ratio(const ddt_stat_t *dds) + { + double rL, rP, rD, D, dedup, compress, copies; + + if (dds->dds_blocks == 0) + return; + + rL = (double)dds->dds_ref_lsize; + rP = (double)dds->dds_ref_psize; + rD = (double)dds->dds_ref_dsize; + D = (double)dds->dds_dsize; + + dedup = rD / D; + compress = rL / rP; + copies = rD / rP; + + (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " + "dedup * compress / copies = %.2f\n\n", + dedup, compress, copies, dedup * compress / copies); + } + + static void + dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) + { + char name[DDT_NAMELEN]; + ddt_entry_t dde; + uint64_t walk = 0; + dmu_object_info_t doi; + uint64_t count, dspace, mspace; + int error; + + error = ddt_object_info(ddt, type, class, &doi); + + if (error == ENOENT) + return; + ASSERT(error == 0); + + count = ddt_object_count(ddt, type, class); + dspace = doi.doi_physical_blocks_512 << 9; + mspace = doi.doi_fill_count * doi.doi_data_block_size; + + ASSERT(count != 0); /* we should have destroyed it */ + + ddt_object_name(ddt, type, class, name); + + (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", + name, + (u_longlong_t)count, + (u_longlong_t)(dspace / count), + (u_longlong_t)(mspace / count)); + + if (dump_opt['D'] < 3) + return; + + zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); + + if (dump_opt['D'] < 4) + return; + + if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) + return; + + (void) printf("%s contents:\n\n", name); + + while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) + dump_dde(ddt, &dde, walk); + + ASSERT(error == ENOENT); + + (void) printf("\n"); + } + + static void + dump_all_ddts(spa_t *spa) + { + ddt_histogram_t ddh_total = { 0 }; + ddt_stat_t dds_total = { 0 }; + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + dump_ddt(ddt, type, class); + } + } + } + + ddt_get_dedup_stats(spa, &dds_total); + + if (dds_total.dds_blocks == 0) { + (void) printf("All DDTs are empty\n"); + return; + } + + (void) printf("\n"); + + if (dump_opt['D'] > 1) { + (void) printf("DDT histogram (aggregated over all DDTs):\n"); + ddt_get_dedup_histogram(spa, &ddh_total); + zpool_dump_ddt(&dds_total, &ddh_total); + } + + dump_dedup_ratio(&dds_total); + } + static void dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size) { @@@ -565,9 -783,8 +784,9 @@@ dump_dtl(vdev_t *vd, int indent boolean_t required; char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" }; char prefix[256]; + int c, t; - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); required = vdev_dtl_required(vd); (void) spa_vdev_state_exit(spa, NULL, 0); @@@ -597,6 -814,68 +816,69 @@@ dump_dtl(vd->vdev_child[c], indent + 4); } + static void + dump_history(spa_t *spa) + { + nvlist_t **events = NULL; + char buf[SPA_MAXBLOCKSIZE]; + uint64_t resid, len, off = 0; + uint_t num = 0; + int error; + time_t tsec; + struct tm t; + char tbuf[30]; + char internalstr[MAXPATHLEN]; ++ int i; + + do { + len = sizeof (buf); + + if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { + (void) fprintf(stderr, "Unable to read history: " + "error %d\n", error); + return; + } + + if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) + break; + + off -= resid; + } while (len != 0); + + (void) printf("\nHistory:\n"); - for (int i = 0; i < num; i++) { ++ for (i = 0; i < num; i++) { + uint64_t time, txg, ievent; + char *cmd, *intstr; + + if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, + &time) != 0) + continue; + if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, + &cmd) != 0) { + if (nvlist_lookup_uint64(events[i], + ZPOOL_HIST_INT_EVENT, &ievent) != 0) + continue; + verify(nvlist_lookup_uint64(events[i], + ZPOOL_HIST_TXG, &txg) == 0); + verify(nvlist_lookup_string(events[i], + ZPOOL_HIST_INT_STR, &intstr) == 0); + if (ievent >= LOG_END) + continue; + + (void) snprintf(internalstr, + sizeof (internalstr), + "[internal %s txg:%lld] %s", + zfs_history_event_names[ievent], txg, + intstr); + cmd = internalstr; + } + tsec = time; + (void) localtime_r(&tsec, &t); + (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); + (void) printf("%s %s\n", tbuf, cmd); + } + } + /*ARGSUSED*/ static void dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) @@@ -614,15 -900,19 +903,20 @@@ blkid2offset(const dnode_phys_t *dnp, c } static void - sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas) + sprintf_blkptr_compact(char *blkbuf, const blkptr_t *bp) { - dva_t *dva = bp->blk_dva; - int ndvas = alldvas ? BP_GET_NDVAS(bp) : 1; + const dva_t *dva = bp->blk_dva; + int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; + int i; + if (dump_opt['b'] >= 5) { + sprintf_blkptr(blkbuf, bp); + return; + } + blkbuf[0] = '\0'; - for (int i = 0; i < ndvas; i++) + for (i = 0; i < ndvas; i++) (void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ", (u_longlong_t)DVA_GET_VDEV(&dva[i]), (u_longlong_t)DVA_GET_OFFSET(&dva[i]), @@@ -1344,19 -1739,52 +1743,54 @@@ dump_cachefile(const char *cachefile nvlist_free(config); } + #define ZDB_MAX_UB_HEADER_SIZE 32 + + static void + dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift) + { + vdev_t vd; + vdev_t *vdp = &vd; + char header[ZDB_MAX_UB_HEADER_SIZE]; ++ int i; + + vd.vdev_ashift = ashift; + vdp->vdev_top = vdp; + - for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) { ++ for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) { + uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i); + uberblock_t *ub = (void *)((char *)lbl + uoff); + + if (uberblock_verify(ub)) + continue; + (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, + "Uberblock[%d]\n", i); + dump_uberblock(ub, header, ""); + } + } + static void dump_label(const char *dev) { int fd; vdev_label_t label; - char *buf = label.vl_vdev_phys.vp_nvlist; + char *path, *buf = label.vl_vdev_phys.vp_nvlist; size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist); struct stat64 statbuf; - uint64_t psize; + uint64_t psize, ashift; + int len = strlen(dev) + 1; + int l; - if ((fd = open64(dev, O_RDONLY)) < 0) { - (void) printf("cannot open '%s': %s\n", dev, strerror(errno)); + if (strncmp(dev, "/dev/dsk/", 9) == 0) { + len++; + path = malloc(len); + (void) snprintf(path, len, "%s%s", "/dev/rdsk/", dev + 9); + } else { + path = strdup(dev); + } + + if ((fd = open64(path, O_RDONLY)) < 0) { + (void) printf("cannot open '%s': %s\n", path, strerror(errno)); + free(path); exit(1); } @@@ -1369,8 -1807,7 +1813,7 @@@ psize = statbuf.st_size; psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); - for (int l = 0; l < VDEV_LABELS; l++) { + for (l = 0; l < VDEV_LABELS; l++) { - nvlist_t *config = NULL; (void) printf("--------------------------------------------\n"); @@@ -1507,13 -1897,19 +1903,20 @@@ typedef struct zdb_cb } zdb_cb_t; static void - zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type) + zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, + dmu_object_type_t type) { + uint64_t refcnt = 0; + int i; + ASSERT(type < ZDB_OT_TOTAL); + + if (zilog && zil_bp_tree_add(zilog, bp) != 0) + return; + - for (int i = 0; i < 4; i++) { + for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; - int t = (i & 1) ? type : DMU_OT_TOTAL; + int t = (i & 1) ? type : ZDB_OT_TOTAL; zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; zb->zb_asize += BP_GET_ASIZE(bp); @@@ -1625,24 -2017,159 +2024,164 @@@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilo return (0); } + static void + zdb_leak(space_map_t *sm, uint64_t start, uint64_t size) + { + vdev_t *vd = sm->sm_ppd; + + (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", + (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); + } + + /* ARGSUSED */ + static void + zdb_space_map_load(space_map_t *sm) + { + } + + static void + zdb_space_map_unload(space_map_t *sm) + { + space_map_vacate(sm, zdb_leak, sm); + } + + /* ARGSUSED */ + static void + zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size) + { + } + + static space_map_ops_t zdb_space_map_ops = { + zdb_space_map_load, + zdb_space_map_unload, + NULL, /* alloc */ + zdb_space_map_claim, + NULL, /* free */ + NULL /* maxsize */ + }; + + static void + zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) + { + ddt_bookmark_t ddb = { 0 }; + ddt_entry_t dde; + int error; ++ int p; + + while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { + blkptr_t blk; + ddt_phys_t *ddp = dde.dde_phys; + + if (ddb.ddb_class == DDT_CLASS_UNIQUE) + return; + + ASSERT(ddt_phys_total_refcnt(&dde) > 1); + - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { ++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + ddt_bp_create(ddb.ddb_checksum, + &dde.dde_key, ddp, &blk); + if (p == DDT_PHYS_DITTO) { + zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); + } else { + zcb->zcb_dedup_asize += + BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); + zcb->zcb_dedup_blocks++; + } + } + if (!dump_opt['L']) { + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + ddt_enter(ddt); + VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); + ddt_exit(ddt); + } + } + + ASSERT(error == ENOENT); + } + + static void + zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) + { + zcb->zcb_spa = spa; ++ int c, m; + + if (!dump_opt['L']) { + vdev_t *rvd = spa->spa_root_vdev; - for (int c = 0; c < rvd->vdev_children; c++) { ++ for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; - for (int m = 0; m < vd->vdev_ms_count; m++) { ++ for (m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + mutex_enter(&msp->ms_lock); + space_map_unload(&msp->ms_map); + VERIFY(space_map_load(&msp->ms_map, + &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo, + spa->spa_meta_objset) == 0); + msp->ms_map.sm_ppd = vd; + mutex_exit(&msp->ms_lock); + } + } + } + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + zdb_ddt_leak_init(spa, zcb); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + static void + zdb_leak_fini(spa_t *spa) + { ++ int c, m; ++ + if (!dump_opt['L']) { + vdev_t *rvd = spa->spa_root_vdev; - for (int c = 0; c < rvd->vdev_children; c++) { ++ for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; - for (int m = 0; m < vd->vdev_ms_count; m++) { ++ for (m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + mutex_enter(&msp->ms_lock); + space_map_unload(&msp->ms_map); + mutex_exit(&msp->ms_lock); + } + } + } + } + + /* ARGSUSED */ + static int + count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) + { + zdb_cb_t *zcb = arg; + + if (dump_opt['b'] >= 4) { + char blkbuf[BP_SPRINTF_LEN]; + sprintf_blkptr(blkbuf, bp); + (void) printf("[%s] %s\n", + "deferred free", blkbuf); + } + zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); + return (0); + } + static int dump_block_stats(spa_t *spa) { zdb_cb_t zcb = { 0 }; zdb_blkstats_t *zb, *tzb; - uint64_t alloc, space, logalloc; - vdev_t *rvd = spa->spa_root_vdev; + uint64_t norm_alloc, norm_space, total_alloc, total_found; + int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD; int leaks = 0; - int c, e; ++ int e; - if (!dump_opt['S']) { - (void) printf("\nTraversing all blocks %s%s%s%s%s...\n", - (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", - (dump_opt['c'] == 1) ? "metadata " : "", - dump_opt['c'] ? "checksums " : "", - (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", - !dump_opt['L'] ? "nothing leaked " : ""); - } + (void) printf("\nTraversing all blocks %s%s%s%s%s...\n", + (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", + (dump_opt['c'] == 1) ? "metadata " : "", + dump_opt['c'] ? "checksums " : "", + (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", + !dump_opt['L'] ? "nothing leaked " : ""); /* * Load all space maps as SM_ALLOC maps, then traverse the pool @@@ -1658,33 -2184,20 +2196,20 @@@ /* * If there's a deferred-free bplist, process that first. */ - if (spa->spa_sync_bplist_obj != 0) { - bplist_t *bpl = &spa->spa_sync_bplist; - blkptr_t blk; - uint64_t itor = 0; + (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, + count_block_cb, &zcb, NULL); + (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, + count_block_cb, &zcb, NULL); - VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset, - spa->spa_sync_bplist_obj)); + if (dump_opt['c'] > 1) + flags |= TRAVERSE_PREFETCH_DATA; - while (bplist_iterate(bpl, &itor, &blk) == 0) { - if (dump_opt['b'] >= 4) { - char blkbuf[BP_SPRINTF_LEN]; - sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk); - (void) printf("[%s] %s\n", - "deferred free", blkbuf); - } - zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED); - } - - bplist_close(bpl); - } - - zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb); + zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); - if (zcb.zcb_haderrors && !dump_opt['S']) { + if (zcb.zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); - for (int e = 0; e < 256; e++) { + for (e = 0; e < 256; e++) { if (zcb.zcb_errors[e] != 0) { (void) printf("\t%5d %llu\n", e, (u_longlong_t)zcb.zcb_errors[e]); diff --cc cmd/ztest/ztest.c index 5ce765418,eed92ec72..bdfde21bb --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@@ -763,152 -871,1365 +871,1372 @@@ ztest_spa_prop_set_uint64(ztest_shared_ return (error); } - zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { - NULL, /* 0 no such transaction type */ - ztest_replay_create, /* TX_CREATE */ - NULL, /* TX_MKDIR */ - NULL, /* TX_MKXATTR */ - NULL, /* TX_SYMLINK */ - ztest_replay_remove, /* TX_REMOVE */ - NULL, /* TX_RMDIR */ - NULL, /* TX_LINK */ - NULL, /* TX_RENAME */ - NULL, /* TX_WRITE */ - NULL, /* TX_TRUNCATE */ - NULL, /* TX_SETATTR */ - NULL, /* TX_ACL */ - }; + static void + ztest_rll_init(rll_t *rll) + { + rll->rll_writer = NULL; + rll->rll_readers = 0; + VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0); + VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0); + } - /* - * Verify that we can't destroy an active pool, create an existing pool, - * or create a pool with a bad vdev spec. - */ - void - ztest_spa_create_destroy(ztest_args_t *za) + static void + ztest_rll_destroy(rll_t *rll) { - int error; - spa_t *spa; - nvlist_t *nvroot; + ASSERT(rll->rll_writer == NULL); + ASSERT(rll->rll_readers == 0); + VERIFY(_mutex_destroy(&rll->rll_lock) == 0); + VERIFY(cond_destroy(&rll->rll_cv) == 0); + } - /* - * Attempt to create using a bad file. - */ - nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); - error = spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL); - nvlist_free(nvroot); - if (error != ENOENT) - fatal(0, "spa_create(bad_file) = %d", error); + static void + ztest_rll_lock(rll_t *rll, rl_type_t type) + { + VERIFY(mutex_lock(&rll->rll_lock) == 0); - /* - * Attempt to create using a bad mirror. - */ - nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1); - error = spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL); - nvlist_free(nvroot); - if (error != ENOENT) - fatal(0, "spa_create(bad_mirror) = %d", error); + if (type == RL_READER) { + while (rll->rll_writer != NULL) + (void) cond_wait(&rll->rll_cv, &rll->rll_lock); + rll->rll_readers++; + } else { + while (rll->rll_writer != NULL || rll->rll_readers) + (void) cond_wait(&rll->rll_cv, &rll->rll_lock); + rll->rll_writer = curthread; + } - /* - * Attempt to create an existing pool. It shouldn't matter - * what's in the nvroot; we should fail with EEXIST. - */ - (void) rw_rdlock(&ztest_shared->zs_name_lock); - nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); - error = spa_create(za->za_pool, nvroot, NULL, NULL, NULL); - nvlist_free(nvroot); - if (error != EEXIST) - fatal(0, "spa_create(whatever) = %d", error); + VERIFY(mutex_unlock(&rll->rll_lock) == 0); + } - error = spa_open(za->za_pool, &spa, FTAG); - if (error) - fatal(0, "spa_open() = %d", error); + static void + ztest_rll_unlock(rll_t *rll) + { + VERIFY(mutex_lock(&rll->rll_lock) == 0); - error = spa_destroy(za->za_pool); - if (error != EBUSY) - fatal(0, "spa_destroy() = %d", error); + if (rll->rll_writer) { + ASSERT(rll->rll_readers == 0); + rll->rll_writer = NULL; + } else { + ASSERT(rll->rll_readers != 0); + ASSERT(rll->rll_writer == NULL); + rll->rll_readers--; + } - spa_close(spa, FTAG); - (void) rw_unlock(&ztest_shared->zs_name_lock); + if (rll->rll_writer == NULL && rll->rll_readers == 0) + VERIFY(cond_broadcast(&rll->rll_cv) == 0); + + VERIFY(mutex_unlock(&rll->rll_lock) == 0); } - static vdev_t * - vdev_lookup_by_path(vdev_t *vd, const char *path) + static void + ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) { - vdev_t *mvd; - int c; + rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; - if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) - return (vd); + ztest_rll_lock(rll, type); + } - for (c = 0; c < vd->vdev_children; c++) - if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != - NULL) - return (mvd); + static void + ztest_object_unlock(ztest_ds_t *zd, uint64_t object) + { + rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; - return (NULL); + ztest_rll_unlock(rll); } - /* - * Verify that vdev_add() works as expected. - */ - void - ztest_vdev_add_remove(ztest_args_t *za) + static rl_t * + ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, + uint64_t size, rl_type_t type) { - spa_t *spa = za->za_spa; - uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; - nvlist_t *nvroot; - int error; + uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); + rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; + rl_t *rl; - (void) mutex_lock(&ztest_shared->zs_vdev_lock); + rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); + rl->rl_object = object; + rl->rl_offset = offset; + rl->rl_size = size; + rl->rl_lock = rll; - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + ztest_rll_lock(rll, type); - ztest_shared->zs_vdev_primaries = - spa->spa_root_vdev->vdev_children * leaves; + return (rl); + } - spa_config_exit(spa, SCL_VDEV, FTAG); + static void + ztest_range_unlock(rl_t *rl) + { + rll_t *rll = rl->rl_lock; - /* - * Make 1/4 of the devices be log devices. - */ - nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, - ztest_random(4) == 0, zopt_raidz, zopt_mirrors, 1); + ztest_rll_unlock(rll); - error = spa_vdev_add(spa, nvroot); - nvlist_free(nvroot); + umem_free(rl, sizeof (*rl)); + } + + static void + ztest_zd_init(ztest_ds_t *zd, objset_t *os) + { + zd->zd_os = os; + zd->zd_zilog = dmu_objset_zil(os); + zd->zd_seq = 0; + dmu_objset_name(os, zd->zd_name); ++ int l; - (void) mutex_unlock(&ztest_shared->zs_vdev_lock); + VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0); - if (error == ENOSPC) - ztest_record_enospc("spa_vdev_add"); - else if (error != 0) - fatal(0, "spa_vdev_add() = %d", error); - for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) ++ for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) + ztest_rll_init(&zd->zd_object_lock[l]); + - for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) ++ for (l = 0; l < ZTEST_RANGE_LOCKS; l++) + ztest_rll_init(&zd->zd_range_lock[l]); } - /* - * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. - */ - void - ztest_vdev_aux_add_remove(ztest_args_t *za) + static void + ztest_zd_fini(ztest_ds_t *zd) { - spa_t *spa = za->za_spa; - vdev_t *rvd = spa->spa_root_vdev; - spa_aux_vdev_t *sav; - char *aux; - uint64_t guid = 0; - int error; ++ int l; + - if (ztest_random(2) == 0) { - sav = &spa->spa_spares; - aux = ZPOOL_CONFIG_SPARES; - } else { - sav = &spa->spa_l2cache; - aux = ZPOOL_CONFIG_L2CACHE; - } + VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0); - (void) mutex_lock(&ztest_shared->zs_vdev_lock); - for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) ++ for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) + ztest_rll_destroy(&zd->zd_object_lock[l]); - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) ++ for (l = 0; l < ZTEST_RANGE_LOCKS; l++) + ztest_rll_destroy(&zd->zd_range_lock[l]); + } + + #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) + + static uint64_t + ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) + { + uint64_t txg; + int error; + + /* + * Attempt to assign tx to some transaction group. + */ + error = dmu_tx_assign(tx, txg_how); + if (error) { + if (error == ERESTART) { + ASSERT(txg_how == TXG_NOWAIT); + dmu_tx_wait(tx); + } else { + ASSERT3U(error, ==, ENOSPC); + ztest_record_enospc(tag); + } + dmu_tx_abort(tx); + return (0); + } + txg = dmu_tx_get_txg(tx); + ASSERT(txg != 0); + return (txg); + } + + static void + ztest_pattern_set(void *buf, uint64_t size, uint64_t value) + { + uint64_t *ip = buf; + uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); + + while (ip < ip_end) + *ip++ = value; + } + + static boolean_t + ztest_pattern_match(void *buf, uint64_t size, uint64_t value) + { + uint64_t *ip = buf; + uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); + uint64_t diff = 0; + + while (ip < ip_end) + diff |= (value - *ip++); + + return (diff == 0); + } + + static void + ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, + uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) + { + bt->bt_magic = BT_MAGIC; + bt->bt_objset = dmu_objset_id(os); + bt->bt_object = object; + bt->bt_offset = offset; + bt->bt_gen = gen; + bt->bt_txg = txg; + bt->bt_crtxg = crtxg; + } + + static void + ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, + uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) + { + ASSERT(bt->bt_magic == BT_MAGIC); + ASSERT(bt->bt_objset == dmu_objset_id(os)); + ASSERT(bt->bt_object == object); + ASSERT(bt->bt_offset == offset); + ASSERT(bt->bt_gen <= gen); + ASSERT(bt->bt_txg <= txg); + ASSERT(bt->bt_crtxg == crtxg); + } + + static ztest_block_tag_t * + ztest_bt_bonus(dmu_buf_t *db) + { + dmu_object_info_t doi; + ztest_block_tag_t *bt; + + dmu_object_info_from_db(db, &doi); + ASSERT3U(doi.doi_bonus_size, <=, db->db_size); + ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); + bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); + + return (bt); + } + + /* + * ZIL logging ops + */ + + #define lrz_type lr_mode + #define lrz_blocksize lr_uid + #define lrz_ibshift lr_gid + #define lrz_bonustype lr_rdev + #define lrz_bonuslen lr_crtime[1] + + static uint64_t + ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) + { + char *name = (void *)(lr + 1); /* name follows lr */ + size_t namesize = strlen(name) + 1; + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return (0); + + itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) + namesize - sizeof (lr_t)); + + return (zil_itx_assign(zd->zd_zilog, itx, tx)); + } + + static uint64_t + ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr) + { + char *name = (void *)(lr + 1); /* name follows lr */ + size_t namesize = strlen(name) + 1; + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return (0); + + itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) + namesize - sizeof (lr_t)); + + return (zil_itx_assign(zd->zd_zilog, itx, tx)); + } + + static uint64_t + ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) + { + itx_t *itx; + itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); + + if (zil_replaying(zd->zd_zilog, tx)) + return (0); + + if (lr->lr_length > ZIL_MAX_LOG_DATA) + write_state = WR_INDIRECT; + + itx = zil_itx_create(TX_WRITE, + sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); + + if (write_state == WR_COPIED && + dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, + ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + write_state = WR_NEED_COPY; + } + itx->itx_private = zd; + itx->itx_wr_state = write_state; + itx->itx_sync = (ztest_random(8) == 0); + itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0); + + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) - sizeof (lr_t)); + + return (zil_itx_assign(zd->zd_zilog, itx, tx)); + } + + static uint64_t + ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) + { + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return (0); + + itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) - sizeof (lr_t)); + + return (zil_itx_assign(zd->zd_zilog, itx, tx)); + } + + static uint64_t + ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) + { + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return (0); + + itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) - sizeof (lr_t)); + + return (zil_itx_assign(zd->zd_zilog, itx, tx)); + } + + /* + * ZIL replay ops + */ + static int + ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap) + { + char *name = (void *)(lr + 1); /* name follows lr */ + objset_t *os = zd->zd_os; + ztest_block_tag_t *bbt; + dmu_buf_t *db; + dmu_tx_t *tx; + uint64_t txg; + int error = 0; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ASSERT(lr->lr_doid == ZTEST_DIROBJ); + ASSERT(name[0] != '\0'); + + tx = dmu_tx_create(os); + + dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); + + if (lr->lrz_type == DMU_OT_ZAP_OTHER) { + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } else { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + } + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) + return (ENOSPC); + + ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid); + + if (lr->lrz_type == DMU_OT_ZAP_OTHER) { + if (lr->lr_foid == 0) { + lr->lr_foid = zap_create(os, + lr->lrz_type, lr->lrz_bonustype, + lr->lrz_bonuslen, tx); + } else { + error = zap_create_claim(os, lr->lr_foid, + lr->lrz_type, lr->lrz_bonustype, + lr->lrz_bonuslen, tx); + } + } else { + if (lr->lr_foid == 0) { + lr->lr_foid = dmu_object_alloc(os, + lr->lrz_type, 0, lr->lrz_bonustype, + lr->lrz_bonuslen, tx); + } else { + error = dmu_object_claim(os, lr->lr_foid, + lr->lrz_type, 0, lr->lrz_bonustype, + lr->lrz_bonuslen, tx); + } + } + + if (error) { + ASSERT3U(error, ==, EEXIST); + ASSERT(zd->zd_zilog->zl_replay); + dmu_tx_commit(tx); + return (error); + } + + ASSERT(lr->lr_foid != 0); + + if (lr->lrz_type != DMU_OT_ZAP_OTHER) + VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid, + lr->lrz_blocksize, lr->lrz_ibshift, tx)); + + VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + bbt = ztest_bt_bonus(db); + dmu_buf_will_dirty(db, tx); + ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg); + dmu_buf_rele(db, FTAG); + + VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, + &lr->lr_foid, tx)); + + (void) ztest_log_create(zd, tx, lr); + + dmu_tx_commit(tx); + + return (0); + } + + static int + ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap) + { + char *name = (void *)(lr + 1); /* name follows lr */ + objset_t *os = zd->zd_os; + dmu_object_info_t doi; + dmu_tx_t *tx; + uint64_t object, txg; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ASSERT(lr->lr_doid == ZTEST_DIROBJ); + ASSERT(name[0] != '\0'); + + VERIFY3U(0, ==, + zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); + ASSERT(object != 0); + + ztest_object_lock(zd, object, RL_WRITER); + + VERIFY3U(0, ==, dmu_object_info(os, object, &doi)); + + tx = dmu_tx_create(os); + + dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); + dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + ztest_object_unlock(zd, object); + return (ENOSPC); + } + + if (doi.doi_type == DMU_OT_ZAP_OTHER) { + VERIFY3U(0, ==, zap_destroy(os, object, tx)); + } else { + VERIFY3U(0, ==, dmu_object_free(os, object, tx)); + } + + VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx)); + + (void) ztest_log_remove(zd, tx, lr); + + dmu_tx_commit(tx); + + ztest_object_unlock(zd, object); + + return (0); + } + + static int + ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap) + { + objset_t *os = zd->zd_os; + void *data = lr + 1; /* data follows lr */ + uint64_t offset, length; + ztest_block_tag_t *bt = data; + ztest_block_tag_t *bbt; + uint64_t gen, txg, lrtxg, crtxg; + dmu_object_info_t doi; + dmu_tx_t *tx; + dmu_buf_t *db; + arc_buf_t *abuf = NULL; + rl_t *rl; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + offset = lr->lr_offset; + length = lr->lr_length; + + /* If it's a dmu_sync() block, write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); + if (length < blocksize) { + offset -= offset % blocksize; + length = blocksize; + } + } + + if (bt->bt_magic == BSWAP_64(BT_MAGIC)) + byteswap_uint64_array(bt, sizeof (*bt)); + + if (bt->bt_magic != BT_MAGIC) + bt = NULL; + + ztest_object_lock(zd, lr->lr_foid, RL_READER); + rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); + + VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + + dmu_object_info_from_db(db, &doi); + + bbt = ztest_bt_bonus(db); + ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); + gen = bbt->bt_gen; + crtxg = bbt->bt_crtxg; + lrtxg = lr->lr_common.lrc_txg; + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, lr->lr_foid, offset, length); + + if (ztest_random(8) == 0 && length == doi.doi_data_block_size && + P2PHASE(offset, length) == 0) + abuf = dmu_request_arcbuf(db, length); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + if (abuf != NULL) + dmu_return_arcbuf(abuf); + dmu_buf_rele(db, FTAG); + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + return (ENOSPC); + } + + if (bt != NULL) { + /* + * Usually, verify the old data before writing new data -- + * but not always, because we also want to verify correct + * behavior when the data was not recently read into cache. + */ + ASSERT(offset % doi.doi_data_block_size == 0); + if (ztest_random(4) != 0) { + int prefetch = ztest_random(2) ? + DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; + ztest_block_tag_t rbt; + + VERIFY(dmu_read(os, lr->lr_foid, offset, + sizeof (rbt), &rbt, prefetch) == 0); + if (rbt.bt_magic == BT_MAGIC) { + ztest_bt_verify(&rbt, os, lr->lr_foid, + offset, gen, txg, crtxg); + } + } + + /* + * Writes can appear to be newer than the bonus buffer because + * the ztest_get_data() callback does a dmu_read() of the + * open-context data, which may be different than the data + * as it was when the write was generated. + */ + if (zd->zd_zilog->zl_replay) { + ztest_bt_verify(bt, os, lr->lr_foid, offset, + MAX(gen, bt->bt_gen), MAX(txg, lrtxg), + bt->bt_crtxg); + } + + /* + * Set the bt's gen/txg to the bonus buffer's gen/txg + * so that all of the usual ASSERTs will work. + */ + ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg); + } + + if (abuf == NULL) { + dmu_write(os, lr->lr_foid, offset, length, data, tx); + } else { + bcopy(data, abuf->b_data, length); + dmu_assign_arcbuf(db, offset, abuf, tx); + } + + (void) ztest_log_write(zd, tx, lr); + + dmu_buf_rele(db, FTAG); + + dmu_tx_commit(tx); + + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + + return (0); + } + + static int + ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap) + { + objset_t *os = zd->zd_os; + dmu_tx_t *tx; + uint64_t txg; + rl_t *rl; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ztest_object_lock(zd, lr->lr_foid, RL_READER); + rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, + RL_WRITER); + + tx = dmu_tx_create(os); + + dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + return (ENOSPC); + } + + VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset, + lr->lr_length, tx) == 0); + + (void) ztest_log_truncate(zd, tx, lr); + + dmu_tx_commit(tx); + + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + + return (0); + } + + static int + ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap) + { + objset_t *os = zd->zd_os; + dmu_tx_t *tx; + dmu_buf_t *db; + ztest_block_tag_t *bbt; + uint64_t txg, lrtxg, crtxg; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ztest_object_lock(zd, lr->lr_foid, RL_WRITER); + + VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, lr->lr_foid); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, lr->lr_foid); + return (ENOSPC); + } + + bbt = ztest_bt_bonus(db); + ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); + crtxg = bbt->bt_crtxg; + lrtxg = lr->lr_common.lrc_txg; + + if (zd->zd_zilog->zl_replay) { + ASSERT(lr->lr_size != 0); + ASSERT(lr->lr_mode != 0); + ASSERT(lrtxg != 0); + } else { + /* + * Randomly change the size and increment the generation. + */ + lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * + sizeof (*bbt); + lr->lr_mode = bbt->bt_gen + 1; + ASSERT(lrtxg == 0); + } + + /* + * Verify that the current bonus buffer is not newer than our txg. + */ + ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, + MAX(txg, lrtxg), crtxg); + + dmu_buf_will_dirty(db, tx); + + ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); + ASSERT3U(lr->lr_size, <=, db->db_size); + VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0); + bbt = ztest_bt_bonus(db); + + ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg); + + dmu_buf_rele(db, FTAG); + + (void) ztest_log_setattr(zd, tx, lr); + + dmu_tx_commit(tx); + + ztest_object_unlock(zd, lr->lr_foid); + + return (0); + } + + zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { + NULL, /* 0 no such transaction type */ + ztest_replay_create, /* TX_CREATE */ + NULL, /* TX_MKDIR */ + NULL, /* TX_MKXATTR */ + NULL, /* TX_SYMLINK */ + ztest_replay_remove, /* TX_REMOVE */ + NULL, /* TX_RMDIR */ + NULL, /* TX_LINK */ + NULL, /* TX_RENAME */ + ztest_replay_write, /* TX_WRITE */ + ztest_replay_truncate, /* TX_TRUNCATE */ + ztest_replay_setattr, /* TX_SETATTR */ + NULL, /* TX_ACL */ + NULL, /* TX_CREATE_ACL */ + NULL, /* TX_CREATE_ATTR */ + NULL, /* TX_CREATE_ACL_ATTR */ + NULL, /* TX_MKDIR_ACL */ + NULL, /* TX_MKDIR_ATTR */ + NULL, /* TX_MKDIR_ACL_ATTR */ + NULL, /* TX_WRITE2 */ + }; + + /* + * ZIL get_data callbacks + */ + + static void + ztest_get_done(zgd_t *zgd, int error) + { + ztest_ds_t *zd = zgd->zgd_private; + uint64_t object = zgd->zgd_rl->rl_object; + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + ztest_range_unlock(zgd->zgd_rl); + ztest_object_unlock(zd, object); + + if (error == 0 && zgd->zgd_bp) + zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); + + umem_free(zgd, sizeof (*zgd)); + } + + static int + ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) + { + ztest_ds_t *zd = arg; + objset_t *os = zd->zd_os; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + blkptr_t *bp = &lr->lr_blkptr; + uint64_t txg = lr->lr_common.lrc_txg; + uint64_t crtxg; + dmu_object_info_t doi; + dmu_buf_t *db; + zgd_t *zgd; + int error; + + ztest_object_lock(zd, object, RL_READER); + error = dmu_bonus_hold(os, object, FTAG, &db); + if (error) { + ztest_object_unlock(zd, object); + return (error); + } + + crtxg = ztest_bt_bonus(db)->bt_crtxg; + + if (crtxg == 0 || crtxg > txg) { + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, object); + return (ENOENT); + } + + dmu_object_info_from_db(db, &doi); + dmu_buf_rele(db, FTAG); + db = NULL; + + zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); + zgd->zgd_zilog = zd->zd_zilog; + zgd->zgd_private = zd; + + if (buf != NULL) { /* immediate write */ + zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, + RL_READER); + + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + ASSERT(error == 0); + } else { + size = doi.doi_data_block_size; + if (ISP2(size)) { + offset = P2ALIGN(offset, size); + } else { + ASSERT(offset < size); + offset = 0; + } + + zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, + RL_READER); + + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + + if (error == 0) { + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + ztest_get_done, zgd); + + if (error == 0) + return (0); + } + } + + ztest_get_done(zgd, error); + + return (error); + } + + static void * + ztest_lr_alloc(size_t lrsize, char *name) + { + char *lr; + size_t namesize = name ? strlen(name) + 1 : 0; + + lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); + + if (name) + bcopy(name, lr + lrsize, namesize); + + return (lr); + } + + void + ztest_lr_free(void *lr, size_t lrsize, char *name) + { + size_t namesize = name ? strlen(name) + 1 : 0; + + umem_free(lr, lrsize + namesize); + } + + /* + * Lookup a bunch of objects. Returns the number of objects not found. + */ + static int + ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) + { + int missing = 0; + int error; ++ int i; + + ASSERT(_mutex_held(&zd->zd_dirobj_lock)); + - for (int i = 0; i < count; i++, od++) { ++ for (i = 0; i < count; i++, od++) { + od->od_object = 0; + error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, + sizeof (uint64_t), 1, &od->od_object); + if (error) { + ASSERT(error == ENOENT); + ASSERT(od->od_object == 0); + missing++; + } else { + dmu_buf_t *db; + ztest_block_tag_t *bbt; + dmu_object_info_t doi; + + ASSERT(od->od_object != 0); + ASSERT(missing == 0); /* there should be no gaps */ + + ztest_object_lock(zd, od->od_object, RL_READER); + VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os, + od->od_object, FTAG, &db)); + dmu_object_info_from_db(db, &doi); + bbt = ztest_bt_bonus(db); + ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); + od->od_type = doi.doi_type; + od->od_blocksize = doi.doi_data_block_size; + od->od_gen = bbt->bt_gen; + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, od->od_object); + } + } + + return (missing); + } + + static int + ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) + { + int missing = 0; ++ int i; + + ASSERT(_mutex_held(&zd->zd_dirobj_lock)); + - for (int i = 0; i < count; i++, od++) { ++ for (i = 0; i < count; i++, od++) { + if (missing) { + od->od_object = 0; + missing++; + continue; + } + + lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); + + lr->lr_doid = od->od_dir; + lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ + lr->lrz_type = od->od_crtype; + lr->lrz_blocksize = od->od_crblocksize; + lr->lrz_ibshift = ztest_random_ibshift(); + lr->lrz_bonustype = DMU_OT_UINT64_OTHER; + lr->lrz_bonuslen = dmu_bonus_max(); + lr->lr_gen = od->od_crgen; + lr->lr_crtime[0] = time(NULL); + + if (ztest_replay_create(zd, lr, B_FALSE) != 0) { + ASSERT(missing == 0); + od->od_object = 0; + missing++; + } else { + od->od_object = lr->lr_foid; + od->od_type = od->od_crtype; + od->od_blocksize = od->od_crblocksize; + od->od_gen = od->od_crgen; + ASSERT(od->od_object != 0); + } + + ztest_lr_free(lr, sizeof (*lr), od->od_name); + } + + return (missing); + } + + static int + ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) + { + int missing = 0; + int error; ++ int i; + + ASSERT(_mutex_held(&zd->zd_dirobj_lock)); + + od += count - 1; + - for (int i = count - 1; i >= 0; i--, od--) { ++ for (i = count - 1; i >= 0; i--, od--) { + if (missing) { + missing++; + continue; + } + + if (od->od_object == 0) + continue; + + lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); + + lr->lr_doid = od->od_dir; + + if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { + ASSERT3U(error, ==, ENOSPC); + missing++; + } else { + od->od_object = 0; + } + ztest_lr_free(lr, sizeof (*lr), od->od_name); + } + + return (missing); + } + + static int + ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, + void *data) + { + lr_write_t *lr; + int error; + + lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); + + lr->lr_foid = object; + lr->lr_offset = offset; + lr->lr_length = size; + lr->lr_blkoff = 0; + BP_ZERO(&lr->lr_blkptr); + + bcopy(data, lr + 1, size); + + error = ztest_replay_write(zd, lr, B_FALSE); + + ztest_lr_free(lr, sizeof (*lr) + size, NULL); + + return (error); + } + + static int + ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) + { + lr_truncate_t *lr; + int error; + + lr = ztest_lr_alloc(sizeof (*lr), NULL); + + lr->lr_foid = object; + lr->lr_offset = offset; + lr->lr_length = size; + + error = ztest_replay_truncate(zd, lr, B_FALSE); + + ztest_lr_free(lr, sizeof (*lr), NULL); + + return (error); + } + + static int + ztest_setattr(ztest_ds_t *zd, uint64_t object) + { + lr_setattr_t *lr; + int error; + + lr = ztest_lr_alloc(sizeof (*lr), NULL); + + lr->lr_foid = object; + lr->lr_size = 0; + lr->lr_mode = 0; + + error = ztest_replay_setattr(zd, lr, B_FALSE); + + ztest_lr_free(lr, sizeof (*lr), NULL); + + return (error); + } + + static void + ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) + { + objset_t *os = zd->zd_os; + dmu_tx_t *tx; + uint64_t txg; + rl_t *rl; + + txg_wait_synced(dmu_objset_pool(os), 0); + + ztest_object_lock(zd, object, RL_READER); + rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, object, offset, size); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + + if (txg != 0) { + dmu_prealloc(os, object, offset, size, tx); + dmu_tx_commit(tx); + txg_wait_synced(dmu_objset_pool(os), txg); + } else { + (void) dmu_free_long_range(os, object, offset, size); + } + + ztest_range_unlock(rl); + ztest_object_unlock(zd, object); + } + + static void + ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) + { + ztest_block_tag_t wbt; + dmu_object_info_t doi; + enum ztest_io_type io_type; + uint64_t blocksize; + void *data; + + VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0); + blocksize = doi.doi_data_block_size; + data = umem_alloc(blocksize, UMEM_NOFAIL); + + /* + * Pick an i/o type at random, biased toward writing block tags. + */ + io_type = ztest_random(ZTEST_IO_TYPES); + if (ztest_random(2) == 0) + io_type = ZTEST_IO_WRITE_TAG; + + switch (io_type) { + + case ZTEST_IO_WRITE_TAG: + ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0); + (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); + break; + + case ZTEST_IO_WRITE_PATTERN: + (void) memset(data, 'a' + (object + offset) % 5, blocksize); + if (ztest_random(2) == 0) { + /* + * Induce fletcher2 collisions to ensure that + * zio_ddt_collision() detects and resolves them + * when using fletcher2-verify for deduplication. + */ + ((uint64_t *)data)[0] ^= 1ULL << 63; + ((uint64_t *)data)[4] ^= 1ULL << 63; + } + (void) ztest_write(zd, object, offset, blocksize, data); + break; + + case ZTEST_IO_WRITE_ZEROES: + bzero(data, blocksize); + (void) ztest_write(zd, object, offset, blocksize, data); + break; + + case ZTEST_IO_TRUNCATE: + (void) ztest_truncate(zd, object, offset, blocksize); + break; + + case ZTEST_IO_SETATTR: + (void) ztest_setattr(zd, object); + break; + } + + umem_free(data, blocksize); + } + + /* + * Initialize an object description template. + */ + static void + ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, + dmu_object_type_t type, uint64_t blocksize, uint64_t gen) + { + od->od_dir = ZTEST_DIROBJ; + od->od_object = 0; + + od->od_crtype = type; + od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); + od->od_crgen = gen; + + od->od_type = DMU_OT_NONE; + od->od_blocksize = 0; + od->od_gen = 0; + + (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", + tag, (int64_t)id, index); + } + + /* + * Lookup or create the objects for a test using the od template. + * If the objects do not all exist, or if 'remove' is specified, + * remove any existing objects and create new ones. Otherwise, + * use the existing objects. + */ + static int + ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) + { + int count = size / sizeof (*od); + int rv = 0; + + VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0); + if ((ztest_lookup(zd, od, count) != 0 || remove) && + (ztest_remove(zd, od, count) != 0 || + ztest_create(zd, od, count) != 0)) + rv = -1; + zd->zd_od = od; + VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0); + + return (rv); + } + + /* ARGSUSED */ + void + ztest_zil_commit(ztest_ds_t *zd, uint64_t id) + { + zilog_t *zilog = zd->zd_zilog; + + zil_commit(zilog, UINT64_MAX, ztest_random(ZTEST_OBJECTS)); + + /* + * Remember the committed values in zd, which is in parent/child + * shared memory. If we die, the next iteration of ztest_run() + * will verify that the log really does contain this record. + */ + mutex_enter(&zilog->zl_lock); + ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq); + zd->zd_seq = zilog->zl_commit_lr_seq; + mutex_exit(&zilog->zl_lock); + } + + /* + * Verify that we can't destroy an active pool, create an existing pool, + * or create a pool with a bad vdev spec. + */ + /* ARGSUSED */ + void + ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) + { + ztest_shared_t *zs = ztest_shared; + spa_t *spa; + nvlist_t *nvroot; + + /* + * Attempt to create using a bad file. + */ + nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); + VERIFY3U(ENOENT, ==, + spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); + nvlist_free(nvroot); + + /* + * Attempt to create using a bad mirror. + */ + nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1); + VERIFY3U(ENOENT, ==, + spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); + nvlist_free(nvroot); + + /* + * Attempt to create an existing pool. It shouldn't matter + * what's in the nvroot; we should fail with EEXIST. + */ + (void) rw_rdlock(&zs->zs_name_lock); + nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); + VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL)); + nvlist_free(nvroot); + VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); + VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool)); + spa_close(spa, FTAG); + + (void) rw_unlock(&zs->zs_name_lock); + } + + static vdev_t * + vdev_lookup_by_path(vdev_t *vd, const char *path) + { + vdev_t *mvd; ++ int c; + + if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) + return (vd); + - for (int c = 0; c < vd->vdev_children; c++) ++ for (c = 0; c < vd->vdev_children; c++) + if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != + NULL) + return (mvd); + + return (NULL); + } + + /* + * Find the first available hole which can be used as a top-level. + */ + int + find_vdev_hole(spa_t *spa) + { + vdev_t *rvd = spa->spa_root_vdev; + int c; + + ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV); + + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *cvd = rvd->vdev_child[c]; + + if (cvd->vdev_ishole) + break; + } + return (c); + } + + /* + * Verify that vdev_add() works as expected. + */ + /* ARGSUSED */ + void + ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) + { + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + uint64_t leaves; + uint64_t guid; + nvlist_t *nvroot; + int error; + + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * zopt_raidz; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; + + /* + * If we have slogs then remove them 1/4 of the time. + */ + if (spa_has_slogs(spa) && ztest_random(4) == 0) { + /* + * Grab the guid from the head of the log class rotor. + */ + guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid; + + spa_config_exit(spa, SCL_VDEV, FTAG); + + /* + * We have to grab the zs_name_lock as writer to + * prevent a race between removing a slog (dmu_objset_find) + * and destroying a dataset. Removing the slog will + * grab a reference on the dataset which may cause + * dmu_objset_destroy() to fail with EBUSY thus + * leaving the dataset in an inconsistent state. + */ + VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0); + error = spa_vdev_remove(spa, guid, B_FALSE); + VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0); + + if (error && error != EEXIST) + fatal(0, "spa_vdev_remove() = %d", error); + } else { + spa_config_exit(spa, SCL_VDEV, FTAG); + + /* + * Make 1/4 of the devices be log devices. + */ + nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, + ztest_random(4) == 0, zopt_raidz, zs->zs_mirrors, 1); + + error = spa_vdev_add(spa, nvroot); + nvlist_free(nvroot); + + if (error == ENOSPC) + ztest_record_enospc("spa_vdev_add"); + else if (error != 0) + fatal(0, "spa_vdev_add() = %d", error); + } + + VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0); + } + + /* + * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. + */ + /* ARGSUSED */ + void + ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) + { + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + vdev_t *rvd = spa->spa_root_vdev; + spa_aux_vdev_t *sav; + char *aux; + uint64_t guid = 0; + int error; + + if (ztest_random(2) == 0) { + sav = &spa->spa_spares; + aux = ZPOOL_CONFIG_SPARES; + } else { + sav = &spa->spa_l2cache; + aux = ZPOOL_CONFIG_L2CACHE; + } + + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); if (sav->sav_count != 0 && ztest_random(4) == 0) { /* @@@ -1399,56 -2865,57 +2874,58 @@@ ztest_objset_destroy_cb(const char *nam return (0); } - /* - * Verify that dmu_objset_{create,destroy,open,close} work as expected. - */ - static uint64_t - ztest_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t object, int mode) + static boolean_t + ztest_snapshot_create(char *osname, uint64_t id) + { + char snapname[MAXNAMELEN]; + int error; + + (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname, + (u_longlong_t)id); + + error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1, + NULL, B_FALSE); + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + return (B_FALSE); + } + if (error != 0 && error != EEXIST) + fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error); + return (B_TRUE); + } + + static boolean_t + ztest_snapshot_destroy(char *osname, uint64_t id) { - itx_t *itx; - lr_create_t *lr; - size_t namesize; - char name[24]; - - (void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object); - namesize = strlen(name) + 1; - - itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize + - ztest_random(ZIL_MAX_BLKSZ)); - lr = (lr_create_t *)&itx->itx_lr; - bzero(lr + 1, lr->lr_common.lrc_reclen - sizeof (*lr)); - lr->lr_doid = object; - lr->lr_foid = 0; - lr->lr_mode = mode; - lr->lr_uid = 0; - lr->lr_gid = 0; - lr->lr_gen = dmu_tx_get_txg(tx); - lr->lr_crtime[0] = time(NULL); - lr->lr_crtime[1] = 0; - lr->lr_rdev = 0; - bcopy(name, (char *)(lr + 1), namesize); - - return (zil_itx_assign(zilog, itx, tx)); + char snapname[MAXNAMELEN]; + int error; + + (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname, + (u_longlong_t)id); + + error = dmu_objset_destroy(snapname, B_FALSE); + if (error != 0 && error != ENOENT) + fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); + return (B_TRUE); } + /* ARGSUSED */ void - ztest_dmu_objset_create_destroy(ztest_args_t *za) + ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) { + ztest_shared_t *zs = ztest_shared; + ztest_ds_t zdtmp; + int iters; int error; objset_t *os, *os2; - char name[100]; - int basemode, expected_error; + char name[MAXNAMELEN]; zilog_t *zilog; - uint64_t seq; - uint64_t objects; ++ int i; - (void) rw_rdlock(&ztest_shared->zs_name_lock); - (void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool, - (u_longlong_t)za->za_instance); + (void) rw_rdlock(&zs->zs_name_lock); - basemode = DS_MODE_TYPE(za->za_instance); - if (basemode != DS_MODE_USER && basemode != DS_MODE_OWNER) - basemode = DS_MODE_USER; + (void) snprintf(name, MAXNAMELEN, "%s/temp_%llu", + zs->zs_pool, (u_longlong_t)id); /* * If this dataset exists from a previous run, process its replay log @@@ -1499,38 -2964,17 +2974,17 @@@ /* * Open the intent log for it. */ - zilog = zil_open(os, NULL); + zilog = zil_open(os, ztest_get_data); /* - * Put a random number of objects in there. + * Put some objects in there, do a little I/O to them, + * and randomly take a couple of snapshots along the way. */ - objects = ztest_random(20); - seq = 0; - while (objects-- != 0) { - uint64_t object; - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, sizeof (name)); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, - DMU_OT_NONE, 0, tx); - ztest_set_random_blocksize(os, object, tx); - seq = ztest_log_create(zilog, tx, object, - DMU_OT_UINT64_OTHER); - dmu_write(os, object, 0, sizeof (name), name, tx); - dmu_tx_commit(tx); - } - if (ztest_random(5) == 0) { - zil_commit(zilog, seq, object); - } - if (ztest_random(100) == 0) { - error = zil_suspend(zilog); - if (error == 0) { - zil_resume(zilog); - } - } + iters = ztest_random(5); - for (int i = 0; i < iters; i++) { ++ for (i = 0; i < iters; i++) { + ztest_dmu_object_alloc_free(&zdtmp, id); + if (ztest_random(iters) == 0) + (void) ztest_snapshot_create(name, i); } /* @@@ -1744,210 -3156,24 +3166,25 @@@ out * Verify that dmu_object_{alloc,free} work as expected. */ void - ztest_dmu_object_alloc_free(ztest_args_t *za) + ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) { - objset_t *os = za->za_os; - dmu_buf_t *db; - dmu_tx_t *tx; - uint64_t batchobj, object, batchsize, endoff, temp; - int b, c, error, bonuslen; - dmu_object_info_t *doi = &za->za_doi; - char osname[MAXNAMELEN]; - - dmu_objset_name(os, osname); - - endoff = -8ULL; - batchsize = 2; - - /* - * Create a batch object if necessary, and record it in the directory. - */ - VERIFY3U(0, ==, dmu_read(os, ZTEST_DIROBJ, za->za_diroff, - sizeof (uint64_t), &batchobj, DMU_READ_PREFETCH)); - if (batchobj == 0) { - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, - sizeof (uint64_t)); - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("create a batch object"); - dmu_tx_abort(tx); - return; - } - batchobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, - DMU_OT_NONE, 0, tx); - ztest_set_random_blocksize(os, batchobj, tx); - dmu_write(os, ZTEST_DIROBJ, za->za_diroff, - sizeof (uint64_t), &batchobj, tx); - dmu_tx_commit(tx); - } - - /* - * Destroy the previous batch of objects. - */ - for (b = 0; b < batchsize; b++) { - VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t), - sizeof (uint64_t), &object, DMU_READ_PREFETCH)); - if (object == 0) - continue; - /* - * Read and validate contents. - * We expect the nth byte of the bonus buffer to be n. - */ - VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db)); - za->za_dbuf = db; - - dmu_object_info_from_db(db, doi); - ASSERT(doi->doi_type == DMU_OT_UINT64_OTHER); - ASSERT(doi->doi_bonus_type == DMU_OT_PLAIN_OTHER); - ASSERT3S(doi->doi_physical_blks, >=, 0); - - bonuslen = doi->doi_bonus_size; - - for (c = 0; c < bonuslen; c++) { - if (((uint8_t *)db->db_data)[c] != - (uint8_t)(c + bonuslen)) { - fatal(0, - "bad bonus: %s, obj %llu, off %d: %u != %u", - osname, object, c, - ((uint8_t *)db->db_data)[c], - (uint8_t)(c + bonuslen)); - } - } - - dmu_buf_rele(db, FTAG); - za->za_dbuf = NULL; - - /* - * We expect the word at endoff to be our object number. - */ - VERIFY(0 == dmu_read(os, object, endoff, - sizeof (uint64_t), &temp, DMU_READ_PREFETCH)); - - if (temp != object) { - fatal(0, "bad data in %s, got %llu, expected %llu", - osname, temp, object); - } - - /* - * Destroy old object and clear batch entry. - */ - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, batchobj, - b * sizeof (uint64_t), sizeof (uint64_t)); - dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("free object"); - dmu_tx_abort(tx); - return; - } - error = dmu_object_free(os, object, tx); - if (error) { - fatal(0, "dmu_object_free('%s', %llu) = %d", - osname, object, error); - } - object = 0; - - dmu_object_set_checksum(os, batchobj, - ztest_random_checksum(), tx); - dmu_object_set_compress(os, batchobj, - ztest_random_compress(), tx); - - dmu_write(os, batchobj, b * sizeof (uint64_t), - sizeof (uint64_t), &object, tx); - - dmu_tx_commit(tx); - } + ztest_od_t od[4]; + int batchsize = sizeof (od) / sizeof (od[0]); ++ int b; - /* - * Before creating the new batch of objects, generate a bunch of churn. - */ - for (b = ztest_random(100); b > 0; b--) { - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("churn objects"); - dmu_tx_abort(tx); - return; - } - object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, - DMU_OT_NONE, 0, tx); - ztest_set_random_blocksize(os, object, tx); - error = dmu_object_free(os, object, tx); - if (error) { - fatal(0, "dmu_object_free('%s', %llu) = %d", - osname, object, error); - } - dmu_tx_commit(tx); - } - for (int b = 0; b < batchsize; b++) ++ for (b = 0; b < batchsize; b++) + ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0); /* - * Create a new batch of objects with randomly chosen - * blocksizes and record them in the batch directory. + * Destroy the previous batch of objects, create a new batch, + * and do some I/O on the new objects. */ - for (b = 0; b < batchsize; b++) { - uint32_t va_blksize; - u_longlong_t va_nblocks; - - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, batchobj, b * sizeof (uint64_t), - sizeof (uint64_t)); - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, endoff, - sizeof (uint64_t)); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("create batchobj"); - dmu_tx_abort(tx); - return; - } - bonuslen = (int)ztest_random(dmu_bonus_max()) + 1; - - object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, - DMU_OT_PLAIN_OTHER, bonuslen, tx); - - ztest_set_random_blocksize(os, object, tx); - - dmu_object_set_checksum(os, object, - ztest_random_checksum(), tx); - dmu_object_set_compress(os, object, - ztest_random_compress(), tx); - - dmu_write(os, batchobj, b * sizeof (uint64_t), - sizeof (uint64_t), &object, tx); - - /* - * Write to both the bonus buffer and the regular data. - */ - VERIFY(dmu_bonus_hold(os, object, FTAG, &db) == 0); - za->za_dbuf = db; - ASSERT3U(bonuslen, <=, db->db_size); - - dmu_object_size_from_db(db, &va_blksize, &va_nblocks); - ASSERT3S(va_nblocks, >=, 0); - - dmu_buf_will_dirty(db, tx); - - /* - * See comments above regarding the contents of - * the bonus buffer and the word at endoff. - */ - for (c = 0; c < bonuslen; c++) - ((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen); - - dmu_buf_rele(db, FTAG); - za->za_dbuf = NULL; - - /* - * Write to a large offset to increase indirection. - */ - dmu_write(os, object, endoff, sizeof (uint64_t), &object, tx); + if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0) + return; - dmu_tx_commit(tx); - } + while (ztest_random(4 * batchsize) != 0) + ztest_io(zd, od[ztest_random(batchsize)].od_object, + ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); } /* @@@ -2918,168 -3859,430 +3870,432 @@@ ztest_zap(ztest_ds_t *zd, uint64_t id ASSERT3U(error, ==, 0); tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, object, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("remove zap entry"); - dmu_tx_abort(tx); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + return; + VERIFY3U(0, ==, zap_remove(os, object, txgname, tx)); + VERIFY3U(0, ==, zap_remove(os, object, propname, tx)); + dmu_tx_commit(tx); + } + + /* + * Testcase to test the upgrading of a microzap to fatzap. + */ + void + ztest_fzap(ztest_ds_t *zd, uint64_t id) + { + objset_t *os = zd->zd_os; + ztest_od_t od[1]; + uint64_t object, txg; ++ int i; + + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0); + + if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) return; + + object = od[0].od_object; + + /* + * Add entries to this ZAP and make sure it spills over + * and gets upgraded to a fatzap. Also, since we are adding + * 2050 entries we should see ptrtbl growth and leaf-block split. + */ - for (int i = 0; i < 2050; i++) { ++ for (i = 0; i < 2050; i++) { + char name[MAXNAMELEN]; + uint64_t value = i; + dmu_tx_t *tx; + int error; + + (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", + id, value); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, name); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + return; + error = zap_add(os, object, name, sizeof (uint64_t), 1, + &value, tx); + ASSERT(error == 0 || error == EEXIST); + dmu_tx_commit(tx); } - error = zap_remove(os, object, txgname, tx); - if (error) - fatal(0, "zap_remove('%s', %llu, '%s') = %d", - osname, object, txgname, error); + } - error = zap_remove(os, object, propname, tx); - if (error) - fatal(0, "zap_remove('%s', %llu, '%s') = %d", - osname, object, propname, error); + /* ARGSUSED */ + void + ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) + { + objset_t *os = zd->zd_os; + ztest_od_t od[1]; + uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; + dmu_tx_t *tx; + int i, namelen, error; + int micro = ztest_random(2); + char name[20], string_value[20]; + void *data; - dmu_tx_commit(tx); + ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0); + + if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) + return; + + object = od[0].od_object; + + /* + * Generate a random name of the form 'xxx.....' where each + * x is a random printable character and the dots are dots. + * There are 94 such characters, and the name length goes from + * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. + */ + namelen = ztest_random(sizeof (name) - 5) + 5 + 1; + + for (i = 0; i < 3; i++) + name[i] = '!' + ztest_random('~' - '!' + 1); + for (; i < namelen - 1; i++) + name[i] = '.'; + name[i] = '\0'; + + if ((namelen & 1) || micro) { + wsize = sizeof (txg); + wc = 1; + data = &txg; + } else { + wsize = 1; + wc = namelen; + data = string_value; + } + + count = -1ULL; + VERIFY(zap_count(os, object, &count) == 0); + ASSERT(count != -1ULL); /* - * Once in a while, destroy the object. + * Select an operation: length, lookup, add, update, remove. */ - if (ztest_random(1000) != 0) + i = ztest_random(5); + + if (i >= 2) { + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + return; + bcopy(name, string_value, namelen); + } else { + tx = NULL; + txg = 0; + bzero(string_value, namelen); + } + + switch (i) { + + case 0: + error = zap_length(os, object, name, &zl_wsize, &zl_wc); + if (error == 0) { + ASSERT3U(wsize, ==, zl_wsize); + ASSERT3U(wc, ==, zl_wc); + } else { + ASSERT3U(error, ==, ENOENT); + } + break; + + case 1: + error = zap_lookup(os, object, name, wsize, wc, data); + if (error == 0) { + if (data == string_value && + bcmp(name, data, namelen) != 0) + fatal(0, "name '%s' != val '%s' len %d", + name, data, namelen); + } else { + ASSERT3U(error, ==, ENOENT); + } + break; + + case 2: + error = zap_add(os, object, name, wsize, wc, data, tx); + ASSERT(error == 0 || error == EEXIST); + break; + + case 3: + VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); + break; + + case 4: + error = zap_remove(os, object, name, tx); + ASSERT(error == 0 || error == ENOENT); + break; + } + + if (tx != NULL) + dmu_tx_commit(tx); + } + + /* + * Commit callback data. + */ + typedef struct ztest_cb_data { + list_node_t zcd_node; + uint64_t zcd_txg; + int zcd_expected_err; + boolean_t zcd_added; + boolean_t zcd_called; + spa_t *zcd_spa; + } ztest_cb_data_t; + + /* This is the actual commit callback function */ + static void + ztest_commit_callback(void *arg, int error) + { + ztest_cb_data_t *data = arg; + uint64_t synced_txg; + + VERIFY(data != NULL); + VERIFY3S(data->zcd_expected_err, ==, error); + VERIFY(!data->zcd_called); + + synced_txg = spa_last_synced_txg(data->zcd_spa); + if (data->zcd_txg > synced_txg) + fatal(0, "commit callback of txg %" PRIu64 " called prematurely" + ", last synced txg = %" PRIu64 "\n", data->zcd_txg, + synced_txg); + + data->zcd_called = B_TRUE; + + if (error == ECANCELED) { + ASSERT3U(data->zcd_txg, ==, 0); + ASSERT(!data->zcd_added); + + /* + * The private callback data should be destroyed here, but + * since we are going to check the zcd_called field after + * dmu_tx_abort(), we will destroy it there. + */ + return; + } + + /* Was this callback added to the global callback list? */ + if (!data->zcd_added) + goto out; + + ASSERT3U(data->zcd_txg, !=, 0); + + /* Remove our callback from the list */ + (void) mutex_lock(&zcl.zcl_callbacks_lock); + list_remove(&zcl.zcl_callbacks, data); + (void) mutex_unlock(&zcl.zcl_callbacks_lock); + + out: + umem_free(data, sizeof (ztest_cb_data_t)); + } + + /* Allocate and initialize callback data structure */ + static ztest_cb_data_t * + ztest_create_cb_data(objset_t *os, uint64_t txg) + { + ztest_cb_data_t *cb_data; + + cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); + + cb_data->zcd_txg = txg; + cb_data->zcd_spa = dmu_objset_spa(os); + + return (cb_data); + } + + /* + * If a number of txgs equal to this threshold have been created after a commit + * callback has been registered but not called, then we assume there is an + * implementation bug. + */ + #define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2) + + /* + * Commit callback test. + */ + void + ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) + { + objset_t *os = zd->zd_os; + ztest_od_t od[1]; + dmu_tx_t *tx; + ztest_cb_data_t *cb_data[3], *tmp_cb; + uint64_t old_txg, txg; + int i, error; + + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); + + if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t)); - dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); - error = dmu_tx_assign(tx, TXG_WAIT); + + cb_data[0] = ztest_create_cb_data(os, 0); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); + + dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t)); + + /* Every once in a while, abort the transaction on purpose */ + if (ztest_random(100) == 0) + error = -1; + + if (!error) + error = dmu_tx_assign(tx, TXG_NOWAIT); + + txg = error ? 0 : dmu_tx_get_txg(tx); + + cb_data[0]->zcd_txg = txg; + cb_data[1] = ztest_create_cb_data(os, txg); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); + if (error) { - ztest_record_enospc("destroy zap object"); + /* + * It's not a strict requirement to call the registered + * callbacks from inside dmu_tx_abort(), but that's what + * it's supposed to happen in the current implementation + * so we will check for that. + */ + for (i = 0; i < 2; i++) { + cb_data[i]->zcd_expected_err = ECANCELED; + VERIFY(!cb_data[i]->zcd_called); + } + dmu_tx_abort(tx); + + for (i = 0; i < 2; i++) { + VERIFY(cb_data[i]->zcd_called); + umem_free(cb_data[i], sizeof (ztest_cb_data_t)); + } + return; } - error = zap_destroy(os, object, tx); - if (error) - fatal(0, "zap_destroy('%s', %llu) = %d", - osname, object, error); - object = 0; - dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t), - &object, tx); - dmu_tx_commit(tx); - } - void - ztest_zap_parallel(ztest_args_t *za) - { - objset_t *os = za->za_os; - uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; - dmu_tx_t *tx; - int i, namelen, error; - char name[20], string_value[20]; - void *data; + cb_data[2] = ztest_create_cb_data(os, txg); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); /* - * Generate a random name of the form 'xxx.....' where each - * x is a random printable character and the dots are dots. - * There are 94 such characters, and the name length goes from - * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. + * Read existing data to make sure there isn't a future leak. */ - namelen = ztest_random(sizeof (name) - 5) + 5 + 1; + VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t), + &old_txg, DMU_READ_PREFETCH)); - for (i = 0; i < 3; i++) - name[i] = '!' + ztest_random('~' - '!' + 1); - for (; i < namelen - 1; i++) - name[i] = '.'; - name[i] = '\0'; + if (old_txg > txg) + fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, + old_txg, txg); - if (ztest_random(2) == 0) - object = ZTEST_MICROZAP_OBJ; - else - object = ZTEST_FATZAP_OBJ; + dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx); - if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) { - wsize = sizeof (txg); - wc = 1; - data = &txg; - } else { - wsize = 1; - wc = namelen; - data = string_value; - } + (void) mutex_lock(&zcl.zcl_callbacks_lock); - count = -1ULL; - VERIFY(zap_count(os, object, &count) == 0); - ASSERT(count != -1ULL); + /* + * Since commit callbacks don't have any ordering requirement and since + * it is theoretically possible for a commit callback to be called + * after an arbitrary amount of time has elapsed since its txg has been + * synced, it is difficult to reliably determine whether a commit + * callback hasn't been called due to high load or due to a flawed + * implementation. + * + * In practice, we will assume that if after a certain number of txgs a + * commit callback hasn't been called, then most likely there's an + * implementation bug.. + */ + tmp_cb = list_head(&zcl.zcl_callbacks); + if (tmp_cb != NULL && + tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) { + fatal(0, "Commit callback threshold exceeded, oldest txg: %" + PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); + } /* - * Select an operation: length, lookup, add, update, remove. + * Let's find the place to insert our callbacks. + * + * Even though the list is ordered by txg, it is possible for the + * insertion point to not be the end because our txg may already be + * quiescing at this point and other callbacks in the open txg + * (from other objsets) may have sneaked in. */ - i = ztest_random(5); + tmp_cb = list_tail(&zcl.zcl_callbacks); + while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) + tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); + + /* Add the 3 callbacks to the list */ + for (i = 0; i < 3; i++) { + if (tmp_cb == NULL) + list_insert_head(&zcl.zcl_callbacks, cb_data[i]); + else + list_insert_after(&zcl.zcl_callbacks, tmp_cb, + cb_data[i]); - if (i >= 2) { - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, object, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("zap parallel"); - dmu_tx_abort(tx); - return; - } - txg = dmu_tx_get_txg(tx); - bcopy(name, string_value, namelen); - } else { - tx = NULL; - txg = 0; - bzero(string_value, namelen); + cb_data[i]->zcd_added = B_TRUE; + VERIFY(!cb_data[i]->zcd_called); + + tmp_cb = cb_data[i]; } - switch (i) { + (void) mutex_unlock(&zcl.zcl_callbacks_lock); - case 0: - error = zap_length(os, object, name, &zl_wsize, &zl_wc); - if (error == 0) { - ASSERT3U(wsize, ==, zl_wsize); - ASSERT3U(wc, ==, zl_wc); - } else { - ASSERT3U(error, ==, ENOENT); - } - break; + dmu_tx_commit(tx); + } - case 1: - error = zap_lookup(os, object, name, wsize, wc, data); - if (error == 0) { - if (data == string_value && - bcmp(name, data, namelen) != 0) - fatal(0, "name '%s' != val '%s' len %d", - name, data, namelen); - } else { - ASSERT3U(error, ==, ENOENT); - } - break; + /* ARGSUSED */ + void + ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) + { + zfs_prop_t proplist[] = { + ZFS_PROP_CHECKSUM, + ZFS_PROP_COMPRESSION, + ZFS_PROP_COPIES, + ZFS_PROP_DEDUP + }; + ztest_shared_t *zs = ztest_shared; ++ int p; - case 2: - error = zap_add(os, object, name, wsize, wc, data, tx); - ASSERT(error == 0 || error == EEXIST); - break; + (void) rw_rdlock(&zs->zs_name_lock); - case 3: - VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); - break; - for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) ++ for (p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) + (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], + ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); - case 4: - error = zap_remove(os, object, name, tx); - ASSERT(error == 0 || error == ENOENT); - break; - } + (void) rw_unlock(&zs->zs_name_lock); + } - if (tx != NULL) - dmu_tx_commit(tx); + /* ARGSUSED */ + void + ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) + { + ztest_shared_t *zs = ztest_shared; + nvlist_t *props = NULL; + + (void) rw_rdlock(&zs->zs_name_lock); + + (void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO, + ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN)); + + VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0); + + if (zopt_verbose >= 6) + dump_nvlist(props, 4); + + nvlist_free(props); + + (void) rw_unlock(&zs->zs_name_lock); } + /* + * Test snapshot hold/release and deferred destroy. + */ void - ztest_dsl_prop_get_set(ztest_args_t *za) + ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) { - objset_t *os = za->za_os; - int i, inherit; - uint64_t value; - const char *prop, *valname; - char setpoint[MAXPATHLEN]; - char osname[MAXNAMELEN]; int error; + objset_t *os = zd->zd_os; + objset_t *origin; + char snapname[100]; + char fullname[100]; + char clonename[100]; + char tag[100]; + char osname[MAXNAMELEN]; (void) rw_rdlock(&ztest_shared->zs_name_lock); @@@ -3262,160 -4558,176 +4571,177 @@@ ztest_fault_inject(ztest_ds_t *zd, uint } /* - * Scrub the pool. + * Verify that DDT repair works as expected. */ void - ztest_scrub(ztest_args_t *za) + ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) { - spa_t *spa = za->za_spa; - - (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); - (void) poll(NULL, 0, 1000); /* wait a second, then force a restart */ - (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); - } + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + objset_t *os = zd->zd_os; + ztest_od_t od[1]; + uint64_t object, blocksize, txg, pattern, psize; + enum zio_checksum checksum = spa_dedup_checksum(spa); + dmu_buf_t *db; + dmu_tx_t *tx; + void *buf; + blkptr_t blk; + int copies = 2 * ZIO_DEDUPDITTO_MIN; ++ int i; - /* - * Rename the pool to a different name and then rename it back. - */ - void - ztest_spa_rename(ztest_args_t *za) - { - char *oldname, *newname; - int error; - spa_t *spa; + blocksize = ztest_random_blocksize(); + blocksize = MIN(blocksize, 2048); /* because we write so many */ - (void) rw_wrlock(&ztest_shared->zs_name_lock); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); - oldname = za->za_pool; - newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL); - (void) strcpy(newname, oldname); - (void) strcat(newname, "_tmp"); + if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) + return; /* - * Do the rename + * Take the name lock as writer to prevent anyone else from changing + * the pool and dataset properies we need to maintain during this test. */ - error = spa_rename(oldname, newname); - if (error) - fatal(0, "spa_rename('%s', '%s') = %d", oldname, - newname, error); + (void) rw_wrlock(&zs->zs_name_lock); - /* - * Try to open it under the old name, which shouldn't exist - */ - error = spa_open(oldname, &spa, FTAG); - if (error != ENOENT) - fatal(0, "spa_open('%s') = %d", oldname, error); + if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum, + B_FALSE) != 0 || + ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1, + B_FALSE) != 0) { + (void) rw_unlock(&zs->zs_name_lock); + return; + } + + object = od[0].od_object; + blocksize = od[0].od_blocksize; + pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os); + + ASSERT(object != 0); + + tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, object, 0, copies * blocksize); + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + (void) rw_unlock(&zs->zs_name_lock); + return; + } /* - * Open it under the new name and make sure it's still the same spa_t. + * Write all the copies of our block. */ - error = spa_open(newname, &spa, FTAG); - if (error != 0) - fatal(0, "spa_open('%s') = %d", newname, error); - for (int i = 0; i < copies; i++) { ++ for (i = 0; i < copies; i++) { + uint64_t offset = i * blocksize; + VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db, + DMU_READ_NO_PREFETCH) == 0); + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == blocksize); + ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) || + ztest_pattern_match(db->db_data, db->db_size, 0ULL)); + dmu_buf_will_fill(db, tx); + ztest_pattern_set(db->db_data, db->db_size, pattern); + dmu_buf_rele(db, FTAG); + } - ASSERT(spa == za->za_spa); - spa_close(spa, FTAG); + dmu_tx_commit(tx); + txg_wait_synced(spa_get_dsl(spa), txg); /* - * Rename it back to the original + * Find out what block we got. */ - error = spa_rename(newname, oldname); - if (error) - fatal(0, "spa_rename('%s', '%s') = %d", newname, - oldname, error); + VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db, + DMU_READ_NO_PREFETCH) == 0); + blk = *((dmu_buf_impl_t *)db)->db_blkptr; + dmu_buf_rele(db, FTAG); /* - * Make sure it can still be opened + * Damage the block. Dedup-ditto will save us when we read it later. */ - error = spa_open(oldname, &spa, FTAG); - if (error != 0) - fatal(0, "spa_open('%s') = %d", oldname, error); + psize = BP_GET_PSIZE(&blk); + buf = zio_buf_alloc(psize); + ztest_pattern_set(buf, psize, ~pattern); - ASSERT(spa == za->za_spa); - spa_close(spa, FTAG); + (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, + buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, + ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); - umem_free(newname, strlen(newname) + 1); + zio_buf_free(buf, psize); - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) rw_unlock(&zs->zs_name_lock); } - /* - * Completely obliterate one disk. + * Scrub the pool. */ - static void - ztest_obliterate_one_disk(uint64_t vdev) + /* ARGSUSED */ + void + ztest_scrub(ztest_ds_t *zd, uint64_t id) { - int fd; - char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN]; - size_t fsize; + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; - if (zopt_maxfaults < 2) - return; + (void) spa_scan(spa, POOL_SCAN_SCRUB); + (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */ + (void) spa_scan(spa, POOL_SCAN_SCRUB); + } - (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev); - (void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name); + /* + * Rename the pool to a different name and then rename it back. + */ + /* ARGSUSED */ + void + ztest_spa_rename(ztest_ds_t *zd, uint64_t id) + { + ztest_shared_t *zs = ztest_shared; + char *oldname, *newname; + spa_t *spa; - fd = open(dev_name, O_RDWR); + (void) rw_wrlock(&zs->zs_name_lock); - if (fd == -1) - fatal(1, "can't open %s", dev_name); + oldname = zs->zs_pool; + newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL); + (void) strcpy(newname, oldname); + (void) strcat(newname, "_tmp"); /* - * Determine the size. + * Do the rename */ - fsize = lseek(fd, 0, SEEK_END); - - (void) close(fd); + VERIFY3U(0, ==, spa_rename(oldname, newname)); /* - * Rename the old device to dev_name.old (useful for debugging). + * Try to open it under the old name, which shouldn't exist */ - VERIFY(rename(dev_name, copy_name) == 0); + VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); /* - * Create a new one. + * Open it under the new name and make sure it's still the same spa_t. */ - VERIFY((fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666)) >= 0); - VERIFY(ftruncate(fd, fsize) == 0); - (void) close(fd); - } + VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); - static void - ztest_replace_one_disk(spa_t *spa, uint64_t vdev) - { - char dev_name[MAXPATHLEN]; - nvlist_t *root; - int error; - uint64_t guid; - vdev_t *vd; + ASSERT(spa == zs->zs_spa); + spa_close(spa, FTAG); - (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev); + /* + * Rename it back to the original + */ + VERIFY3U(0, ==, spa_rename(newname, oldname)); /* - * Build the nvlist describing dev_name. + * Make sure it can still be opened */ - root = make_vdev_root(dev_name, NULL, 0, 0, 0, 0, 0, 1); + VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, dev_name)) == NULL) - guid = 0; - else - guid = vd->vdev_guid; - spa_config_exit(spa, SCL_VDEV, FTAG); - error = spa_vdev_attach(spa, guid, root, B_TRUE); - if (error != 0 && - error != EBUSY && - error != ENOTSUP && - error != ENODEV && - error != EDOM) - fatal(0, "spa_vdev_attach(in-place) = %d", error); + ASSERT(spa == zs->zs_spa); + spa_close(spa, FTAG); - nvlist_free(root); + umem_free(newname, strlen(newname) + 1); + + (void) rw_unlock(&zs->zs_name_lock); } + /* + * Verify pool integrity by running zdb. + */ static void - ztest_verify_blocks(char *pool) + ztest_run_zdb(char *pool) { int status; char zdb[MAXPATHLEN + MAXNAMELEN + 20]; @@@ -3597,6 -4896,45 +4910,46 @@@ ztest_resume_thread(void *arg return (NULL); } + static void * + ztest_deadman_thread(void *arg) + { + ztest_shared_t *zs = arg; + int grace = 300; + hrtime_t delta; + + delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace; + + (void) poll(NULL, 0, (int)(1000 * delta)); + + fatal(0, "failed to complete within %d seconds of deadline", grace); + + return (NULL); + } + + static void + ztest_execute(ztest_info_t *zi, uint64_t id) + { + ztest_shared_t *zs = ztest_shared; + ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets]; + hrtime_t functime = gethrtime(); ++ int i; + - for (int i = 0; i < zi->zi_iters; i++) ++ for (i = 0; i < zi->zi_iters; i++) + zi->zi_func(zd, id); + + functime = gethrtime() - functime; + + atomic_add_64(&zi->zi_call_count, 1); + atomic_add_64(&zi->zi_call_time, functime); + + if (zopt_verbose >= 4) { + Dl_info dli; + (void) dladdr((void *)zi->zi_func, &dli); + (void) printf("%6.2f sec in %s\n", + (double)functime / NANOSEC, dli.dli_sname); + } + } + static void * ztest_thread(void *arg) { @@@ -3610,69 -4948,157 +4963,159 @@@ /* * See if it's time to force a crash. */ - if (now > za->za_kill) { - zs->zs_alloc = spa_get_alloc(za->za_spa); - zs->zs_space = spa_get_space(za->za_spa); - (void) kill(getpid(), SIGKILL); - } + if (now > zs->zs_thread_kill) + ztest_kill(zs); /* - * Pick a random function. + * If we're getting ENOSPC with some regularity, stop. */ - f = ztest_random(ZTEST_FUNCS); - zi = &zs->zs_info[f]; + if (zs->zs_enospc_count > 10) + break; /* - * Decide whether to call it, based on the requested frequency. + * Pick a random function to execute. */ - if (zi->zi_call_target == 0 || - (double)zi->zi_call_total / zi->zi_call_target > - (double)(now - zs->zs_start_time) / (zopt_time * NANOSEC)) - continue; + zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)]; + call_next = zi->zi_call_next; + + if (now >= call_next && + atomic_cas_64(&zi->zi_call_next, call_next, call_next + + ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) + ztest_execute(zi, id); + } - atomic_add_64(&zi->zi_calls, 1); - atomic_add_64(&zi->zi_call_total, 1); + return (NULL); + } - za->za_diroff = (za->za_instance * ZTEST_FUNCS + f) * - ZTEST_DIRSIZE; - za->za_diroff_shared = (1ULL << 63); + static void + ztest_dataset_name(char *dsname, char *pool, int d) + { + (void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d); + } - for (i = 0; i < zi->zi_iters; i++) - zi->zi_func(za); + static void + ztest_dataset_destroy(ztest_shared_t *zs, int d) + { + char name[MAXNAMELEN]; ++ int t; - functime = gethrtime() - now; + ztest_dataset_name(name, zs->zs_pool, d); - atomic_add_64(&zi->zi_call_time, functime); + if (zopt_verbose >= 3) + (void) printf("Destroying %s to free up space\n", name); - if (zopt_verbose >= 4) { - Dl_info dli; - (void) dladdr((void *)zi->zi_func, &dli); - (void) printf("%6.2f sec in %s\n", - (double)functime / NANOSEC, dli.dli_sname); - } + /* + * Cleanup any non-standard clones and snapshots. In general, + * ztest thread t operates on dataset (t % zopt_datasets), + * so there may be more than one thing to clean up. + */ - for (int t = d; t < zopt_threads; t += zopt_datasets) ++ for (t = d; t < zopt_threads; t += zopt_datasets) + ztest_dsl_dataset_cleanup(name, t); - /* - * If we're getting ENOSPC with some regularity, stop. - */ - if (zs->zs_enospc_count > 10) - break; + (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, + DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); + } + + static void + ztest_dataset_dirobj_verify(ztest_ds_t *zd) + { + uint64_t usedobjs, dirobjs, scratch; + + /* + * ZTEST_DIROBJ is the object directory for the entire dataset. + * Therefore, the number of objects in use should equal the + * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. + * If not, we have an object leak. + * + * Note that we can only check this in ztest_dataset_open(), + * when the open-context and syncing-context values agree. + * That's because zap_count() returns the open-context value, + * while dmu_objset_space() returns the rootbp fill count. + */ + VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); + dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); + ASSERT3U(dirobjs + 1, ==, usedobjs); + } + + static int + ztest_dataset_open(ztest_shared_t *zs, int d) + { + ztest_ds_t *zd = &zs->zs_zd[d]; + uint64_t committed_seq = zd->zd_seq; + objset_t *os; + zilog_t *zilog; + char name[MAXNAMELEN]; + int error; + + ztest_dataset_name(name, zs->zs_pool, d); + + (void) rw_rdlock(&zs->zs_name_lock); + + error = ztest_dataset_create(name); + if (error == ENOSPC) { + (void) rw_unlock(&zs->zs_name_lock); + ztest_record_enospc(FTAG); + return (error); } + ASSERT(error == 0 || error == EEXIST); - return (NULL); + VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0); + (void) rw_unlock(&zs->zs_name_lock); + + ztest_zd_init(zd, os); + + zilog = zd->zd_zilog; + + if (zilog->zl_header->zh_claim_lr_seq != 0 && + zilog->zl_header->zh_claim_lr_seq < committed_seq) + fatal(0, "missing log records: claimed %llu < committed %llu", + zilog->zl_header->zh_claim_lr_seq, committed_seq); + + ztest_dataset_dirobj_verify(zd); + + zil_replay(os, zd, ztest_replay_vector); + + ztest_dataset_dirobj_verify(zd); + + if (zopt_verbose >= 6) + (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", + zd->zd_name, + (u_longlong_t)zilog->zl_parse_blk_count, + (u_longlong_t)zilog->zl_parse_lr_count, + (u_longlong_t)zilog->zl_replaying_seq); + + zilog = zil_open(os, ztest_get_data); + + if (zilog->zl_replaying_seq != 0 && + zilog->zl_replaying_seq < committed_seq) + fatal(0, "missing log records: replayed %llu < committed %llu", + zilog->zl_replaying_seq, committed_seq); + + return (0); + } + + static void + ztest_dataset_close(ztest_shared_t *zs, int d) + { + ztest_ds_t *zd = &zs->zs_zd[d]; + + zil_close(zd->zd_zilog); + dmu_objset_rele(zd->zd_os, zd); + + ztest_zd_fini(zd); } /* * Kick off threads to run tests on all datasets in parallel. */ static void - ztest_run(char *pool) + ztest_run(ztest_shared_t *zs) { - int t, d, error; - ztest_shared_t *zs = ztest_shared; - ztest_args_t *za; + thread_t *tid; spa_t *spa; - char name[100]; thread_t resume_tid; + int error; ++ int t, d; ztest_exiting = B_FALSE; @@@ -3775,91 -5180,92 +5197,92 @@@ if (zopt_verbose >= 4) (void) printf("starting main threads...\n"); - za[0].za_start = gethrtime(); - za[0].za_stop = za[0].za_start + zopt_passtime * NANOSEC; - za[0].za_stop = MIN(za[0].za_stop, zs->zs_stop_time); - za[0].za_kill = za[0].za_stop; - if (ztest_random(100) < zopt_killrate) - za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC); - + /* + * Kick off all the tests that run in parallel. + */ - for (int t = 0; t < zopt_threads; t++) { + for (t = 0; t < zopt_threads; t++) { - d = t % zopt_datasets; - - (void) strcpy(za[t].za_pool, pool); - za[t].za_os = za[d].za_os; - za[t].za_spa = spa; - za[t].za_zilog = za[d].za_zilog; - za[t].za_instance = t; - za[t].za_random = ztest_random(-1ULL); - za[t].za_start = za[0].za_start; - za[t].za_stop = za[0].za_stop; - za[t].za_kill = za[0].za_kill; - - if (t < zopt_datasets) { - int test_future = FALSE; - (void) rw_rdlock(&ztest_shared->zs_name_lock); - (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d); - error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0, - ztest_create_cb, NULL); - if (error == EEXIST) { - test_future = TRUE; - } else if (error == ENOSPC) { - zs->zs_enospc_count++; - (void) rw_unlock(&ztest_shared->zs_name_lock); - break; - } else if (error != 0) { - fatal(0, "dmu_objset_create(%s) = %d", - name, error); - } - error = dmu_objset_open(name, DMU_OST_OTHER, - DS_MODE_USER, &za[d].za_os); - if (error) - fatal(0, "dmu_objset_open('%s') = %d", - name, error); - (void) rw_unlock(&ztest_shared->zs_name_lock); - if (test_future) - ztest_dmu_check_future_leak(&za[t]); - zil_replay(za[d].za_os, za[d].za_os, - ztest_replay_vector); - za[d].za_zilog = zil_open(za[d].za_os, NULL); - } - - VERIFY(thr_create(0, 0, ztest_thread, &za[t], THR_BOUND, - &za[t].za_thread) == 0); + if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0) + return; + VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t, + THR_BOUND, &tid[t]) == 0); } - while (--t >= 0) { - VERIFY(thr_join(za[t].za_thread, NULL, NULL) == 0); - if (t < zopt_datasets) { - zil_close(za[t].za_zilog); - dmu_objset_close(za[t].za_os); - } + /* + * Wait for all of the tests to complete. We go in reverse order + * so we don't close datasets while threads are still using them. + */ - for (int t = zopt_threads - 1; t >= 0; t--) { ++ for (t = zopt_threads - 1; t >= 0; t--) { + VERIFY(thr_join(tid[t], NULL, NULL) == 0); + if (t < zopt_datasets) + ztest_dataset_close(zs, t); } - if (zopt_verbose >= 3) - show_pool_stats(spa); - txg_wait_synced(spa_get_dsl(spa), 0); - zs->zs_alloc = spa_get_alloc(spa); - zs->zs_space = spa_get_space(spa); + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); + + umem_free(tid, zopt_threads * sizeof (thread_t)); + + /* Kill the resume thread */ + ztest_exiting = B_TRUE; + VERIFY(thr_join(resume_tid, NULL, NULL) == 0); + ztest_resume(spa); + + /* + * Right before closing the pool, kick off a bunch of async I/O; + * spa_close() should wait for it to complete. + */ + for (uint64_t object = 1; object < 50; object++) + dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20); + + spa_close(spa, FTAG); /* - * If we had out-of-space errors, destroy a random objset. + * Verify that we can loop over all pools. */ - if (zs->zs_enospc_count != 0) { - (void) rw_rdlock(&ztest_shared->zs_name_lock); - d = (int)ztest_random(zopt_datasets); - (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d); - if (zopt_verbose >= 3) - (void) printf("Destroying %s to free up space\n", name); + mutex_enter(&spa_namespace_lock); + for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) + if (zopt_verbose > 3) + (void) printf("spa_next: found %s\n", spa_name(spa)); + mutex_exit(&spa_namespace_lock); + + /* + * Verify that we can export the pool and reimport it under a + * different name. + */ + if (ztest_random(2) == 0) { + char name[MAXNAMELEN]; + (void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool); + ztest_spa_import_export(zs->zs_pool, name); + ztest_spa_import_export(name, zs->zs_pool); + } + + kernel_fini(); + } + + static void + ztest_freeze(ztest_shared_t *zs) + { + ztest_ds_t *zd = &zs->zs_zd[0]; + spa_t *spa; + int numloops = 0; + + if (zopt_verbose >= 3) + (void) printf("testing spa_freeze()...\n"); - /* Cleanup any non-standard clones and snapshots */ - ztest_dsl_dataset_cleanup(name, za[d].za_instance); + kernel_init(FREAD | FWRITE); + VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); + VERIFY3U(0, ==, ztest_dataset_open(zs, 0)); - (void) dmu_objset_find(name, ztest_destroy_cb, &za[d], - DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); - (void) rw_unlock(&ztest_shared->zs_name_lock); + /* + * Force the first log block to be transactionally allocated. + * We have to do this before we freeze the pool -- otherwise + * the log chain won't be anchored. + */ + while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { + ztest_dmu_object_alloc_free(zd, 0); + zil_commit(zd->zd_zilog, UINT64_MAX, 0); } txg_wait_synced(spa_get_dsl(spa), 0); @@@ -3957,6 -5413,7 +5430,8 @@@ main(int argc, char **argv ztest_info_t *zi; char timebuf[100]; char numbuf[6]; + spa_t *spa; ++ int i, f; (void) setvbuf(stdout, NULL, _IOLBF, 0); @@@ -3991,26 -5450,24 +5468,24 @@@ bzero(zs, sizeof (ztest_shared_t)); if (zopt_verbose >= 3 && zopt_init != 1) (void) printf("ztest_init(), pass %d\n", i); - ztest_init(zopt_pool); + zs->zs_pool = zopt_pool; + ztest_init(zs); } - /* - * Initialize the call targets for each function. - */ + zs->zs_pool = zopt_pool; + zs->zs_proc_start = gethrtime(); + zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC; + - for (int f = 0; f < ZTEST_FUNCS; f++) { + for (f = 0; f < ZTEST_FUNCS; f++) { zi = &zs->zs_info[f]; - *zi = ztest_info[f]; - - if (*zi->zi_interval == 0) - zi->zi_call_target = UINT64_MAX; + if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) + zi->zi_call_next = UINT64_MAX; else - zi->zi_call_target = zopt_time / *zi->zi_interval; + zi->zi_call_next = zs->zs_proc_start + + ztest_random(2 * zi->zi_interval[0] + 1); } - zs->zs_start_time = gethrtime(); - zs->zs_stop_time = zs->zs_start_time + zopt_time * NANOSEC; - /* * Run the tests in a loop. These tests include fault injection * to verify that self-healing data works, and forced crashes @@@ -4024,9 -5480,9 +5498,9 @@@ /* * Initialize the workload counters for each function. */ - for (int f = 0; f < ZTEST_FUNCS; f++) { + for (f = 0; f < ZTEST_FUNCS; f++) { zi = &zs->zs_info[f]; - zi->zi_calls = 0; + zi->zi_call_count = 0; zi->zi_call_time = 0; } diff --cc lib/libzfs/libzfs_import.c index d67776889,fd3044b1d..95632d938 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@@ -403,6 -405,21 +405,23 @@@ refresh_config(libzfs_handle_t *hdl, nv return (nvl); } + /* + * Determine if the vdev id is a hole in the namespace. + */ + boolean_t + vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) + { - for (int c = 0; c < holes; c++) { ++ int c; ++ ++ for (c = 0; c < holes; c++) { + + /* Top-level is a hole */ + if (hole_array[c] == id) + return (B_TRUE); + } + return (B_FALSE); + } + /* * Convert our list of pools into the definitive set of configurations. We * start by picking the best config for each toplevel vdev. Once that's done, diff --cc module/zfs/dbuf.c index e9a8aab49,42ae43997..22e7188bc --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@@ -109,14 -106,12 +106,16 @@@ dmu_buf_impl_t dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; - objset_impl_t *os = dn->dn_objset; - uint64_t obj, hv, idx; + objset_t *os = dn->dn_objset; + uint64_t obj = dn->dn_object; + uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *db; + obj = dn->dn_object; + hv = DBUF_HASH(os, obj, level, blkid); + idx = hv & h->hash_table_mask; + mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { @@@ -142,16 -137,14 +141,16 @@@ static dmu_buf_impl_t dbuf_hash_insert(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; - objset_impl_t *os = db->db_objset; + objset_t *os = db->db_objset; uint64_t obj = db->db.db_object; int level = db->db_level; - uint64_t blkid = db->db_blkid; - uint64_t hv = DBUF_HASH(os, obj, level, blkid); - uint64_t idx = hv & h->hash_table_mask; + uint64_t blkid, hv, idx; dmu_buf_impl_t *dbf; + blkid = db->db_blkid; + hv = DBUF_HASH(os, obj, level, blkid); + idx = hv & h->hash_table_mask; + mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { diff --cc module/zfs/ddt.c index 000000000,926b4df9a..cd4e8476c mode 000000,100644..100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@@ -1,0 -1,1140 +1,1155 @@@ + /* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + + /* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + static const ddt_ops_t *ddt_ops[DDT_TYPES] = { + &ddt_zap_ops, + }; + + static const char *ddt_class_name[DDT_CLASSES] = { + "ditto", + "duplicate", + "unique", + }; + + static void + ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) + { + spa_t *spa = ddt->ddt_spa; + objset_t *os = ddt->ddt_os; + uint64_t *objectp = &ddt->ddt_object[type][class]; + boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + ASSERT(*objectp == 0); + VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); + ASSERT(*objectp != 0); + + VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, objectp, tx) == 0); + + VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class], tx) == 0); + } + + static void + ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) + { + spa_t *spa = ddt->ddt_spa; + objset_t *os = ddt->ddt_os; + uint64_t *objectp = &ddt->ddt_object[type][class]; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + ASSERT(*objectp != 0); + ASSERT(ddt_object_count(ddt, type, class) == 0); + ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); + VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); + VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); + VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); + bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); + + *objectp = 0; + } + + static int + ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) + { + ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; + dmu_object_info_t doi; + char name[DDT_NAMELEN]; + int error; + + ddt_object_name(ddt, type, class, name); + + error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); + + if (error) + return (error); + + error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class]); + + /* + * Seed the cached statistics. + */ + VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); + + ddo->ddo_count = ddt_object_count(ddt, type, class); + ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; + ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; + + ASSERT(error == 0); + return (error); + } + + static void + ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) + { + ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; + dmu_object_info_t doi; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class], tx) == 0); + + /* + * Cache DDT statistics; this is the only time they'll change. + */ + VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); + + ddo->ddo_count = ddt_object_count(ddt, type, class); + ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; + ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; + } + + static int + ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde) + { + if (!ddt_object_exists(ddt, type, class)) + return (ENOENT); + + return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, + ddt->ddt_object[type][class], dde)); + } + + static void + ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde) + { + if (!ddt_object_exists(ddt, type, class)) + return; + + ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, + ddt->ddt_object[type][class], dde); + } + + int + ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde, dmu_tx_t *tx) + { + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, + ddt->ddt_object[type][class], dde, tx)); + } + + static int + ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde, dmu_tx_t *tx) + { + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, + ddt->ddt_object[type][class], dde, tx)); + } + + int + ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + uint64_t *walk, ddt_entry_t *dde) + { + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, + ddt->ddt_object[type][class], dde, walk)); + } + + uint64_t + ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class) + { + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, + ddt->ddt_object[type][class])); + } + + int + ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_object_info_t *doi) + { + if (!ddt_object_exists(ddt, type, class)) + return (ENOENT); + + return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], + doi)); + } + + boolean_t + ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) + { + return (!!ddt->ddt_object[type][class]); + } + + void + ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + char *name) + { + (void) sprintf(name, DMU_POOL_DDT, + zio_checksum_table[ddt->ddt_checksum].ci_name, + ddt_ops[type]->ddt_op_name, ddt_class_name[class]); + } + + void + ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) + { ++ int d; + ASSERT(txg != 0); + - for (int d = 0; d < SPA_DVAS_PER_BP; d++) ++ for (d = 0; d < SPA_DVAS_PER_BP; d++) + bp->blk_dva[d] = ddp->ddp_dva[d]; + BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); + } + + void + ddt_bp_create(enum zio_checksum checksum, + const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) + { + BP_ZERO(bp); + + if (ddp != NULL) + ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); + + bp->blk_cksum = ddk->ddk_cksum; + bp->blk_fill = 1; + + BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); + BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); + BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); + BP_SET_CHECKSUM(bp, checksum); + BP_SET_TYPE(bp, DMU_OT_DEDUP); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + } + + void + ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) + { + ddk->ddk_cksum = bp->blk_cksum; + ddk->ddk_prop = 0; + + DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); + DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); + DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); + } + + void + ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) + { ++ int d; + ASSERT(ddp->ddp_phys_birth == 0); + - for (int d = 0; d < SPA_DVAS_PER_BP; d++) ++ for (d = 0; d < SPA_DVAS_PER_BP; d++) + ddp->ddp_dva[d] = bp->blk_dva[d]; + ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); + } + + void + ddt_phys_clear(ddt_phys_t *ddp) + { + bzero(ddp, sizeof (*ddp)); + } + + void + ddt_phys_addref(ddt_phys_t *ddp) + { + ddp->ddp_refcnt++; + } + + void + ddt_phys_decref(ddt_phys_t *ddp) + { + ASSERT((int64_t)ddp->ddp_refcnt > 0); + ddp->ddp_refcnt--; + } + + void + ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) + { + blkptr_t blk; + + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + ddt_phys_clear(ddp); + zio_free(ddt->ddt_spa, txg, &blk); + } + + ddt_phys_t * + ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) + { + ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; ++ int p; + - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { ++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && + BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) + return (ddp); + } + return (NULL); + } + + uint64_t + ddt_phys_total_refcnt(const ddt_entry_t *dde) + { + uint64_t refcnt = 0; ++ int p; + - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) ++ for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) + refcnt += dde->dde_phys[p].ddp_refcnt; + + return (refcnt); + } + + static void + ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) + { + spa_t *spa = ddt->ddt_spa; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + uint64_t lsize = DDK_GET_LSIZE(ddk); + uint64_t psize = DDK_GET_PSIZE(ddk); ++ int p, d; + + bzero(dds, sizeof (*dds)); + - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { ++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + uint64_t dsize = 0; + uint64_t refcnt = ddp->ddp_refcnt; + + if (ddp->ddp_phys_birth == 0) + continue; + - for (int d = 0; d < SPA_DVAS_PER_BP; d++) ++ for (d = 0; d < SPA_DVAS_PER_BP; d++) + dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); + + dds->dds_blocks += 1; + dds->dds_lsize += lsize; + dds->dds_psize += psize; + dds->dds_dsize += dsize; + + dds->dds_ref_blocks += refcnt; + dds->dds_ref_lsize += lsize * refcnt; + dds->dds_ref_psize += psize * refcnt; + dds->dds_ref_dsize += dsize * refcnt; + } + } + + void + ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) + { + const uint64_t *s = (const uint64_t *)src; + uint64_t *d = (uint64_t *)dst; + uint64_t *d_end = (uint64_t *)(dst + 1); + + ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ + + while (d < d_end) + *d++ += (*s++ ^ neg) - neg; + } + + static void + ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) + { + ddt_stat_t dds; + ddt_histogram_t *ddh; + int bucket; + + ddt_stat_generate(ddt, dde, &dds); + + bucket = highbit(dds.dds_ref_blocks) - 1; + ASSERT(bucket >= 0); + + ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; + + ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); + } + + void + ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) + { - for (int h = 0; h < 64; h++) ++ int h; ++ ++ for (h = 0; h < 64; h++) + ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); + } + + void + ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) + { ++ int h; ++ + bzero(dds, sizeof (*dds)); + - for (int h = 0; h < 64; h++) ++ for (h = 0; h < 64; h++) + ddt_stat_add(dds, &ddh->ddh_stat[h], 0); + } + + boolean_t + ddt_histogram_empty(const ddt_histogram_t *ddh) + { + const uint64_t *s = (const uint64_t *)ddh; + const uint64_t *s_end = (const uint64_t *)(ddh + 1); + + while (s < s_end) + if (*s++ != 0) + return (B_FALSE); + + return (B_TRUE); + } + + void + ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) + { + /* Sum the statistics we cached in ddt_object_sync(). */ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_object_t *ddo = + &ddt->ddt_object_stats[type][class]; + ddo_total->ddo_count += ddo->ddo_count; + ddo_total->ddo_dspace += ddo->ddo_dspace; + ddo_total->ddo_mspace += ddo->ddo_mspace; + } + } + } + + /* ... and compute the averages. */ + if (ddo_total->ddo_count != 0) { + ddo_total->ddo_dspace /= ddo_total->ddo_count; + ddo_total->ddo_mspace /= ddo_total->ddo_count; + } else { + ASSERT(ddo_total->ddo_dspace == 0); + ASSERT(ddo_total->ddo_mspace == 0); + } + } + + void + ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) + { + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_histogram_add(ddh, + &ddt->ddt_histogram_cache[type][class]); + } + } + } + } + + void + ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) + { + ddt_histogram_t *ddh_total; + + ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); + ddt_get_dedup_histogram(spa, ddh_total); + ddt_histogram_stat(dds_total, ddh_total); + kmem_free(ddh_total, sizeof (ddt_histogram_t)); + } + + uint64_t + ddt_get_dedup_dspace(spa_t *spa) + { + ddt_stat_t dds_total = { 0 }; + + ddt_get_dedup_stats(spa, &dds_total); + return (dds_total.dds_ref_dsize - dds_total.dds_dsize); + } + + uint64_t + ddt_get_pool_dedup_ratio(spa_t *spa) + { + ddt_stat_t dds_total = { 0 }; + + ddt_get_dedup_stats(spa, &dds_total); + if (dds_total.dds_dsize == 0) + return (100); + + return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); + } + + int + ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) + { + spa_t *spa = ddt->ddt_spa; + uint64_t total_refcnt = 0; + uint64_t ditto = spa->spa_dedup_ditto; + int total_copies = 0; + int desired_copies = 0; ++ int p; + - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ++ for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { + ddt_phys_t *ddp = &dde->dde_phys[p]; + zio_t *zio = dde->dde_lead_zio[p]; + uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ + if (zio != NULL) + refcnt += zio->io_parent_count; /* pending refs */ + if (ddp == ddp_willref) + refcnt++; /* caller's ref */ + if (refcnt != 0) { + total_refcnt += refcnt; + total_copies += p; + } + } + + if (ditto == 0 || ditto > UINT32_MAX) + ditto = UINT32_MAX; + + if (total_refcnt >= 1) + desired_copies++; + if (total_refcnt >= ditto) + desired_copies++; + if (total_refcnt >= ditto * ditto) + desired_copies++; + + return (MAX(desired_copies, total_copies) - total_copies); + } + + int + ddt_ditto_copies_present(ddt_entry_t *dde) + { + ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; + dva_t *dva = ddp->ddp_dva; + int copies = 0 - DVA_GET_GANG(dva); ++ int d; + - for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++) ++ for (d = 0; d < SPA_DVAS_PER_BP; d++, dva++) + if (DVA_IS_VALID(dva)) + copies++; + + ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); + + return (copies); + } + + size_t + ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) + { + uchar_t *version = dst++; + int cpfunc = ZIO_COMPRESS_ZLE; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + size_t c_len; + + ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ + + c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); + + if (c_len == s_len) { + cpfunc = ZIO_COMPRESS_OFF; + bcopy(src, dst, s_len); + } + + *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc; + + return (c_len + 1); + } + + void + ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) + { + uchar_t version = *src++; + int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + + if (ci->ci_decompress != NULL) + (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); + else + bcopy(src, dst, d_len); + + if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK) + byteswap_uint64_array(dst, d_len); + } + + ddt_t * + ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) + { + return (spa->spa_ddt[c]); + } + + ddt_t * + ddt_select(spa_t *spa, const blkptr_t *bp) + { + return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); + } + + void + ddt_enter(ddt_t *ddt) + { + mutex_enter(&ddt->ddt_lock); + } + + void + ddt_exit(ddt_t *ddt) + { + mutex_exit(&ddt->ddt_lock); + } + + static ddt_entry_t * + ddt_alloc(const ddt_key_t *ddk) + { + ddt_entry_t *dde; + + dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP); + cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); + + dde->dde_key = *ddk; + + return (dde); + } + + static void + ddt_free(ddt_entry_t *dde) + { + ASSERT(!dde->dde_loading); ++ int p; + - for (int p = 0; p < DDT_PHYS_TYPES; p++) ++ for (p = 0; p < DDT_PHYS_TYPES; p++) + ASSERT(dde->dde_lead_zio[p] == NULL); + + if (dde->dde_repair_data != NULL) + zio_buf_free(dde->dde_repair_data, + DDK_GET_PSIZE(&dde->dde_key)); + + cv_destroy(&dde->dde_cv); + kmem_free(dde, sizeof (*dde)); + } + + void + ddt_remove(ddt_t *ddt, ddt_entry_t *dde) + { + ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + + avl_remove(&ddt->ddt_tree, dde); + ddt_free(dde); + } + + ddt_entry_t * + ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) + { + ddt_entry_t *dde, dde_search; + enum ddt_type type; + enum ddt_class class; + avl_index_t where; + int error; + + ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + + ddt_key_fill(&dde_search.dde_key, bp); + + dde = avl_find(&ddt->ddt_tree, &dde_search, &where); + if (dde == NULL) { + if (!add) + return (NULL); + dde = ddt_alloc(&dde_search.dde_key); + avl_insert(&ddt->ddt_tree, dde, where); + } + + while (dde->dde_loading) + cv_wait(&dde->dde_cv, &ddt->ddt_lock); + + if (dde->dde_loaded) + return (dde); + + dde->dde_loading = B_TRUE; + + ddt_exit(ddt); + + error = ENOENT; + + for (type = 0; type < DDT_TYPES; type++) { + for (class = 0; class < DDT_CLASSES; class++) { + error = ddt_object_lookup(ddt, type, class, dde); + if (error != ENOENT) + break; + } + if (error != ENOENT) + break; + } + + ASSERT(error == 0 || error == ENOENT); + + ddt_enter(ddt); + + ASSERT(dde->dde_loaded == B_FALSE); + ASSERT(dde->dde_loading == B_TRUE); + + dde->dde_type = type; /* will be DDT_TYPES if no entry found */ + dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ + dde->dde_loaded = B_TRUE; + dde->dde_loading = B_FALSE; + + if (error == 0) + ddt_stat_update(ddt, dde, -1ULL); + + cv_broadcast(&dde->dde_cv); + + return (dde); + } + + void + ddt_prefetch(spa_t *spa, const blkptr_t *bp) + { + ddt_t *ddt; + ddt_entry_t dde; + + if (!BP_GET_DEDUP(bp)) + return; + + /* + * We remove the DDT once it's empty and only prefetch dedup blocks + * when there are entries in the DDT. Thus no locking is required + * as the DDT can't disappear on us. + */ + ddt = ddt_select(spa, bp); + ddt_key_fill(&dde.dde_key, bp); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + ddt_object_prefetch(ddt, type, class, &dde); + } + } + } + + int + ddt_entry_compare(const void *x1, const void *x2) + { + const ddt_entry_t *dde1 = x1; + const ddt_entry_t *dde2 = x2; + const uint64_t *u1 = (const uint64_t *)&dde1->dde_key; + const uint64_t *u2 = (const uint64_t *)&dde2->dde_key; ++ int i; + - for (int i = 0; i < DDT_KEY_WORDS; i++) { ++ for (i = 0; i < DDT_KEY_WORDS; i++) { + if (u1[i] < u2[i]) + return (-1); + if (u1[i] > u2[i]) + return (1); + } + + return (0); + } + + static ddt_t * + ddt_table_alloc(spa_t *spa, enum zio_checksum c) + { + ddt_t *ddt; + + ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP); + + mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&ddt->ddt_tree, ddt_entry_compare, + sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, + sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + ddt->ddt_checksum = c; + ddt->ddt_spa = spa; + ddt->ddt_os = spa->spa_meta_objset; + + return (ddt); + } + + static void + ddt_table_free(ddt_t *ddt) + { + ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); + ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); + avl_destroy(&ddt->ddt_tree); + avl_destroy(&ddt->ddt_repair_tree); + mutex_destroy(&ddt->ddt_lock); + kmem_free(ddt, sizeof (*ddt)); + } + + void + ddt_create(spa_t *spa) + { + spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) + spa->spa_ddt[c] = ddt_table_alloc(spa, c); + } + + int + ddt_load(spa_t *spa) + { + int error; + + ddt_create(spa); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object); + + if (error) + return (error == ENOENT ? 0 : error); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + error = ddt_object_load(ddt, type, class); + if (error != 0 && error != ENOENT) + return (error); + } + } + + /* + * Seed the cached histograms. + */ + bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + sizeof (ddt->ddt_histogram)); + } + + return (0); + } + + void + ddt_unload(spa_t *spa) + { + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + if (spa->spa_ddt[c]) { + ddt_table_free(spa->spa_ddt[c]); + spa->spa_ddt[c] = NULL; + } + } + } + + boolean_t + ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) + { + ddt_t *ddt; + ddt_entry_t dde; + + if (!BP_GET_DEDUP(bp)) + return (B_FALSE); + + if (max_class == DDT_CLASS_UNIQUE) + return (B_TRUE); + + ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; + + ddt_key_fill(&dde.dde_key, bp); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) + for (enum ddt_class class = 0; class <= max_class; class++) + if (ddt_object_lookup(ddt, type, class, &dde) == 0) + return (B_TRUE); + + return (B_FALSE); + } + + ddt_entry_t * + ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) + { + ddt_key_t ddk; + ddt_entry_t *dde; + + ddt_key_fill(&ddk, bp); + + dde = ddt_alloc(&ddk); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + /* + * We can only do repair if there are multiple copies + * of the block. For anything in the UNIQUE class, + * there's definitely only one copy, so don't even try. + */ + if (class != DDT_CLASS_UNIQUE && + ddt_object_lookup(ddt, type, class, dde) == 0) + return (dde); + } + } + + bzero(dde->dde_phys, sizeof (dde->dde_phys)); + + return (dde); + } + + void + ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) + { + avl_index_t where; + + ddt_enter(ddt); + + if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && + avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) + avl_insert(&ddt->ddt_repair_tree, dde, where); + else + ddt_free(dde); + + ddt_exit(ddt); + } + + static void + ddt_repair_entry_done(zio_t *zio) + { + ddt_entry_t *rdde = zio->io_private; + + ddt_free(rdde); + } + + static void + ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) + { + ddt_phys_t *ddp = dde->dde_phys; + ddt_phys_t *rddp = rdde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + ddt_key_t *rddk = &rdde->dde_key; + zio_t *zio; + blkptr_t blk; ++ int p; + + zio = zio_null(rio, rio->io_spa, NULL, + ddt_repair_entry_done, rdde, rio->io_flags); + - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { ++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth != rddp->ddp_phys_birth || + bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) + continue; + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, + rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, + ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); + } + + zio_nowait(zio); + } + + static void + ddt_repair_table(ddt_t *ddt, zio_t *rio) + { + spa_t *spa = ddt->ddt_spa; + ddt_entry_t *dde, *rdde_next, *rdde; + avl_tree_t *t = &ddt->ddt_repair_tree; + blkptr_t blk; + + if (spa_sync_pass(spa) > 1) + return; + + ddt_enter(ddt); + for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { + rdde_next = AVL_NEXT(t, rdde); + avl_remove(&ddt->ddt_repair_tree, rdde); + ddt_exit(ddt); + ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); + dde = ddt_repair_start(ddt, &blk); + ddt_repair_entry(ddt, dde, rdde, rio); + ddt_repair_done(ddt, dde); + ddt_enter(ddt); + } + ddt_exit(ddt); + } + + static void + ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) + { + dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + enum ddt_type otype = dde->dde_type; + enum ddt_type ntype = DDT_TYPE_CURRENT; + enum ddt_class oclass = dde->dde_class; + enum ddt_class nclass; + uint64_t total_refcnt = 0; ++ int p; + + ASSERT(dde->dde_loaded); + ASSERT(!dde->dde_loading); + - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { ++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + ASSERT(dde->dde_lead_zio[p] == NULL); + ASSERT((int64_t)ddp->ddp_refcnt >= 0); + if (ddp->ddp_phys_birth == 0) { + ASSERT(ddp->ddp_refcnt == 0); + continue; + } + if (p == DDT_PHYS_DITTO) { + if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) + ddt_phys_free(ddt, ddk, ddp, txg); + continue; + } + if (ddp->ddp_refcnt == 0) + ddt_phys_free(ddt, ddk, ddp, txg); + total_refcnt += ddp->ddp_refcnt; + } + + if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) + nclass = DDT_CLASS_DITTO; + else if (total_refcnt > 1) + nclass = DDT_CLASS_DUPLICATE; + else + nclass = DDT_CLASS_UNIQUE; + + if (otype != DDT_TYPES && + (otype != ntype || oclass != nclass || total_refcnt == 0)) { + VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); + ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); + } + + if (total_refcnt != 0) { + dde->dde_type = ntype; + dde->dde_class = nclass; + ddt_stat_update(ddt, dde, 0); + if (!ddt_object_exists(ddt, ntype, nclass)) + ddt_object_create(ddt, ntype, nclass, tx); + VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); + + /* + * If the class changes, the order that we scan this bp + * changes. If it decreases, we could miss it, so + * scan it right now. (This covers both class changing + * while we are doing ddt_walk(), and when we are + * traversing.) + */ + if (nclass < oclass) { + dsl_scan_ddt_entry(dp->dp_scan, + ddt->ddt_checksum, dde, tx); + } + } + } + + static void + ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) + { + spa_t *spa = ddt->ddt_spa; + ddt_entry_t *dde; + void *cookie = NULL; + + if (avl_numnodes(&ddt->ddt_tree) == 0) + return; + + ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); + + if (spa->spa_ddt_stat_object == 0) { + spa->spa_ddt_stat_object = zap_create(ddt->ddt_os, + DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx); + VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object, tx) == 0); + } + + while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { + ddt_sync_entry(ddt, dde, tx, txg); + ddt_free(dde); + } + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + if (!ddt_object_exists(ddt, type, class)) + continue; + ddt_object_sync(ddt, type, class, tx); + if (ddt_object_count(ddt, type, class) == 0) + ddt_object_destroy(ddt, type, class, tx); + } + } + + bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + sizeof (ddt->ddt_histogram)); + } + + void + ddt_sync(spa_t *spa, uint64_t txg) + { + dmu_tx_t *tx; + zio_t *rio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + + ASSERT(spa_syncing_txg(spa) == txg); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (ddt == NULL) + continue; + ddt_sync_table(ddt, tx, txg); + ddt_repair_table(ddt, rio); + } + + (void) zio_wait(rio); + + dmu_tx_commit(tx); + } + + int + ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) + { + do { + do { + do { + ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; + int error = ENOENT; + if (ddt_object_exists(ddt, ddb->ddb_type, + ddb->ddb_class)) { + error = ddt_object_walk(ddt, + ddb->ddb_type, ddb->ddb_class, + &ddb->ddb_cursor, dde); + } + dde->dde_type = ddb->ddb_type; + dde->dde_class = ddb->ddb_class; + if (error == 0) + return (0); + if (error != ENOENT) + return (error); + ddb->ddb_cursor = 0; + } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); + ddb->ddb_checksum = 0; + } while (++ddb->ddb_type < DDT_TYPES); + ddb->ddb_type = 0; + } while (++ddb->ddb_class < DDT_CLASSES); + + return (ENOENT); + } diff --cc module/zfs/dmu.c index d86468202,5b87c81c6..ad7a8f74f --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@@ -1148,6 -1519,8 +1519,9 @@@ dmu_offset_next(objset_t *os, uint64_t void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { + dnode_phys_t *dnp; ++ int i; + rw_enter(&dn->dn_struct_rwlock, RW_READER); mutex_enter(&dn->dn_mtx); @@@ -1157,12 -1535,11 +1536,11 @@@ doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; - doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + - SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; - doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; - doi->doi_type = dn->dn_type; - doi->doi_bonus_size = dn->dn_bonuslen; - doi->doi_bonus_type = dn->dn_bonustype; + doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; + doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz; + doi->doi_fill_count = 0; - for (int i = 0; i < dnp->dn_nblkptr; i++) ++ for (i = 0; i < dnp->dn_nblkptr; i++) + doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); diff --cc module/zfs/dmu_objset.c index 8bb6ce2e3,690e6ecde..2ff085e44 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@@ -452,16 -502,12 +502,13 @@@ dmu_objset_evict_dbufs(objset_t *os } void - dmu_objset_evict(dsl_dataset_t *ds, void *arg) + dmu_objset_evict(objset_t *os) { - objset_impl_t *osi = arg; - objset_t os; - int i; + dsl_dataset_t *ds = os->os_dsl_dataset; ++ int t; - for (i = 0; i < TXG_SIZE; i++) { - ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL); - ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL); - } - for (int t = 0; t < TXG_SIZE; t++) ++ for (t = 0; t < TXG_SIZE; t++) + ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { if (!dsl_dataset_is_snapshot(ds)) { @@@ -888,13 -949,10 +950,12 @@@ dmu_objset_sync_dnodes(list_t *list, li /* ARGSUSED */ static void - ready(zio_t *zio, arc_buf_t *abuf, void *arg) + dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) { + int i; + blkptr_t *bp = zio->io_bp; - blkptr_t *bp_orig = &zio->io_bp_orig; - objset_impl_t *os = arg; + objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; ASSERT(bp == os->os_rootbp); @@@ -908,16 -966,26 +969,26 @@@ * dnode and user/group accounting objects). */ bp->blk_fill = 0; - for (int i = 0; i < dnp->dn_nblkptr; i++) + for (i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += dnp->dn_blkptr[i].blk_fill; + } + + /* ARGSUSED */ + static void + dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) + { + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + objset_t *os = arg; if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { - ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); + ASSERT(BP_EQUAL(bp, bp_orig)); } else { - if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg) - (void) dsl_dataset_block_kill(os->os_dsl_dataset, - &zio->io_bp_orig, zio, os->os_synctx); - dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx); + dsl_dataset_t *ds = os->os_dsl_dataset; + dmu_tx_t *tx = os->os_synctx; + + (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, bp, tx); } } diff --cc module/zfs/dmu_tx.c index c6fbeeef0,5fc062c16..32dbea622 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@@ -203,6 -216,6 +216,7 @@@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t start, end, i; int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; int err = 0; ++ int l; if (len == 0) return; @@@ -289,11 -302,11 +303,11 @@@ * If this write is not off the end of the file * we need to account for overwrites/unref. */ - if (start <= dn->dn_maxblkid) - bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS); + if (start <= dn->dn_maxblkid) { - for (int l = 0; l < DN_MAX_LEVELS; l++) ++ for (l = 0; l < DN_MAX_LEVELS; l++) + history[l] = -1ULL; + } while (start <= dn->dn_maxblkid) { - spa_t *spa = txh->txh_tx->tx_pool->dp_spa; - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); diff --cc module/zfs/dsl_dataset.c index 58fc78684,ddd83576c..2e1fff35a --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@@ -77,16 -87,14 +87,16 @@@ parent_delta(dsl_dataset_t *ds, int64_ } void - dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) + dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) { - int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); - int compressed = BP_GET_PSIZE(bp); - int uncompressed = BP_GET_UCSIZE(bp); + int used, compressed, uncompressed; int64_t delta; + used = bp_get_dasize(tx->tx_pool->dp_spa, bp); + compressed = BP_GET_PSIZE(bp); + uncompressed = BP_GET_UCSIZE(bp); + - dprintf_bp(bp, "born, ds=%p\n", ds); + dprintf_bp(bp, "ds=%p", ds); ASSERT(dmu_tx_is_syncing(tx)); /* It could have been compressed away to nothing */ diff --cc module/zfs/dsl_scan.c index 000000000,23c37c7cc..e402dde7c mode 000000,100644..100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@@ -1,0 -1,1739 +1,1741 @@@ + /* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + /* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #ifdef _KERNEL + #include + #endif + + typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); + + static scan_cb_t dsl_scan_defrag_cb; + static scan_cb_t dsl_scan_scrub_cb; + static scan_cb_t dsl_scan_remove_cb; + static dsl_syncfunc_t dsl_scan_cancel_sync; + static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); + + int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ + int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ + int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ + boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ + boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ + enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; + int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */ + + #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ + ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ + (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) + + extern int zfs_txg_timeout; + + /* the order has to match pool_scan_type */ + static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { + NULL, + dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ + dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ + }; + + int + dsl_scan_init(dsl_pool_t *dp, uint64_t txg) + { + int err; + dsl_scan_t *scn; + spa_t *spa = dp->dp_spa; + uint64_t f; + + scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); + scn->scn_dp = dp; + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + "scrub_func", sizeof (uint64_t), 1, &f); + if (err == 0) { + /* + * There was an old-style scrub in progress. Restart a + * new-style scrub from the beginning. + */ + scn->scn_restart_txg = txg; + zfs_dbgmsg("old-style scrub was in progress; " + "restarting new-style scrub in txg %llu", + scn->scn_restart_txg); + + /* + * Load the queue obj from the old location so that it + * can be freed by dsl_scan_done(). + */ + (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + "scrub_queue", sizeof (uint64_t), 1, + &scn->scn_phys.scn_queue_obj); + } else { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys); + if (err == ENOENT) + return (0); + else if (err) + return (err); + + if (scn->scn_phys.scn_state == DSS_SCANNING && + spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { + /* + * A new-type scrub was in progress on an old + * pool, and the pool was accessed by old + * software. Restart from the beginning, since + * the old software may have changed the pool in + * the meantime. + */ + scn->scn_restart_txg = txg; + zfs_dbgmsg("new-style scrub was modified " + "by old software; restarting in txg %llu", + scn->scn_restart_txg); + } + } + + spa_scan_stat_init(spa); + return (0); + } + + void + dsl_scan_fini(dsl_pool_t *dp) + { + if (dp->dp_scan) { + kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); + dp->dp_scan = NULL; + } + } + + /* ARGSUSED */ + static int + dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx) + { + dsl_scan_t *scn = arg1; + + if (scn->scn_phys.scn_state == DSS_SCANNING) + return (EBUSY); + + return (0); + } + + /* ARGSUSED */ + static void + dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) + { + dsl_scan_t *scn = arg1; + pool_scan_func_t *funcp = arg2; + dmu_object_type_t ot = 0; + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + + ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); + ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); + bzero(&scn->scn_phys, sizeof (scn->scn_phys)); + scn->scn_phys.scn_func = *funcp; + scn->scn_phys.scn_state = DSS_SCANNING; + scn->scn_phys.scn_min_txg = 0; + scn->scn_phys.scn_max_txg = tx->tx_txg; + scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ + scn->scn_phys.scn_start_time = gethrestime_sec(); + scn->scn_phys.scn_errors = 0; + scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; + scn->scn_restart_txg = 0; + spa_scan_stat_init(spa); + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; + + /* rewrite all disk labels */ + vdev_config_dirty(spa->spa_root_vdev); + + if (vdev_resilver_needed(spa->spa_root_vdev, + &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { + spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); + } else { + spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START); + } + + spa->spa_scrub_started = B_TRUE; + /* + * If this is an incremental scrub, limit the DDT scrub phase + * to just the auto-ditto class (for correctness); the rest + * of the scrub should go faster using top-down pruning. + */ + if (scn->scn_phys.scn_min_txg > TXG_INITIAL) + scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; + + } + + /* back to the generic stuff */ + + if (dp->dp_blkstats == NULL) { + dp->dp_blkstats = + kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); + } + bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + + if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) + ot = DMU_OT_ZAP_OTHER; + + scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, + ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); + + dsl_scan_sync_state(scn, tx); + + spa_history_log_internal(LOG_POOL_SCAN, spa, tx, + "func=%u mintxg=%llu maxtxg=%llu", + *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); + } + + /* ARGSUSED */ + static void + dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) + { + static const char *old_names[] = { + "scrub_bookmark", + "scrub_ddt_bookmark", + "scrub_ddt_class_max", + "scrub_queue", + "scrub_min_txg", + "scrub_max_txg", + "scrub_func", + "scrub_errors", + NULL + }; + + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + int i; + + /* Remove any remnants of an old-style scrub. */ + for (i = 0; old_names[i]; i++) { + (void) zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); + } + + if (scn->scn_phys.scn_queue_obj != 0) { + VERIFY(0 == dmu_object_free(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, tx)); + scn->scn_phys.scn_queue_obj = 0; + } + + /* + * If we were "restarted" from a stopped state, don't bother + * with anything else. + */ + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (complete) + scn->scn_phys.scn_state = DSS_FINISHED; + else + scn->scn_phys.scn_state = DSS_CANCELED; + + spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx, + "complete=%u", complete); + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > 0) { + cv_wait(&spa->spa_scrub_io_cv, + &spa->spa_scrub_lock); + } + mutex_exit(&spa->spa_scrub_lock); + spa->spa_scrub_started = B_FALSE; + spa->spa_scrub_active = B_FALSE; + + /* + * If the scrub/resilver completed, update all DTLs to + * reflect this. Whether it succeeded or not, vacate + * all temporary scrub DTLs. + */ + vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, + complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); + if (complete) { + spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ? + ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); + } + spa_errlog_rotate(spa); + + /* + * We may have finished replacing a device. + * Let the async thread assess this and handle the detach. + */ + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); + } + + scn->scn_phys.scn_end_time = gethrestime_sec(); + } + + /* ARGSUSED */ + static int + dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx) + { + dsl_scan_t *scn = arg1; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return (ENOENT); + return (0); + } + + /* ARGSUSED */ + static void + dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx) + { + dsl_scan_t *scn = arg1; + + dsl_scan_done(scn, B_FALSE, tx); + dsl_scan_sync_state(scn, tx); + } + + int + dsl_scan_cancel(dsl_pool_t *dp) + { + boolean_t complete = B_FALSE; + int err; + + err = dsl_sync_task_do(dp, dsl_scan_cancel_check, + dsl_scan_cancel_sync, dp->dp_scan, &complete, 3); + return (err); + } + + static void dsl_scan_visitbp(blkptr_t *bp, + const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf, + dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, + dmu_tx_t *tx); + static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, + dmu_objset_type_t ostype, + dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx); + + void + dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) + { + zio_free(dp->dp_spa, txg, bp); + } + + void + dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) + { + ASSERT(dsl_pool_sync_context(dp)); + zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); + } + + int + dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) + { + return (arc_read(pio, spa, bpp, pbuf, done, private, + priority, zio_flags, arc_flags, zb)); + } + + int + dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) + { + return (arc_read_nolock(pio, spa, bpp, done, private, + priority, zio_flags, arc_flags, zb)); + } + + static boolean_t + bookmark_is_zero(const zbookmark_t *zb) + { + return (zb->zb_objset == 0 && zb->zb_object == 0 && + zb->zb_level == 0 && zb->zb_blkid == 0); + } + + /* dnp is the dnode for zb1->zb_object */ + static boolean_t + bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, + const zbookmark_t *zb2) + { + uint64_t zb1nextL0, zb2thisobj; + + ASSERT(zb1->zb_objset == zb2->zb_objset); + ASSERT(zb2->zb_level == 0); + + /* + * A bookmark in the deadlist is considered to be after + * everything else. + */ + if (zb2->zb_object == DMU_DEADLIST_OBJECT) + return (B_TRUE); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) + return (B_FALSE); + + zb1nextL0 = (zb1->zb_blkid + 1) << + ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + + zb2thisobj = zb2->zb_object ? zb2->zb_object : + zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + + if (zb1->zb_object == DMU_META_DNODE_OBJECT) { + uint64_t nextobj = zb1nextL0 * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; + return (nextobj <= zb2thisobj); + } + + if (zb1->zb_object < zb2thisobj) + return (B_TRUE); + if (zb1->zb_object > zb2thisobj) + return (B_FALSE); + if (zb2->zb_object == DMU_META_DNODE_OBJECT) + return (B_FALSE); + return (zb1nextL0 <= zb2->zb_blkid); + } + + static uint64_t + dsl_scan_ds_maxtxg(dsl_dataset_t *ds) + { + uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; + if (dsl_dataset_is_snapshot(ds)) + return (MIN(smt, ds->ds_phys->ds_creation_txg)); + return (smt); + } + + static void + dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) + { + VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys, tx)); + } + + static boolean_t + dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) + { + uint64_t elapsed_nanosecs; + int mintime; + + /* we never skip user/group accounting objects */ + if (zb && (int64_t)zb->zb_object < 0) + return (B_FALSE); + + if (scn->scn_pausing) + return (B_TRUE); /* we're already pausing */ + + if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark)) + return (B_FALSE); /* we're resuming */ + + /* We only know how to resume from level-0 blocks. */ + if (zb && zb->zb_level != 0) + return (B_FALSE); + + mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scan_min_time_ms; + elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; + if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + (elapsed_nanosecs / MICROSEC > mintime && + txg_sync_waiting(scn->scn_dp)) || + spa_shutting_down(scn->scn_dp->dp_spa)) { + if (zb) { + dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + scn->scn_phys.scn_bookmark = *zb; + } + dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); + scn->scn_pausing = B_TRUE; + return (B_TRUE); + } + return (B_FALSE); + } + + typedef struct zil_scan_arg { + dsl_pool_t *zsa_dp; + zil_header_t *zsa_zh; + } zil_scan_arg_t; + + /* ARGSUSED */ + static int + dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) + { + zil_scan_arg_t *zsa = arg; + dsl_pool_t *dp = zsa->zsa_dp; + dsl_scan_t *scn = dp->dp_scan; + zil_header_t *zh = zsa->zsa_zh; + zbookmark_t zb; + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return (0); + + /* + * One block ("stubby") can be allocated a long time ago; we + * want to visit that one because it has been allocated + * (on-disk) even if it hasn't been claimed (even though for + * scrub there's nothing to do to it). + */ + if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); + + VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + return (0); + } + + /* ARGSUSED */ + static int + dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) + { + if (lrc->lrc_txtype == TX_WRITE) { + zil_scan_arg_t *zsa = arg; + dsl_pool_t *dp = zsa->zsa_dp; + dsl_scan_t *scn = dp->dp_scan; + zil_header_t *zh = zsa->zsa_zh; + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_t zb; + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return (0); + + /* + * birth can be < claim_txg if this record's txg is + * already txg sync'ed (but this log block contains + * other records that are not synced) + */ + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + lr->lr_foid, ZB_ZIL_LEVEL, + lr->lr_offset / BP_GET_LSIZE(bp)); + + VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + } + return (0); + } + + static void + dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) + { + uint64_t claim_txg = zh->zh_claim_txg; + zil_scan_arg_t zsa = { dp, zh }; + zilog_t *zilog; + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed (or, in read-only mode, blocks that *would* be claimed). + */ + if (claim_txg == 0 && spa_writeable(dp->dp_spa)) + return; + + zilog = zil_alloc(dp->dp_meta_objset, zh); + + (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, + claim_txg); + + zil_free(zilog); + } + + /* ARGSUSED */ + static void + dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, + uint64_t objset, uint64_t object, uint64_t blkid) + { + zbookmark_t czb; + uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; + + if (zfs_no_scrub_prefetch) + return; + + if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) + return; + + SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); + + /* + * XXX need to make sure all of these arc_read() prefetches are + * done before setting xlateall (similar to dsl_read()) + */ + (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, + buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + &flags, &czb); + } + + static boolean_t + dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, + const zbookmark_t *zb) + { + /* + * We never skip over user/group accounting objects (obj<0) + */ + if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) && + (int64_t)zb->zb_object >= 0) { + /* + * If we already visited this bp & everything below (in + * a prior txg sync), don't bother doing it again. + */ + if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) + return (B_TRUE); + + /* + * If we found the block we're trying to resume from, or + * we went past it to a different object, zero it out to + * indicate that it's OK to start checking for pausing + * again. + */ + if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || + zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { + dprintf("resuming at %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); + } + } + return (B_FALSE); + } + + /* + * Return nonzero on i/o error. + * Return new buf to write out in *bufp. + */ + static int + dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, + dnode_phys_t *dnp, const blkptr_t *bp, + const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp) + { + dsl_pool_t *dp = scn->scn_dp; + int err; + + if (BP_GET_LEVEL(bp) > 0) { + uint32_t flags = ARC_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { + dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset, + zb->zb_object, zb->zb_blkid * epb + i); + } + for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + dsl_scan_visitbp(cbp, &czb, dnp, + *bufp, ds, scn, ostype, tx); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) { + uint32_t flags = ARC_WAIT; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + uint32_t flags = ARC_WAIT; + dnode_phys_t *cdnp; + int i, j; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) { + for (j = 0; j < cdnp->dn_nblkptr; j++) { + blkptr_t *cbp = &cdnp->dn_blkptr[j]; + dsl_scan_prefetch(scn, *bufp, cbp, + zb->zb_objset, zb->zb_blkid * epb + i, j); + } + } + for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) { + dsl_scan_visitdnode(scn, ds, ostype, + cdnp, *bufp, zb->zb_blkid * epb + i, tx); + } + + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + uint32_t flags = ARC_WAIT; + objset_phys_t *osp; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + + osp = (*bufp)->b_data; + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) + dsl_scan_zil(dp, &osp->os_zil_header); + + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx); + + if (OBJSET_BUF_HAS_USERUSED(*bufp)) { + /* + * We also always visit user/group accounting + * objects, and never skip them, even if we are + * pausing. This is necessary so that the space + * deltas from this txg get integrated. + */ + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_groupused_dnode, *bufp, + DMU_GROUPUSED_OBJECT, tx); + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_userused_dnode, *bufp, + DMU_USERUSED_OBJECT, tx); + } + } + + return (0); + } + + static void + dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, + dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf, + uint64_t object, dmu_tx_t *tx) + { + int j; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, + dnp->dn_nlevels - 1, j); + dsl_scan_visitbp(&dnp->dn_blkptr[j], + &czb, dnp, buf, ds, scn, ostype, tx); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zbookmark_t czb; + SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, + 0, DMU_SPILL_BLKID); + dsl_scan_visitbp(&dnp->dn_spill, + &czb, dnp, buf, ds, scn, ostype, tx); + } + } + + /* + * The arguments are in this order because mdb can only print the + * first 5; we want them to be useful. + */ + static void + dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, + dnode_phys_t *dnp, arc_buf_t *pbuf, + dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, + dmu_tx_t *tx) + { + dsl_pool_t *dp = scn->scn_dp; + arc_buf_t *buf = NULL; + blkptr_t bp_toread = *bp; + + /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ + + if (dsl_scan_check_pause(scn, zb)) + return; + + if (dsl_scan_check_resume(scn, dnp, zb)) + return; + + if (bp->blk_birth == 0) + return; + + scn->scn_visited_this_txg++; + + dprintf_bp(bp, + "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p", + ds, ds ? ds->ds_object : 0, + zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, + pbuf, bp); + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return; + + if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) { + /* + * For non-user-accounting blocks, we need to read the + * new bp (from a deleted snapshot, found in + * check_existing_xlation). If we used the old bp, + * pointers inside this block from before we resumed + * would be untranslated. + * + * For user-accounting blocks, we need to read the old + * bp, because we will apply the entire space delta to + * it (original untranslated -> translations from + * deleted snap -> now). + */ + bp_toread = *bp; + } + + if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx, + &buf) != 0) + return; + + /* + * If dsl_scan_ddt() has aready visited this block, it will have + * already done any translations or scrubbing, so don't call the + * callback again. + */ + if (ddt_class_contains(dp->dp_spa, + scn->scn_phys.scn_ddt_class_max, bp)) { + ASSERT(buf == NULL); + return; + } + + /* + * If this block is from the future (after cur_max_txg), then we + * are doing this on behalf of a deleted snapshot, and we will + * revisit the future block on the next pass of this dataset. + * Don't scan it now unless we need to because something + * under it was modified. + */ + if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) { + scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); + } + if (buf) + (void) arc_buf_remove_ref(buf, &buf); + } + + static void + dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, + dmu_tx_t *tx) + { + zbookmark_t zb; + + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + dsl_scan_visitbp(bp, &zb, NULL, NULL, + ds, scn, DMU_OST_NONE, tx); + + dprintf_ds(ds, "finished scan%s", ""); + } + + void + dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) + { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { + if (dsl_dataset_is_snapshot(ds)) { + /* Note, scn_cur_{min,max}_txg stays the same. */ + scn->scn_phys.scn_bookmark.zb_objset = + ds->ds_phys->ds_next_snap_obj; + zfs_dbgmsg("destroying ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_next_snap_obj); + scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; + } else { + SET_BOOKMARK(&scn->scn_phys.scn_bookmark, + ZB_DESTROYED_OBJSET, 0, 0, 0); + zfs_dbgmsg("destroying ds %llu; currently traversing; " + "reset bookmark to -1,0,0,0", + (u_longlong_t)ds->ds_object); + } + } else if (zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); + if (dsl_dataset_is_snapshot(ds)) { + /* + * We keep the same mintxg; it could be > + * ds_creation_txg if the previous snapshot was + * deleted too. + */ + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0); + zfs_dbgmsg("destroying ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_next_snap_obj); + } else { + zfs_dbgmsg("destroying ds %llu; in queue; removing", + (u_longlong_t)ds->ds_object); + } + } else { + zfs_dbgmsg("destroying ds %llu; ignoring", + (u_longlong_t)ds->ds_object); + } + + /* + * dsl_scan_sync() should be called after this, and should sync + * out our changed state, but just to be safe, do it here. + */ + dsl_scan_sync_state(scn, tx); + } + + void + dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) + { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); + + if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { + scn->scn_phys.scn_bookmark.zb_objset = + ds->ds_phys->ds_prev_snap_obj; + zfs_dbgmsg("snapshotting ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); + } else if (zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0); + zfs_dbgmsg("snapshotting ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); + } + dsl_scan_sync_state(scn, tx); + } + + void + dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) + { + dsl_pool_t *dp = ds1->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { + scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; + zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds1->ds_object, + (u_longlong_t)ds2->ds_object); + } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { + scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; + zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds2->ds_object, + (u_longlong_t)ds1->ds_object); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds1->ds_object, &mintxg) == 0) { + int err; + + ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); + err = zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); + VERIFY(err == 0 || err == EEXIST); + if (err == EEXIST) { + /* Both were there to begin with */ + VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + ds1->ds_object, mintxg, tx)); + } + zfs_dbgmsg("clone_swap ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds1->ds_object, + (u_longlong_t)ds2->ds_object); + } else if (zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { + ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); + VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); + zfs_dbgmsg("clone_swap ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds2->ds_object, + (u_longlong_t)ds1->ds_object); + } + + dsl_scan_sync_state(scn, tx); + } + + struct enqueue_clones_arg { + dmu_tx_t *tx; + uint64_t originobj; + }; + + /* ARGSUSED */ + static int + enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) + { + struct enqueue_clones_arg *eca = arg; + dsl_dataset_t *ds; + int err; + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err) + return (err); + + if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { + while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); + + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + ds = prev; + } + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, + ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); + } + dsl_dataset_rele(ds, FTAG); + return (0); + } + + static void + dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) + { + dsl_pool_t *dp = scn->scn_dp; + dsl_dataset_t *ds; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + + /* + * Iterate over the bps in this ds. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx); + + char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP); + dsl_dataset_name(ds, dsname); + zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " + "pausing=%u", + (longlong_t)dsobj, dsname, + (longlong_t)scn->scn_phys.scn_cur_min_txg, + (longlong_t)scn->scn_phys.scn_cur_max_txg, + (int)scn->scn_pausing); + kmem_free(dsname, ZFS_MAXNAMELEN); + + if (scn->scn_pausing) + goto out; + + /* + * We've finished this pass over this dataset. + */ + + /* + * If we did not completely visit this dataset, do another pass. + */ + if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { + zfs_dbgmsg("incomplete pass; visiting again"); + scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, + scn->scn_phys.scn_cur_max_txg, tx) == 0); + goto out; + } + + /* + * Add descendent datasets to work queue. + */ + if (ds->ds_phys->ds_next_snap_obj != 0) { + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj, + ds->ds_phys->ds_creation_txg, tx) == 0); + } + if (ds->ds_phys->ds_num_children > 1) { + boolean_t usenext = B_FALSE; + if (ds->ds_phys->ds_next_clones_obj != 0) { + uint64_t count; + /* + * A bug in a previous version of the code could + * cause upgrade_clones_cb() to not set + * ds_next_snap_obj when it should, leading to a + * missing entry. Therefore we can only use the + * next_clones_obj when its count is correct. + */ + int err = zap_count(dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj, &count); + if (err == 0 && + count == ds->ds_phys->ds_num_children - 1) + usenext = B_TRUE; + } + + if (usenext) { + VERIFY(zap_join_key(dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj, + scn->scn_phys.scn_queue_obj, + ds->ds_phys->ds_creation_txg, tx) == 0); + } else { + struct enqueue_clones_arg eca; + eca.tx = tx; + eca.originobj = ds->ds_object; + + (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, + NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); + } + } + + out: + dsl_dataset_rele(ds, FTAG); + } + + /* ARGSUSED */ + static int + enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) + { + dmu_tx_t *tx = arg; + dsl_dataset_t *ds; + int err; + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err) + return (err); + + while (ds->ds_phys->ds_prev_snap_obj != 0) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + FTAG, &prev); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + /* + * If this is a clone, we don't need to worry about it for now. + */ + if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { + dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele(prev, FTAG); + return (0); + } + dsl_dataset_rele(ds, FTAG); + ds = prev; + } + + VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0); + dsl_dataset_rele(ds, FTAG); + return (0); + } + + /* + * Scrub/dedup interaction. + * + * If there are N references to a deduped block, we don't want to scrub it + * N times -- ideally, we should scrub it exactly once. + * + * We leverage the fact that the dde's replication class (enum ddt_class) + * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest + * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. + * + * To prevent excess scrubbing, the scrub begins by walking the DDT + * to find all blocks with refcnt > 1, and scrubs each of these once. + * Since there are two replication classes which contain blocks with + * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. + * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. + * + * There would be nothing more to say if a block's refcnt couldn't change + * during a scrub, but of course it can so we must account for changes + * in a block's replication class. + * + * Here's an example of what can occur: + * + * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 + * when visited during the top-down scrub phase, it will be scrubbed twice. + * This negates our scrub optimization, but is otherwise harmless. + * + * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 + * on each visit during the top-down scrub phase, it will never be scrubbed. + * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's + * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to + * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 + * while a scrub is in progress, it scrubs the block right then. + */ + static void + dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) + { + ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; + ddt_entry_t dde = { 0 }; + int error; + uint64_t n = 0; + + while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { + ddt_t *ddt; + + if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) + break; + dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", + (longlong_t)ddb->ddb_class, + (longlong_t)ddb->ddb_type, + (longlong_t)ddb->ddb_checksum, + (longlong_t)ddb->ddb_cursor); + + /* There should be no pending changes to the dedup table */ + ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; + ASSERT(avl_first(&ddt->ddt_tree) == NULL); + + dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); + n++; + + if (dsl_scan_check_pause(scn, NULL)) + break; + } + + zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u", + (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max, + (int)scn->scn_pausing); + + ASSERT(error == 0 || error == ENOENT); + ASSERT(error != ENOENT || + ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); + } + + /* ARGSUSED */ + void + dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx) + { + const ddt_key_t *ddk = &dde->dde_key; + ddt_phys_t *ddp = dde->dde_phys; + blkptr_t bp; + zbookmark_t zb = { 0 }; ++ int p; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { ++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg) + continue; + ddt_bp_create(checksum, ddk, ddp, &bp); + + scn->scn_visited_this_txg++; + scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); + } + } + + static void + dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) + { + dsl_pool_t *dp = scn->scn_dp; + zap_cursor_t zc; + zap_attribute_t za; + + if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= + scn->scn_phys.scn_ddt_class_max) { + scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; + scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; + dsl_scan_ddt(scn, tx); + if (scn->scn_pausing) + return; + } + + if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { + /* First do the MOS & ORIGIN */ + + scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; + scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; + dsl_scan_visit_rootbp(scn, NULL, + &dp->dp_meta_rootbp, tx); + spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); + if (scn->scn_pausing) + return; + + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { + VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, + NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); + } else { + dsl_scan_visitds(scn, + dp->dp_origin_snap->ds_object, tx); + } + ASSERT(!scn->scn_pausing); + } else if (scn->scn_phys.scn_bookmark.zb_objset != + ZB_DESTROYED_OBJSET) { + /* + * If we were paused, continue from here. Note if the + * ds we were paused on was deleted, the zb_objset may + * be -1, so we will skip this and find a new objset + * below. + */ + dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); + if (scn->scn_pausing) + return; + } + + /* + * In case we were paused right at the end of the ds, zero the + * bookmark so we don't think that we're still trying to resume. + */ + bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t)); + + /* keep pulling things out of the zap-object-as-queue */ + while (zap_cursor_init(&zc, dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj), + zap_cursor_retrieve(&zc, &za) == 0) { + dsl_dataset_t *ds; + uint64_t dsobj; + + dsobj = strtonum(za.za_name, NULL); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, dsobj, tx)); + + /* Set up min/max txg */ + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + if (za.za_first_integer != 0) { + scn->scn_phys.scn_cur_min_txg = + MAX(scn->scn_phys.scn_min_txg, + za.za_first_integer); + } else { + scn->scn_phys.scn_cur_min_txg = + MAX(scn->scn_phys.scn_min_txg, + ds->ds_phys->ds_prev_snap_txg); + } + scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); + dsl_dataset_rele(ds, FTAG); + + dsl_scan_visitds(scn, dsobj, tx); + zap_cursor_fini(&zc); + if (scn->scn_pausing) + return; + } + zap_cursor_fini(&zc); + } + + static int + dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) + { + dsl_scan_t *scn = arg; + uint64_t elapsed_nanosecs; + + elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; + + if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && + txg_sync_waiting(scn->scn_dp)) || + spa_shutting_down(scn->scn_dp->dp_spa)) + return (ERESTART); + + zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, + dmu_tx_get_txg(tx), bp, 0)); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, + -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), + -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); + scn->scn_visited_this_txg++; + return (0); + } + + boolean_t + dsl_scan_active(dsl_scan_t *scn) + { + spa_t *spa = scn->scn_dp->dp_spa; + uint64_t used = 0, comp, uncomp; + + if (spa->spa_load_state != SPA_LOAD_NONE) + return (B_FALSE); + if (spa_shutting_down(spa)) + return (B_FALSE); + + if (scn->scn_phys.scn_state == DSS_SCANNING) + return (B_TRUE); + + if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, + &used, &comp, &uncomp); + } + return (used != 0); + } + + void + dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) + { + dsl_scan_t *scn = dp->dp_scan; + spa_t *spa = dp->dp_spa; + int err; + + /* + * Check for scn_restart_txg before checking spa_load_state, so + * that we can restart an old-style scan while the pool is being + * imported (see dsl_scan_init). + */ + if (scn->scn_restart_txg != 0 && + scn->scn_restart_txg <= tx->tx_txg) { + pool_scan_func_t func = POOL_SCAN_SCRUB; + dsl_scan_done(scn, B_FALSE, tx); + if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) + func = POOL_SCAN_RESILVER; + zfs_dbgmsg("restarting scan func=%u txg=%llu", + func, tx->tx_txg); + dsl_scan_setup_sync(scn, &func, tx); + } + + + if (!dsl_scan_active(scn) || + spa_sync_pass(dp->dp_spa) > 1) + return; + + scn->scn_visited_this_txg = 0; + scn->scn_pausing = B_FALSE; + scn->scn_sync_start_time = gethrtime(); + spa->spa_scrub_active = B_TRUE; + + /* + * First process the free list. If we pause the free, don't do + * any scanning. This ensures that there is no free list when + * we are scanning, so the scan code doesn't have to worry about + * traversing it. + */ + if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bpobj_iterate(&dp->dp_free_bpobj, + dsl_scan_free_cb, scn, tx); + VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + if (scn->scn_visited_this_txg) { + zfs_dbgmsg("freed %llu blocks in %llums from " + "free_bpobj txg %llu", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t) + (gethrtime() - scn->scn_sync_start_time) / MICROSEC, + (longlong_t)tx->tx_txg); + scn->scn_visited_this_txg = 0; + /* + * Re-sync the ddt so that we can further modify + * it when doing bprewrite. + */ + ddt_sync(spa, tx->tx_txg); + } + if (err == ERESTART) + return; + } + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + + if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= + scn->scn_phys.scn_ddt_class_max) { + zfs_dbgmsg("doing scan sync txg %llu; " + "ddt bm=%llu/%llu/%llu/%llx", + (longlong_t)tx->tx_txg, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); + ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); + ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); + ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); + ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); + } else { + zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", + (longlong_t)tx->tx_txg, + (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, + (longlong_t)scn->scn_phys.scn_bookmark.zb_object, + (longlong_t)scn->scn_phys.scn_bookmark.zb_level, + (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); + } + + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + dsl_scan_visit(scn, tx); + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + zfs_dbgmsg("visited %llu blocks in %llums", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC); + + if (!scn->scn_pausing) { + /* finished with scan. */ + zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg); + dsl_scan_done(scn, B_TRUE, tx); + } + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > 0) { + cv_wait(&spa->spa_scrub_io_cv, + &spa->spa_scrub_lock); + } + mutex_exit(&spa->spa_scrub_lock); + } + + dsl_scan_sync_state(scn, tx); + } + + /* + * This will start a new scan, or restart an existing one. + */ + void + dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) + { + if (txg == 0) { + dmu_tx_t *tx; + tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); + + txg = dmu_tx_get_txg(tx); + dp->dp_scan->scn_restart_txg = txg; + dmu_tx_commit(tx); + } else { + dp->dp_scan->scn_restart_txg = txg; + } + zfs_dbgmsg("restarting resilver txg=%llu", txg); + } + + boolean_t + dsl_scan_resilvering(dsl_pool_t *dp) + { + return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && + dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); + } + + /* + * scrub consumers + */ + + static void + count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) + { + int i; + + /* + * If we resume after a reboot, zab will be NULL; don't record + * incomplete stats in that case. + */ + if (zab == NULL) + return; + + for (i = 0; i < 4; i++) { + int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; + int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; + zfs_blkstat_t *zb = &zab->zab_type[l][t]; + int equal; + + zb->zb_count++; + zb->zb_asize += BP_GET_ASIZE(bp); + zb->zb_lsize += BP_GET_LSIZE(bp); + zb->zb_psize += BP_GET_PSIZE(bp); + zb->zb_gangs += BP_COUNT_GANG(bp); + + switch (BP_GET_NDVAS(bp)) { + case 2: + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + zb->zb_ditto_2_of_2_samevdev++; + break; + case 3: + equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + + (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2])) + + (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2])); + if (equal == 1) + zb->zb_ditto_2_of_3_samevdev++; + else if (equal == 3) + zb->zb_ditto_3_of_3_samevdev++; + break; + } + } + } + + static void + dsl_scan_scrub_done(zio_t *zio) + { + spa_t *spa = zio->io_spa; + + zio_data_buf_free(zio->io_data, zio->io_size); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_inflight--; + cv_broadcast(&spa->spa_scrub_io_cv); + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { + spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; + } + mutex_exit(&spa->spa_scrub_lock); + } + + static int + dsl_scan_scrub_cb(dsl_pool_t *dp, + const blkptr_t *bp, const zbookmark_t *zb) + { + dsl_scan_t *scn = dp->dp_scan; + size_t size = BP_GET_PSIZE(bp); + spa_t *spa = dp->dp_spa; + uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); + boolean_t needs_io; + int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; + int zio_priority; ++ int d; + + if (phys_birth <= scn->scn_phys.scn_min_txg || + phys_birth >= scn->scn_phys.scn_max_txg) + return (0); + + count_block(dp->dp_blkstats, bp); + + ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); + if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { + zio_flags |= ZIO_FLAG_SCRUB; + zio_priority = ZIO_PRIORITY_SCRUB; + needs_io = B_TRUE; + } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { + zio_flags |= ZIO_FLAG_RESILVER; + zio_priority = ZIO_PRIORITY_RESILVER; + needs_io = B_FALSE; + } + + /* If it's an intent log block, failure is expected. */ + if (zb->zb_level == ZB_ZIL_LEVEL) + zio_flags |= ZIO_FLAG_SPECULATIVE; + - for (int d = 0; d < BP_GET_NDVAS(bp); d++) { ++ for (d = 0; d < BP_GET_NDVAS(bp); d++) { + vdev_t *vd = vdev_lookup_top(spa, + DVA_GET_VDEV(&bp->blk_dva[d])); + + /* + * Keep track of how much data we've examined so that + * zpool(1M) status can make useful progress reports. + */ + scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); + spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); + + /* if it's a resilver, this may not be in the target range */ + if (!needs_io) { + if (DVA_GET_GANG(&bp->blk_dva[d])) { + /* + * Gang members may be spread across multiple + * vdevs, so the best estimate we have is the + * scrub range, which has already been checked. + * XXX -- it would be better to change our + * allocation policy to ensure that all + * gang members reside on the same vdev. + */ + needs_io = B_TRUE; + } else { + needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, + phys_birth, 1); + } + } + } + + if (needs_io && !zfs_no_scrub_io) { + void *data = zio_data_buf_alloc(size); + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); + + zio_nowait(zio_read(NULL, spa, bp, data, size, + dsl_scan_scrub_done, NULL, zio_priority, + zio_flags, zb)); + } + + /* do not relocate this block */ + return (0); + } + + int + dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) + { + spa_t *spa = dp->dp_spa; + + /* + * Purge all vdev caches and probe all devices. We do this here + * rather than in sync context because this requires a writer lock + * on the spa_config lock, which we can't do from sync context. The + * spa_scrub_reopen flag indicates that vdev_open() should not + * attempt to start another scrub. + */ + spa_vdev_state_enter(spa, SCL_NONE); + spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + (void) spa_vdev_state_exit(spa, NULL, 0); + + return (dsl_sync_task_do(dp, dsl_scan_setup_check, + dsl_scan_setup_sync, dp->dp_scan, &func, 0)); + } diff --cc module/zfs/include/sys/spa.h index 0a4d55097,41a40300e..86fe01553 --- a/module/zfs/include/sys/spa.h +++ b/module/zfs/include/sys/spa.h @@@ -309,6 -338,66 +338,67 @@@ typedef struct blkptr #define BP_SPRINTF_LEN 320 + /* + * This macro allows code sharing between zfs, libzpool, and mdb. + * 'func' is either snprintf() or mdb_snprintf(). + * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. + */ + #define SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress) \ + { \ + static const char *copyname[] = \ + { "zero", "single", "double", "triple" }; \ + int size = BP_SPRINTF_LEN; \ + int len = 0; \ + int copies = 0; \ ++ int d; \ + \ + if (bp == NULL) { \ + len = func(buf + len, size - len, ""); \ + } else if (BP_IS_HOLE(bp)) { \ + len = func(buf + len, size - len, ""); \ + } else { \ - for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ ++ for (d = 0; d < BP_GET_NDVAS(bp); d++) { \ + const dva_t *dva = &bp->blk_dva[d]; \ + if (DVA_IS_VALID(dva)) \ + copies++; \ + len += func(buf + len, size - len, \ + "DVA[%d]=<%llu:%llx:%llx>%c", d, \ + (u_longlong_t)DVA_GET_VDEV(dva), \ + (u_longlong_t)DVA_GET_OFFSET(dva), \ + (u_longlong_t)DVA_GET_ASIZE(dva), \ + ws); \ + } \ + if (BP_IS_GANG(bp) && \ + DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ + DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ + copies--; \ + len += func(buf + len, size - len, \ + "[L%llu %s] %s %s %s %s %s %s%c" \ + "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ + "cksum=%llx:%llx:%llx:%llx", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + checksum, \ + compress, \ + BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ + BP_IS_GANG(bp) ? "gang" : "contiguous", \ + BP_GET_DEDUP(bp) ? "dedup" : "unique", \ + copyname[copies], \ + ws, \ + (u_longlong_t)BP_GET_LSIZE(bp), \ + (u_longlong_t)BP_GET_PSIZE(bp), \ + (u_longlong_t)bp->blk_birth, \ + (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ + (u_longlong_t)bp->blk_fill, \ + ws, \ + (u_longlong_t)bp->blk_cksum.zc_word[0], \ + (u_longlong_t)bp->blk_cksum.zc_word[1], \ + (u_longlong_t)bp->blk_cksum.zc_word[2], \ + (u_longlong_t)bp->blk_cksum.zc_word[3]); \ + } \ + ASSERT(len < size); \ + } + #include #define BP_GET_BUFC_TYPE(bp) \ diff --cc module/zfs/metaslab.c index 987617ffe,17b4b12c4..1722a53fc --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@@ -495,10 -730,9 +730,10 @@@ voi metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; + int t; - vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, - -msp->ms_smo.smo_alloc, B_TRUE); + vdev_space_update(mg->mg_vd, + -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); metaslab_group_remove(mg, msp); @@@ -512,6 -746,11 +747,11 @@@ space_map_destroy(&msp->ms_freemap[t]); } - for (int t = 0; t < TXG_DEFER_SIZE; t++) ++ for (t = 0; t < TXG_DEFER_SIZE; t++) + space_map_destroy(&msp->ms_defermap[t]); + + ASSERT3S(msp->ms_deferspace, ==, 0); + mutex_exit(&msp->ms_lock); mutex_destroy(&msp->ms_lock); @@@ -574,17 -846,35 +847,36 @@@ metaslab_prefetch(metaslab_group_t *mg static int metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) { + metaslab_group_t *mg = msp->ms_group; space_map_t *sm = &msp->ms_map; space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; ++ int t; ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo, - msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); - if (error) { - metaslab_group_sort(msp->ms_group, msp, 0); - return (error); + space_map_load_wait(sm); + if (!sm->sm_loaded) { + int error = space_map_load(sm, sm_ops, SM_FREE, + &msp->ms_smo, + spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); + if (error) { + metaslab_group_sort(msp->ms_group, msp, 0); + return (error); + } - for (int t = 0; t < TXG_DEFER_SIZE; t++) ++ for (t = 0; t < TXG_DEFER_SIZE; t++) + space_map_walk(&msp->ms_defermap[t], + space_map_claim, sm); + + } + + /* + * Track the bonus area as we activate new metaslabs. + */ + if (sm->sm_start > mg->mg_bonus_area) { + mutex_enter(&mg->mg_lock); + mg->mg_bonus_area = sm->sm_start; + mutex_exit(&mg->mg_lock); } /* @@@ -632,9 -922,11 +924,12 @@@ metaslab_sync(metaslab_t *msp, uint64_ space_map_obj_t *smo = &msp->ms_smo_syncing; dmu_buf_t *db; dmu_tx_t *tx; + int t; - tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + ASSERT(!vd->vdev_ishole); + + if (allocmap->sm_space == 0 && freemap->sm_space == 0) + return; /* * The only state that can actually be changing concurrently with @@@ -683,7 -977,11 +980,11 @@@ space_map_walk(sm, space_map_remove, allocmap); space_map_walk(freed_map, space_map_remove, allocmap); - for (int t = 0; t < TXG_DEFER_SIZE; t++) ++ for (t = 0; t < TXG_DEFER_SIZE; t++) + space_map_walk(&msp->ms_defermap[t], + space_map_remove, allocmap); + - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) + for (t = 1; t < TXG_CONCURRENT_STATES; t++) space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], space_map_remove, allocmap); @@@ -717,10 -1015,13 +1018,14 @@@ metaslab_sync_done(metaslab_t *msp, uin space_map_obj_t *smosync = &msp->ms_smo_syncing; space_map_t *sm = &msp->ms_map; space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; + int64_t alloc_delta, defer_delta; + int t; + ASSERT(!vd->vdev_ishole); + mutex_enter(&msp->ms_lock); /* @@@ -734,10 -1035,18 +1039,18 @@@ space_map_create(&msp->ms_freemap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); } - vdev_space_update(vd, sm->sm_size, 0, B_TRUE); + - for (int t = 0; t < TXG_DEFER_SIZE; t++) ++ for (t = 0; t < TXG_DEFER_SIZE; t++) + space_map_create(&msp->ms_defermap[t], sm->sm_start, + sm->sm_size, sm->sm_shift, sm->sm_lock); + + vdev_space_update(vd, 0, 0, sm->sm_size); } - vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE); + alloc_delta = smosync->smo_alloc - smo->smo_alloc; + defer_delta = freed_map->sm_space - defer_map->sm_space; + + vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); @@@ -773,6 -1095,32 +1099,33 @@@ mutex_exit(&msp->ms_lock); } + void + metaslab_sync_reassess(metaslab_group_t *mg) + { + vdev_t *vd = mg->mg_vd; ++ int m; + + /* + * Re-evaluate all metaslabs which have lower offsets than the + * bonus area. + */ - for (int m = 0; m < vd->vdev_ms_count; m++) { ++ for (m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp->ms_map.sm_start > mg->mg_bonus_area) + break; + + mutex_enter(&msp->ms_lock); + metaslab_group_sort(mg, msp, metaslab_weight(msp)); + mutex_exit(&msp->ms_lock); + } + + /* + * Prefetch the next potential metaslabs + */ + metaslab_prefetch(mg); + } + static uint64_t metaslab_distance(metaslab_t *msp, dva_t *dva) { @@@ -1154,9 -1517,10 +1522,10 @@@ metaslab_alloc(spa_t *spa, metaslab_cla { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; - int error = 0; + int d, error = 0; ASSERT(bp->blk_birth == 0); + ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); @@@ -1195,10 -1559,10 +1564,10 @@@ voi metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) { const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); + int d, ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg); + ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); diff --cc module/zfs/spa.c index 705dda4df,d7c5de0d3..b0236e49f --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@@ -565,27 -601,58 +601,60 @@@ spa_get_errlists(spa_t *spa, avl_tree_ offsetof(spa_error_entry_t, se_avl)); } - /* - * Activate an uninitialized pool. - */ - static void - spa_activate(spa_t *spa, int mode) + static taskq_t * + spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, + uint_t value) { - int t, q; + uint_t flags = TASKQ_PREPOPULATE; + boolean_t batch = B_FALSE; - ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + switch (mode) { + case zti_mode_null: + return (NULL); /* no taskq needed */ - spa->spa_state = POOL_STATE_ACTIVE; - spa->spa_mode = mode; + case zti_mode_fixed: + ASSERT3U(value, >=, 1); + value = MAX(value, 1); + break; + + case zti_mode_batch: + batch = B_TRUE; + flags |= TASKQ_THREADS_CPU_PCT; + value = zio_taskq_batch_pct; + break; + + case zti_mode_online_percent: + flags |= TASKQ_THREADS_CPU_PCT; + break; + + default: + panic("unrecognized mode for %s taskq (%u:%u) in " + "spa_activate()", + name, mode, value); + break; + } - spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops); - spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops); + if (zio_taskq_sysdc && spa->spa_proc != &p0) { + if (batch) + flags |= TASKQ_DC_BATCH; + + return (taskq_create_sysdc(name, value, 50, INT_MAX, + spa->spa_proc, zio_taskq_basedc, flags)); + } + return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, + spa->spa_proc, flags)); + } + + static void + spa_create_zio_taskqs(spa_t *spa) + { - for (int t = 0; t < ZIO_TYPES; t++) { - for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { ++ int t, q; + + for (t = 0; t < ZIO_TYPES; t++) { - const zio_taskq_info_t *ztip = &zio_taskqs[t]; + for (q = 0; q < ZIO_TASKQ_TYPES; q++) { - enum zti_modes mode = ztip->zti_nthreads[q].zti_mode; - uint_t value = ztip->zti_nthreads[q].zti_value; + const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; + enum zti_modes mode = ztip->zti_mode; + uint_t value = ztip->zti_value; char name[32]; (void) snprintf(name, sizeof (name), @@@ -660,9 -814,10 +818,10 @@@ spa_deactivate(spa_t *spa list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_state_dirty_list); - for (int t = 0; t < ZIO_TYPES; t++) { - for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { + for (t = 0; t < ZIO_TYPES; t++) { + for (q = 0; q < ZIO_TASKQ_TYPES; q++) { - taskq_destroy(spa->spa_zio_taskq[t][q]); + if (spa->spa_zio_taskq[t][q] != NULL) + taskq_destroy(spa->spa_zio_taskq[t][q]); spa->spa_zio_taskq[t][q] = NULL; } } @@@ -1106,27 -1288,23 +1295,24 @@@ spa_check_removed(vdev_t *vd * that the label does not contain the most up-to-date information. */ void - spa_load_log_state(spa_t *spa) + spa_load_log_state(spa_t *spa, nvlist_t *nv) { - nvlist_t *nv, *nvroot, **child; - uint64_t is_log; - uint_t children; - vdev_t *rvd = spa->spa_root_vdev; + vdev_t *ovd, *rvd = spa->spa_root_vdev; + int c; - VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0); - VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0); - - for (c = 0; c < children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; + /* + * Load the original root vdev tree from the passed config. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); - if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - &is_log) == 0 && is_log) - vdev_load_log_state(tvd, child[c]); - for (int c = 0; c < rvd->vdev_children; c++) { ++ for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *cvd = rvd->vdev_child[c]; + if (cvd->vdev_islog) + vdev_load_log_state(cvd, ovd->vdev_child[c]); } - nvlist_free(nv); + vdev_free(ovd); + spa_config_exit(spa, SCL_ALL, FTAG); } /* @@@ -1149,181 -1327,481 +1335,486 @@@ spa_check_logs(spa_t *spa return (0); } - /* - * Load an existing storage pool, using the pool's builtin spa_config as a - * source of configuration information. - */ - static int - spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) + static boolean_t + spa_passivate_log(spa_t *spa) { - int error = 0; - nvlist_t *nvroot = NULL; - vdev_t *rvd; - uberblock_t *ub = &spa->spa_uberblock; - uint64_t config_cache_txg = spa->spa_config_txg; - uint64_t pool_guid; - uint64_t version; - uint64_t autoreplace = 0; - int orig_mode = spa->spa_mode; - char *ereport = FM_EREPORT_ZFS_POOL; + vdev_t *rvd = spa->spa_root_vdev; + boolean_t slog_found = B_FALSE; ++ int c; - /* - * If this is an untrusted config, access the pool in read-only mode. - * This prevents things like resilvering recently removed devices. - */ - if (!mosconfig) - spa->spa_mode = FREAD; + ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (!spa_has_slogs(spa)) + return (B_FALSE); - spa->spa_load_state = state; - for (int c = 0; c < rvd->vdev_children; c++) { ++ for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || - nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { - error = EINVAL; - goto out; + if (tvd->vdev_islog) { + metaslab_group_passivate(mg); + slog_found = B_TRUE; + } } - /* - * Versioning wasn't explicitly added to the label until later, so if - * it's not present treat it as the initial version. - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) - version = SPA_VERSION_INITIAL; + return (slog_found); + } - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, - &spa->spa_config_txg); + static void + spa_activate_log(spa_t *spa) + { + vdev_t *rvd = spa->spa_root_vdev; ++ int c; - if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && - spa_guid_exists(pool_guid, 0)) { - error = EEXIST; - goto out; - } + ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); - spa->spa_load_guid = pool_guid; - for (int c = 0; c < rvd->vdev_children; c++) { ++ for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; - /* - * Create "The Godfather" zio to hold all async IOs - */ - spa->spa_async_zio_root = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); + if (tvd->vdev_islog) + metaslab_group_activate(mg); + } + } - /* - * Parse the configuration into a vdev tree. We explicitly set the - * value that will be returned by spa_version() since parsing the - * configuration requires knowing the version number. - */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa->spa_ubsync.ub_version = version; - error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); - spa_config_exit(spa, SCL_ALL, FTAG); + int + spa_offline_log(spa_t *spa) + { + int error = 0; - if (error != 0) - goto out; + if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + NULL, DS_FIND_CHILDREN)) == 0) { - ASSERT(spa->spa_root_vdev == rvd); - ASSERT(spa_guid(spa) == pool_guid); + /* + * We successfully offlined the log device, sync out the + * current txg so that the "stubby" block can be removed + * by zil_sync(). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + } + return (error); + } - /* - * Try to open all vdevs, loading each label in the process. - */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_open(rvd); - spa_config_exit(spa, SCL_ALL, FTAG); - if (error != 0) - goto out; + static void + spa_aux_check_removed(spa_aux_vdev_t *sav) + { - for (int i = 0; i < sav->sav_count; i++) ++ int i; + - /* - * We need to validate the vdev labels against the configuration that - * we have in hand, which is dependent on the setting of mosconfig. If - * mosconfig is true then we're validating the vdev labels based on - * that config. Otherwise, we're validating against the cached config - * (zpool.cache) that was read when we loaded the zfs module, and then - * later we will recursively call spa_load() and validate against - * the vdev config. - */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_validate(rvd); - spa_config_exit(spa, SCL_ALL, FTAG); - if (error != 0) - goto out; ++ for (i = 0; i < sav->sav_count; i++) + spa_check_removed(sav->sav_vdevs[i]); + } - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { - error = ENXIO; - goto out; - } + void + spa_claim_notify(zio_t *zio) + { + spa_t *spa = zio->io_spa; - /* - * Find the best uberblock. - */ - vdev_uberblock_load(NULL, rvd, ub); + if (zio->io_error) + return; - /* - * If we weren't able to find a single valid uberblock, return failure. - */ - if (ub->ub_txg == 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = ENXIO; - goto out; - } + mutex_enter(&spa->spa_props_lock); /* any mutex will do */ + if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) + spa->spa_claim_max_txg = zio->io_bp->blk_birth; + mutex_exit(&spa->spa_props_lock); + } - /* - * If the pool is newer than the code, we can't open it. - */ - if (ub->ub_version > SPA_VERSION) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_VERSION_NEWER); - error = ENOTSUP; - goto out; - } + typedef struct spa_load_error { + uint64_t sle_meta_count; + uint64_t sle_data_count; + } spa_load_error_t; - /* - * If the vdev guid sum doesn't match the uberblock, we have an - * incomplete configuration. - */ - if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_GUID_SUM); - error = ENXIO; - goto out; - } + static void + spa_load_verify_done(zio_t *zio) + { + blkptr_t *bp = zio->io_bp; + spa_load_error_t *sle = zio->io_private; + dmu_object_type_t type = BP_GET_TYPE(bp); + int error = zio->io_error; - /* - * Initialize internal SPA structures. - */ - spa->spa_state = POOL_STATE_ACTIVE; - spa->spa_ubsync = spa->spa_uberblock; - spa->spa_first_txg = spa_last_synced_txg(spa) + 1; - error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - goto out; + if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && + type != DMU_OT_INTENT_LOG) + atomic_add_64(&sle->sle_meta_count, 1); + else + atomic_add_64(&sle->sle_data_count, 1); } - spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; + zio_data_buf_free(zio->io_data, zio->io_size); + } - if (zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, - sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + /*ARGSUSED*/ + static int + spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + { + if (bp != NULL) { + zio_t *rio = arg; + size_t size = BP_GET_PSIZE(bp); + void *data = zio_data_buf_alloc(size); - if (!mosconfig) { - nvlist_t *newconfig; - uint64_t hostid; + zio_nowait(zio_read(rio, spa, bp, data, size, + spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); + } + return (0); + } - if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; + static int + spa_load_verify(spa_t *spa) + { + zio_t *rio; + spa_load_error_t sle = { 0 }; + zpool_rewind_policy_t policy; + boolean_t verify_ok = B_FALSE; + int error; + + zpool_get_rewind_policy(spa->spa_config, &policy); + + if (policy.zrp_request & ZPOOL_NEVER_REWIND) + return (0); + + rio = zio_root(spa, NULL, &sle, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + + error = traverse_pool(spa, spa->spa_verify_min_txg, + TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); + + (void) zio_wait(rio); + + spa->spa_load_meta_errors = sle.sle_meta_count; + spa->spa_load_data_errors = sle.sle_data_count; + + if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && + sle.sle_data_count <= policy.zrp_maxdata) { + verify_ok = B_TRUE; + spa->spa_load_txg = spa->spa_uberblock.ub_txg; + spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; + } else { + spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; + } + + if (error) { + if (error != ENXIO && error != EIO) + error = EIO; + return (error); + } + + return (verify_ok ? 0 : EIO); + } + + /* + * Find a value in the pool props object. + */ + static void + spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) + { + (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, + zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); + } + + /* + * Find a value in the pool directory object. + */ + static int + spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) + { + return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + name, sizeof (uint64_t), 1, val)); + } + + static int + spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) + { + vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); + return (err); + } + + /* + * Fix up config after a partly-completed split. This is done with the + * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off + * pool have that entry in their config, but only the splitting one contains + * a list of all the guids of the vdevs that are being split off. + * + * This function determines what to do with that list: either rejoin + * all the disks to the pool, or complete the splitting process. To attempt + * the rejoin, each disk that is offlined is marked online again, and + * we do a reopen() call. If the vdev label for every disk that was + * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) + * then we call vdev_split() on each disk, and complete the split. + * + * Otherwise we leave the config alone, with all the vdevs in place in + * the original pool. + */ + static void + spa_try_repair(spa_t *spa, nvlist_t *config) + { + uint_t extracted; + uint64_t *glist; + uint_t i, gcount; + nvlist_t *nvl; + vdev_t **vd; + boolean_t attempt_reopen; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) + return; + + /* check that the config is complete */ + if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, + &glist, &gcount) != 0) + return; + + vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); + + /* attempt to online all the vdevs & validate */ + attempt_reopen = B_TRUE; + for (i = 0; i < gcount; i++) { + if (glist[i] == 0) /* vdev is hole */ + continue; + + vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); + if (vd[i] == NULL) { + /* + * Don't bother attempting to reopen the disks; + * just do the split. + */ + attempt_reopen = B_FALSE; + } else { + /* attempt to re-online it */ + vd[i]->vdev_offline = B_FALSE; + } + } + + if (attempt_reopen) { + vdev_reopen(spa->spa_root_vdev); + + /* check each device to see what state it's in */ + for (extracted = 0, i = 0; i < gcount; i++) { + if (vd[i] != NULL && + vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) + break; + ++extracted; } + } + + /* + * If every disk has been moved to the new pool, or if we never + * even attempted to look at them, then we split them off for + * good. + */ + if (!attempt_reopen || gcount == extracted) { + for (i = 0; i < gcount; i++) + if (vd[i] != NULL) + vdev_split(vd[i]); + vdev_reopen(spa->spa_root_vdev); + } + + kmem_free(vd, gcount * sizeof (vdev_t *)); + } + + static int + spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, + boolean_t mosconfig) + { + nvlist_t *config = spa->spa_config; + char *ereport = FM_EREPORT_ZFS_POOL; + int error; + uint64_t pool_guid; + nvlist_t *nvl; + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) + return (EINVAL); - if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig, + /* + * Versioning wasn't explicitly added to the label until later, so if + * it's not present treat it as the initial version. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; + + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &spa->spa_config_txg); + + if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && + spa_guid_exists(pool_guid, 0)) { + error = EEXIST; + } else { + spa->spa_load_guid = pool_guid; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, + &nvl) == 0) { + VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, + KM_SLEEP) == 0); + } + + error = spa_load_impl(spa, pool_guid, config, state, type, + mosconfig, &ereport); + } + + spa->spa_minref = refcount_count(&spa->spa_refcount); + if (error && error != EBADF) + zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); + spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; + spa->spa_ena = 0; + + return (error); + } + + /* + * Load an existing storage pool, using the pool's builtin spa_config as a + * source of configuration information. + */ + static int + spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, + spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, + char **ereport) + { + int error = 0; + nvlist_t *nvroot = NULL; + vdev_t *rvd; + uberblock_t *ub = &spa->spa_uberblock; + uint64_t config_cache_txg = spa->spa_config_txg; + int orig_mode = spa->spa_mode; + int parse; + uint64_t obj; ++ int c; + + /* + * If this is an untrusted config, access the pool in read-only mode. + * This prevents things like resilvering recently removed devices. + */ + if (!mosconfig) + spa->spa_mode = FREAD; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa->spa_load_state = state; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) + return (EINVAL); + + parse = (type == SPA_IMPORT_EXISTING ? + VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); + + /* + * Create "The Godfather" zio to hold all async IOs + */ + spa->spa_async_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); + + /* + * Parse the configuration into a vdev tree. We explicitly set the + * value that will be returned by spa_version() since parsing the + * configuration requires knowing the version number. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) + return (error); + + ASSERT(spa->spa_root_vdev == rvd); + + if (type != SPA_IMPORT_ASSEMBLE) { + ASSERT(spa_guid(spa) == pool_guid); + } + + /* + * Try to open all vdevs, loading each label in the process. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_open(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error != 0) + return (error); + + /* + * We need to validate the vdev labels against the configuration that + * we have in hand, which is dependent on the setting of mosconfig. If + * mosconfig is true then we're validating the vdev labels based on + * that config. Otherwise, we're validating against the cached config + * (zpool.cache) that was read when we loaded the zfs module, and then + * later we will recursively call spa_load() and validate against + * the vdev config. + * + * If we're assembling a new pool that's been split off from an + * existing pool, the labels haven't yet been updated so we skip + * validation for now. + */ + if (type != SPA_IMPORT_ASSEMBLE) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_validate(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) + return (error); + + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + return (ENXIO); + } + + /* + * Find the best uberblock. + */ + vdev_uberblock_load(NULL, rvd, ub); + + /* + * If we weren't able to find a single valid uberblock, return failure. + */ + if (ub->ub_txg == 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); + + /* + * If the pool is newer than the code, we can't open it. + */ + if (ub->ub_version > SPA_VERSION) + return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); + + /* + * If the vdev guid sum doesn't match the uberblock, we have an + * incomplete configuration. + */ + if (mosconfig && type != SPA_IMPORT_ASSEMBLE && + rvd->vdev_guid_sum != ub->ub_guid_sum) + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); + + if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_try_repair(spa, config); + spa_config_exit(spa, SCL_ALL, FTAG); + nvlist_free(spa->spa_config_splitting); + spa->spa_config_splitting = NULL; + } + + /* + * Initialize internal SPA structures. + */ + spa->spa_state = POOL_STATE_ACTIVE; + spa->spa_ubsync = spa->spa_uberblock; + spa->spa_verify_min_txg = spa->spa_extreme_rewind ? + TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; + spa->spa_first_txg = spa->spa_last_ubsync_txg ? + spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; + spa->spa_claim_max_txg = spa->spa_first_txg; + spa->spa_prev_software_version = ub->ub_software_version; + + error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); + if (error) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; + + if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + if (!mosconfig) { + uint64_t hostid; + nvlist_t *policy = NULL, *nvconfig; + + if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { char *hostname; unsigned long myhostid = 0; @@@ -1549,15 -1985,50 +1998,51 @@@ * Check the state of the root vdev. If it can't be opened, it * indicates one or more toplevel vdevs are faulted. */ - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { - error = ENXIO; - goto out; + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + return (ENXIO); + + /* + * Load the DDTs (dedup tables). + */ + error = ddt_load(spa); + if (error != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + spa_update_dspace(spa); + + if (state != SPA_LOAD_TRYIMPORT) { + error = spa_load_verify(spa); + if (error) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + error)); } - if (spa_writeable(spa)) { + /* + * Load the intent log state and check log integrity. If we're + * assembling a pool from a split, the log is not transferred over. + */ + if (type != SPA_IMPORT_ASSEMBLE) { + nvlist_t *nvconfig; + + if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + spa_load_log_state(spa, nvroot); + nvlist_free(nvconfig); + + if (spa_check_logs(spa)) { + *ereport = FM_EREPORT_ZFS_LOG_REPLAY; + return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); + } + } + + if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || + spa->spa_load_max_txg == UINT64_MAX)) { dmu_tx_t *tx; int need_update = B_FALSE; + int c; ASSERT(state != SPA_LOAD_TRYIMPORT); @@@ -1571,53 -2047,142 +2061,142 @@@ zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); - spa->spa_log_state = SPA_LOG_GOOD; - spa->spa_sync_on = B_TRUE; - txg_sync_start(spa->spa_dsl_pool); + spa->spa_claiming = B_FALSE; + + spa_set_log_state(spa, SPA_LOG_GOOD); + spa->spa_sync_on = B_TRUE; + txg_sync_start(spa->spa_dsl_pool); + + /* + * Wait for all claims to sync. We sync up to the highest + * claimed log block birth time so that claimed log blocks + * don't appear to be from the future. spa_claim_max_txg + * will have been set for us by either zil_check_log_chain() + * (invoked from spa_check_logs()) or zil_claim() above. + */ + txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); + + /* + * If the config cache is stale, or we have uninitialized + * metaslabs (see spa_vdev_add()), then update the config. + * + * If spa_load_verbatim is true, trust the current + * in-core spa_config and update the disk labels. + */ + if (config_cache_txg != spa->spa_config_txg || + state == SPA_LOAD_IMPORT || spa->spa_load_verbatim || + state == SPA_LOAD_RECOVER) + need_update = B_TRUE; + - for (int c = 0; c < rvd->vdev_children; c++) ++ for (c = 0; c < rvd->vdev_children; c++) + if (rvd->vdev_child[c]->vdev_ms_array == 0) + need_update = B_TRUE; + + /* + * Update the config cache asychronously in case we're the + * root pool, in which case the config cache isn't writable yet. + */ + if (need_update) + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + + /* + * Check all DTLs to see if anything needs resilvering. + */ + if (!dsl_scan_resilvering(spa->spa_dsl_pool) && + vdev_resilver_needed(rvd, NULL, NULL)) + spa_async_request(spa, SPA_ASYNC_RESILVER); + + /* + * Delete any inconsistent datasets. + */ + (void) dmu_objset_find(spa_name(spa), + dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); + + /* + * Clean up any stale temporary dataset userrefs. + */ + dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); + } + + return (0); + } + + static int + spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) + { + spa_unload(spa); + spa_deactivate(spa); + + spa->spa_load_max_txg--; + + spa_activate(spa, spa_mode_global); + spa_async_suspend(spa); + + return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); + } + + static int + spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, + uint64_t max_request, int rewind_flags) + { + nvlist_t *config = NULL; + int load_error, rewind_error; + uint64_t safe_rewind_txg; + uint64_t min_txg; + + if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { + spa->spa_load_max_txg = spa->spa_load_txg; + spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + spa->spa_load_max_txg = max_request; + } - /* - * Wait for all claims to sync. - */ - txg_wait_synced(spa->spa_dsl_pool, 0); + load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, + mosconfig); + if (load_error == 0) + return (0); - /* - * If the config cache is stale, or we have uninitialized - * metaslabs (see spa_vdev_add()), then update the config. - * - * If spa_load_verbatim is true, trust the current - * in-core spa_config and update the disk labels. - */ - if (config_cache_txg != spa->spa_config_txg || - state == SPA_LOAD_IMPORT || spa->spa_load_verbatim) - need_update = B_TRUE; + if (spa->spa_root_vdev != NULL) + config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); - for (c = 0; c < rvd->vdev_children; c++) - if (rvd->vdev_child[c]->vdev_ms_array == 0) - need_update = B_TRUE; + spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; + spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; - /* - * Update the config cache asychronously in case we're the - * root pool, in which case the config cache isn't writable yet. - */ - if (need_update) - spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + if (rewind_flags & ZPOOL_NEVER_REWIND) { + nvlist_free(config); + return (load_error); + } - /* - * Check all DTLs to see if anything needs resilvering. - */ - if (vdev_resilver_needed(rvd, NULL, NULL)) - spa_async_request(spa, SPA_ASYNC_RESILVER); + /* Price of rolling back is discarding txgs, including log */ + if (state == SPA_LOAD_RECOVER) + spa_set_log_state(spa, SPA_LOG_CLEAR); + + spa->spa_load_max_txg = spa->spa_last_ubsync_txg; + safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; + min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? + TXG_INITIAL : safe_rewind_txg; + + /* + * Continue as long as we're finding errors, we're still within + * the acceptable rewind range, and we're still finding uberblocks + */ + while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && + spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { + if (spa->spa_load_max_txg < safe_rewind_txg) + spa->spa_extreme_rewind = B_TRUE; + rewind_error = spa_load_retry(spa, state, mosconfig); } - error = 0; - out: - spa->spa_minref = refcount_count(&spa->spa_refcount); - if (error && error != EBADF) - zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); - spa->spa_load_state = SPA_LOAD_NONE; - spa->spa_ena = 0; + if (config) + spa_rewind_data_to_nvlist(spa, config); - return (error); + spa->spa_extreme_rewind = B_FALSE; + spa->spa_load_max_txg = UINT64_MAX; + + if (config && (rewind_error || state != SPA_LOAD_RECOVER)) + spa_config_set(spa, config); + + return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); } /* @@@ -2950,10 -3575,20 +3592,20 @@@ spa_vdev_add(spa_t *spa, nvlist_t *nvro /* * Transfer each new top-level vdev from vd to rvd. */ - for (int c = 0; c < vd->vdev_children; c++) { + for (c = 0; c < vd->vdev_children; c++) { + + /* + * Set the vdev id to the first hole, if one exists. + */ + for (id = 0; id < rvd->vdev_children; id++) { + if (rvd->vdev_child[id]->vdev_ishole) { + vdev_free(rvd->vdev_child[id]); + break; + } + } tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); - tvd->vdev_id = rvd->vdev_children; + tvd->vdev_id = id; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } @@@ -3201,7 -3841,7 +3858,8 @@@ spa_vdev_detach(spa_t *spa, uint64_t gu boolean_t unspare = B_FALSE; uint64_t unspare_guid; size_t len; + char *vdpath; + int t; txg = spa_vdev_enter(spa); @@@ -3329,77 -3969,366 +3987,366 @@@ (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); } - /* - * If the parent mirror/replacing vdev only has one child, - * the parent is no longer needed. Remove it from the tree. - */ - if (pvd->vdev_children == 1) - vdev_remove_parent(cvd); + /* + * If the parent mirror/replacing vdev only has one child, + * the parent is no longer needed. Remove it from the tree. + */ + if (pvd->vdev_children == 1) + vdev_remove_parent(cvd); + + /* + * We don't set tvd until now because the parent we just removed + * may have been the previous top-level vdev. + */ + tvd = cvd->vdev_top; + ASSERT(tvd->vdev_parent == rvd); + + /* + * Reevaluate the parent vdev state. + */ + vdev_propagate_state(cvd); + + /* + * If the 'autoexpand' property is set on the pool then automatically + * try to expand the size of the pool. For example if the device we + * just detached was smaller than the others, it may be possible to + * add metaslabs (i.e. grow the pool). We need to reopen the vdev + * first so that we can obtain the updated sizes of the leaf vdevs. + */ + if (spa->spa_autoexpand) { + vdev_reopen(tvd); + vdev_expand(tvd, txg); + } + + vdev_config_dirty(tvd); + + /* + * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that + * vd->vdev_detached is set and free vd's DTL object in syncing context. + * But first make sure we're not on any *other* txg's DTL list, to + * prevent vd from being accessed after it's freed. + */ + vdpath = spa_strdup(vd->vdev_path); - for (int t = 0; t < TXG_SIZE; t++) ++ for (t = 0; t < TXG_SIZE; t++) + (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); + vd->vdev_detached = B_TRUE; + vdev_dirty(tvd, VDD_DTL, vd, txg); + + spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); + + error = spa_vdev_exit(spa, vd, txg, 0); + + spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, + "vdev=%s", vdpath); + spa_strfree(vdpath); + + /* + * If this was the removal of the original device in a hot spare vdev, + * then we want to go through and remove the device from the hot spare + * list of every other pool. + */ + if (unspare) { + spa_t *myspa = spa; + spa = NULL; + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (spa->spa_state != POOL_STATE_ACTIVE) + continue; + if (spa == myspa) + continue; + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + (void) spa_vdev_remove(spa, unspare_guid, + B_TRUE); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + } + mutex_exit(&spa_namespace_lock); + } + + return (error); + } + + /* + * Split a set of devices from their mirrors, and create a new pool from them. + */ + int + spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, + nvlist_t *props, boolean_t exp) + { + int error = 0; + uint64_t txg, *glist; + spa_t *newspa; + uint_t c, children, lastlog; + nvlist_t **child, *nvl, *tmp; + dmu_tx_t *tx; + char *altroot = NULL; + vdev_t *rvd, **vml = NULL; /* vdev modify list */ + boolean_t activate_slog; + + if (!spa_writeable(spa)) + return (EROFS); + + txg = spa_vdev_enter(spa); + + /* clear the log and flush everything up to now */ + activate_slog = spa_passivate_log(spa); + (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); + error = spa_offline_log(spa); + txg = spa_vdev_config_enter(spa); + + if (activate_slog) + spa_activate_log(spa); + + if (error != 0) + return (spa_vdev_exit(spa, NULL, txg, error)); + + /* check new spa name before going any further */ + if (spa_lookup(newname) != NULL) + return (spa_vdev_exit(spa, NULL, txg, EEXIST)); + + /* + * scan through all the children to ensure they're all mirrors + */ + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || + nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + /* first, check to ensure we've got the right child count */ + rvd = spa->spa_root_vdev; + lastlog = 0; + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + /* don't count the holes & logs as children */ + if (vd->vdev_islog || vd->vdev_ishole) { + if (lastlog == 0) + lastlog = c; + continue; + } + + lastlog = 0; + } + if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + /* next, ensure no spare or cache devices are part of the split */ + if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || + nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); + glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); + + /* then, loop over each vdev and validate it */ + for (c = 0; c < children; c++) { + uint64_t is_hole = 0; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &is_hole); + + if (is_hole != 0) { + if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || + spa->spa_root_vdev->vdev_child[c]->vdev_islog) { + continue; + } else { + error = EINVAL; + break; + } + } + + /* which disk is going to be split? */ + if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, + &glist[c]) != 0) { + error = EINVAL; + break; + } + + /* look it up in the spa */ + vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); + if (vml[c] == NULL) { + error = ENODEV; + break; + } + + /* make sure there's nothing stopping the split */ + if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || + vml[c]->vdev_islog || + vml[c]->vdev_ishole || + vml[c]->vdev_isspare || + vml[c]->vdev_isl2cache || + !vdev_writeable(vml[c]) || + vml[c]->vdev_children != 0 || + vml[c]->vdev_state != VDEV_STATE_HEALTHY || + c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { + error = EINVAL; + break; + } + + if (vdev_dtl_required(vml[c])) { + error = EBUSY; + break; + } + + /* we need certain info from the top level */ + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, + vml[c]->vdev_top->vdev_ms_array) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, + vml[c]->vdev_top->vdev_ms_shift) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, + vml[c]->vdev_top->vdev_asize) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, + vml[c]->vdev_top->vdev_ashift) == 0); + } + + if (error != 0) { + kmem_free(vml, children * sizeof (vdev_t *)); + kmem_free(glist, children * sizeof (uint64_t)); + return (spa_vdev_exit(spa, NULL, txg, error)); + } + + /* stop writers from using the disks */ + for (c = 0; c < children; c++) { + if (vml[c] != NULL) + vml[c]->vdev_offline = B_TRUE; + } + vdev_reopen(spa->spa_root_vdev); /* - * We don't set tvd until now because the parent we just removed - * may have been the previous top-level vdev. + * Temporarily record the splitting vdevs in the spa config. This + * will disappear once the config is regenerated. */ - tvd = cvd->vdev_top; - ASSERT(tvd->vdev_parent == rvd); + VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, + glist, children) == 0); + kmem_free(glist, children * sizeof (uint64_t)); - /* - * Reevaluate the parent vdev state. - */ - vdev_propagate_state(cvd); + mutex_enter(&spa->spa_props_lock); + VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, + nvl) == 0); + mutex_exit(&spa->spa_props_lock); + spa->spa_config_splitting = nvl; + vdev_config_dirty(spa->spa_root_vdev); + + /* configure and create the new pool */ + VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, + exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, + spa->spa_config_txg) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, + spa_generate_guid(NULL)) == 0); + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - /* - * If the 'autoexpand' property is set on the pool then automatically - * try to expand the size of the pool. For example if the device we - * just detached was smaller than the others, it may be possible to - * add metaslabs (i.e. grow the pool). We need to reopen the vdev - * first so that we can obtain the updated sizes of the leaf vdevs. - */ - if (spa->spa_autoexpand) { - vdev_reopen(tvd); - vdev_expand(tvd, txg); + /* add the new pool to the namespace */ + newspa = spa_add(newname, config, altroot); + newspa->spa_config_txg = spa->spa_config_txg; + spa_set_log_state(newspa, SPA_LOG_CLEAR); + + /* release the spa config lock, retaining the namespace lock */ + spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); + + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 1); + + spa_activate(newspa, spa_mode_global); + spa_async_suspend(newspa); + + /* create the new pool from the disks of the original pool */ + error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); + if (error) + goto out; + + /* if that worked, generate a real config for the new pool */ + if (newspa->spa_root_vdev != NULL) { + VERIFY(nvlist_alloc(&newspa->spa_config_splitting, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, + ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); + spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, + B_TRUE)); } - vdev_config_dirty(tvd); + /* set the props */ + if (props != NULL) { + spa_configfile_set(newspa, props, B_FALSE); + error = spa_prop_set(newspa, props); + if (error) + goto out; + } - /* - * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that - * vd->vdev_detached is set and free vd's DTL object in syncing context. - * But first make sure we're not on any *other* txg's DTL list, to - * prevent vd from being accessed after it's freed. - */ - for (t = 0; t < TXG_SIZE; t++) - (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); - vd->vdev_detached = B_TRUE; - vdev_dirty(tvd, VDD_DTL, vd, txg); + /* flush everything */ + txg = spa_vdev_config_enter(newspa); + vdev_config_dirty(newspa->spa_root_vdev); + (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); - spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 2); - error = spa_vdev_exit(spa, vd, txg, 0); + spa_async_resume(newspa); - /* - * If this was the removal of the original device in a hot spare vdev, - * then we want to go through and remove the device from the hot spare - * list of every other pool. - */ - if (unspare) { - spa_t *myspa = spa; - spa = NULL; - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) { - if (spa->spa_state != POOL_STATE_ACTIVE) - continue; - if (spa == myspa) - continue; - spa_open_ref(spa, FTAG); - mutex_exit(&spa_namespace_lock); - (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); - mutex_enter(&spa_namespace_lock); - spa_close(spa, FTAG); + /* finally, update the original pool's config */ + txg = spa_vdev_config_enter(spa); + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) + dmu_tx_abort(tx); + for (c = 0; c < children; c++) { + if (vml[c] != NULL) { + vdev_split(vml[c]); + if (error == 0) + spa_history_log_internal(LOG_POOL_VDEV_DETACH, + spa, tx, "vdev=%s", + vml[c]->vdev_path); + vdev_free(vml[c]); } - mutex_exit(&spa_namespace_lock); } + vdev_config_dirty(spa->spa_root_vdev); + spa->spa_config_splitting = NULL; + nvlist_free(nvl); + if (error == 0) + dmu_tx_commit(tx); + (void) spa_vdev_exit(spa, NULL, txg, 0); + + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 3); + + /* split is complete; log a history record */ + spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, + "split new pool %s from pool %s", newname, spa_name(spa)); + + kmem_free(vml, children * sizeof (vdev_t *)); + + /* if we're not going to mount the filesystems in userland, export */ + if (exp) + error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, + B_FALSE, B_FALSE); + + return (error); + + out: + spa_unload(newspa); + spa_deactivate(newspa); + spa_remove(newspa); + + txg = spa_vdev_config_enter(spa); + + /* re-online all offlined disks */ + for (c = 0; c < children; c++) { + if (vml[c] != NULL) + vml[c]->vdev_offline = B_FALSE; + } + vdev_reopen(spa->spa_root_vdev); + + nvlist_free(spa->spa_config_splitting); + spa->spa_config_splitting = NULL; + (void) spa_vdev_exit(spa, NULL, txg, error); + kmem_free(vml, children * sizeof (vdev_t *)); return (error); } @@@ -3685,12 -4755,21 +4777,23 @@@ spa_scan(spa_t *spa, pool_scan_func_t f static void spa_async_remove(spa_t *spa, vdev_t *vd) { + int c; + if (vd->vdev_remove_wanted) { - vd->vdev_remove_wanted = 0; + vd->vdev_remove_wanted = B_FALSE; + vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); - vdev_clear(spa, vd); + + /* + * We want to clear the stats, but we don't want to do a full + * vdev_clear() as that will cause us to throw away + * degraded/faulted state as well as attempt to reopen the + * device, all of which is a waste. + */ + vd->vdev_stat.vs_read_errors = 0; + vd->vdev_stat.vs_write_errors = 0; + vd->vdev_stat.vs_checksum_errors = 0; + vdev_state_dirty(vd->vdev_top); } @@@ -3701,10 -4780,8 +4804,10 @@@ static void spa_async_probe(spa_t *spa, vdev_t *vd) { + int c; + if (vd->vdev_probe_wanted) { - vd->vdev_probe_wanted = 0; + vd->vdev_probe_wanted = B_FALSE; vdev_reopen(vd); /* vdev_open() does the actual probe */ } @@@ -3785,11 -4860,11 +4887,11 @@@ spa_async_thread(spa_t *spa * See if any devices need to be marked REMOVED. */ if (tasks & SPA_ASYNC_REMOVE) { - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); - for (int i = 0; i < spa->spa_l2cache.sav_count; i++) + for (i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); - for (int i = 0; i < spa->spa_spares.sav_count; i++) + for (i = 0; i < spa->spa_spares.sav_count; i++) spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); (void) spa_vdev_state_exit(spa, NULL, 0); } @@@ -4146,9 -5247,7 +5274,8 @@@ spa_sync(spa_t *spa, uint64_t txg vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; dmu_tx_t *tx; - int dirty_vdevs; int error; + int c; /* * Lock out configuration changes. diff --cc module/zfs/spa_misc.c index 88ae172b4,52af7fcb7..20946c4e7 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@@ -433,6 -424,6 +433,7 @@@ spa_add(const char *name, nvlist_t *con { spa_t *spa; spa_config_dirent_t *dp; ++ int t; ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@@ -450,6 -444,9 +454,9 @@@ cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); - for (int t = 0; t < TXG_SIZE; t++) ++ for (t = 0; t < TXG_SIZE; t++) + bplist_create(&spa->spa_free_bplist[t]); + (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); spa->spa_state = POOL_STATE_UNINITIALIZED; spa->spa_freeze_txg = UINT64_MAX; @@@ -492,6 -493,6 +503,7 @@@ voi spa_remove(spa_t *spa) { spa_config_dirent_t *dp; ++ int t; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); @@@ -519,7 -522,11 +533,11 @@@ spa_config_lock_destroy(spa); - for (int t = 0; t < TXG_SIZE; t++) ++ for (t = 0; t < TXG_SIZE; t++) + bplist_destroy(&spa->spa_free_bplist[t]); + cv_destroy(&spa->spa_async_cv); + cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); @@@ -1302,24 -1419,52 +1430,54 @@@ spa_max_replication(spa_t *spa return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } + int + spa_prev_software_version(spa_t *spa) + { + return (spa->spa_prev_software_version); + } + uint64_t - bp_get_dasize(spa_t *spa, const blkptr_t *bp) + dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { - int sz = 0, i; + uint64_t asize = DVA_GET_ASIZE(dva); + uint64_t dsize = asize; - if (!spa->spa_deflate) - return (BP_GET_ASIZE(bp)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - for (i = 0; i < SPA_DVAS_PER_BP; i++) { - vdev_t *vd = - vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i])); - if (vd) - sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> - SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; + if (asize != 0 && spa->spa_deflate) { + vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } + + return (dsize); + } + + uint64_t + bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) + { + uint64_t dsize = 0; ++ int d; + - for (int d = 0; d < SPA_DVAS_PER_BP; d++) ++ for (d = 0; d < SPA_DVAS_PER_BP; d++) + dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); + + return (dsize); + } + + uint64_t + bp_get_dsize(spa_t *spa, const blkptr_t *bp) + { + uint64_t dsize = 0; ++ int d; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + - for (int d = 0; d < SPA_DVAS_PER_BP; d++) ++ for (d = 0; d < SPA_DVAS_PER_BP; d++) + dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); + spa_config_exit(spa, SCL_VDEV, FTAG); - return (sz); + + return (dsize); } /* diff --cc module/zfs/vdev.c index cb4a3e252,a61f29b8e..e4c1a7707 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@@ -1016,13 -1058,35 +1067,38 @@@ vdev_open_child(void *arg vd->vdev_open_thread = NULL; } + boolean_t + vdev_uses_zvols(vdev_t *vd) + { ++ int c; ++ + if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, + strlen(ZVOL_DIR)) == 0) + return (B_TRUE); - for (int c = 0; c < vd->vdev_children; c++) ++ for (c = 0; c < vd->vdev_children; c++) + if (vdev_uses_zvols(vd->vdev_child[c])) + return (B_TRUE); + return (B_FALSE); + } + void vdev_open_children(vdev_t *vd) { taskq_t *tq; int children = vd->vdev_children; + int c; + /* + * in order to handle pools on top of zvols, do the opens + * in a single thread so that the same thread holds the + * spa_namespace_lock + */ + if (vdev_uses_zvols(vd)) { - for (int c = 0; c < children; c++) ++ for (c = 0; c < children; c++) + vd->vdev_child[c]->vdev_open_error = + vdev_open(vd->vdev_child[c]); + return; + } tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); @@@ -1090,10 -1177,16 +1190,16 @@@ vdev_open(vdev_t *vd vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { - vd->vdev_state = VDEV_STATE_HEALTHY; + vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } + /* + * For hole or missing vdevs we just return success. + */ + if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) + return (0); + - for (int c = 0; c < vd->vdev_children; c++) { + for (c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); @@@ -1200,11 -1293,10 +1306,11 @@@ vdev_validate(vdev_t *vd { spa_t *spa = vd->vdev_spa; nvlist_t *label; - uint64_t guid, top_guid; + uint64_t guid = 0, top_guid; uint64_t state; + int c; - for (int c = 0; c < vd->vdev_children; c++) + for (c = 0; c < vd->vdev_children; c++) if (vdev_validate(vd->vdev_child[c]) != 0) return (EBADF); @@@ -1308,6 -1431,41 +1445,43 @@@ vdev_close(vdev_t *vd vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } + void + vdev_hold(vdev_t *vd) + { + spa_t *spa = vd->vdev_spa; ++ int c; + + ASSERT(spa_is_root(spa)); + if (spa->spa_state == POOL_STATE_UNINITIALIZED) + return; + - for (int c = 0; c < vd->vdev_children; c++) ++ for (c = 0; c < vd->vdev_children; c++) + vdev_hold(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_hold(vd); + } + + void + vdev_rele(vdev_t *vd) + { + spa_t *spa = vd->vdev_spa; ++ int c; + + ASSERT(spa_is_root(spa)); - for (int c = 0; c < vd->vdev_children; c++) ++ for (c = 0; c < vd->vdev_children; c++) + vdev_rele(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_rele(vd); + } + + /* + * Reopen all interior vdevs and any unopened leaves. We don't actually + * reopen leaf vdevs which had previously been opened as they might deadlock + * on the spa_config_lock. Instead we only obtain the leaf's physical size. + * If the leaf has never been opened then open it, as usual. + */ void vdev_reopen(vdev_t *vd) { @@@ -1545,7 -1708,9 +1724,9 @@@ vdev_dtl_reassess(vdev_t *vd, uint64_t } mutex_enter(&vd->vdev_dtl_lock); - for (int t = 0; t < DTL_TYPES; t++) { + for (t = 0; t < DTL_TYPES; t++) { + /* account for child's outage in parent's missing map */ + int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) @@@ -1555,10 -1720,10 +1736,10 @@@ else minref = vd->vdev_children; /* any kind of mirror */ space_map_ref_create(&reftree); - for (int c = 0; c < vd->vdev_children; c++) { + for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); - space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1); + space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref); @@@ -1803,6 -1969,42 +1988,43 @@@ vdev_validate_aux(vdev_t *vd return (0); } + void + vdev_remove(vdev_t *vd, uint64_t txg) + { + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + dmu_tx_t *tx; ++ int m; + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + + if (vd->vdev_dtl_smo.smo_object) { + ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0); + (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx); + vd->vdev_dtl_smo.smo_object = 0; + } + + if (vd->vdev_ms != NULL) { - for (int m = 0; m < vd->vdev_ms_count; m++) { ++ for (m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp == NULL || msp->ms_smo.smo_object == 0) + continue; + + ASSERT3U(msp->ms_smo.smo_alloc, ==, 0); + (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx); + msp->ms_smo.smo_object = 0; + } + } + + if (vd->vdev_ms_array) { + (void) dmu_object_free(mos, vd->vdev_ms_array, tx); + vd->vdev_ms_array = 0; + vd->vdev_ms_shift = 0; + } + dmu_tx_commit(tx); + } + void vdev_sync_done(vdev_t *vd, uint64_t txg) { @@@ -2201,6 -2484,19 +2506,20 @@@ vdev_clear_stats(vdev_t *vd mutex_exit(&vd->vdev_stat_lock); } + void + vdev_scan_stat_init(vdev_t *vd) + { + vdev_stat_t *vs = &vd->vdev_stat; ++ int c; + - for (int c = 0; c < vd->vdev_children; c++) ++ for (c = 0; c < vd->vdev_children; c++) + vdev_scan_stat_init(vd->vdev_child[c]); + + mutex_enter(&vd->vdev_stat_lock); + vs->vs_scan_processed = 0; + mutex_exit(&vd->vdev_stat_lock); + } + void vdev_stat_update(zio_t *zio, uint64_t psize) { @@@ -2536,12 -2827,17 +2850,18 @@@ vdev_propagate_state(vdev_t *vd int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; + int c; if (vd->vdev_children > 0) { - for (int c = 0; c < vd->vdev_children; c++) { + for (c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; + /* + * Don't factor holes into the decision. + */ + if (child->vdev_ishole) + continue; + if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* @@@ -2737,24 -3039,24 +3065,25 @@@ vdev_is_bootable(vdev_t *vd return (B_TRUE); } + /* + * Load the state from the original vdev tree (ovd) which + * we've retrieved from the MOS config object. If the original + * vdev was offline then we transfer that state to the device + * in the current vdev tree (nvd). + */ void - vdev_load_log_state(vdev_t *vd, nvlist_t *nv) + vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) { - uint_t children; - nvlist_t **child; - uint64_t val; - spa_t *spa = vd->vdev_spa; + spa_t *spa = nvd->vdev_spa; + int c; - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (c = 0; c < children; c++) - vdev_load_log_state(vd->vdev_child[c], child[c]); - } + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); - if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv, - ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) { - for (int c = 0; c < nvd->vdev_children; c++) ++ for (c = 0; c < nvd->vdev_children; c++) + vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); + if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) { /* * It would be nice to call vdev_offline() * directly but the pool isn't fully loaded and diff --cc module/zfs/zio.c index 6efce7705,88d80af4e..520639e06 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@@ -904,10 -991,10 +992,10 @@@ zio_write_bp_init(zio_t *zio * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ - if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && + if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass > SYNC_PASS_REWRITE) { - uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; - ASSERT(csize != 0); - ASSERT(psize != 0); + enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; ++ ASSERT(psize != 0); zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { @@@ -1113,24 -1234,13 +1237,13 @@@ zio_reexecute(zio_t *pio pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_error = 0; - for (int w = 0; w < ZIO_WAIT_TYPES; w++) + for (w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; - for (int c = 0; c < ZIO_CHILD_TYPES; c++) + for (c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; - if (IO_IS_ALLOCATING(pio)) { - /* - * Remember the failed bp so that the io_ready() callback - * can update its accounting upon reexecution. The block - * was already freed in zio_done(); we indicate this with - * a fill count of -1 so that zio_free() knows to skip it. - */ - blkptr_t *bp = pio->io_bp; - ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg); - bp->blk_fill = BLK_FILL_ALREADY_FREED; - pio->io_bp_orig = *bp; - BP_ZERO(bp); - } + if (IO_IS_ALLOCATING(pio)) + BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. @@@ -1416,10 -1530,9 +1535,10 @@@ zio_gang_tree_assemble_done(zio_t *zio zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; + int g; ASSERT(gio == zio_unique_parent(zio)); - ASSERT(zio_walk_children(zio) == NULL); + ASSERT(zio->io_child_count == 0); if (zio->io_error) return; @@@ -1429,9 -1542,9 +1548,9 @@@ ASSERT(zio->io_data == gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); - ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); + ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; @@@ -1457,9 -1569,9 +1576,9 @@@ zio_gang_tree_issue(zio_t *pio, zio_gan zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); if (gn != NULL) { - ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); + ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; @@@ -1554,13 -1665,13 +1673,13 @@@ zio_write_gang_block(zio_t *pio uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; - int ndvas = gio->io_prop.zp_ndvas; - int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); + int copies = gio->io_prop.zp_copies; + int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); zio_prop_t zp; - int error; + int g, error; - error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE, - bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp, + error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, + bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); if (error) { pio->io_error = error; @@@ -1617,10 -1730,378 +1738,380 @@@ /* * ========================================================================== - * Allocate and free blocks + * Dedup * ========================================================================== */ + static void + zio_ddt_child_read_done(zio_t *zio) + { + blkptr_t *bp = zio->io_bp; + ddt_entry_t *dde = zio->io_private; + ddt_phys_t *ddp; + zio_t *pio = zio_unique_parent(zio); + + mutex_enter(&pio->io_lock); + ddp = ddt_phys_select(dde, bp); + if (zio->io_error == 0) + ddt_phys_clear(ddp); /* this ddp doesn't need repair */ + if (zio->io_error == 0 && dde->dde_repair_data == NULL) + dde->dde_repair_data = zio->io_data; + else + zio_buf_free(zio->io_data, zio->io_size); + mutex_exit(&pio->io_lock); + } + + static int + zio_ddt_read_start(zio_t *zio) + { + blkptr_t *bp = zio->io_bp; ++ int p; + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(BP_GET_PSIZE(bp) == zio->io_size); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + if (zio->io_child_error[ZIO_CHILD_DDT]) { + ddt_t *ddt = ddt_select(zio->io_spa, bp); + ddt_entry_t *dde = ddt_repair_start(ddt, bp); + ddt_phys_t *ddp = dde->dde_phys; + ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); + blkptr_t blk; + + ASSERT(zio->io_vsd == NULL); + zio->io_vsd = dde; + + if (ddp_self == NULL) + return (ZIO_PIPELINE_CONTINUE); + - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { ++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) + continue; + ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, + &blk); + zio_nowait(zio_read(zio, zio->io_spa, &blk, + zio_buf_alloc(zio->io_size), zio->io_size, + zio_ddt_child_read_done, dde, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, + &zio->io_bookmark)); + } + return (ZIO_PIPELINE_CONTINUE); + } + + zio_nowait(zio_read(zio, zio->io_spa, bp, + zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); + + return (ZIO_PIPELINE_CONTINUE); + } + static int + zio_ddt_read_done(zio_t *zio) + { + blkptr_t *bp = zio->io_bp; + + if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) + return (ZIO_PIPELINE_STOP); + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(BP_GET_PSIZE(bp) == zio->io_size); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + if (zio->io_child_error[ZIO_CHILD_DDT]) { + ddt_t *ddt = ddt_select(zio->io_spa, bp); + ddt_entry_t *dde = zio->io_vsd; + if (ddt == NULL) { + ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); + return (ZIO_PIPELINE_CONTINUE); + } + if (dde == NULL) { + zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); + return (ZIO_PIPELINE_STOP); + } + if (dde->dde_repair_data != NULL) { + bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); + zio->io_child_error[ZIO_CHILD_DDT] = 0; + } + ddt_repair_done(ddt, dde); + zio->io_vsd = NULL; + } + + ASSERT(zio->io_vsd == NULL); + + return (ZIO_PIPELINE_CONTINUE); + } + + static boolean_t + zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) + { + spa_t *spa = zio->io_spa; ++ int p; + + /* + * Note: we compare the original data, not the transformed data, + * because when zio->io_bp is an override bp, we will not have + * pushed the I/O transforms. That's an important optimization + * because otherwise we'd compress/encrypt all dmu_sync() data twice. + */ - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ++ for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { + zio_t *lio = dde->dde_lead_zio[p]; + + if (lio != NULL) { + return (lio->io_orig_size != zio->io_orig_size || + bcmp(zio->io_orig_data, lio->io_orig_data, + zio->io_orig_size) != 0); + } + } + - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ++ for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { + ddt_phys_t *ddp = &dde->dde_phys[p]; + + if (ddp->ddp_phys_birth != 0) { + arc_buf_t *abuf = NULL; + uint32_t aflags = ARC_WAIT; + blkptr_t blk = *zio->io_bp; + int error; + + ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); + + ddt_exit(ddt); + + error = arc_read_nolock(NULL, spa, &blk, + arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &zio->io_bookmark); + + if (error == 0) { + if (arc_buf_size(abuf) != zio->io_orig_size || + bcmp(abuf->b_data, zio->io_orig_data, + zio->io_orig_size) != 0) + error = EEXIST; + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + } + + ddt_enter(ddt); + return (error != 0); + } + } + + return (B_FALSE); + } + + static void + zio_ddt_child_write_ready(zio_t *zio) + { + int p = zio->io_prop.zp_copies; + ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); + ddt_entry_t *dde = zio->io_private; + ddt_phys_t *ddp = &dde->dde_phys[p]; + zio_t *pio; + + if (zio->io_error) + return; + + ddt_enter(ddt); + + ASSERT(dde->dde_lead_zio[p] == zio); + + ddt_phys_fill(ddp, zio->io_bp); + + while ((pio = zio_walk_parents(zio)) != NULL) + ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); + + ddt_exit(ddt); + } + + static void + zio_ddt_child_write_done(zio_t *zio) + { + int p = zio->io_prop.zp_copies; + ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); + ddt_entry_t *dde = zio->io_private; + ddt_phys_t *ddp = &dde->dde_phys[p]; + + ddt_enter(ddt); + + ASSERT(ddp->ddp_refcnt == 0); + ASSERT(dde->dde_lead_zio[p] == zio); + dde->dde_lead_zio[p] = NULL; + + if (zio->io_error == 0) { + while (zio_walk_parents(zio) != NULL) + ddt_phys_addref(ddp); + } else { + ddt_phys_clear(ddp); + } + + ddt_exit(ddt); + } + + static void + zio_ddt_ditto_write_done(zio_t *zio) + { + int p = DDT_PHYS_DITTO; + zio_prop_t *zp = &zio->io_prop; + blkptr_t *bp = zio->io_bp; + ddt_t *ddt = ddt_select(zio->io_spa, bp); + ddt_entry_t *dde = zio->io_private; + ddt_phys_t *ddp = &dde->dde_phys[p]; + ddt_key_t *ddk = &dde->dde_key; + + ddt_enter(ddt); + + ASSERT(ddp->ddp_refcnt == 0); + ASSERT(dde->dde_lead_zio[p] == zio); + dde->dde_lead_zio[p] = NULL; + + if (zio->io_error == 0) { + ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); + ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); + ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); + if (ddp->ddp_phys_birth != 0) + ddt_phys_free(ddt, ddk, ddp, zio->io_txg); + ddt_phys_fill(ddp, bp); + } + + ddt_exit(ddt); + } + + static int + zio_ddt_write(zio_t *zio) + { + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + uint64_t txg = zio->io_txg; + zio_prop_t *zp = &zio->io_prop; + int p = zp->zp_copies; + int ditto_copies; + zio_t *cio = NULL; + zio_t *dio = NULL; + ddt_t *ddt = ddt_select(spa, bp); + ddt_entry_t *dde; + ddt_phys_t *ddp; + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); + ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); + + ddt_enter(ddt); + dde = ddt_lookup(ddt, bp, B_TRUE); + ddp = &dde->dde_phys[p]; + + if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { + /* + * If we're using a weak checksum, upgrade to a strong checksum + * and try again. If we're already using a strong checksum, + * we can't resolve it, so just convert to an ordinary write. + * (And automatically e-mail a paper to Nature?) + */ + if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { + zp->zp_checksum = spa_dedup_checksum(spa); + zio_pop_transforms(zio); + zio->io_stage = ZIO_STAGE_OPEN; + BP_ZERO(bp); + } else { + zp->zp_dedup = 0; + } + zio->io_pipeline = ZIO_WRITE_PIPELINE; + ddt_exit(ddt); + return (ZIO_PIPELINE_CONTINUE); + } + + ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); + ASSERT(ditto_copies < SPA_DVAS_PER_BP); + + if (ditto_copies > ddt_ditto_copies_present(dde) && + dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { + zio_prop_t czp = *zp; + + czp.zp_copies = ditto_copies; + + /* + * If we arrived here with an override bp, we won't have run + * the transform stack, so we won't have the data we need to + * generate a child i/o. So, toss the override bp and restart. + * This is safe, because using the override bp is just an + * optimization; and it's rare, so the cost doesn't matter. + */ + if (zio->io_bp_override) { + zio_pop_transforms(zio); + zio->io_stage = ZIO_STAGE_OPEN; + zio->io_pipeline = ZIO_WRITE_PIPELINE; + zio->io_bp_override = NULL; + BP_ZERO(bp); + ddt_exit(ddt); + return (ZIO_PIPELINE_CONTINUE); + } + + dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, + zio->io_orig_size, &czp, NULL, + zio_ddt_ditto_write_done, dde, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); + + zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); + dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; + } + + if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { + if (ddp->ddp_phys_birth != 0) + ddt_bp_fill(ddp, bp, txg); + if (dde->dde_lead_zio[p] != NULL) + zio_add_child(zio, dde->dde_lead_zio[p]); + else + ddt_phys_addref(ddp); + } else if (zio->io_bp_override) { + ASSERT(bp->blk_birth == txg); + ASSERT(BP_EQUAL(bp, zio->io_bp_override)); + ddt_phys_fill(ddp, bp); + ddt_phys_addref(ddp); + } else { + cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, + zio->io_orig_size, zp, zio_ddt_child_write_ready, + zio_ddt_child_write_done, dde, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); + + zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); + dde->dde_lead_zio[p] = cio; + } + + ddt_exit(ddt); + + if (cio) + zio_nowait(cio); + if (dio) + zio_nowait(dio); + + return (ZIO_PIPELINE_CONTINUE); + } + + ddt_entry_t *freedde; /* for debugging */ + + static int + zio_ddt_free(zio_t *zio) + { + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + ddt_t *ddt = ddt_select(spa, bp); + ddt_entry_t *dde; + ddt_phys_t *ddp; + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + ddt_enter(ddt); + freedde = dde = ddt_lookup(ddt, bp, B_TRUE); + ddp = ddt_phys_select(dde, bp); + ddt_phys_decref(ddp); + ddt_exit(ddt); + + return (ZIO_PIPELINE_CONTINUE); + } + + /* + * ========================================================================== + * Allocate and free blocks + * ========================================================================== + */ static int zio_dva_allocate(zio_t *zio) { @@@ -1680,40 -2161,14 +2171,14 @@@ zio_dva_claim(zio_t *zio static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { - spa_t *spa = zio->io_spa; - boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE); - int g; - ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); - - if (zio->io_bp == bp && !now) { - /* - * This is a rewrite for sync-to-convergence. - * We can't do a metaslab_free(NOW) because bp wasn't allocated - * during this sync pass, which means that metaslab_sync() - * already committed the allocation. - */ - ASSERT(DVA_EQUAL(BP_IDENTITY(bp), - BP_IDENTITY(&zio->io_bp_orig))); - ASSERT(spa_sync_pass(spa) > 1); - - if (BP_IS_GANG(bp) && gn == NULL) { - /* - * This is a gang leader whose gang header(s) we - * couldn't read now, so defer the free until later. - * The block should still be intact because without - * the headers, we'd never even start the rewrite. - */ - bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); - return; - } - } + ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) - metaslab_free(spa, bp, bp->blk_birth, now); + metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); if (gn != NULL) { - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { zio_dva_unallocate(zio, gn->gn_child[g], &gn->gn_gbh->zg_blkptr[g]); }