]> granicus.if.org Git - zfs/commitdiff
OpenZFS 9166 - zfs storage pool checkpoint
authorSerapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Fri, 16 Dec 2016 22:11:29 +0000 (14:11 -0800)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 26 Jun 2018 17:07:42 +0000 (10:07 -0700)
Details about the motivation of this feature and its usage can
be found in this blogpost:

    https://sdimitro.github.io/post/zpool-checkpoint/

A lightning talk of this feature can be found here:
https://www.youtube.com/watch?v=fPQA8K40jAM

Implementation details can be found in big block comment of
spa_checkpoint.c

Side-changes that are relevant to this commit but not explained
elsewhere:

* renames members of "struct metaslab trees to be shorter without
  losing meaning

* space_map_{alloc,truncate}() accept a block size as a
  parameter. The reason is that in the current state all space
  maps that we allocate through the DMU use a global tunable
  (space_map_blksz) which defauls to 4KB. This is ok for metaslab
  space maps in terms of bandwirdth since they are scattered all
  over the disk. But for other space maps this default is probably
  not what we want. Examples are device removal's vdev_obsolete_sm
  or vdev_chedkpoint_sm from this review. Both of these have a
  1:1 relationship with each vdev and could benefit from a bigger
  block size.

Porting notes:

* The part of dsl_scan_sync() which handles async destroys has
  been moved into the new dsl_process_async_destroys() function.

* Remove "VERIFY(!(flags & FWRITE))" in "kernel.c" so zhack can write
  to block device backed pools.

* ZTS:
  * Fix get_txg() in zpool_sync_001_pos due to "checkpoint_txg".

  * Don't use large dd block sizes on /dev/urandom under Linux in
    checkpoint_capacity.

  * Adopt Delphix-OS's setting of 4 (spa_asize_inflation =
    SPA_DVAS_PER_BP + 1) for the checkpoint_capacity test to speed
    its attempts to fill the pool

  * Create the base and nested pools with sync=disabled to speed up
    the "setup" phase.

  * Clear labels in test pool between checkpoint tests to avoid
    duplicate pool issues.

  * The import_rewind_device_replaced test has been marked as "known
    to fail" for the reasons listed in its DISCLAIMER.

  * New module parameters:

      zfs_spa_discard_memory_limit,
      zfs_remove_max_bytes_pause (not documented - debugging only)
      vdev_max_ms_count (formerly metaslabs_per_vdev)
      vdev_min_ms_count

Authored by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: John Kennedy <john.kennedy@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9166
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/7159fdb8
Closes #7570

115 files changed:
cmd/zdb/zdb.c
cmd/zdb/zdb_il.c
cmd/zpool/zpool_main.c
cmd/ztest/ztest.c
configure.ac
include/libzfs.h
include/libzfs_core.h
include/sys/Makefile.am
include/sys/dmu.h
include/sys/dsl_dir.h
include/sys/dsl_pool.h
include/sys/dsl_synctask.h
include/sys/fs/zfs.h
include/sys/metaslab.h
include/sys/metaslab_impl.h
include/sys/range_tree.h
include/sys/spa.h
include/sys/spa_checkpoint.h [new file with mode: 0644]
include/sys/spa_impl.h
include/sys/space_map.h
include/sys/uberblock_impl.h
include/sys/vdev.h
include/sys/vdev_impl.h
include/sys/vdev_removal.h
include/sys/zio.h
include/sys/zthr.h
include/zfeature_common.h
lib/libzfs/libzfs_pool.c
lib/libzfs/libzfs_util.c
lib/libzfs_core/libzfs_core.c
lib/libzpool/Makefile.am
lib/libzpool/kernel.c
man/man5/zfs-module-parameters.5
man/man5/zpool-features.5
man/man8/zdb.8
man/man8/zpool.8
module/zcommon/zfeature_common.c
module/zcommon/zpool_prop.c
module/zfs/Makefile.in
module/zfs/dmu_traverse.c
module/zfs/dnode.c
module/zfs/dnode_sync.c
module/zfs/dsl_dataset.c
module/zfs/dsl_destroy.c
module/zfs/dsl_dir.c
module/zfs/dsl_pool.c
module/zfs/dsl_scan.c
module/zfs/dsl_synctask.c
module/zfs/dsl_userhold.c
module/zfs/metaslab.c
module/zfs/range_tree.c
module/zfs/spa.c
module/zfs/spa_checkpoint.c [new file with mode: 0644]
module/zfs/spa_misc.c
module/zfs/space_map.c
module/zfs/uberblock.c
module/zfs/vdev.c
module/zfs/vdev_indirect.c
module/zfs/vdev_label.c
module/zfs/vdev_removal.c
module/zfs/zcp.c
module/zfs/zcp_synctask.c
module/zfs/zfs_ioctl.c
module/zfs/zil.c
module/zfs/zio.c
module/zfs/zthr.c
tests/runfiles/linux.run
tests/runfiles/longevity.run [new file with mode: 0644]
tests/zfs-tests/cmd/Makefile.am
tests/zfs-tests/cmd/randwritecomp/.gitignore [new file with mode: 0644]
tests/zfs-tests/cmd/randwritecomp/Makefile.am [new file with mode: 0644]
tests/zfs-tests/cmd/randwritecomp/randwritecomp.c [new file with mode: 0644]
tests/zfs-tests/include/commands.cfg
tests/zfs-tests/include/libtest.shlib
tests/zfs-tests/tests/functional/Makefile.am
tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh
tests/zfs-tests/tests/functional/pool_checkpoint/Makefile.am [new file with mode: 0644]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_after_rewind.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_big_rewind.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_capacity.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_conf_change.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_many.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_indirect.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_invalid.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_open.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_removal.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_rewind.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_ro_rewind.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_sm_scale.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_twice.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_vdev_add.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zhack_feat.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/cleanup.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/pool_checkpoint/pool_checkpoint.kshlib [new file with mode: 0644]
tests/zfs-tests/tests/functional/pool_checkpoint/setup.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/removal/removal.kshlib
tests/zfs-tests/tests/functional/removal/removal_remap_deadlists.ksh
tests/zfs-tests/tests/functional/removal/removal_reservation.ksh
tests/zfs-tests/tests/functional/removal/removal_with_add.ksh
tests/zfs-tests/tests/functional/removal/removal_with_create_fs.ksh
tests/zfs-tests/tests/functional/removal/removal_with_export.ksh
tests/zfs-tests/tests/functional/removal/removal_with_remap.ksh
tests/zfs-tests/tests/functional/removal/removal_with_remove.ksh
tests/zfs-tests/tests/functional/removal/removal_with_scrub.ksh
tests/zfs-tests/tests/functional/removal/removal_with_send.ksh
tests/zfs-tests/tests/functional/removal/removal_with_send_recv.ksh
tests/zfs-tests/tests/functional/removal/removal_with_snapshot.ksh
tests/zfs-tests/tests/functional/removal/removal_with_zdb.ksh

index faac43c792aa05b38f24fce422df16fea40c5c46..d1e77cce7f58b043657c15fe12f84542e30943ab 100644 (file)
@@ -131,7 +131,7 @@ static void
 usage(void)
 {
        (void) fprintf(stderr,
-           "Usage:\t%s [-AbcdDFGhiLMPsvX] [-e [-V] [-p <path> ...]] "
+           "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p <path> ...]] "
            "[-I <inflight I/Os>]\n"
            "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
            "\t\t[<poolname> [<object> ...]]\n"
@@ -168,6 +168,8 @@ usage(void)
        (void) fprintf(stderr, "        -h pool history\n");
        (void) fprintf(stderr, "        -i intent logs\n");
        (void) fprintf(stderr, "        -l read label contents\n");
+       (void) fprintf(stderr, "        -k examine the checkpointed state "
+           "of the pool\n");
        (void) fprintf(stderr, "        -L disable leak tracking (do not "
            "load spacemaps)\n");
        (void) fprintf(stderr, "        -m metaslabs\n");
@@ -730,6 +732,22 @@ get_prev_obsolete_spacemap_refcount(spa_t *spa)
        return (0);
 }
 
+static int
+get_checkpoint_refcount(vdev_t *vd)
+{
+       int refcount = 0;
+
+       if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
+           zap_contains(spa_meta_objset(vd->vdev_spa),
+           vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
+               refcount++;
+
+       for (uint64_t c = 0; c < vd->vdev_children; c++)
+               refcount += get_checkpoint_refcount(vd->vdev_child[c]);
+
+       return (refcount);
+}
+
 static int
 verify_spacemap_refcounts(spa_t *spa)
 {
@@ -743,6 +761,7 @@ verify_spacemap_refcounts(spa_t *spa)
        actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
        actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
        actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
+       actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
 
        if (expected_refcount != actual_refcount) {
                (void) printf("space map refcount mismatch: expected %lld != "
@@ -816,8 +835,8 @@ static void
 dump_metaslab_stats(metaslab_t *msp)
 {
        char maxbuf[32];
-       range_tree_t *rt = msp->ms_tree;
-       avl_tree_t *t = &msp->ms_size_tree;
+       range_tree_t *rt = msp->ms_allocatable;
+       avl_tree_t *t = &msp->ms_allocatable_by_size;
        int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
        /* max sure nicenum has enough space */
@@ -853,7 +872,7 @@ dump_metaslab(metaslab_t *msp)
                metaslab_load_wait(msp);
                if (!msp->ms_loaded) {
                        VERIFY0(metaslab_load(msp));
-                       range_tree_stat_verify(msp->ms_tree);
+                       range_tree_stat_verify(msp->ms_allocatable);
                }
                dump_metaslab_stats(msp);
                metaslab_unload(msp);
@@ -2420,6 +2439,8 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
                snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
                (void) printf("\trootbp = %s\n", blkbuf);
        }
+       (void) printf("\tcheckpoint_txg = %llu\n",
+           (u_longlong_t)ub->ub_checkpoint_txg);
        (void) printf("%s", footer ? footer : "");
 }
 
@@ -3129,6 +3150,7 @@ static const char *zdb_ot_extname[] = {
 typedef struct zdb_cb {
        zdb_blkstats_t  zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
        uint64_t        zcb_removing_size;
+       uint64_t        zcb_checkpoint_size;
        uint64_t        zcb_dedup_asize;
        uint64_t        zcb_dedup_blocks;
        uint64_t        zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
@@ -3229,7 +3251,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
        }
 
        VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
-           refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
+           refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
            bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
 }
 
@@ -3388,7 +3410,7 @@ claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
        ASSERT(vdev_is_concrete(vd));
 
        VERIFY0(metaslab_claim_impl(vd, offset, size,
-           spa_first_txg(vd->vdev_spa)));
+           spa_min_claim_txg(vd->vdev_spa)));
 }
 
 static void
@@ -3453,70 +3475,6 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
        spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
-/*
- * vm_idxp is an in-out parameter which (for indirect vdevs) is the
- * index in vim_entries that has the first entry in this metaslab.  On
- * return, it will be set to the first entry after this metaslab.
- */
-static void
-zdb_leak_init_ms(metaslab_t *msp, uint64_t *vim_idxp)
-{
-       metaslab_group_t *mg = msp->ms_group;
-       vdev_t *vd = mg->mg_vd;
-       vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
-
-       mutex_enter(&msp->ms_lock);
-       metaslab_unload(msp);
-
-       /*
-        * We don't want to spend the CPU manipulating the size-ordered
-        * tree, so clear the range_tree ops.
-        */
-       msp->ms_tree->rt_ops = NULL;
-
-       (void) fprintf(stderr,
-           "\rloading vdev %llu of %llu, metaslab %llu of %llu ...",
-           (longlong_t)vd->vdev_id,
-           (longlong_t)rvd->vdev_children,
-           (longlong_t)msp->ms_id,
-           (longlong_t)vd->vdev_ms_count);
-
-       /*
-        * For leak detection, we overload the metaslab ms_tree to
-        * contain allocated segments instead of free segments. As a
-        * result, we can't use the normal metaslab_load/unload
-        * interfaces.
-        */
-       if (vd->vdev_ops == &vdev_indirect_ops) {
-               vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-               for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
-                   (*vim_idxp)++) {
-                       vdev_indirect_mapping_entry_phys_t *vimep =
-                           &vim->vim_entries[*vim_idxp];
-                       uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
-                       uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
-                       ASSERT3U(ent_offset, >=, msp->ms_start);
-                       if (ent_offset >= msp->ms_start + msp->ms_size)
-                               break;
-
-                       /*
-                        * Mappings do not cross metaslab boundaries,
-                        * because we create them by walking the metaslabs.
-                        */
-                       ASSERT3U(ent_offset + ent_len, <=,
-                           msp->ms_start + msp->ms_size);
-                       range_tree_add(msp->ms_tree, ent_offset, ent_len);
-               }
-       } else if (msp->ms_sm != NULL) {
-               VERIFY0(space_map_load(msp->ms_sm, msp->ms_tree, SM_ALLOC));
-       }
-
-       if (!msp->ms_loaded) {
-               msp->ms_loaded = B_TRUE;
-       }
-       mutex_exit(&msp->ms_lock);
-}
-
 /* ARGSUSED */
 static int
 increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
@@ -3615,11 +3573,246 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
        ASSERT(error == ENOENT);
 }
 
+typedef struct checkpoint_sm_exclude_entry_arg {
+       vdev_t *cseea_vd;
+       uint64_t cseea_checkpoint_size;
+} checkpoint_sm_exclude_entry_arg_t;
+
+static int
+checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
+    void *arg)
+{
+       checkpoint_sm_exclude_entry_arg_t *cseea = arg;
+       vdev_t *vd = cseea->cseea_vd;
+       metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+       uint64_t end = offset + size;
+
+       ASSERT(type == SM_FREE);
+
+       /*
+        * Since the vdev_checkpoint_sm exists in the vdev level
+        * and the ms_sm space maps exist in the metaslab level,
+        * an entry in the checkpoint space map could theoretically
+        * cross the boundaries of the metaslab that it belongs.
+        *
+        * In reality, because of the way that we populate and
+        * manipulate the checkpoint's space maps currently,
+        * there shouldn't be any entries that cross metaslabs.
+        * Hence the assertion below.
+        *
+        * That said, there is no fundamental requirement that
+        * the checkpoint's space map entries should not cross
+        * metaslab boundaries. So if needed we could add code
+        * that handles metaslab-crossing segments in the future.
+        */
+       VERIFY3U(offset, >=, ms->ms_start);
+       VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+       /*
+        * By removing the entry from the allocated segments we
+        * also verify that the entry is there to begin with.
+        */
+       mutex_enter(&ms->ms_lock);
+       range_tree_remove(ms->ms_allocatable, offset, size);
+       mutex_exit(&ms->ms_lock);
+
+       cseea->cseea_checkpoint_size += size;
+       return (0);
+}
+
+static void
+zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
+{
+       spa_t *spa = vd->vdev_spa;
+       space_map_t *checkpoint_sm = NULL;
+       uint64_t checkpoint_sm_obj;
+
+       /*
+        * If there is no vdev_top_zap, we are in a pool whose
+        * version predates the pool checkpoint feature.
+        */
+       if (vd->vdev_top_zap == 0)
+               return;
+
+       /*
+        * If there is no reference of the vdev_checkpoint_sm in
+        * the vdev_top_zap, then one of the following scenarios
+        * is true:
+        *
+        * 1] There is no checkpoint
+        * 2] There is a checkpoint, but no checkpointed blocks
+        *    have been freed yet
+        * 3] The current vdev is indirect
+        *
+        * In these cases we return immediately.
+        */
+       if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+           VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+               return;
+
+       VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+           VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
+           &checkpoint_sm_obj));
+
+       checkpoint_sm_exclude_entry_arg_t cseea;
+       cseea.cseea_vd = vd;
+       cseea.cseea_checkpoint_size = 0;
+
+       VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+           checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+       space_map_update(checkpoint_sm);
+
+       VERIFY0(space_map_iterate(checkpoint_sm,
+           checkpoint_sm_exclude_entry_cb, &cseea));
+       space_map_close(checkpoint_sm);
+
+       zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
+}
+
+static void
+zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+               ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
+               zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
+       }
+}
+
+static void
+load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+               vdev_t *vd = rvd->vdev_child[i];
+
+               ASSERT3U(i, ==, vd->vdev_id);
+
+               if (vd->vdev_ops == &vdev_indirect_ops)
+                       continue;
+
+               for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+                       metaslab_t *msp = vd->vdev_ms[m];
+
+                       (void) fprintf(stderr,
+                           "\rloading concrete vdev %llu, "
+                           "metaslab %llu of %llu ...",
+                           (longlong_t)vd->vdev_id,
+                           (longlong_t)msp->ms_id,
+                           (longlong_t)vd->vdev_ms_count);
+
+                       mutex_enter(&msp->ms_lock);
+                       metaslab_unload(msp);
+
+                       /*
+                        * We don't want to spend the CPU manipulating the
+                        * size-ordered tree, so clear the range_tree ops.
+                        */
+                       msp->ms_allocatable->rt_ops = NULL;
+
+                       if (msp->ms_sm != NULL) {
+                               VERIFY0(space_map_load(msp->ms_sm,
+                                   msp->ms_allocatable, maptype));
+                       }
+                       if (!msp->ms_loaded)
+                               msp->ms_loaded = B_TRUE;
+                       mutex_exit(&msp->ms_lock);
+               }
+       }
+}
+
+/*
+ * vm_idxp is an in-out parameter which (for indirect vdevs) is the
+ * index in vim_entries that has the first entry in this metaslab.
+ * On return, it will be set to the first entry after this metaslab.
+ */
+static void
+load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
+    uint64_t *vim_idxp)
+{
+       vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+       mutex_enter(&msp->ms_lock);
+       metaslab_unload(msp);
+
+       /*
+        * We don't want to spend the CPU manipulating the
+        * size-ordered tree, so clear the range_tree ops.
+        */
+       msp->ms_allocatable->rt_ops = NULL;
+
+       for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
+           (*vim_idxp)++) {
+               vdev_indirect_mapping_entry_phys_t *vimep =
+                   &vim->vim_entries[*vim_idxp];
+               uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
+               uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
+               ASSERT3U(ent_offset, >=, msp->ms_start);
+               if (ent_offset >= msp->ms_start + msp->ms_size)
+                       break;
+
+               /*
+                * Mappings do not cross metaslab boundaries,
+                * because we create them by walking the metaslabs.
+                */
+               ASSERT3U(ent_offset + ent_len, <=,
+                   msp->ms_start + msp->ms_size);
+               range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
+       }
+
+       if (!msp->ms_loaded)
+               msp->ms_loaded = B_TRUE;
+       mutex_exit(&msp->ms_lock);
+}
+
+static void
+zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *vd = rvd->vdev_child[c];
+
+               ASSERT3U(c, ==, vd->vdev_id);
+
+               if (vd->vdev_ops != &vdev_indirect_ops)
+                       continue;
+
+               /*
+                * Note: we don't check for mapping leaks on
+                * removing vdevs because their ms_allocatable's
+                * are used to look for leaks in allocated space.
+                */
+               zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
+
+               /*
+                * Normally, indirect vdevs don't have any
+                * metaslabs.  We want to set them up for
+                * zio_claim().
+                */
+               VERIFY0(vdev_metaslab_init(vd, 0));
+
+               vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+               uint64_t vim_idx = 0;
+               for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+
+                       (void) fprintf(stderr,
+                           "\rloading indirect vdev %llu, "
+                           "metaslab %llu of %llu ...",
+                           (longlong_t)vd->vdev_id,
+                           (longlong_t)vd->vdev_ms[m]->ms_id,
+                           (longlong_t)vd->vdev_ms_count);
+
+                       load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
+                           &vim_idx);
+               }
+               ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
+       }
+}
+
 static void
 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
        zcb->zcb_spa = spa;
-       uint64_t c;
 
        if (!dump_opt['L']) {
                dsl_pool_t *dp = spa->spa_dsl_pool;
@@ -3627,7 +3820,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 
                /*
                 * We are going to be changing the meaning of the metaslab's
-                * ms_tree.  Ensure that the allocator doesn't try to
+                * ms_allocatable.  Ensure that the allocator doesn't try to
                 * use the tree.
                 */
                spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
@@ -3637,38 +3830,37 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
                    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
                    UMEM_NOFAIL);
 
-               for (c = 0; c < rvd->vdev_children; c++) {
-                       vdev_t *vd = rvd->vdev_child[c];
-                       uint64_t vim_idx = 0;
-
-                       ASSERT3U(c, ==, vd->vdev_id);
-
-                       /*
-                        * Note: we don't check for mapping leaks on
-                        * removing vdevs because their ms_tree's are
-                        * used to look for leaks in allocated space.
-                        */
-                       if (vd->vdev_ops == &vdev_indirect_ops) {
-                               zcb->zcb_vd_obsolete_counts[c] =
-                                   zdb_load_obsolete_counts(vd);
+               /*
+                * For leak detection, we overload the ms_allocatable trees
+                * to contain allocated segments instead of free segments.
+                * As a result, we can't use the normal metaslab_load/unload
+                * interfaces.
+                */
+               zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
+               load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
 
-                               /*
-                                * Normally, indirect vdevs don't have any
-                                * metaslabs.  We want to set them up for
-                                * zio_claim().
-                                */
-                               VERIFY0(vdev_metaslab_init(vd, 0));
-                       }
+               /*
+                * On load_concrete_ms_allocatable_trees() we loaded all the
+                * allocated entries from the ms_sm to the ms_allocatable for
+                * each metaslab. If the pool has a checkpoint or is in the
+                * middle of discarding a checkpoint, some of these blocks
+                * may have been freed but their ms_sm may not have been
+                * updated because they are referenced by the checkpoint. In
+                * order to avoid false-positives during leak-detection, we
+                * go through the vdev's checkpoint space map and exclude all
+                * its entries from their relevant ms_allocatable.
+                *
+                * We also aggregate the space held by the checkpoint and add
+                * it to zcb_checkpoint_size.
+                *
+                * Note that at this point we are also verifying that all the
+                * entries on the checkpoint_sm are marked as allocated in
+                * the ms_sm of their relevant metaslab.
+                * [see comment in checkpoint_sm_exclude_entry_cb()]
+                */
+               zdb_leak_init_exclude_checkpoint(spa, zcb);
 
-                       for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
-                               zdb_leak_init_ms(vd->vdev_ms[m], &vim_idx);
-                       }
-                       if (vd->vdev_ops == &vdev_indirect_ops) {
-                               ASSERT3U(vim_idx, ==,
-                                   vdev_indirect_mapping_num_entries(
-                                   vd->vdev_indirect_mapping));
-                       }
-               }
+               /* for cleaner progress output */
                (void) fprintf(stderr, "\n");
 
                if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
@@ -3677,12 +3869,16 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
                        (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
                            increment_indirect_mapping_cb, zcb, NULL);
                }
+       } else {
+               /*
+                * If leak tracing is disabled, we still need to consider
+                * any checkpointed space in our space verification.
+                */
+               zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
        }
 
        spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
        zdb_ddt_leak_init(spa, zcb);
-
        spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
@@ -3709,7 +3905,7 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
                for (uint64_t inner_offset = 0;
                    inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
                    inner_offset += 1 << vd->vdev_ashift) {
-                       if (range_tree_contains(msp->ms_tree,
+                       if (range_tree_contains(msp->ms_allocatable,
                            offset + inner_offset, 1 << vd->vdev_ashift)) {
                                obsolete_bytes += 1 << vd->vdev_ashift;
                        }
@@ -3775,23 +3971,23 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
                                ASSERT3P(mg, ==, msp->ms_group);
 
                                /*
-                                * The ms_tree has been overloaded to
-                                * contain allocated segments. Now that we
-                                * finished traversing all blocks, any
-                                * block that remains in the ms_tree
+                                * ms_allocatable has been overloaded
+                                * to contain allocated segments. Now that
+                                * we finished traversing all blocks, any
+                                * block that remains in the ms_allocatable
                                 * represents an allocated block that we
                                 * did not claim during the traversal.
                                 * Claimed blocks would have been removed
-                                * from the ms_tree.  For indirect vdevs,
-                                * space remaining in the tree represents
-                                * parts of the mapping that are not
-                                * referenced, which is not a bug.
+                                * from the ms_allocatable.  For indirect
+                                * vdevs, space remaining in the tree
+                                * represents parts of the mapping that are
+                                * not referenced, which is not a bug.
                                 */
                                if (vd->vdev_ops == &vdev_indirect_ops) {
-                                       range_tree_vacate(msp->ms_tree,
+                                       range_tree_vacate(msp->ms_allocatable,
                                            NULL, NULL);
                                } else {
-                                       range_tree_vacate(msp->ms_tree,
+                                       range_tree_vacate(msp->ms_allocatable,
                                            zdb_leak, vd);
                                }
 
@@ -3923,7 +4119,7 @@ dump_block_stats(spa_t *spa)
 
        total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
        total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
-           zcb.zcb_removing_size;
+           zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
 
        if (total_found == total_alloc) {
                if (!dump_opt['L'])
@@ -4332,6 +4528,390 @@ verify_device_removal_feature_counts(spa_t *spa)
        return (ret);
 }
 
+#define        BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
+/*
+ * Import the checkpointed state of the pool specified by the target
+ * parameter as readonly. The function also accepts a pool config
+ * as an optional parameter, else it attempts to infer the config by
+ * the name of the target pool.
+ *
+ * Note that the checkpointed state's pool name will be the name of
+ * the original pool with the above suffix appened to it. In addition,
+ * if the target is not a pool name (e.g. a path to a dataset) then
+ * the new_path parameter is populated with the updated path to
+ * reflect the fact that we are looking into the checkpointed state.
+ *
+ * The function returns a newly-allocated copy of the name of the
+ * pool containing the checkpointed state. When this copy is no
+ * longer needed it should be freed with free(3C). Same thing
+ * applies to the new_path parameter if allocated.
+ */
+static char *
+import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
+{
+       int error = 0;
+       char *poolname, *bogus_name = NULL;
+
+       /* If the target is not a pool, the extract the pool name */
+       char *path_start = strchr(target, '/');
+       if (path_start != NULL) {
+               size_t poolname_len = path_start - target;
+               poolname = strndup(target, poolname_len);
+       } else {
+               poolname = target;
+       }
+
+       if (cfg == NULL) {
+               error = spa_get_stats(poolname, &cfg, NULL, 0);
+               if (error != 0) {
+                       fatal("Tried to read config of pool \"%s\" but "
+                           "spa_get_stats() failed with error %d\n",
+                           poolname, error);
+               }
+       }
+
+       if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1)
+               return (NULL);
+       fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
+
+       error = spa_import(bogus_name, cfg, NULL,
+           ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT);
+       if (error != 0) {
+               fatal("Tried to import pool \"%s\" but spa_import() failed "
+                   "with error %d\n", bogus_name, error);
+       }
+
+       if (new_path != NULL && path_start != NULL) {
+               if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
+                       if (path_start != NULL)
+                               free(poolname);
+                       return (NULL);
+               }
+       }
+
+       if (target != poolname)
+               free(poolname);
+
+       return (bogus_name);
+}
+
+typedef struct verify_checkpoint_sm_entry_cb_arg {
+       vdev_t *vcsec_vd;
+
+       /* the following fields are only used for printing progress */
+       uint64_t vcsec_entryid;
+       uint64_t vcsec_num_entries;
+} verify_checkpoint_sm_entry_cb_arg_t;
+
+#define        ENTRIES_PER_PROGRESS_UPDATE 10000
+
+static int
+verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
+    void *arg)
+{
+       verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
+       vdev_t *vd = vcsec->vcsec_vd;
+       metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+       uint64_t end = offset + size;
+
+       ASSERT(type == SM_FREE);
+
+       if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
+               (void) fprintf(stderr,
+                   "\rverifying vdev %llu, space map entry %llu of %llu ...",
+                   (longlong_t)vd->vdev_id,
+                   (longlong_t)vcsec->vcsec_entryid,
+                   (longlong_t)vcsec->vcsec_num_entries);
+       }
+       vcsec->vcsec_entryid++;
+
+       /*
+        * See comment in checkpoint_sm_exclude_entry_cb()
+        */
+       VERIFY3U(offset, >=, ms->ms_start);
+       VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+       /*
+        * The entries in the vdev_checkpoint_sm should be marked as
+        * allocated in the checkpointed state of the pool, therefore
+        * their respective ms_allocateable trees should not contain them.
+        */
+       mutex_enter(&ms->ms_lock);
+       range_tree_verify(ms->ms_allocatable, offset, size);
+       mutex_exit(&ms->ms_lock);
+
+       return (0);
+}
+
+/*
+ * Verify that all segments in the vdev_checkpoint_sm are allocated
+ * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
+ * ms_allocatable).
+ *
+ * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
+ * each vdev in the current state of the pool to the metaslab space maps
+ * (ms_sm) of the checkpointed state of the pool.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of the current spa_t. The entries of these ms_allocatable
+ * trees are cleared out and then repopulated from with the free
+ * entries of their respective ms_sm space maps.
+ */
+static void
+verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+       vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+       vdev_t *current_rvd = current->spa_root_vdev;
+
+       load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
+
+       for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
+               vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
+               vdev_t *current_vd = current_rvd->vdev_child[c];
+
+               space_map_t *checkpoint_sm = NULL;
+               uint64_t checkpoint_sm_obj;
+
+               if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+                       /*
+                        * Since we don't allow device removal in a pool
+                        * that has a checkpoint, we expect that all removed
+                        * vdevs were removed from the pool before the
+                        * checkpoint.
+                        */
+                       ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+                       continue;
+               }
+
+               /*
+                * If the checkpoint space map doesn't exist, then nothing
+                * here is checkpointed so there's nothing to verify.
+                */
+               if (current_vd->vdev_top_zap == 0 ||
+                   zap_contains(spa_meta_objset(current),
+                   current_vd->vdev_top_zap,
+                   VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+                       continue;
+
+               VERIFY0(zap_lookup(spa_meta_objset(current),
+                   current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+                   sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+               VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
+                   checkpoint_sm_obj, 0, current_vd->vdev_asize,
+                   current_vd->vdev_ashift));
+               space_map_update(checkpoint_sm);
+
+               verify_checkpoint_sm_entry_cb_arg_t vcsec;
+               vcsec.vcsec_vd = ckpoint_vd;
+               vcsec.vcsec_entryid = 0;
+               vcsec.vcsec_num_entries =
+                   space_map_length(checkpoint_sm) / sizeof (uint64_t);
+               VERIFY0(space_map_iterate(checkpoint_sm,
+                   verify_checkpoint_sm_entry_cb, &vcsec));
+               dump_spacemap(current->spa_meta_objset, checkpoint_sm);
+               space_map_close(checkpoint_sm);
+       }
+
+       /*
+        * If we've added vdevs since we took the checkpoint, ensure
+        * that their checkpoint space maps are empty.
+        */
+       if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
+               for (uint64_t c = ckpoint_rvd->vdev_children;
+                   c < current_rvd->vdev_children; c++) {
+                       vdev_t *current_vd = current_rvd->vdev_child[c];
+                       ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
+               }
+       }
+
+       /* for cleaner progress output */
+       (void) fprintf(stderr, "\n");
+}
+
+/*
+ * Verifies that all space that's allocated in the checkpoint is
+ * still allocated in the current version, by checking that everything
+ * in checkpoint's ms_allocatable (which is actually allocated, not
+ * allocatable/free) is not present in current's ms_allocatable.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of both spas when called. The entries of all ms_allocatable
+ * trees are cleared out and then repopulated from their respective
+ * ms_sm space maps. In the checkpointed state we load the allocated
+ * entries, and in the current state we load the free entries.
+ */
+static void
+verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+       vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+       vdev_t *current_rvd = current->spa_root_vdev;
+
+       load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
+       load_concrete_ms_allocatable_trees(current, SM_FREE);
+
+       for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
+               vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
+               vdev_t *current_vd = current_rvd->vdev_child[i];
+
+               if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+                       /*
+                        * See comment in verify_checkpoint_vdev_spacemaps()
+                        */
+                       ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+                       continue;
+               }
+
+               for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
+                       metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
+                       metaslab_t *current_msp = current_vd->vdev_ms[m];
+
+                       (void) fprintf(stderr,
+                           "\rverifying vdev %llu of %llu, "
+                           "metaslab %llu of %llu ...",
+                           (longlong_t)current_vd->vdev_id,
+                           (longlong_t)current_rvd->vdev_children,
+                           (longlong_t)current_vd->vdev_ms[m]->ms_id,
+                           (longlong_t)current_vd->vdev_ms_count);
+
+                       /*
+                        * We walk through the ms_allocatable trees that
+                        * are loaded with the allocated blocks from the
+                        * ms_sm spacemaps of the checkpoint. For each
+                        * one of these ranges we ensure that none of them
+                        * exists in the ms_allocatable trees of the
+                        * current state which are loaded with the ranges
+                        * that are currently free.
+                        *
+                        * This way we ensure that none of the blocks that
+                        * are part of the checkpoint were freed by mistake.
+                        */
+                       range_tree_walk(ckpoint_msp->ms_allocatable,
+                           (range_tree_func_t *)range_tree_verify,
+                           current_msp->ms_allocatable);
+               }
+       }
+
+       /* for cleaner progress output */
+       (void) fprintf(stderr, "\n");
+}
+
+static void
+verify_checkpoint_blocks(spa_t *spa)
+{
+       spa_t *checkpoint_spa;
+       char *checkpoint_pool;
+       nvlist_t *config = NULL;
+       int error = 0;
+
+       /*
+        * We import the checkpointed state of the pool (under a different
+        * name) so we can do verification on it against the current state
+        * of the pool.
+        */
+       checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
+           NULL);
+       ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
+
+       error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
+       if (error != 0) {
+               fatal("Tried to open pool \"%s\" but spa_open() failed with "
+                   "error %d\n", checkpoint_pool, error);
+       }
+
+       /*
+        * Ensure that ranges in the checkpoint space maps of each vdev
+        * are allocated according to the checkpointed state's metaslab
+        * space maps.
+        */
+       verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
+
+       /*
+        * Ensure that allocated ranges in the checkpoint's metaslab
+        * space maps remain allocated in the metaslab space maps of
+        * the current state.
+        */
+       verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
+
+       /*
+        * Once we are done, we get rid of the checkpointed state.
+        */
+       spa_close(checkpoint_spa, FTAG);
+       free(checkpoint_pool);
+}
+
+static void
+dump_leftover_checkpoint_blocks(spa_t *spa)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+               vdev_t *vd = rvd->vdev_child[i];
+
+               space_map_t *checkpoint_sm = NULL;
+               uint64_t checkpoint_sm_obj;
+
+               if (vd->vdev_top_zap == 0)
+                       continue;
+
+               if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+                   VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+                       continue;
+
+               VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+                   VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+                   sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+               VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+                   checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+               space_map_update(checkpoint_sm);
+               dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
+               space_map_close(checkpoint_sm);
+       }
+}
+
+static int
+verify_checkpoint(spa_t *spa)
+{
+       uberblock_t checkpoint;
+       int error;
+
+       if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+               return (0);
+
+       error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+           sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+       if (error == ENOENT) {
+               /*
+                * If the feature is active but the uberblock is missing
+                * then we must be in the middle of discarding the
+                * checkpoint.
+                */
+               (void) printf("\nPartially discarded checkpoint "
+                   "state found:\n");
+               dump_leftover_checkpoint_blocks(spa);
+               return (0);
+       } else if (error != 0) {
+               (void) printf("lookup error %d when looking for "
+                   "checkpointed uberblock in MOS\n", error);
+               return (error);
+       }
+       dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
+
+       if (checkpoint.ub_checkpoint_txg == 0) {
+               (void) printf("\nub_checkpoint_txg not set in checkpointed "
+                   "uberblock\n");
+               error = 3;
+       }
+
+       if (error == 0)
+               verify_checkpoint_blocks(spa);
+
+       return (error);
+}
+
 static void
 dump_zpool(spa_t *spa)
 {
@@ -4435,6 +5015,9 @@ dump_zpool(spa_t *spa)
        if (dump_opt['h'])
                dump_history(spa);
 
+       if (rc == 0 && !dump_opt['L'])
+               rc = verify_checkpoint(spa);
+
        if (rc != 0) {
                dump_debug_buffer();
                exit(rc);
@@ -4879,6 +5462,7 @@ main(int argc, char **argv)
        int rewind = ZPOOL_NEVER_REWIND;
        char *spa_config_path_env;
        boolean_t target_is_spa = B_TRUE;
+       nvlist_t *cfg = NULL;
 
        (void) setrlimit(RLIMIT_NOFILE, &rl);
        (void) enable_extended_FILE_stdio(-1, -1);
@@ -4895,7 +5479,7 @@ main(int argc, char **argv)
                spa_config_path = spa_config_path_env;
 
        while ((c = getopt(argc, argv,
-           "AbcCdDeEFGhiI:lLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
+           "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
                switch (c) {
                case 'b':
                case 'c':
@@ -4920,6 +5504,7 @@ main(int argc, char **argv)
                case 'A':
                case 'e':
                case 'F':
+               case 'k':
                case 'L':
                case 'P':
                case 'q':
@@ -5029,7 +5614,7 @@ main(int argc, char **argv)
                verbose = MAX(verbose, 1);
 
        for (c = 0; c < 256; c++) {
-               if (dump_all && strchr("AeEFlLOPRSX", c) == NULL)
+               if (dump_all && strchr("AeEFklLOPRSX", c) == NULL)
                        dump_opt[c] = 1;
                if (dump_opt[c])
                        dump_opt[c] += verbose;
@@ -5081,6 +5666,17 @@ main(int argc, char **argv)
        error = 0;
        target = argv[0];
 
+       char *checkpoint_pool = NULL;
+       char *checkpoint_target = NULL;
+       if (dump_opt['k']) {
+               checkpoint_pool = import_checkpointed_state(target, cfg,
+                   &checkpoint_target);
+
+               if (checkpoint_target != NULL)
+                       target = checkpoint_target;
+
+       }
+
        if (strpbrk(target, "/@") != NULL) {
                size_t targetlen;
 
@@ -5097,7 +5693,6 @@ main(int argc, char **argv)
 
        if (dump_opt['e']) {
                importargs_t args = { 0 };
-               nvlist_t *cfg = NULL;
 
                args.paths = nsearch;
                args.path = searchdirs;
@@ -5121,6 +5716,7 @@ main(int argc, char **argv)
                                (void) printf("\nConfiguration for import:\n");
                                dump_nvlist(cfg, 8);
                        }
+
                        error = spa_import(target_pool, cfg, NULL,
                            flags | ZFS_IMPORT_SKIP_MMP);
                }
@@ -5130,7 +5726,18 @@ main(int argc, char **argv)
                free(target_pool);
 
        if (error == 0) {
-               if (target_is_spa || dump_opt['R']) {
+               if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
+                       ASSERT(checkpoint_pool != NULL);
+                       ASSERT(checkpoint_target == NULL);
+
+                       error = spa_open(checkpoint_pool, &spa, FTAG);
+                       if (error != 0) {
+                               fatal("Tried to open pool \"%s\" but "
+                                   "spa_open() failed with error %d\n",
+                                   checkpoint_pool, error);
+                       }
+
+               } else if (target_is_spa || dump_opt['R']) {
                        /*
                         * Disable the activity check to allow examination of
                         * active pools.
@@ -5216,6 +5823,12 @@ main(int argc, char **argv)
                        zdb_read_block(argv[i], spa);
        }
 
+       if (dump_opt['k']) {
+               free(checkpoint_pool);
+               if (!target_is_spa)
+                       free(checkpoint_target);
+       }
+
        if (os != NULL)
                close_objset(os, FTAG);
        else
index 2db9c9c0c1c0ee0110c3c17becfec16e97117123..c12178effae0db50425645e1029841a003f195b9 100644 (file)
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  */
 
 /*
@@ -42,6 +42,7 @@
 #include <sys/resource.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
+#include <sys/spa_impl.h>
 #include <sys/abd.h>
 
 #include "zdb.h"
@@ -166,7 +167,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, void *arg)
        if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
                (void) printf("%shas blkptr, %s\n", tab_prefix,
                    !BP_IS_HOLE(bp) &&
-                   bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
+                   bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
                    "will claim" : "won't claim");
                print_log_bp(bp, tab_prefix);
 
@@ -361,7 +362,7 @@ print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 
        if (claim_txg != 0)
                claim = "already claimed";
-       else if (bp->blk_birth >= spa_first_txg(zilog->zl_spa))
+       else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa))
                claim = "will claim";
        else
                claim = "won't claim";
@@ -416,6 +417,11 @@ dump_intent_log(zilog_t *zilog)
        for (i = 0; i < TX_MAX_TYPE; i++)
                zil_rec_info[i].zri_count = 0;
 
+       /* see comment in zil_claim() or zil_check_log_chain() */
+       if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+           zh->zh_claim_txg == 0)
+               return;
+
        if (verbose >= 2) {
                (void) printf("\n");
                (void) zil_parse(zilog, print_log_block, print_log_record, NULL,
index ad25a2f6ed97d04dac4f8c53aee0f4004f0676cb..5aca0d32594a398db81faeec115d44f6bd55c7b6 100644 (file)
@@ -36,6 +36,7 @@
 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <getopt.h>
 #include <libgen.h>
 #include <libintl.h>
 #include <libuutil.h>
@@ -76,6 +77,8 @@ static int zpool_do_add(int, char **);
 static int zpool_do_remove(int, char **);
 static int zpool_do_labelclear(int, char **);
 
+static int zpool_do_checkpoint(int, char **);
+
 static int zpool_do_list(int, char **);
 static int zpool_do_iostat(int, char **);
 static int zpool_do_status(int, char **);
@@ -131,6 +134,7 @@ typedef enum {
        HELP_ATTACH,
        HELP_CLEAR,
        HELP_CREATE,
+       HELP_CHECKPOINT,
        HELP_DESTROY,
        HELP_DETACH,
        HELP_EXPORT,
@@ -254,6 +258,8 @@ static zpool_command_t command_table[] = {
        { NULL },
        { "labelclear", zpool_do_labelclear,    HELP_LABELCLEAR         },
        { NULL },
+       { "checkpoint", zpool_do_checkpoint,    HELP_CHECKPOINT         },
+       { NULL },
        { "list",       zpool_do_list,          HELP_LIST               },
        { "iostat",     zpool_do_iostat,        HELP_IOSTAT             },
        { "status",     zpool_do_status,        HELP_STATUS             },
@@ -306,6 +312,8 @@ get_usage(zpool_help_t idx)
                return (gettext("\tcreate [-fnd] [-o property=value] ... \n"
                    "\t    [-O file-system-property=value] ... \n"
                    "\t    [-m mountpoint] [-R root] <pool> <vdev> ...\n"));
+       case HELP_CHECKPOINT:
+               return (gettext("\tcheckpoint [--discard] <pool> ...\n"));
        case HELP_DESTROY:
                return (gettext("\tdestroy [-f] <pool>\n"));
        case HELP_DETACH:
@@ -316,15 +324,13 @@ get_usage(zpool_help_t idx)
                return (gettext("\thistory [-il] [<pool>] ...\n"));
        case HELP_IMPORT:
                return (gettext("\timport [-d dir] [-D]\n"
-                   "\timport [-d dir | -c cachefile] [-F [-n]] [-l] "
-                   "<pool | id>\n"
                    "\timport [-o mntopts] [-o property=value] ... \n"
                    "\t    [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] "
                    "[-R root] [-F [-n]] -a\n"
                    "\timport [-o mntopts] [-o property=value] ... \n"
                    "\t    [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] "
                    "[-R root] [-F [-n]]\n"
-                   "\t    <pool | id> [newpool]\n"));
+                   "\t    [--rewind-to-checkpoint] <pool | id> [newpool]\n"));
        case HELP_IOSTAT:
                return (gettext("\tiostat [[[-c [script1,script2,...]"
                    "[-lq]]|[-rw]] [-T d | u] [-ghHLpPvy]\n"
@@ -2453,6 +2459,79 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
        return (ret);
 }
 
+/*
+ * zpool checkpoint <pool>
+ *       checkpoint --discard <pool>
+ *
+ *       -d         Discard the checkpoint from a checkpointed
+ *       --discard  pool.
+ *
+ * Checkpoints the specified pool, by taking a "snapshot" of its
+ * current state. A pool can only have one checkpoint at a time.
+ */
+int
+zpool_do_checkpoint(int argc, char **argv)
+{
+       boolean_t discard;
+       char *pool;
+       zpool_handle_t *zhp;
+       int c, err;
+
+       struct option long_options[] = {
+               {"discard", no_argument, NULL, 'd'},
+               {0, 0, 0, 0}
+       };
+
+       discard = B_FALSE;
+       while ((c = getopt_long(argc, argv, ":d", long_options, NULL)) != -1) {
+               switch (c) {
+               case 'd':
+                       discard = B_TRUE;
+                       break;
+               case '?':
+                       (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+                           optopt);
+                       usage(B_FALSE);
+               }
+       }
+
+       argc -= optind;
+       argv += optind;
+
+       if (argc < 1) {
+               (void) fprintf(stderr, gettext("missing pool argument\n"));
+               usage(B_FALSE);
+       }
+
+       if (argc > 1) {
+               (void) fprintf(stderr, gettext("too many arguments\n"));
+               usage(B_FALSE);
+       }
+
+       pool = argv[0];
+
+       if ((zhp = zpool_open(g_zfs, pool)) == NULL) {
+               /* As a special case, check for use of '/' in the name */
+               if (strchr(pool, '/') != NULL)
+                       (void) fprintf(stderr, gettext("'zpool checkpoint' "
+                           "doesn't work on datasets. To save the state "
+                           "of a dataset from a specific point in time "
+                           "please use 'zfs snapshot'\n"));
+               return (1);
+       }
+
+       if (discard)
+               err = (zpool_discard_checkpoint(zhp) != 0);
+       else
+               err = (zpool_checkpoint(zhp) != 0);
+
+       zpool_close(zhp);
+
+       return (err);
+}
+
+#define        CHECKPOINT_OPT  1024
+
 /*
  * zpool import [-d dir] [-D]
  *       import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l]
@@ -2499,6 +2578,9 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
  *      -s     Scan using the default search path, the libblkid cache will
  *             not be consulted.
  *
+ *       --rewind-to-checkpoint
+ *             Import the pool and revert back to the checkpoint.
+ *
  * The import command scans for pools to import, and import pools based on pool
  * name and GUID.  The pool can also be renamed as part of the import process.
  */
@@ -2534,8 +2616,14 @@ zpool_do_import(int argc, char **argv)
        importargs_t idata = { 0 };
        char *endptr;
 
+       struct option long_options[] = {
+               {"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT},
+               {0, 0, 0, 0}
+       };
+
        /* check options */
-       while ((c = getopt(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:VX")) != -1) {
+       while ((c = getopt_long(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:VX",
+           long_options, NULL)) != -1) {
                switch (c) {
                case 'a':
                        do_all = B_TRUE;
@@ -2622,6 +2710,9 @@ zpool_do_import(int argc, char **argv)
                case 'X':
                        xtreme_rewind = B_TRUE;
                        break;
+               case CHECKPOINT_OPT:
+                       flags |= ZFS_IMPORT_CHECKPOINT;
+                       break;
                case ':':
                        (void) fprintf(stderr, gettext("missing argument for "
                            "'%c' option\n"), optopt);
@@ -4977,6 +5068,7 @@ print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted,
 
        switch (prop) {
        case ZPOOL_PROP_EXPANDSZ:
+       case ZPOOL_PROP_CHECKPOINT:
                if (value == 0)
                        (void) strlcpy(propval, "-", sizeof (propval));
                else
@@ -5064,6 +5156,8 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
                    toplevel, format);
                print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc,
                    scripted, toplevel, format);
+               print_one_column(ZPOOL_PROP_CHECKPOINT,
+                   vs->vs_checkpoint_space, scripted, toplevel, format);
                print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, scripted,
                    B_TRUE, format);
                print_one_column(ZPOOL_PROP_FRAGMENTATION,
@@ -5187,8 +5281,8 @@ zpool_do_list(int argc, char **argv)
        int ret = 0;
        list_cbdata_t cb = { 0 };
        static char default_props[] =
-           "name,size,allocated,free,expandsize,fragmentation,capacity,"
-           "dedupratio,health,altroot";
+           "name,size,allocated,free,checkpoint,expandsize,fragmentation,"
+           "capacity,dedupratio,health,altroot";
        char *props = default_props;
        float interval = 0;
        unsigned long count = 0;
@@ -5990,6 +6084,32 @@ typedef struct scrub_cbdata {
        pool_scrub_cmd_t cb_scrub_cmd;
 } scrub_cbdata_t;
 
+static boolean_t
+zpool_has_checkpoint(zpool_handle_t *zhp)
+{
+       nvlist_t *config, *nvroot;
+
+       config = zpool_get_config(zhp, NULL);
+
+       if (config != NULL) {
+               pool_checkpoint_stat_t *pcs = NULL;
+               uint_t c;
+
+               nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
+               (void) nvlist_lookup_uint64_array(nvroot,
+                   ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
+
+               if (pcs == NULL || pcs->pcs_state == CS_NONE)
+                       return (B_FALSE);
+
+               assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS ||
+                   pcs->pcs_state == CS_CHECKPOINT_DISCARDING);
+               return (B_TRUE);
+       }
+
+       return (B_FALSE);
+}
+
 int
 scrub_callback(zpool_handle_t *zhp, void *data)
 {
@@ -6007,6 +6127,13 @@ scrub_callback(zpool_handle_t *zhp, void *data)
 
        err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd);
 
+       if (err == 0 && zpool_has_checkpoint(zhp) &&
+           cb->cb_type == POOL_SCAN_SCRUB) {
+               (void) printf(gettext("warning: will not scrub state that "
+                   "belongs to the checkpoint of pool '%s'\n"),
+                   zpool_get_name(zhp));
+       }
+
        return (err != 0);
 }
 
@@ -6211,6 +6338,40 @@ print_scan_status(pool_scan_stat_t *ps)
        }
 }
 
+/*
+ * As we don't scrub checkpointed blocks, we want to warn the
+ * user that we skipped scanning some blocks if a checkpoint exists
+ * or existed at any time during the scan.
+ */
+static void
+print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs)
+{
+       if (ps == NULL || pcs == NULL)
+               return;
+
+       if (pcs->pcs_state == CS_NONE ||
+           pcs->pcs_state == CS_CHECKPOINT_DISCARDING)
+               return;
+
+       assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS);
+
+       if (ps->pss_state == DSS_NONE)
+               return;
+
+       if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) &&
+           ps->pss_end_time < pcs->pcs_start_time)
+               return;
+
+       if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) {
+               (void) printf(gettext("    scan warning: skipped blocks "
+                   "that are only referenced by the checkpoint.\n"));
+       } else {
+               assert(ps->pss_state == DSS_SCANNING);
+               (void) printf(gettext("    scan warning: skipping blocks "
+                   "that are only referenced by the checkpoint.\n"));
+       }
+}
+
 /*
  * Print out detailed removal status.
  */
@@ -6316,6 +6477,39 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs)
        }
 }
 
+static void
+print_checkpoint_status(pool_checkpoint_stat_t *pcs)
+{
+       time_t start;
+       char space_buf[7];
+
+       if (pcs == NULL || pcs->pcs_state == CS_NONE)
+               return;
+
+       (void) printf(gettext("checkpoint: "));
+
+       start = pcs->pcs_start_time;
+       zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf));
+
+       if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) {
+               char *date = ctime(&start);
+
+               /*
+                * ctime() adds a newline at the end of the generated
+                * string, thus the weird format specifier and the
+                * strlen() call used to chop it off from the output.
+                */
+               (void) printf(gettext("created %.*s, consumes %s\n"),
+                   (int)(strlen(date) - 1), date, space_buf);
+               return;
+       }
+
+       assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING);
+
+       (void) printf(gettext("discarding, %s remaining.\n"),
+           space_buf);
+}
+
 static void
 print_error_log(zpool_handle_t *zhp)
 {
@@ -6741,16 +6935,21 @@ status_callback(zpool_handle_t *zhp, void *data)
                uint64_t nerr;
                nvlist_t **spares, **l2cache;
                uint_t nspares, nl2cache;
+               pool_checkpoint_stat_t *pcs = NULL;
                pool_scan_stat_t *ps = NULL;
                pool_removal_stat_t *prs = NULL;
 
+               (void) nvlist_lookup_uint64_array(nvroot,
+                   ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
                (void) nvlist_lookup_uint64_array(nvroot,
                    ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c);
-               print_scan_status(ps);
-
                (void) nvlist_lookup_uint64_array(nvroot,
                    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
+
+               print_scan_status(ps);
+               print_checkpoint_scan_warning(ps, pcs);
                print_removal_status(zhp, prs);
+               print_checkpoint_status(pcs);
 
                cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0,
                    cbp->cb_name_flags | VDEV_NAME_TYPE_ID);
index 83a8b4f3ccef30c0e8d31be5478190d066f14271..78ad7e8de0a06a7833cddd8a7b52435dc9ab4dd3 100644 (file)
@@ -345,6 +345,7 @@ ztest_func_t ztest_reguid;
 ztest_func_t ztest_spa_upgrade;
 ztest_func_t ztest_device_removal;
 ztest_func_t ztest_remap_blocks;
+ztest_func_t ztest_spa_checkpoint_create_discard;
 ztest_func_t ztest_fletcher;
 ztest_func_t ztest_fletcher_incr;
 ztest_func_t ztest_verify_dnode_bt;
@@ -397,6 +398,7 @@ ztest_info_t ztest_info[] = {
        ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime),
        ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes),
        ZTI_INIT(ztest_remap_blocks, 1, &zopt_sometimes),
+       ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely),
        ZTI_INIT(ztest_fletcher, 1, &zopt_rarely),
        ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
        ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
@@ -446,6 +448,7 @@ static ztest_ds_t *ztest_ds;
 
 static kmutex_t ztest_vdev_lock;
 static boolean_t ztest_device_removal_active = B_FALSE;
+static kmutex_t ztest_checkpoint_lock;
 
 /*
  * The ztest_name_lock protects the pool and dataset namespace used by
@@ -2864,6 +2867,62 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
        mutex_exit(&ztest_vdev_lock);
 }
 
+static void
+ztest_spa_checkpoint(spa_t *spa)
+{
+       ASSERT(MUTEX_HELD(&ztest_checkpoint_lock));
+
+       int error = spa_checkpoint(spa->spa_name);
+
+       switch (error) {
+       case 0:
+       case ZFS_ERR_DEVRM_IN_PROGRESS:
+       case ZFS_ERR_DISCARDING_CHECKPOINT:
+       case ZFS_ERR_CHECKPOINT_EXISTS:
+               break;
+       case ENOSPC:
+               ztest_record_enospc(FTAG);
+               break;
+       default:
+               fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error);
+       }
+}
+
+static void
+ztest_spa_discard_checkpoint(spa_t *spa)
+{
+       ASSERT(MUTEX_HELD(&ztest_checkpoint_lock));
+
+       int error = spa_checkpoint_discard(spa->spa_name);
+
+       switch (error) {
+       case 0:
+       case ZFS_ERR_DISCARDING_CHECKPOINT:
+       case ZFS_ERR_NO_CHECKPOINT:
+               break;
+       default:
+               fatal(0, "spa_discard_checkpoint(%s) = %d",
+                   spa->spa_name, error);
+       }
+
+}
+
+/* ARGSUSED */
+void
+ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id)
+{
+       spa_t *spa = ztest_spa;
+
+       mutex_enter(&ztest_checkpoint_lock);
+       if (ztest_random(2) == 0) {
+               ztest_spa_checkpoint(spa);
+       } else {
+               ztest_spa_discard_checkpoint(spa);
+       }
+       mutex_exit(&ztest_checkpoint_lock);
+}
+
+
 static vdev_t *
 vdev_lookup_by_path(vdev_t *vd, const char *path)
 {
@@ -2953,6 +3012,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
                case EEXIST:    /* Generic zil_reset() error */
                case EBUSY:     /* Replay required */
                case EACCES:    /* Crypto key not loaded */
+               case ZFS_ERR_CHECKPOINT_EXISTS:
+               case ZFS_ERR_DISCARDING_CHECKPOINT:
                        break;
                default:
                        fatal(0, "spa_vdev_remove() = %d", error);
@@ -2971,10 +3032,15 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
                error = spa_vdev_add(spa, nvroot);
                nvlist_free(nvroot);
 
-               if (error == ENOSPC)
+               switch (error) {
+               case 0:
+                       break;
+               case ENOSPC:
                        ztest_record_enospc("spa_vdev_add");
-               else if (error != 0)
+                       break;
+               default:
                        fatal(0, "spa_vdev_add() = %d", error);
+               }
        }
 
        mutex_exit(&ztest_vdev_lock);
@@ -3048,8 +3114,13 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
                nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
                    (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
                error = spa_vdev_add(spa, nvroot);
-               if (error != 0)
+
+               switch (error) {
+               case 0:
+                       break;
+               default:
                        fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
+               }
                nvlist_free(nvroot);
        } else {
                /*
@@ -3061,8 +3132,16 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
                        (void) vdev_online(spa, guid, 0, NULL);
 
                error = spa_vdev_remove(spa, guid, B_FALSE);
-               if (error != 0 && error != EBUSY)
+
+               switch (error) {
+               case 0:
+               case EBUSY:
+               case ZFS_ERR_CHECKPOINT_EXISTS:
+               case ZFS_ERR_DISCARDING_CHECKPOINT:
+                       break;
+               default:
                        fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
+               }
        }
 
        mutex_exit(&ztest_vdev_lock);
@@ -3166,7 +3245,6 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id)
                --zs->zs_mirrors;
        }
        mutex_exit(&ztest_vdev_lock);
-
 }
 
 /*
@@ -3271,7 +3349,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
                spa_config_exit(spa, SCL_ALL, FTAG);
                error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
                if (error != 0 && error != ENODEV && error != EBUSY &&
-                   error != ENOTSUP)
+                   error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS &&
+                   error != ZFS_ERR_DISCARDING_CHECKPOINT)
                        fatal(0, "detach (%s) returned %d", oldpath, error);
                goto out;
        }
@@ -3362,6 +3441,10 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
        if (error == EOVERFLOW || error == EBUSY)
                expected_error = error;
 
+       if (error == ZFS_ERR_CHECKPOINT_EXISTS ||
+           error == ZFS_ERR_DISCARDING_CHECKPOINT)
+               expected_error = error;
+
        /* XXX workaround 6690467 */
        if (error != expected_error && expected_error != EBUSY) {
                fatal(0, "attach (%s %llu, %s %llu, %d) "
@@ -3556,6 +3639,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
        uint64_t top;
        uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
 
+       mutex_enter(&ztest_checkpoint_lock);
        mutex_enter(&ztest_vdev_lock);
        spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
@@ -3566,8 +3650,9 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
         * when the device removal completes).
         */
        if (ztest_device_removal_active) {
-               spa_config_exit(spa, SCL_STATE, FTAG);
+               spa_config_exit(spa, SCL_STATE, spa);
                mutex_exit(&ztest_vdev_lock);
+               mutex_exit(&ztest_checkpoint_lock);
                return;
        }
 
@@ -3597,6 +3682,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
            psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) {
                spa_config_exit(spa, SCL_STATE, spa);
                mutex_exit(&ztest_vdev_lock);
+               mutex_exit(&ztest_checkpoint_lock);
                return;
        }
        ASSERT(psize > 0);
@@ -3622,6 +3708,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
                }
                spa_config_exit(spa, SCL_STATE, spa);
                mutex_exit(&ztest_vdev_lock);
+               mutex_exit(&ztest_checkpoint_lock);
                return;
        }
 
@@ -3656,6 +3743,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
                }
                spa_config_exit(spa, SCL_STATE, spa);
                mutex_exit(&ztest_vdev_lock);
+               mutex_exit(&ztest_checkpoint_lock);
                return;
        }
 
@@ -3686,6 +3774,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
 
        spa_config_exit(spa, SCL_STATE, spa);
        mutex_exit(&ztest_vdev_lock);
+       mutex_exit(&ztest_checkpoint_lock);
 }
 
 /*
@@ -5722,7 +5811,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
         */
        fd = open(pathrand, O_RDWR);
 
-       if (fd == -1)   /* we hit a gap in the device namespace */
+       if (fd == -1) /* we hit a gap in the device namespace */
                goto out;
 
        fsize = lseek(fd, 0, SEEK_END);
@@ -6645,6 +6734,7 @@ ztest_run(ztest_shared_t *zs)
         * Initialize parent/child shared state.
         */
        mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL);
        VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL));
 
        zs->zs_thread_start = gethrtime();
@@ -6696,7 +6786,7 @@ ztest_run(ztest_shared_t *zs)
        spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
 
        /*
-        * Verify that we can safely inquire about about any object,
+        * Verify that we can safely inquire about any object,
         * whether it's allocated or not.  To make it interesting,
         * we probe a 5-wide window around each power of two.
         * This hits all edge cases, including zero and the max.
@@ -6804,6 +6894,7 @@ ztest_run(ztest_shared_t *zs)
        mutex_destroy(&zcl.zcl_callbacks_lock);
        (void) pthread_rwlock_destroy(&ztest_name_lock);
        mutex_destroy(&ztest_vdev_lock);
+       mutex_destroy(&ztest_checkpoint_lock);
 }
 
 static void
@@ -6953,6 +7044,7 @@ ztest_import(ztest_shared_t *zs)
        int error;
 
        mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL);
        VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL));
 
        kernel_init(FREAD | FWRITE);
@@ -6984,6 +7076,7 @@ ztest_import(ztest_shared_t *zs)
 
        (void) pthread_rwlock_destroy(&ztest_name_lock);
        mutex_destroy(&ztest_vdev_lock);
+       mutex_destroy(&ztest_checkpoint_lock);
 }
 
 /*
@@ -6998,6 +7091,7 @@ ztest_init(ztest_shared_t *zs)
        int i;
 
        mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL);
        VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL));
 
        kernel_init(FREAD | FWRITE);
@@ -7049,6 +7143,7 @@ ztest_init(ztest_shared_t *zs)
 
        (void) pthread_rwlock_destroy(&ztest_name_lock);
        mutex_destroy(&ztest_vdev_lock);
+       mutex_destroy(&ztest_checkpoint_lock);
 }
 
 static void
index 91bfbf4ca56466b3b1a2088be6ff4607c47790cd..9a67472bfc435db4d2746d089f859e5b46dd730b 100644 (file)
@@ -177,6 +177,7 @@ AC_CONFIG_FILES([
        tests/zfs-tests/cmd/mmapwrite/Makefile
        tests/zfs-tests/cmd/nvlist_to_lua/Makefile
        tests/zfs-tests/cmd/randfree_file/Makefile
+       tests/zfs-tests/cmd/randwritecomp/Makefile
        tests/zfs-tests/cmd/readmmap/Makefile
        tests/zfs-tests/cmd/rename_dir/Makefile
        tests/zfs-tests/cmd/rm_lnkcnt_zero_file/Makefile
@@ -291,6 +292,7 @@ AC_CONFIG_FILES([
        tests/zfs-tests/tests/functional/nopwrite/Makefile
        tests/zfs-tests/tests/functional/online_offline/Makefile
        tests/zfs-tests/tests/functional/pool_names/Makefile
+       tests/zfs-tests/tests/functional/pool_checkpoint/Makefile
        tests/zfs-tests/tests/functional/poolversion/Makefile
        tests/zfs-tests/tests/functional/privilege/Makefile
        tests/zfs-tests/tests/functional/projectquota/Makefile
index b9896315885fd3342ccc95f408d8248ce06a3f4b..c0c0f3c3cf67c9c31d34ad80126306fa58dc89d1 100644 (file)
@@ -152,6 +152,11 @@ typedef enum zfs_error {
        EZFS_ACTIVE_POOL,       /* pool is imported on a different system */
        EZFS_CRYPTOFAILED,      /* failed to setup encryption */
        EZFS_NO_PENDING,        /* cannot cancel, no operation is pending */
+       EZFS_CHECKPOINT_EXISTS, /* checkpoint exists */
+       EZFS_DISCARDING_CHECKPOINT,     /* currently discarding a checkpoint */
+       EZFS_NO_CHECKPOINT,     /* pool has no checkpoint */
+       EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */
+       EZFS_VDEV_TOO_BIG,      /* a device is too big to be used */
        EZFS_UNKNOWN
 } zfs_error_t;
 
@@ -457,6 +462,8 @@ extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *);
 extern int zpool_get_physpath(zpool_handle_t *, char *, size_t);
 extern void zpool_explain_recover(libzfs_handle_t *, const char *, int,
     nvlist_t *);
+extern int zpool_checkpoint(zpool_handle_t *);
+extern int zpool_discard_checkpoint(zpool_handle_t *);
 
 /*
  * Basic handle manipulations.  These functions do not create or destroy the
index 5af0e1e7571be3c0b4a5dd06fe57df0960781685..4ca9b254c26b94d1e1d3aed4dc48698c98b781b1 100644 (file)
@@ -110,6 +110,9 @@ int lzc_channel_program_nosync(const char *, const char *, uint64_t,
 int lzc_sync(const char *, nvlist_t *, nvlist_t **);
 int lzc_reopen(const char *, boolean_t);
 
+int lzc_pool_checkpoint(const char *);
+int lzc_pool_checkpoint_discard(const char *);
+
 #ifdef __cplusplus
 }
 #endif
index 50c21831d2fd0cd9293eb9d593562a15d013ae6d..d64133ceb98470aef540cb0055539b352dc5b23b 100644 (file)
@@ -13,6 +13,7 @@ COMMON_H = \
        $(top_srcdir)/include/sys/bptree.h \
        $(top_srcdir)/include/sys/bqueue.h \
        $(top_srcdir)/include/sys/cityhash.h \
+       $(top_srcdir)/include/sys/spa_checkpoint.h \
        $(top_srcdir)/include/sys/dbuf.h \
        $(top_srcdir)/include/sys/ddt.h \
        $(top_srcdir)/include/sys/dmu.h \
index 45259a7fc5472249307354253b544d2330915dbe..d95c09bb931e09f2bada1691f7269964e0af8492 100644 (file)
@@ -366,6 +366,7 @@ typedef struct dmu_buf {
 #define        DMU_POOL_REMOVING               "com.delphix:removing"
 #define        DMU_POOL_OBSOLETE_BPOBJ         "com.delphix:obsolete_bpobj"
 #define        DMU_POOL_CONDENSING_INDIRECT    "com.delphix:condensing_indirect"
+#define        DMU_POOL_ZPOOL_CHECKPOINT       "com.delphix:zpool_checkpoint"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
index 8a346e902dc84aa342c10d3636ba0b95ee0b582c..86bc2dd8759eaa0516c54b3118753a5eead118aa 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
@@ -138,6 +138,7 @@ uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
     const char *name, dmu_tx_t *tx);
 
 uint64_t dsl_dir_get_used(dsl_dir_t *dd);
+uint64_t dsl_dir_get_compressed(dsl_dir_t *dd);
 uint64_t dsl_dir_get_quota(dsl_dir_t *dd);
 uint64_t dsl_dir_get_reservation(dsl_dir_t *dd);
 uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd);
index c60e4bf9d039963f7902118326f63a4da19caaef..01870e8670466190692cf4810dead71350d04a06 100644 (file)
@@ -38,6 +38,7 @@
 #include <sys/bpobj.h>
 #include <sys/bptree.h>
 #include <sys/rrwlock.h>
+#include <sys/dsl_synctask.h>
 #include <sys/mmp.h>
 
 #ifdef __cplusplus
@@ -128,6 +129,7 @@ typedef struct dsl_pool {
        txg_list_t dp_dirty_zilogs;
        txg_list_t dp_dirty_dirs;
        txg_list_t dp_sync_tasks;
+       txg_list_t dp_early_sync_tasks;
        taskq_t *dp_sync_taskq;
        taskq_t *dp_zil_clean_taskq;
 
@@ -151,7 +153,9 @@ dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops,
 void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
 void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
 int dsl_pool_sync_context(dsl_pool_t *dp);
-uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy);
+uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp,
+    zfs_space_check_t slop_policy);
 void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
 void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
 void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
@@ -162,6 +166,8 @@ void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
 void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
 void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
     int64_t used, int64_t comp, int64_t uncomp);
+void dsl_pool_ckpoint_diduse_space(dsl_pool_t *dp,
+    int64_t used, int64_t comp, int64_t uncomp);
 boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
 void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
 void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag);
index 6139303c1564847cbff4526325b9eb19063775e1..da6c7a40daca6da31832aaa110f80c7a52c2e123 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_DSL_SYNCTASK_H
@@ -57,14 +57,41 @@ typedef enum zfs_space_check {
        ZFS_SPACE_CHECK_RESERVED,
 
        /*
-        * No space check is performed.  Only operations which we expect to
-        * result in a net reduction in space should use this
-        * (e.g. "zfs destroy". Setting quotas & reservations also uses
-        * this because it needs to circumvent the quota/reservation checks).
+        * Space check allows use of three quarters of the slop space.
+        * If there is less than 0.8% free space, the operation will
+        * fail.
+        */
+       ZFS_SPACE_CHECK_EXTRA_RESERVED,
+
+       /*
+        * In all cases "zfs destroy" is expected to result in an net
+        * reduction of space, except one. When the pool has a
+        * checkpoint, space freed by "zfs destroy" will not actually
+        * free anything internally. Thus, it starts failing after
+        * three quarters of the slop space is exceeded.
+        */
+       ZFS_SPACE_CHECK_DESTROY = ZFS_SPACE_CHECK_EXTRA_RESERVED,
+
+       /*
+        * A channel program can run a "zfs destroy" as part of its
+        * script and therefore has the same space_check policy when
+        * being evaluated.
+        */
+       ZFS_SPACE_CHECK_ZCP_EVAL = ZFS_SPACE_CHECK_DESTROY,
+
+       /*
+        * No space check is performed. This level of space check should
+        * be used cautiously as operations that use it can even run when
+        * 0.8% capacity is left for use. In this scenario, if there is a
+        * checkpoint, async destroys are suspended and any kind of freeing
+        * can potentially add space instead of freeing it.
         *
         * See also the comments above spa_slop_shift.
         */
        ZFS_SPACE_CHECK_NONE,
+
+       ZFS_SPACE_CHECK_DISCARD_CHECKPOINT = ZFS_SPACE_CHECK_NONE,
+
 } zfs_space_check_t;
 
 typedef struct dsl_sync_task {
@@ -85,6 +112,10 @@ int dsl_sync_task(const char *, dsl_checkfunc_t *,
     dsl_syncfunc_t *, void *, int, zfs_space_check_t);
 void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
     void *, int, zfs_space_check_t, dmu_tx_t *);
+int dsl_early_sync_task(const char *, dsl_checkfunc_t *,
+    dsl_syncfunc_t *, void *, int, zfs_space_check_t);
+void dsl_early_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
+    void *, int, zfs_space_check_t, dmu_tx_t *);
 
 #ifdef __cplusplus
 }
index 870618ecb886c7c44fbd9c32b80d3fda1bdf747e..0ee9b00bd98d93fbd348aad888bdd03b0a90bbd0 100644 (file)
@@ -239,6 +239,7 @@ typedef enum {
        ZPOOL_PROP_TNAME,
        ZPOOL_PROP_MAXDNODESIZE,
        ZPOOL_PROP_MULTIHOST,
+       ZPOOL_PROP_CHECKPOINT,
        ZPOOL_NUM_PROPS
 } zpool_prop_t;
 
@@ -616,6 +617,7 @@ typedef struct zpool_load_policy {
 #define        ZPOOL_CONFIG_DTL                "DTL"
 #define        ZPOOL_CONFIG_SCAN_STATS         "scan_stats"    /* not stored on disk */
 #define        ZPOOL_CONFIG_REMOVAL_STATS      "removal_stats" /* not stored on disk */
+#define        ZPOOL_CONFIG_CHECKPOINT_STATS   "checkpoint_stats" /* not on disk */
 #define        ZPOOL_CONFIG_VDEV_STATS         "vdev_stats"    /* not stored on disk */
 #define        ZPOOL_CONFIG_INDIRECT_SIZE      "indirect_size" /* not stored on disk */
 
@@ -752,6 +754,8 @@ typedef struct zpool_load_policy {
        "com.delphix:indirect_obsolete_sm"
 #define        VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \
        "com.delphix:obsolete_counts_are_precise"
+#define        VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
+       "com.delphix:pool_checkpoint_sm"
 
 /*
  * This is needed in userland to report the minimum necessary device size.
@@ -861,6 +865,18 @@ typedef enum pool_scrub_cmd {
        POOL_SCRUB_FLAGS_END
 } pool_scrub_cmd_t;
 
+typedef enum {
+       CS_NONE,
+       CS_CHECKPOINT_EXISTS,
+       CS_CHECKPOINT_DISCARDING,
+       CS_NUM_STATES
+} checkpoint_state_t;
+
+typedef struct pool_checkpoint_stat {
+       uint64_t pcs_state;             /* checkpoint_state_t */
+       uint64_t pcs_start_time;        /* time checkpoint/discard started */
+       uint64_t pcs_space;             /* checkpointed space */
+} pool_checkpoint_stat_t;
 
 /*
  * ZIO types.  Needed to interpret vdev statistics below.
@@ -958,7 +974,7 @@ typedef struct vdev_stat {
        uint64_t        vs_scan_removing;       /* removing?    */
        uint64_t        vs_scan_processed;      /* scan processed bytes */
        uint64_t        vs_fragmentation;       /* device fragmentation */
-
+       uint64_t        vs_checkpoint_space;    /* checkpoint-consumed space */
 } vdev_stat_t;
 
 /*
@@ -1144,6 +1160,8 @@ typedef enum zfs_ioc {
        ZFS_IOC_UNLOAD_KEY,
        ZFS_IOC_CHANGE_KEY,
        ZFS_IOC_REMAP,
+       ZFS_IOC_POOL_CHECKPOINT,
+       ZFS_IOC_POOL_DISCARD_CHECKPOINT,
 
        /*
         * Linux - 3/64 numbers reserved.
@@ -1166,6 +1184,22 @@ typedef enum zfs_ioc {
  */
 #define        BLKZNAME                _IOR(0x12, 125, char[ZFS_MAX_DATASET_NAME_LEN])
 
+/*
+ * ZFS-specific error codes used for returning descriptive errors
+ * to the userland through zfs ioctls.
+ *
+ * The enum implicitly includes all the error codes from errno.h.
+ * New code should use and extend this enum for errors that are
+ * not described precisely by generic errno codes.
+ */
+typedef enum {
+       ZFS_ERR_CHECKPOINT_EXISTS = 1024,
+       ZFS_ERR_DISCARDING_CHECKPOINT,
+       ZFS_ERR_NO_CHECKPOINT,
+       ZFS_ERR_DEVRM_IN_PROGRESS,
+       ZFS_ERR_VDEV_TOO_BIG
+} zfs_errno_t;
+
 /*
  * Internal SPA load state.  Used by FMA diagnosis engine.
  */
@@ -1235,6 +1269,7 @@ typedef enum {
 #define        ZFS_IMPORT_TEMP_NAME    0x10
 #define        ZFS_IMPORT_SKIP_MMP     0x20
 #define        ZFS_IMPORT_LOAD_KEYS    0x40
+#define        ZFS_IMPORT_CHECKPOINT   0x80
 
 /*
  * Channel program argument/return nvlist keys and defaults.
index fdcf6c71be4a7c2e65a3ed4b696ff1d067f70c10..282ec231c91980c22d21bbbea6167b3463b10fac 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_H
@@ -70,8 +70,8 @@ int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
 int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
     dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *);
 void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
-void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, uint64_t);
-void metaslab_free_dva(spa_t *, const dva_t *, uint64_t);
+void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t);
+void metaslab_free_dva(spa_t *, const dva_t *, boolean_t);
 void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *);
 void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
 int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
index 76f670a4d43f565dafc5bb87d1a4a3273240f9b4..dafd2b2310ad69c61b0db6cf7996c6d6390b3c1a 100644 (file)
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_IMPL_H
@@ -255,16 +255,16 @@ struct metaslab_group {
 
 /*
  * Each metaslab maintains a set of in-core trees to track metaslab
- * operations.  The in-core free tree (ms_tree) contains the list of
+ * operations.  The in-core free tree (ms_allocatable) contains the list of
  * free segments which are eligible for allocation.  As blocks are
- * allocated, the allocated segment are removed from the ms_tree and
- * added to a per txg allocation tree (ms_alloctree).  As blocks are
- * freed, they are added to the free tree (ms_freeingtree).  These trees
+ * allocated, the allocated segment are removed from the ms_allocatable and
+ * added to a per txg allocation tree (ms_allocating).  As blocks are
+ * freed, they are added to the free tree (ms_freeing).  These trees
  * allow us to process all allocations and frees in syncing context
  * where it is safe to update the on-disk space maps.  An additional set
  * of in-core trees is maintained to track deferred frees
- * (ms_defertree).  Once a block is freed it will move from the
- * ms_freedtree to the ms_defertree.  A deferred free means that a block
+ * (ms_defer).  Once a block is freed it will move from the
+ * ms_freed to the ms_defer tree.  A deferred free means that a block
  * has been freed but cannot be used by the pool until TXG_DEFER_SIZE
  * transactions groups later.  For example, a block that is freed in txg
  * 50 will not be available for reallocation until txg 52 (50 +
@@ -278,14 +278,14 @@ struct metaslab_group {
  *      ALLOCATE
  *         |
  *         V
- *    free segment (ms_tree) -----> ms_alloctree[4] ----> (write to space map)
+ *    free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map)
  *         ^
- *         |                           ms_freeingtree <--- FREE
- *         |                                 |
- *         |                                 v
- *         |                           ms_freedtree
- *         |                                 |
- *         +-------- ms_defertree[2] <-------+---------> (write to space map)
+ *         |                        ms_freeing <--- FREE
+ *         |                             |
+ *         |                             v
+ *         |                         ms_freed
+ *         |                             |
+ *         +-------- ms_defer[2] <-------+-------> (write to space map)
  *
  *
  * Each metaslab's space is tracked in a single space map in the MOS,
@@ -296,8 +296,8 @@ struct metaslab_group {
  * To load the in-core free tree we read the space map from disk.  This
  * object contains a series of alloc and free records that are combined
  * to make up the list of all free segments in this metaslab.  These
- * segments are represented in-core by the ms_tree and are stored in an
- * AVL tree.
+ * segments are represented in-core by the ms_allocatable and are stored
+ * in an AVL tree.
  *
  * As the space map grows (as a result of the appends) it will
  * eventually become space-inefficient.  When the metaslab's in-core
@@ -317,20 +317,22 @@ struct metaslab {
        uint64_t        ms_size;
        uint64_t        ms_fragmentation;
 
-       range_tree_t    *ms_alloctree[TXG_SIZE];
-       range_tree_t    *ms_tree;
+       range_tree_t    *ms_allocating[TXG_SIZE];
+       range_tree_t    *ms_allocatable;
 
        /*
         * The following range trees are accessed only from syncing context.
         * ms_free*tree only have entries while syncing, and are empty
         * between syncs.
         */
-       range_tree_t    *ms_freeingtree; /* to free this syncing txg */
-       range_tree_t    *ms_freedtree; /* already freed this syncing txg */
-       range_tree_t    *ms_defertree[TXG_DEFER_SIZE];
+       range_tree_t    *ms_freeing;    /* to free this syncing txg */
+       range_tree_t    *ms_freed;      /* already freed this syncing txg */
+       range_tree_t    *ms_defer[TXG_DEFER_SIZE];
+       range_tree_t    *ms_checkpointing; /* to add to the checkpoint */
 
        boolean_t       ms_condensing;  /* condensing? */
        boolean_t       ms_condense_wanted;
+       uint64_t        ms_condense_checked_txg;
 
        /*
         * We must hold both ms_lock and ms_group->mg_lock in order to
@@ -356,11 +358,12 @@ struct metaslab {
        /*
         * The metaslab block allocators can optionally use a size-ordered
         * range tree and/or an array of LBAs. Not all allocators use
-        * this functionality. The ms_size_tree should always contain the
-        * same number of segments as the ms_tree. The only difference
-        * is that the ms_size_tree is ordered by segment sizes.
+        * this functionality. The ms_allocatable_by_size should always
+        * contain the same number of segments as the ms_allocatable. The
+        * only difference is that the ms_allocatable_by_size is ordered by
+        * segment sizes.
         */
-       avl_tree_t      ms_size_tree;
+       avl_tree_t      ms_allocatable_by_size;
        uint64_t        ms_lbas[MAX_LBAS];
 
        metaslab_group_t *ms_group;     /* metaslab group               */
index 9eef762de4094a7f12b304e8d8c895cf1449175c..7f79786f56dd86aaccbae41a430577ddbca1521c 100644 (file)
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_RANGE_TREE_H
index e8578be9a332aeb7e776b987ac95cbe16ee5795d..b6483e11b2cfe3fa63acc3335e86778abf5fb084 100644 (file)
@@ -747,6 +747,8 @@ extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props,
     uint64_t flags);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(char *pool);
+extern int spa_checkpoint(const char *pool);
+extern int spa_checkpoint_discard(const char *pool);
 extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
     boolean_t hardforce);
 extern int spa_reset(char *pool);
@@ -965,6 +967,7 @@ extern spa_load_state_t spa_load_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
 extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_get_dspace(spa_t *spa);
+extern uint64_t spa_get_checkpoint_space(spa_t *spa);
 extern uint64_t spa_get_slop_space(spa_t *spa);
 extern void spa_update_dspace(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
@@ -1016,6 +1019,10 @@ extern boolean_t spa_writeable(spa_t *spa);
 extern boolean_t spa_has_pending_synctask(spa_t *spa);
 extern int spa_maxblocksize(spa_t *spa);
 extern int spa_maxdnodesize(spa_t *spa);
+extern boolean_t spa_has_checkpoint(spa_t *spa);
+extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa);
+extern boolean_t spa_suspend_async_destroy(spa_t *spa);
+extern uint64_t spa_min_claim_txg(spa_t *spa);
 extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
 extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva,
     const blkptr_t *bp);
@@ -1027,6 +1034,7 @@ extern uint64_t spa_get_last_removal_txg(spa_t *spa);
 extern boolean_t spa_trust_config(spa_t *spa);
 extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
 extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
+extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
 extern boolean_t spa_multihost(spa_t *spa);
 extern unsigned long spa_get_hostid(void);
 
diff --git a/include/sys/spa_checkpoint.h b/include/sys/spa_checkpoint.h
new file mode 100644 (file)
index 0000000..a5c8560
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_SPA_CHECKPOINT_H
+#define        _SYS_SPA_CHECKPOINT_H
+
+#include <sys/zthr.h>
+
+typedef struct spa_checkpoint_info {
+       uint64_t sci_timestamp; /* when checkpointed uberblock was synced  */
+       uint64_t sci_dspace;    /* disk space used by checkpoint in bytes */
+} spa_checkpoint_info_t;
+
+int spa_checkpoint(const char *);
+int spa_checkpoint_discard(const char *);
+
+boolean_t spa_checkpoint_discard_thread_check(void *, zthr_t *);
+int spa_checkpoint_discard_thread(void *, zthr_t *);
+
+int spa_checkpoint_get_stats(spa_t *, pool_checkpoint_stat_t *);
+
+#endif /* _SYS_SPA_CHECKPOINT_H */
index 6abb631577fca6f90eaa9d83b0930d7f61780dc3..8d2a31961d66581d11e67511fe4f688e6346fcc0 100644 (file)
@@ -32,6 +32,7 @@
 #define        _SYS_SPA_IMPL_H
 
 #include <sys/spa.h>
+#include <sys/spa_checkpoint.h>
 #include <sys/vdev.h>
 #include <sys/vdev_removal.h>
 #include <sys/metaslab.h>
@@ -284,6 +285,10 @@ struct spa {
        spa_condensing_indirect_t       *spa_condensing_indirect;
        zthr_t          *spa_condense_zthr;     /* zthr doing condense. */
 
+       uint64_t        spa_checkpoint_txg;     /* the txg of the checkpoint */
+       spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */
+       zthr_t          *spa_checkpoint_discard_zthr;
+
        char            *spa_root;              /* alternate root directory */
        uint64_t        spa_ena;                /* spa-wide ereport ENA */
        int             spa_last_open_failed;   /* error if last open failed */
index 457300d053285a0158066b6d2c69bc66660a9457..98b87269cb6ce37fd38e2bf2621dea70e6091642 100644 (file)
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_SPACE_MAP_H
@@ -57,7 +57,7 @@ extern "C" {
 typedef struct space_map_phys {
        uint64_t        smp_object;     /* on-disk space map object */
        uint64_t        smp_objsize;    /* size of the object */
-       uint64_t        smp_alloc;      /* space allocated from the map */
+       int64_t         smp_alloc;      /* space allocated from the map */
        uint64_t        smp_pad[5];     /* reserved */
 
        /*
@@ -82,7 +82,7 @@ typedef struct space_map {
        uint64_t        sm_size;        /* size of map */
        uint8_t         sm_shift;       /* unit shift */
        uint64_t        sm_length;      /* synced length */
-       uint64_t        sm_alloc;       /* synced space allocated */
+       int64_t         sm_alloc;       /* synced space allocated */
        objset_t        *sm_os;         /* objset for this map */
        uint64_t        sm_object;      /* object id for this map */
        uint32_t        sm_blksz;       /* block size for space map */
@@ -140,6 +140,8 @@ typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size,
 
 int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
 int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg);
+int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
+    dmu_tx_t *tx);
 
 void space_map_histogram_clear(space_map_t *sm);
 void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
@@ -153,8 +155,8 @@ uint64_t space_map_length(space_map_t *sm);
 
 void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
     dmu_tx_t *tx);
-void space_map_truncate(space_map_t *sm, dmu_tx_t *tx);
-uint64_t space_map_alloc(objset_t *os, dmu_tx_t *tx);
+void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx);
+uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx);
 void space_map_free(space_map_t *sm, dmu_tx_t *tx);
 void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx);
 
index 08eeabdda9d689149c636f1bfea9abaf927c7557..113df7c61d7d67b2d1c4d6664a86d808dd9ff93e 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_UBERBLOCK_IMPL_H
@@ -61,6 +61,28 @@ struct uberblock {
        uint64_t        ub_mmp_magic;   /* MMP_MAGIC                    */
        uint64_t        ub_mmp_delay;   /* nanosec since last MMP write */
        uint64_t        ub_mmp_seq;     /* reserved for sequence number */
+
+       /*
+        * ub_checkpoint_txg indicates two things about the current uberblock:
+        *
+        * 1] If it is not zero then this uberblock is a checkpoint. If it is
+        *    zero, then this uberblock is not a checkpoint.
+        *
+        * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
+        *    the ub_txg that the uberblock had at the time we moved it to
+        *    the MOS config.
+        *
+        * The field is set when we checkpoint the uberblock and continues to
+        * hold that value even after we've rewound (unlike the ub_txg that
+        * is reset to a higher value).
+        *
+        * Besides checks used to determine whether we are reopening the
+        * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
+        * the value of the field is used to determine which ZIL blocks have
+        * been allocated according to the ms_sm when we are rewinding to a
+        * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
+        * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
+        */
        uint64_t        ub_checkpoint_txg;
 };
 
index 161e30ae7f4784b1e9020252f6e79b8e1572cf5d..6d31d61b586947c6a084a1858db1337fa9402a63 100644 (file)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_H
@@ -81,7 +81,7 @@ extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx);
 extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx);
 extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx);
 extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset,
-    uint64_t size, uint64_t txg);
+    uint64_t size);
 extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev,
     uint64_t offset, uint64_t size, dmu_tx_t *tx);
 
@@ -122,6 +122,7 @@ extern boolean_t vdev_readable(vdev_t *vd);
 extern boolean_t vdev_writeable(vdev_t *vd);
 extern boolean_t vdev_allocatable(vdev_t *vd);
 extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
+extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd);
 
 extern void vdev_cache_init(vdev_t *vd);
 extern void vdev_cache_fini(vdev_t *vd);
index e28994613533ad0b220dee3a544d4ce53df4bcf5..c22087307ceb5527fbfde1fc4c0b913266769fb7 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_IMPL_H
@@ -236,6 +236,9 @@ struct vdev {
        kmutex_t        vdev_queue_lock; /* protects vdev_queue_depth   */
        uint64_t        vdev_top_zap;
 
+       /* pool checkpoint related */
+       space_map_t     *vdev_checkpoint_sm;    /* contains reserved blocks */
+
        /*
         * Values stored in the config for an indirect or removing vdev.
         */
@@ -469,6 +472,7 @@ extern void vdev_set_min_asize(vdev_t *vd);
 /*
  * Global variables
  */
+extern int vdev_standard_sm_blksz;
 /* zdb uses this tunable, so it must be declared here to make lint happy. */
 extern int zfs_vdev_cache_size;
 
@@ -481,6 +485,11 @@ extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx);
 extern int vdev_obsolete_sm_object(vdev_t *vd);
 extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd);
 
+/*
+ * Other miscellaneous functions
+ */
+int vdev_checkpoint_sm_object(vdev_t *vd);
+
 #ifdef __cplusplus
 }
 #endif
index bec2cea33c29bc387c36ab65dfb1544e93b39ab4..3962237afdabcc574e0d3fc980949e18ddfe9a17 100644 (file)
@@ -14,7 +14,7 @@
  */
 
 /*
- * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_REMOVAL_H
@@ -79,7 +79,7 @@ extern void spa_condense_fini(spa_t *);
 extern void spa_start_indirect_condensing_thread(spa_t *);
 extern void spa_vdev_condense_suspend(spa_t *);
 extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
-extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t, uint64_t);
+extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t);
 extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *);
 extern void svr_sync(spa_t *spa, dmu_tx_t *tx);
 extern void spa_vdev_remove_suspend(spa_t *);
index 25c12fbcc8992b0e536b463e77bd0e972e6ad41a..6c0c682a8f07ece4618e215490f8178455a66bb0 100644 (file)
@@ -566,7 +566,6 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
 
 extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
     blkptr_t *new_bp, uint64_t size, boolean_t *slog);
-extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 extern void zio_shrink(zio_t *zio, uint64_t size);
 
index 6bfb6b6c0dfbb3fc6ae4a03f8753c95dab23c0e2..62da2eea811cd75a8fa145a3119755654af4abb0 100644 (file)
@@ -13,7 +13,6 @@
  * CDDL HEADER END
  */
 
-
 /*
  * Copyright (c) 2017 by Delphix. All rights reserved.
  */
index 13670c8e51233246298ee4e1be4559d0fd89773a..c59b800d3c4571c29bea2d6b5b3bae3ce08f2ea6 100644 (file)
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
@@ -61,6 +61,7 @@ typedef enum spa_feature {
        SPA_FEATURE_PROJECT_QUOTA,
        SPA_FEATURE_DEVICE_REMOVAL,
        SPA_FEATURE_OBSOLETE_COUNTS,
+       SPA_FEATURE_POOL_CHECKPOINT,
        SPA_FEATURES
 } spa_feature_t;
 
index ef98b25bc071d136f56979ad42a77d5abb88706d..e8c0cdfe4e386f4114bf5a4ab6a64385db7ad6e2 100644 (file)
@@ -350,6 +350,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
                        break;
 
                case ZPOOL_PROP_EXPANDSZ:
+               case ZPOOL_PROP_CHECKPOINT:
                        if (intval == 0) {
                                (void) strlcpy(buf, "-", len);
                        } else if (literal) {
@@ -1379,6 +1380,48 @@ zpool_destroy(zpool_handle_t *zhp, const char *log_str)
        return (0);
 }
 
+/*
+ * Create a checkpoint in the given pool.
+ */
+int
+zpool_checkpoint(zpool_handle_t *zhp)
+{
+       libzfs_handle_t *hdl = zhp->zpool_hdl;
+       char msg[1024];
+       int error;
+
+       error = lzc_pool_checkpoint(zhp->zpool_name);
+       if (error != 0) {
+               (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+                   "cannot checkpoint '%s'"), zhp->zpool_name);
+               (void) zpool_standard_error(hdl, error, msg);
+               return (-1);
+       }
+
+       return (0);
+}
+
+/*
+ * Discard the checkpoint from the given pool.
+ */
+int
+zpool_discard_checkpoint(zpool_handle_t *zhp)
+{
+       libzfs_handle_t *hdl = zhp->zpool_hdl;
+       char msg[1024];
+       int error;
+
+       error = lzc_pool_checkpoint_discard(zhp->zpool_name);
+       if (error != 0) {
+               (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+                   "cannot discard checkpoint in '%s'"), zhp->zpool_name);
+               (void) zpool_standard_error(hdl, error, msg);
+               return (-1);
+       }
+
+       return (0);
+}
+
 /*
  * Add the given vdevs to the pool.  The caller must have already performed the
  * necessary verification to ensure that the vdev specification is well-formed.
index 855c72fab3ff78997d2de597af76469b46c7388b..a19b34415dbd81cd3febea95ad8859b6c26f192a 100644 (file)
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2018, Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  * Copyright (c) 2017 Datto Inc.
  */
@@ -264,6 +264,17 @@ libzfs_error_description(libzfs_handle_t *hdl)
        case EZFS_NO_PENDING:
                return (dgettext(TEXT_DOMAIN, "operation is not "
                    "in progress"));
+       case EZFS_CHECKPOINT_EXISTS:
+               return (dgettext(TEXT_DOMAIN, "checkpoint exists"));
+       case EZFS_DISCARDING_CHECKPOINT:
+               return (dgettext(TEXT_DOMAIN, "currently discarding "
+                   "checkpoint"));
+       case EZFS_NO_CHECKPOINT:
+               return (dgettext(TEXT_DOMAIN, "checkpoint does not exist"));
+       case EZFS_DEVRM_IN_PROGRESS:
+               return (dgettext(TEXT_DOMAIN, "device removal in progress"));
+       case EZFS_VDEV_TOO_BIG:
+               return (dgettext(TEXT_DOMAIN, "device exceeds supported size"));
        case EZFS_ACTIVE_POOL:
                return (dgettext(TEXT_DOMAIN, "pool is imported on a "
                    "different host"));
@@ -530,7 +541,21 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
        case EREMOTEIO:
                zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap);
                break;
-
+       case ZFS_ERR_CHECKPOINT_EXISTS:
+               zfs_verror(hdl, EZFS_CHECKPOINT_EXISTS, fmt, ap);
+               break;
+       case ZFS_ERR_DISCARDING_CHECKPOINT:
+               zfs_verror(hdl, EZFS_DISCARDING_CHECKPOINT, fmt, ap);
+               break;
+       case ZFS_ERR_NO_CHECKPOINT:
+               zfs_verror(hdl, EZFS_NO_CHECKPOINT, fmt, ap);
+               break;
+       case ZFS_ERR_DEVRM_IN_PROGRESS:
+               zfs_verror(hdl, EZFS_DEVRM_IN_PROGRESS, fmt, ap);
+               break;
+       case ZFS_ERR_VDEV_TOO_BIG:
+               zfs_verror(hdl, EZFS_VDEV_TOO_BIG, fmt, ap);
+               break;
        default:
                zfs_error_aux(hdl, strerror(error));
                zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
index 5a46042a4599d3c9086f2e2df4c426d86323dc8f..ab2b705193217b6150736b12f2201e4063579b34 100644 (file)
@@ -1142,6 +1142,74 @@ lzc_channel_program(const char *pool, const char *program, uint64_t instrlimit,
            memlimit, argnvl, outnvl));
 }
 
+/*
+ * Creates a checkpoint for the specified pool.
+ *
+ * If this function returns 0 the pool was successfully checkpointed.
+ *
+ * This method may also return:
+ *
+ * ZFS_ERR_CHECKPOINT_EXISTS
+ *     The pool already has a checkpoint. A pools can only have one
+ *     checkpoint at most, at any given time.
+ *
+ * ZFS_ERR_DISCARDING_CHECKPOINT
+ *     ZFS is in the middle of discarding a checkpoint for this pool.
+ *     The pool can be checkpointed again once the discard is done.
+ *
+ * ZFS_DEVRM_IN_PROGRESS
+ *     A vdev is currently being removed. The pool cannot be
+ *     checkpointed until the device removal is done.
+ *
+ * ZFS_VDEV_TOO_BIG
+ *     One or more top-level vdevs exceed the maximum vdev size
+ *     supported for this feature.
+ */
+int
+lzc_pool_checkpoint(const char *pool)
+{
+       int error;
+
+       nvlist_t *result = NULL;
+       nvlist_t *args = fnvlist_alloc();
+
+       error = lzc_ioctl(ZFS_IOC_POOL_CHECKPOINT, pool, args, &result);
+
+       fnvlist_free(args);
+       fnvlist_free(result);
+
+       return (error);
+}
+
+/*
+ * Discard the checkpoint from the specified pool.
+ *
+ * If this function returns 0 the checkpoint was successfully discarded.
+ *
+ * This method may also return:
+ *
+ * ZFS_ERR_NO_CHECKPOINT
+ *     The pool does not have a checkpoint.
+ *
+ * ZFS_ERR_DISCARDING_CHECKPOINT
+ *     ZFS is already in the middle of discarding the checkpoint.
+ */
+int
+lzc_pool_checkpoint_discard(const char *pool)
+{
+       int error;
+
+       nvlist_t *result = NULL;
+       nvlist_t *args = fnvlist_alloc();
+
+       error = lzc_ioctl(ZFS_IOC_POOL_DISCARD_CHECKPOINT, pool, args, &result);
+
+       fnvlist_free(args);
+       fnvlist_free(result);
+
+       return (error);
+}
+
 /*
  * Executes a read-only channel program.
  *
index f2a7a00a85d9b65d81416fce92a5480891fa1150..58e3b6eec7d212242d13da610d25734cc3673b2e 100644 (file)
@@ -92,6 +92,7 @@ KERNEL_C = \
        skein_zfs.c \
        spa.c \
        spa_boot.c \
+       spa_checkpoint.c \
        spa_config.c \
        spa_errlog.c \
        spa_history.c \
index 1eaa51b9f86eaf127543a182f4666864a37b1a03..341548ac300a1077cd809e3003f79bfecbc809e5 100644 (file)
@@ -497,8 +497,6 @@ vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
 #ifdef __linux__
                flags |= O_DIRECT;
 #endif
-               /* We shouldn't be writing to block devices in userspace */
-               VERIFY(!(flags & FWRITE));
        }
 
        if (flags & FCREAT)
index fe58e6e599ac8d5c62c76597708615c3537291db..9125b5b1f789b804e3a02e394ae4461b29401728 100644 (file)
@@ -293,7 +293,7 @@ Use \fB1\fR for yes (default) and \fB0\fR for no.
 .sp
 .ne 2
 .na
-\fBmetaslabs_per_vdev\fR (int)
+\fBvdev_max_ms_count\fR (int)
 .ad
 .RS 12n
 When a vdev is added, it will be divided into approximately (but no more than) this number of metaslabs.
@@ -301,6 +301,17 @@ When a vdev is added, it will be divided into approximately (but no more than) t
 Default value: \fB200\fR.
 .RE
 
+.sp
+.ne 2
+.na
+\fBvdev_min_ms_count\fR (int)
+.ad
+.RS 12n
+Minimum number of metaslabs to create in a top-level vdev.
+.sp
+Default value: \fB16\fR.
+.RE
+
 .sp
 .ne 2
 .na
@@ -2100,6 +2111,18 @@ Flushing of data to disk is done in passes. Defer frees starting in this pass
 Default value: \fB2\fR.
 .RE
 
+.sp
+.ne 2
+.na
+\fBzfs_spa_discard_memory_limit\fR (int)
+.ad
+.RS 12n
+Maximum memory used for prefetching a checkpoint's space map on each
+vdev while discarding the checkpoint.
+.sp
+Default value: \fB16,777,216\fR.
+.RE
+
 .sp
 .ne 2
 .na
index ce34a05a22fa7724db41652b922ce7295fb55031..7cad9a27bbe0d5119de1a0b4948e54a889575e53 100644 (file)
@@ -1,5 +1,5 @@
 '\" te
-.\" Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+.\" Copyright (c) 2013, 2017 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" The contents of this file are subject to the terms of the Common Development
@@ -484,6 +484,24 @@ used on a top-level vdev, and will never return to being \fBenabled\fR.
 .sp
 .ne 2
 .na
+\fB\fBzpool_checkpoint\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID   com.delphix:zpool_checkpoint
+READ\-ONLY COMPATIBLE  yes
+DEPENDENCIES   none
+.TE
+
+This feature enables the "zpool checkpoint" subcommand that can
+checkpoint the state of the pool at the time it was issued and later
+rewind back to it or discard it.
+
+This feature becomes \fBactive\fR when the "zpool checkpoint" command
+is used to checkpoint the pool.
+The feature will only return back to being \fBenabled\fR when the pool
+is rewound or the checkpoint has been discarded.
 \fB\fBlarge_blocks\fR\fR
 .ad
 .RS 4n
index 0ce4d852d890ea82cee0a32d3c6ecf1c0e5ebdf9..f00d9c0cfef730f371f66b8f96df0185867b51f1 100644 (file)
@@ -23,7 +23,7 @@
 .Nd display zpool debugging and consistency information
 .Sh SYNOPSIS
 .Nm
-.Op Fl AbcdDFGhiLMPsvX
+.Op Fl AbcdDFGhikLMPsvX
 .Op Fl e Oo Fl V Oc Op Fl p Ar path ...
 .Op Fl I Ar inflight I/Os
 .Oo Fl o Ar var Ns = Ns Ar value Oc Ns ...
@@ -172,6 +172,9 @@ Display information about intent log
 .Pq ZIL
 entries relating to each dataset.
 If specified multiple times, display counts of each intent log transaction type.
+.It Fl k
+Examine the checkpointed state of the pool.
+Note, the on disk format of the pool is not reverted to the checkpointed state.
 .It Fl l Ar device
 Read the vdev labels from the specified device.
 .Nm Fl l
index 2280518091cd98c967021fbc5d424dd396c888ca..ddcba2e7206f94b541dd40f006f6684455d6594b 100644 (file)
 .Oo Fl o Ar property Ns = Ns Ar value Oc
 .Ar pool device new_device
 .Nm
+.Cm checkpoint
+.Op Fl d, -discard
+.Ar pool
+.Nm
 .Cm clear
 .Ar pool
 .Op Ar device
@@ -93,6 +97,7 @@
 .Fl a
 .Op Fl DflmN
 .Op Fl F Oo Fl n Oc Oo Fl T Oc Oo Fl X Oc
+.Op Fl -rewind-to-checkpoint
 .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns device
 .Op Fl o Ar mntopts
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ...
 .Cm import
 .Op Fl Dflm
 .Op Fl F Oo Fl n Oc Oo Fl T Oc Oo Fl X Oc
+.Op Fl -rewind-to-checkpoint
 .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns device
 .Op Fl o Ar mntopts
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ...
@@ -470,6 +476,50 @@ configuration.
 .Pp
 The content of the cache devices is considered volatile, as is the case with
 other system caches.
+.Ss Pool checkpoint
+Before starting critical procedures that include destructive actions (e.g
+.Nm zfs Cm destroy
+), an administrator can checkpoint the pool's state and in the case of a
+mistake or failure, rewind the entire pool back to the checkpoint.
+Otherwise, the checkpoint can be discarded when the procedure has completed
+successfully.
+.Pp
+A pool checkpoint can be thought of as a pool-wide snapshot and should be used
+with care as it contains every part of the pool's state, from properties to vdev
+configuration.
+Thus, while a pool has a checkpoint certain operations are not allowed.
+Specifically, vdev removal/attach/detach, mirror splitting, and
+changing the pool's guid.
+Adding a new vdev is supported but in the case of a rewind it will have to be
+added again.
+Finally, users of this feature should keep in mind that scrubs in a pool that
+has a checkpoint do not repair checkpointed data.
+.Pp
+To create a checkpoint for a pool:
+.Bd -literal
+# zpool checkpoint pool
+.Ed
+.Pp
+To later rewind to its checkpointed state, you need to first export it and
+then rewind it during import:
+.Bd -literal
+# zpool export pool
+# zpool import --rewind-to-checkpoint pool
+.Ed
+.Pp
+To discard the checkpoint from a pool:
+.Bd -literal
+# zpool checkpoint -d pool
+.Ed
+.Pp
+Dataset reservations (controlled by the
+.Nm reservation
+or
+.Nm refreservation
+zfs properties) may be unenforceable while a checkpoint exists, because the
+checkpoint is allowed to consume the dataset's reservation.
+Finally, data that is part of the checkpoint but has been freed in the
+current state of the pool won't be scanned during a scrub.
 .Ss Properties
 Each pool has several properties associated with it.
 Some properties are read-only statistics while others are configurable and
@@ -856,6 +906,39 @@ supported at the moment is ashift.
 .El
 .It Xo
 .Nm
+.Cm checkpoint
+.Op Fl d, -discard
+.Ar pool
+.Xc
+Checkpoints the current state of
+.Ar pool
+, which can be later restored by
+.Nm zpool Cm import --rewind-to-checkpoint .
+The existence of a checkpoint in a pool prohibits the following
+.Nm zpool
+commands:
+.Cm remove ,
+.Cm attach ,
+.Cm detach ,
+.Cm split ,
+and
+.Cm reguid .
+In addition, it may break reservation boundaries if the pool lacks free
+space.
+The
+.Nm zpool Cm status
+command indicates the existence of a checkpoint or the progress of discarding a
+checkpoint from a pool.
+The
+.Nm zpool Cm list
+command reports how much space the checkpoint takes from the pool.
+.Bl -tag -width Ds
+.It Fl d, -discard
+Discards an existing checkpoint from
+.Ar pool .
+.El
+.It Xo
+.Nm
 .Cm clear
 .Ar pool
 .Op Ar device
@@ -1290,6 +1373,16 @@ and the
 .Sy altroot
 property to
 .Ar root .
+.It Fl -rewind-to-checkpoint
+Rewinds pool to the checkpointed state.
+Once the pool is imported with this flag there is no way to undo the rewind.
+All changes and data that were written after the checkpoint are lost!
+The only exception is when the
+.Sy readonly
+mounting option is enabled.
+In this case, the checkpointed state of the pool is opened and an
+administrator can see how the pool would look like if they were
+to fully rewind.
 .It Fl s
 Scan using the default search path, the libblkid cache will not be
 consulted. A custom search path may be specified by setting the
index ea1bccf5092939f894989613a3673fe097dd1246..b010c88434505d13919b7fd27c6c9cf588c00a65 100644 (file)
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
@@ -210,6 +210,11 @@ zpool_feature_init(void)
            hole_birth_deps);
        }
 
+       zfeature_register(SPA_FEATURE_POOL_CHECKPOINT,
+           "com.delphix:zpool_checkpoint", "zpool_checkpoint",
+           "Pool state can be checkpointed, allowing rewind later.",
+           ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+
        zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
            "com.delphix:extensible_dataset", "extensible_dataset",
            "Enhanced dataset functionality, used by other features.",
index bc38eca7de904c719bf2a5c2450f19b72a072a5e..dc0bb59bc8da193d4c965e7037f3221d4445c952 100644 (file)
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/zio.h>
@@ -79,6 +79,8 @@ zpool_prop_init(void)
            ZFS_TYPE_POOL, "<size>", "FREE");
        zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
            ZFS_TYPE_POOL, "<size>", "FREEING");
+       zprop_register_number(ZPOOL_PROP_CHECKPOINT, "checkpoint", 0,
+           PROP_READONLY, ZFS_TYPE_POOL, "<size>", "CKPOINT");
        zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
            ZFS_TYPE_POOL, "<size>", "LEAKED");
        zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
index 1c2187c56cea6127b26652fe9509419392a73592..d8d1e3a23925a4b84ba3e0fd463c62c79e5194b6 100644 (file)
@@ -68,6 +68,7 @@ $(MODULE)-objs += sha256.o
 $(MODULE)-objs += skein_zfs.o
 $(MODULE)-objs += spa.o
 $(MODULE)-objs += spa_boot.o
+$(MODULE)-objs += spa_checkpoint.o
 $(MODULE)-objs += spa_config.o
 $(MODULE)-objs += spa_errlog.o
 $(MODULE)-objs += spa_history.o
index a5f468ac8db0785c04797c7fca2e7c4ec6b9ee2d..f0b5356180a871b7fbfb3f2e54ed133551899351 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -81,8 +81,8 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
        if (BP_IS_HOLE(bp))
                return (0);
 
-       if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
-               return (0);
+       if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
+               return (-1);
 
        SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
            bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
@@ -121,20 +121,17 @@ static void
 traverse_zil(traverse_data_t *td, zil_header_t *zh)
 {
        uint64_t claim_txg = zh->zh_claim_txg;
-       zilog_t *zilog;
 
        /*
         * We only want to visit blocks that have been claimed but not yet
-        * replayed; plus, in read-only mode, blocks that are already stable.
+        * replayed; plus blocks that are already stable in read-only mode.
         */
        if (claim_txg == 0 && spa_writeable(td->td_spa))
                return;
 
-       zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
-
+       zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
        (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
            claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT));
-
        zil_free(zilog);
 }
 
index ab687f7cc7ff07c39fb1a531de17186dfa07f6fa..fddad607d09d6e06b73a9ba5d290e5e77e886b60 100644 (file)
@@ -1284,6 +1284,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
            (spa_is_root(os->os_spa) &&
            spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
 
+       ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
+
        if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT ||
            object == DMU_PROJECTUSED_OBJECT) {
                if (object == DMU_USERUSED_OBJECT)
@@ -1519,7 +1521,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
                mutex_exit(&dn->dn_mtx);
                dnode_slots_rele(dnc, idx, slots);
                dbuf_rele(db, FTAG);
-               return (SET_ERROR(type == DMU_OT_NONE ? ENOENT : EEXIST));
+               return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ?
+                   ENOENT : EEXIST));
        }
 
        if (refcount_add(&dn->dn_holds, tag) == 1)
index 96e7bccc9217570cd1c92d27bdb150eac59c3475..044031e4f7b11740ca9002d7666abece5cd9fa32 100644 (file)
@@ -644,7 +644,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
                    dn->dn_maxblkid == 0 || list_head(list) != NULL ||
                    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
                    dnp->dn_datablkszsec ||
-                   range_tree_space(dn->dn_free_ranges[txgoff]) != 0);
+                   !range_tree_is_empty(dn->dn_free_ranges[txgoff]));
                dnp->dn_datablkszsec =
                    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
                dn->dn_next_blksz[txgoff] = 0;
index 9db6d1e0be0fd8c4978b5d6ecf7d948a2f0da6ce..bb9b4a1c78cba16aeb1677019694c02b816de576 100644 (file)
@@ -46,6 +46,7 @@
 #include <sys/zfs_context.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/vdev.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_onexit.h>
@@ -208,7 +209,9 @@ int
 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
     boolean_t async)
 {
-       int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+       int used = bp_get_dsize_sync(spa, bp);
        int compressed = BP_GET_PSIZE(bp);
        int uncompressed = BP_GET_UCSIZE(bp);
 
@@ -3821,7 +3824,8 @@ dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
        ddsqra.ddsqra_value = refquota;
 
        return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
-           dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
+           dsl_dataset_set_refquota_sync, &ddsqra, 0,
+           ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 static int
@@ -3936,8 +3940,8 @@ dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
        ddsqra.ddsqra_value = refreservation;
 
        return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
-           dsl_dataset_set_refreservation_sync, &ddsqra,
-           0, ZFS_SPACE_CHECK_NONE));
+           dsl_dataset_set_refreservation_sync, &ddsqra, 0,
+           ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 /*
index b3296ceee3b377db19ff002cfb134d3a68667a43..b450073cca890b79fbd1d1de712c97719873a761 100644 (file)
@@ -1036,7 +1036,7 @@ dsl_destroy_head(const char *name)
 
                error = dsl_sync_task(name, dsl_destroy_head_check,
                    dsl_destroy_head_begin_sync, &ddha,
-                   0, ZFS_SPACE_CHECK_NONE);
+                   0, ZFS_SPACE_CHECK_DESTROY);
                if (error != 0)
                        return (error);
 
@@ -1062,7 +1062,7 @@ dsl_destroy_head(const char *name)
        }
 
        return (dsl_sync_task(name, dsl_destroy_head_check,
-           dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE));
+           dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY));
 }
 
 /*
index 36abfe0241b046c6f822a8a5d631cf1d92b8bdeb..519c94b64a7f07222db5c8bde5420ec43b1a7a0a 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013 Martin Matuska. All rights reserved.
  * Copyright (c) 2014 Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -942,14 +942,14 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
        ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
            DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
        if (pds) {
-               VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
+               VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
                    name, sizeof (uint64_t), 1, &ddobj, tx));
        } else {
                /* it's the root dir */
-               VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+               VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
                    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
        }
-       VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
+       VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
        dmu_buf_will_dirty(dbuf, tx);
        ddphys = dbuf->db_data;
 
@@ -987,6 +987,12 @@ dsl_dir_get_used(dsl_dir_t *dd)
        return (dsl_dir_phys(dd)->dd_used_bytes);
 }
 
+uint64_t
+dsl_dir_get_compressed(dsl_dir_t *dd)
+{
+       return (dsl_dir_phys(dd)->dd_compressed_bytes);
+}
+
 uint64_t
 dsl_dir_get_quota(dsl_dir_t *dd)
 {
@@ -1215,7 +1221,8 @@ dsl_dir_space_available(dsl_dir_t *dd,
                used += dsl_dir_space_towrite(dd);
 
        if (dd->dd_parent == NULL) {
-               uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
+               uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool,
+                   ZFS_SPACE_CHECK_NORMAL);
                quota = MIN(quota, poolsize);
        }
 
@@ -1326,11 +1333,12 @@ top_of_function:
         */
        uint64_t deferred = 0;
        if (dd->dd_parent == NULL) {
-               spa_t *spa = dd->dd_pool->dp_spa;
-               uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
-               deferred = metaslab_class_get_deferred(spa_normal_class(spa));
-               if (poolsize - deferred < quota) {
-                       quota = poolsize - deferred;
+               uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
+                   (netfree) ?
+                   ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL);
+
+               if (avail < quota) {
+                       quota = avail;
                        retval = ENOSPC;
                }
        }
@@ -1684,7 +1692,8 @@ dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
        ddsqra.ddsqra_value = quota;
 
        return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
-           dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
+           dsl_dir_set_quota_sync, &ddsqra, 0,
+           ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 int
@@ -1727,7 +1736,8 @@ dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
                avail = dsl_dir_space_available(dd->dd_parent,
                    NULL, 0, FALSE);
        } else {
-               avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
+               avail = dsl_pool_adjustedsize(dd->dd_pool,
+                   ZFS_SPACE_CHECK_NORMAL) - used;
        }
 
        if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
@@ -1805,7 +1815,8 @@ dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
        ddsqra.ddsqra_value = reservation;
 
        return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
-           dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
+           dsl_dir_set_reservation_sync, &ddsqra, 0,
+           ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 static dsl_dir_t *
index 1bb49c13ae15cced8c3b625235ee5002c8f8c926..e8f519b18b01d47a2e1c7fbf8a2fceeb91676c8d 100644 (file)
@@ -43,6 +43,8 @@
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_deadlist.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
 #include <sys/bptree.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
@@ -201,6 +203,8 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
            offsetof(dsl_dir_t, dd_dirty_link));
        txg_list_create(&dp->dp_sync_tasks, spa,
            offsetof(dsl_sync_task_t, dst_node));
+       txg_list_create(&dp->dp_early_sync_tasks, spa,
+           offsetof(dsl_sync_task_t, dst_node));
 
        dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
            zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
@@ -385,6 +389,7 @@ dsl_pool_close(dsl_pool_t *dp)
        txg_list_destroy(&dp->dp_dirty_datasets);
        txg_list_destroy(&dp->dp_dirty_zilogs);
        txg_list_destroy(&dp->dp_sync_tasks);
+       txg_list_destroy(&dp->dp_early_sync_tasks);
        txg_list_destroy(&dp->dp_dirty_dirs);
 
        taskq_destroy(dp->dp_zil_clean_taskq);
@@ -574,6 +579,29 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
                cv_signal(&dp->dp_spaceavail_cv);
 }
 
+#ifdef ZFS_DEBUG
+static boolean_t
+dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
+{
+       spa_t *spa = dp->dp_spa;
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *vd = rvd->vdev_child[c];
+               txg_list_t *tl = &vd->vdev_ms_list;
+               metaslab_t *ms;
+
+               for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms;
+                   ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) {
+                       VERIFY(range_tree_is_empty(ms->ms_freeing));
+                       VERIFY(range_tree_is_empty(ms->ms_checkpointing));
+               }
+       }
+
+       return (B_TRUE);
+}
+#endif
+
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
@@ -589,6 +617,23 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 
        tx = dmu_tx_create_assigned(dp, txg);
 
+       /*
+        * Run all early sync tasks before writing out any dirty blocks.
+        * For more info on early sync tasks see block comment in
+        * dsl_early_sync_task().
+        */
+       if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) {
+               dsl_sync_task_t *dst;
+
+               ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+               while ((dst =
+                   txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) {
+                       ASSERT(dsl_early_sync_task_verify(dp, txg));
+                       dsl_sync_task_sync(dst, tx);
+               }
+               ASSERT(dsl_early_sync_task_verify(dp, txg));
+       }
+
        /*
         * Write out all dirty blocks of dirty datasets.
         */
@@ -744,22 +789,66 @@ dsl_pool_sync_context(dsl_pool_t *dp)
            taskq_member(dp->dp_sync_taskq, curthread));
 }
 
+/*
+ * This function returns the amount of allocatable space in the pool
+ * minus whatever space is currently reserved by ZFS for specific
+ * purposes. Specifically:
+ *
+ * 1] Any reserved SLOP space
+ * 2] Any space used by the checkpoint
+ * 3] Any space used for deferred frees
+ *
+ * The latter 2 are especially important because they are needed to
+ * rectify the SPA's and DMU's different understanding of how much space
+ * is used. Now the DMU is aware of that extra space tracked by the SPA
+ * without having to maintain a separate special dir (e.g similar to
+ * $MOS, $FREEING, and $LEAKED).
+ *
+ * Note: By deferred frees here, we mean the frees that were deferred
+ * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the
+ * segments placed in ms_defer trees during metaslab_sync_done().
+ */
 uint64_t
-dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
+dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy)
 {
-       uint64_t space, resv;
-
-       /*
-        * If we're trying to assess whether it's OK to do a free,
-        * cut the reservation in half to allow forward progress
-        * (e.g. make it possible to rm(1) files from a full pool).
-        */
-       space = spa_get_dspace(dp->dp_spa);
-       resv = spa_get_slop_space(dp->dp_spa);
-       if (netfree)
+       spa_t *spa = dp->dp_spa;
+       uint64_t space, resv, adjustedsize;
+       uint64_t spa_deferred_frees =
+           spa->spa_deferred_bpobj.bpo_phys->bpo_bytes;
+
+       space = spa_get_dspace(spa)
+           - spa_get_checkpoint_space(spa) - spa_deferred_frees;
+       resv = spa_get_slop_space(spa);
+
+       switch (slop_policy) {
+       case ZFS_SPACE_CHECK_NORMAL:
+               break;
+       case ZFS_SPACE_CHECK_RESERVED:
                resv >>= 1;
+               break;
+       case ZFS_SPACE_CHECK_EXTRA_RESERVED:
+               resv >>= 2;
+               break;
+       case ZFS_SPACE_CHECK_NONE:
+               resv = 0;
+               break;
+       default:
+               panic("invalid slop policy value: %d", slop_policy);
+               break;
+       }
+       adjustedsize = (space >= resv) ? (space - resv) : 0;
 
-       return (space - resv);
+       return (adjustedsize);
+}
+
+uint64_t
+dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy)
+{
+       uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy);
+       uint64_t deferred =
+           metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+       uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0;
+       return (quota);
 }
 
 boolean_t
index 2c349474618907b1986d470281d711cc51d956ab..986dccdea738083e890491d677bf850637faf8db 100644 (file)
@@ -733,7 +733,7 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
        }
 
        return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
-           dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
+           dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 /* ARGSUSED */
@@ -810,13 +810,23 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
                 * If the scrub/resilver completed, update all DTLs to
                 * reflect this.  Whether it succeeded or not, vacate
                 * all temporary scrub DTLs.
+                *
+                * As the scrub does not currently support traversing
+                * data that have been freed but are part of a checkpoint,
+                * we don't mark the scrub as done in the DTLs as faults
+                * may still exist in those vdevs.
                 */
-               vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
-                   complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
-               if (complete) {
+               if (complete &&
+                   !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+                       vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+                           scn->scn_phys.scn_max_txg, B_TRUE);
+
                        spa_event_notify(spa, NULL, NULL,
                            scn->scn_phys.scn_min_txg ?
                            ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+               } else {
+                       vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+                           0, B_TRUE);
                }
                spa_errlog_rotate(spa);
 
@@ -1217,7 +1227,7 @@ dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
         * (on-disk) even if it hasn't been claimed (even though for
         * scrub there's nothing to do to it).
         */
-       if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+       if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
                return (0);
 
        SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
@@ -1268,11 +1278,13 @@ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
        zil_scan_arg_t zsa = { dp, zh };
        zilog_t *zilog;
 
+       ASSERT(spa_writeable(dp->dp_spa));
+
        /*
         * We only want to visit blocks that have been claimed but not yet
         * replayed (or, in read-only mode, blocks that *would* be claimed).
         */
-       if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+       if (claim_txg == 0)
                return;
 
        zilog = zil_alloc(dp->dp_meta_objset, zh);
@@ -3004,79 +3016,16 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
        return (B_TRUE);
 }
 
-/*
- * This is the primary entry point for scans that is called from syncing
- * context. Scans must happen entirely during syncing context so that we
- * cna guarantee that blocks we are currently scanning will not change out
- * from under us. While a scan is active, this function controls how quickly
- * transaction groups proceed, instead of the normal handling provided by
- * txg_sync_thread().
- */
-void
-dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+static int
+dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
 {
-       int err = 0;
        dsl_scan_t *scn = dp->dp_scan;
        spa_t *spa = dp->dp_spa;
-       state_sync_type_t sync_type = SYNC_OPTIONAL;
-
-       /*
-        * Check for scn_restart_txg before checking spa_load_state, so
-        * that we can restart an old-style scan while the pool is being
-        * imported (see dsl_scan_init).
-        */
-       if (dsl_scan_restarting(scn, tx)) {
-               pool_scan_func_t func = POOL_SCAN_SCRUB;
-               dsl_scan_done(scn, B_FALSE, tx);
-               if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
-                       func = POOL_SCAN_RESILVER;
-               zfs_dbgmsg("restarting scan func=%u txg=%llu",
-                   func, (longlong_t)tx->tx_txg);
-               dsl_scan_setup_sync(&func, tx);
-       }
-
-       /*
-        * Only process scans in sync pass 1.
-        */
-       if (spa_sync_pass(spa) > 1)
-               return;
-
-       /*
-        * If the spa is shutting down, then stop scanning. This will
-        * ensure that the scan does not dirty any new data during the
-        * shutdown phase.
-        */
-       if (spa_shutting_down(spa))
-               return;
-
-       /*
-        * If the scan is inactive due to a stalled async destroy, try again.
-        */
-       if (!scn->scn_async_stalled && !dsl_scan_active(scn))
-               return;
+       int err = 0;
 
-       /* reset scan statistics */
-       scn->scn_visited_this_txg = 0;
-       scn->scn_holes_this_txg = 0;
-       scn->scn_lt_min_this_txg = 0;
-       scn->scn_gt_max_this_txg = 0;
-       scn->scn_ddt_contained_this_txg = 0;
-       scn->scn_objsets_visited_this_txg = 0;
-       scn->scn_avg_seg_size_this_txg = 0;
-       scn->scn_segs_this_txg = 0;
-       scn->scn_avg_zio_size_this_txg = 0;
-       scn->scn_zios_this_txg = 0;
-       scn->scn_suspending = B_FALSE;
-       scn->scn_sync_start_time = gethrtime();
-       spa->spa_scrub_active = B_TRUE;
+       if (spa_suspend_async_destroy(spa))
+               return (0);
 
-       /*
-        * First process the async destroys.  If we suspend, don't do
-        * any scrubbing or resilvering.  This ensures that there are no
-        * async destroys while we are scanning, so the scan code doesn't
-        * have to worry about traversing it.  It is also faster to free the
-        * blocks than to scrub them.
-        */
        if (zfs_free_bpobj_enabled &&
            spa_version(spa) >= SPA_VERSION_DEADLISTS) {
                scn->scn_is_bptree = B_FALSE;
@@ -3152,7 +3101,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                ddt_sync(spa, tx->tx_txg);
        }
        if (err != 0)
-               return;
+               return (err);
        if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
            zfs_free_leak_on_eio &&
            (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
@@ -3205,6 +3154,85 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
                        dsl_pool_destroy_obsolete_bpobj(dp, tx);
        }
+       return (0);
+}
+
+/*
+ * This is the primary entry point for scans that is called from syncing
+ * context. Scans must happen entirely during syncing context so that we
+ * cna guarantee that blocks we are currently scanning will not change out
+ * from under us. While a scan is active, this function controls how quickly
+ * transaction groups proceed, instead of the normal handling provided by
+ * txg_sync_thread().
+ */
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+       int err = 0;
+       dsl_scan_t *scn = dp->dp_scan;
+       spa_t *spa = dp->dp_spa;
+       state_sync_type_t sync_type = SYNC_OPTIONAL;
+
+       /*
+        * Check for scn_restart_txg before checking spa_load_state, so
+        * that we can restart an old-style scan while the pool is being
+        * imported (see dsl_scan_init).
+        */
+       if (dsl_scan_restarting(scn, tx)) {
+               pool_scan_func_t func = POOL_SCAN_SCRUB;
+               dsl_scan_done(scn, B_FALSE, tx);
+               if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+                       func = POOL_SCAN_RESILVER;
+               zfs_dbgmsg("restarting scan func=%u txg=%llu",
+                   func, (longlong_t)tx->tx_txg);
+               dsl_scan_setup_sync(&func, tx);
+       }
+
+       /*
+        * Only process scans in sync pass 1.
+        */
+       if (spa_sync_pass(spa) > 1)
+               return;
+
+       /*
+        * If the spa is shutting down, then stop scanning. This will
+        * ensure that the scan does not dirty any new data during the
+        * shutdown phase.
+        */
+       if (spa_shutting_down(spa))
+               return;
+
+       /*
+        * If the scan is inactive due to a stalled async destroy, try again.
+        */
+       if (!scn->scn_async_stalled && !dsl_scan_active(scn))
+               return;
+
+       /* reset scan statistics */
+       scn->scn_visited_this_txg = 0;
+       scn->scn_holes_this_txg = 0;
+       scn->scn_lt_min_this_txg = 0;
+       scn->scn_gt_max_this_txg = 0;
+       scn->scn_ddt_contained_this_txg = 0;
+       scn->scn_objsets_visited_this_txg = 0;
+       scn->scn_avg_seg_size_this_txg = 0;
+       scn->scn_segs_this_txg = 0;
+       scn->scn_avg_zio_size_this_txg = 0;
+       scn->scn_zios_this_txg = 0;
+       scn->scn_suspending = B_FALSE;
+       scn->scn_sync_start_time = gethrtime();
+       spa->spa_scrub_active = B_TRUE;
+
+       /*
+        * First process the async destroys.  If we suspend, don't do
+        * any scrubbing or resilvering.  This ensures that there are no
+        * async destroys while we are scanning, so the scan code doesn't
+        * have to worry about traversing it.  It is also faster to free the
+        * blocks than to scrub them.
+        */
+       err = dsl_process_async_destroys(dp, tx);
+       if (err != 0)
+               return;
 
        if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
                return;
index d8eb10d37685bf14e4f4d81415ed61997e95b9b0..b63ce5cad90c0cd6a16362992f95c8c8a085209b 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -39,33 +39,10 @@ dsl_null_checkfunc(void *arg, dmu_tx_t *tx)
        return (0);
 }
 
-/*
- * Called from open context to perform a callback in syncing context.  Waits
- * for the operation to complete.
- *
- * The checkfunc will be called from open context as a preliminary check
- * which can quickly fail.  If it succeeds, it will be called again from
- * syncing context.  The checkfunc should generally be designed to work
- * properly in either context, but if necessary it can check
- * dmu_tx_is_syncing(tx).
- *
- * The synctask infrastructure enforces proper locking strategy with respect
- * to the dp_config_rwlock -- the lock will always be held when the callbacks
- * are called.  It will be held for read during the open-context (preliminary)
- * call to the checkfunc, and then held for write from syncing context during
- * the calls to the check and sync funcs.
- *
- * A dataset or pool name can be passed as the first argument.  Typically,
- * the check func will hold, check the return value of the hold, and then
- * release the dataset.  The sync func will VERIFYO(hold()) the dataset.
- * This is safe because no changes can be made between the check and sync funcs,
- * and the sync func will only be called if the check func successfully opened
- * the dataset.
- */
-int
-dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+static int
+dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc,
     dsl_syncfunc_t *syncfunc, void *arg,
-    int blocks_modified, zfs_space_check_t space_check)
+    int blocks_modified, zfs_space_check_t space_check, boolean_t early)
 {
        spa_t *spa;
        dmu_tx_t *tx;
@@ -102,7 +79,9 @@ top:
                return (err);
        }
 
-       VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, &dst, dst.dst_txg));
+       txg_list_t *task_list = (early) ?
+           &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
+       VERIFY(txg_list_add_tail(task_list, &dst, dst.dst_txg));
 
        dmu_tx_commit(tx);
 
@@ -117,9 +96,64 @@ top:
        return (dst.dst_error);
 }
 
-void
-dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
-    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+/*
+ * Called from open context to perform a callback in syncing context.  Waits
+ * for the operation to complete.
+ *
+ * The checkfunc will be called from open context as a preliminary check
+ * which can quickly fail.  If it succeeds, it will be called again from
+ * syncing context.  The checkfunc should generally be designed to work
+ * properly in either context, but if necessary it can check
+ * dmu_tx_is_syncing(tx).
+ *
+ * The synctask infrastructure enforces proper locking strategy with respect
+ * to the dp_config_rwlock -- the lock will always be held when the callbacks
+ * are called.  It will be held for read during the open-context (preliminary)
+ * call to the checkfunc, and then held for write from syncing context during
+ * the calls to the check and sync funcs.
+ *
+ * A dataset or pool name can be passed as the first argument.  Typically,
+ * the check func will hold, check the return value of the hold, and then
+ * release the dataset.  The sync func will VERIFYO(hold()) the dataset.
+ * This is safe because no changes can be made between the check and sync funcs,
+ * and the sync func will only be called if the check func successfully opened
+ * the dataset.
+ */
+int
+dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+    dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check)
+{
+       return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg,
+           blocks_modified, space_check, B_FALSE));
+}
+
+/*
+ * An early synctask works exactly as a standard synctask with one important
+ * difference on the way it is handled during syncing context. Standard
+ * synctasks run after we've written out all the dirty blocks of dirty
+ * datasets. Early synctasks are executed before writing out any dirty data,
+ * and thus before standard synctasks.
+ *
+ * For that reason, early synctasks can affect the process of writing dirty
+ * changes to disk for the txg that they run and should be used with caution.
+ * In addition, early synctasks should not dirty any metaslabs as this would
+ * invalidate the precodition/invariant for subsequent early synctasks.
+ * [see dsl_pool_sync() and dsl_early_sync_task_verify()]
+ */
+int
+dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+    dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check)
+{
+       return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg,
+           blocks_modified, space_check, B_TRUE));
+}
+
+static void
+dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx,
+    boolean_t early)
 {
        dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);
 
@@ -133,7 +167,25 @@ dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
        dst->dst_error = 0;
        dst->dst_nowaiter = B_TRUE;
 
-       VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, dst, dst->dst_txg));
+       txg_list_t *task_list = (early) ?
+           &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
+       VERIFY(txg_list_add_tail(task_list, dst, dst->dst_txg));
+}
+
+void
+dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+{
+       dsl_sync_task_nowait_common(dp, syncfunc, arg,
+           blocks_modified, space_check, tx, B_FALSE);
+}
+
+void
+dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
+{
+       dsl_sync_task_nowait_common(dp, syncfunc, arg,
+           blocks_modified, space_check, tx, B_TRUE);
 }
 
 /*
@@ -160,12 +212,12 @@ dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx)
         * (arc_tempreserve, dsl_pool_tempreserve).
         */
        if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) {
-               uint64_t quota = dsl_pool_adjustedsize(dp,
-                   dst->dst_space_check == ZFS_SPACE_CHECK_RESERVED) -
-                   metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+               uint64_t quota = dsl_pool_unreserved_space(dp,
+                   dst->dst_space_check);
                uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+
                /* MOS space is triple-dittoed, so we multiply by 3. */
-               if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) {
+               if (used + dst->dst_space * 3 > quota) {
                        dst->dst_error = SET_ERROR(ENOSPC);
                        if (dst->dst_nowaiter)
                                kmem_free(dst, sizeof (*dst));
index b5a684f0bc6231cf5cc9aefa5c4618348f4938a9..c80b35d488056f0fba68a909976084c4b04d7778 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  */
 
@@ -604,7 +604,8 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
            KM_SLEEP));
 
        error = dsl_sync_task(pool, dsl_dataset_user_release_check,
-           dsl_dataset_user_release_sync, &ddura, 0, ZFS_SPACE_CHECK_NONE);
+           dsl_dataset_user_release_sync, &ddura, 0,
+           ZFS_SPACE_CHECK_EXTRA_RESERVED);
        fnvlist_free(ddura.ddura_todelete);
        fnvlist_free(ddura.ddura_chkholds);
 
index c11e459e0f0bfd9290359ec9ac61270ece95cef0..76fa99e8b1f664f592dccb6687ee9ab50d9e4c32 100644 (file)
@@ -34,6 +34,7 @@
 #include <sys/spa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/vdev_indirect_mapping.h>
+#include <sys/zap.h>
 
 #define        WITH_DF_BLOCK_ALLOCATOR
 
@@ -53,6 +54,14 @@ unsigned long metaslab_aliquot = 512 << 10;
  */
 unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
 
+/*
+ * Since we can touch multiple metaslabs (and their respective space maps)
+ * with each transaction group, we benefit from having a smaller space map
+ * block size since it allows us to issue more I/O operations scattered
+ * around the disk.
+ */
+int zfs_metaslab_sm_blksz = (1 << 12);
+
 /*
  * The in-core space map representation is more compact than its on-disk form.
  * The zfs_condense_pct determines how much more compact the in-core
@@ -211,7 +220,7 @@ uint64_t metaslab_trace_max_entries = 5000;
 
 static uint64_t metaslab_weight(metaslab_t *);
 static void metaslab_set_fragmentation(metaslab_t *);
-static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
+static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 
 #ifdef _METASLAB_TRACING
@@ -484,11 +493,11 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg)
         */
        for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
                allocated +=
-                   range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
+                   range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
        }
 
-       msp_free_space = range_tree_space(msp->ms_tree) + allocated +
-           msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
+       msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
+           msp->ms_deferspace + range_tree_space(msp->ms_freed);
 
        VERIFY3U(sm_free_space, ==, msp_free_space);
 }
@@ -1021,7 +1030,7 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
 uint64_t
 metaslab_block_maxsize(metaslab_t *msp)
 {
-       avl_tree_t *t = &msp->ms_size_tree;
+       avl_tree_t *t = &msp->ms_allocatable_by_size;
        range_seg_t *rs;
 
        if (t == NULL || (rs = avl_last(t)) == NULL)
@@ -1101,7 +1110,7 @@ metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
         */
        uint64_t align = size & -size;
        uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
-       avl_tree_t *t = &msp->ms_tree->rt_root;
+       avl_tree_t *t = &msp->ms_allocatable->rt_root;
 
        return (metaslab_block_picker(t, cursor, size, align));
 }
@@ -1134,13 +1143,14 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
         */
        uint64_t align = size & -size;
        uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
-       range_tree_t *rt = msp->ms_tree;
+       range_tree_t *rt = msp->ms_allocatable;
        avl_tree_t *t = &rt->rt_root;
        uint64_t max_size = metaslab_block_maxsize(msp);
        int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
        ASSERT(MUTEX_HELD(&msp->ms_lock));
-       ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
+       ASSERT3U(avl_numnodes(t), ==,
+           avl_numnodes(&msp->ms_allocatable_by_size));
 
        if (max_size < size)
                return (-1ULL);
@@ -1151,7 +1161,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
         */
        if (max_size < metaslab_df_alloc_threshold ||
            free_pct < metaslab_df_free_pct) {
-               t = &msp->ms_size_tree;
+               t = &msp->ms_allocatable_by_size;
                *cursor = 0;
        }
 
@@ -1178,8 +1188,8 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 static uint64_t
 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 {
-       range_tree_t *rt = msp->ms_tree;
-       avl_tree_t *t = &msp->ms_size_tree;
+       range_tree_t *rt = msp->ms_allocatable;
+       avl_tree_t *t = &msp->ms_allocatable_by_size;
        uint64_t *cursor = &msp->ms_lbas[0];
        uint64_t *cursor_end = &msp->ms_lbas[1];
        uint64_t offset = 0;
@@ -1192,7 +1202,7 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
        if ((*cursor + size) > *cursor_end) {
                range_seg_t *rs;
 
-               rs = avl_last(&msp->ms_size_tree);
+               rs = avl_last(&msp->ms_allocatable_by_size);
                if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
                        return (-1ULL);
 
@@ -1232,7 +1242,7 @@ uint64_t metaslab_ndf_clump_shift = 4;
 static uint64_t
 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 {
-       avl_tree_t *t = &msp->ms_tree->rt_root;
+       avl_tree_t *t = &msp->ms_allocatable->rt_root;
        avl_index_t where;
        range_seg_t *rs, rsearch;
        uint64_t hbit = highbit64(size);
@@ -1240,7 +1250,8 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
        uint64_t max_size = metaslab_block_maxsize(msp);
 
        ASSERT(MUTEX_HELD(&msp->ms_lock));
-       ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
+       ASSERT3U(avl_numnodes(t), ==,
+           avl_numnodes(&msp->ms_allocatable_by_size));
 
        if (max_size < size)
                return (-1ULL);
@@ -1250,7 +1261,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 
        rs = avl_find(t, &rsearch, &where);
        if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
-               t = &msp->ms_size_tree;
+               t = &msp->ms_allocatable_by_size;
 
                rsearch.rs_start = 0;
                rsearch.rs_end = MIN(max_size,
@@ -1316,13 +1327,15 @@ metaslab_load(metaslab_t *msp)
 
        /*
         * If the space map has not been allocated yet, then treat
-        * all the space in the metaslab as free and add it to the
-        * ms_tree.
+        * all the space in the metaslab as free and add it to ms_allocatable.
         */
-       if (msp->ms_sm != NULL)
-               error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
-       else
-               range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
+       if (msp->ms_sm != NULL) {
+               error = space_map_load(msp->ms_sm, msp->ms_allocatable,
+                   SM_FREE);
+       } else {
+               range_tree_add(msp->ms_allocatable,
+                   msp->ms_start, msp->ms_size);
+       }
 
        success = (error == 0);
 
@@ -1333,9 +1346,16 @@ metaslab_load(metaslab_t *msp)
                ASSERT3P(msp->ms_group, !=, NULL);
                msp->ms_loaded = B_TRUE;
 
-               for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-                       range_tree_walk(msp->ms_defertree[t],
-                           range_tree_remove, msp->ms_tree);
+               /*
+                * If the metaslab already has a spacemap, then we need to
+                * remove all segments from the defer tree; otherwise, the
+                * metaslab is completely empty and we can skip this.
+                */
+               if (msp->ms_sm != NULL) {
+                       for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+                               range_tree_walk(msp->ms_defer[t],
+                                   range_tree_remove, msp->ms_allocatable);
+                       }
                }
                msp->ms_max_size = metaslab_block_maxsize(msp);
        }
@@ -1347,7 +1367,7 @@ void
 metaslab_unload(metaslab_t *msp)
 {
        ASSERT(MUTEX_HELD(&msp->ms_lock));
-       range_tree_vacate(msp->ms_tree, NULL, NULL);
+       range_tree_vacate(msp->ms_allocatable, NULL, NULL);
        msp->ms_loaded = B_FALSE;
        msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
        msp->ms_max_size = 0;
@@ -1393,8 +1413,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
         * addition of new space; and for debugging, it ensures that we'd
         * data fault on any attempt to use this metaslab before it's ready.
         */
-       ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree,
-           metaslab_rangesize_compare, 0);
+       ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
+           &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
        metaslab_group_add(mg, ms);
 
        metaslab_set_fragmentation(ms);
@@ -1446,20 +1466,21 @@ metaslab_fini(metaslab_t *msp)
        space_map_close(msp->ms_sm);
 
        metaslab_unload(msp);
-       range_tree_destroy(msp->ms_tree);
-       range_tree_destroy(msp->ms_freeingtree);
-       range_tree_destroy(msp->ms_freedtree);
+       range_tree_destroy(msp->ms_allocatable);
+       range_tree_destroy(msp->ms_freeing);
+       range_tree_destroy(msp->ms_freed);
 
        for (int t = 0; t < TXG_SIZE; t++) {
-               range_tree_destroy(msp->ms_alloctree[t]);
+               range_tree_destroy(msp->ms_allocating[t]);
        }
 
        for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-               range_tree_destroy(msp->ms_defertree[t]);
+               range_tree_destroy(msp->ms_defer[t]);
        }
-
        ASSERT0(msp->ms_deferspace);
 
+       range_tree_destroy(msp->ms_checkpointing);
+
        mutex_exit(&msp->ms_lock);
        cv_destroy(&msp->ms_load_cv);
        mutex_destroy(&msp->ms_lock);
@@ -1679,7 +1700,7 @@ metaslab_weight_from_range_tree(metaslab_t *msp)
                int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
 
                segments <<= 1;
-               segments += msp->ms_tree->rt_histogram[i];
+               segments += msp->ms_allocatable->rt_histogram[i];
 
                /*
                 * The range tree provides more precision than the space map
@@ -1895,7 +1916,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight)
         */
        ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
            size >= SPA_MINBLOCKSIZE ||
-           range_tree_space(msp->ms_tree) == 0);
+           range_tree_space(msp->ms_allocatable) == 0);
        ASSERT0(weight & METASLAB_ACTIVE_MASK);
 
        msp->ms_activation_weight = 0;
@@ -2028,18 +2049,37 @@ metaslab_should_condense(metaslab_t *msp)
        range_seg_t *rs;
        uint64_t size, entries, segsz, object_size, optimal_size, record_size;
        dmu_object_info_t doi;
-       uint64_t vdev_blocksize = 1ULL << msp->ms_group->mg_vd->vdev_ashift;
+       vdev_t *vd = msp->ms_group->mg_vd;
+       uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
+       uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
 
        ASSERT(MUTEX_HELD(&msp->ms_lock));
        ASSERT(msp->ms_loaded);
 
        /*
-        * Use the ms_size_tree range tree, which is ordered by size, to
-        * obtain the largest segment in the free tree. We always condense
-        * metaslabs that are empty and metaslabs for which a condense
-        * request has been made.
+        * Allocations and frees in early passes are generally more space
+        * efficient (in terms of blocks described in space map entries)
+        * than the ones in later passes (e.g. we don't compress after
+        * sync pass 5) and condensing a metaslab multiple times in a txg
+        * could degrade performance.
+        *
+        * Thus we prefer condensing each metaslab at most once every txg at
+        * the earliest sync pass possible. If a metaslab is eligible for
+        * condensing again after being considered for condensing within the
+        * same txg, it will hopefully be dirty in the next txg where it will
+        * be condensed at an earlier pass.
+        */
+       if (msp->ms_condense_checked_txg == current_txg)
+               return (B_FALSE);
+       msp->ms_condense_checked_txg = current_txg;
+
+       /*
+        * Use the ms_allocatable_by_size range tree, which is ordered by
+        * size, to obtain the largest segment in the free tree. We always
+        * condense metaslabs that are empty and metaslabs for which a
+        * condense request has been made.
         */
-       rs = avl_last(&msp->ms_size_tree);
+       rs = avl_last(&msp->ms_allocatable_by_size);
        if (rs == NULL || msp->ms_condense_wanted)
                return (B_TRUE);
 
@@ -2053,7 +2093,8 @@ metaslab_should_condense(metaslab_t *msp)
        entries = size / (MIN(size, SM_RUN_MAX));
        segsz = entries * sizeof (uint64_t);
 
-       optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
+       optimal_size =
+           sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root);
        object_size = space_map_length(msp->ms_sm);
 
        dmu_object_info_from_db(sm->sm_dbuf, &doi);
@@ -2076,7 +2117,6 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
        space_map_t *sm = msp->ms_sm;
 
        ASSERT(MUTEX_HELD(&msp->ms_lock));
-       ASSERT3U(spa_sync_pass(msp->ms_group->mg_vd->vdev_spa), ==, 1);
        ASSERT(msp->ms_loaded);
 
 
@@ -2084,7 +2124,8 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
            "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
            msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
            msp->ms_group->mg_vd->vdev_spa->spa_name,
-           space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
+           space_map_length(msp->ms_sm),
+           avl_numnodes(&msp->ms_allocatable->rt_root),
            msp->ms_condense_wanted ? "TRUE" : "FALSE");
 
        msp->ms_condense_wanted = B_FALSE;
@@ -2099,20 +2140,16 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
        condense_tree = range_tree_create(NULL, NULL);
        range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
 
-       /*
-        * Remove what's been freed in this txg from the condense_tree.
-        * Since we're in sync_pass 1, we know that all the frees from
-        * this txg are in the freeingtree.
-        */
-       range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
+       range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
+       range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);
 
        for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-               range_tree_walk(msp->ms_defertree[t],
+               range_tree_walk(msp->ms_defer[t],
                    range_tree_remove, condense_tree);
        }
 
        for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
-               range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
+               range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
                    range_tree_remove, condense_tree);
        }
 
@@ -2122,13 +2159,13 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
         * metaslab's ms_condensing flag to ensure that
         * allocations on this metaslab do not occur while we're
         * in the middle of committing it to disk. This is only critical
-        * for the ms_tree as all other range trees use per txg
+        * for ms_allocatable as all other range trees use per txg
         * views of their content.
         */
        msp->ms_condensing = B_TRUE;
 
        mutex_exit(&msp->ms_lock);
-       space_map_truncate(sm, tx);
+       space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);
 
        /*
         * While we would ideally like to create a space map representation
@@ -2144,7 +2181,7 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
        range_tree_vacate(condense_tree, NULL, NULL);
        range_tree_destroy(condense_tree);
 
-       space_map_write(sm, msp->ms_tree, SM_FREE, tx);
+       space_map_write(sm, msp->ms_allocatable, SM_FREE, tx);
        mutex_enter(&msp->ms_lock);
        msp->ms_condensing = B_FALSE;
 }
@@ -2159,7 +2196,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
        vdev_t *vd = mg->mg_vd;
        spa_t *spa = vd->vdev_spa;
        objset_t *mos = spa_meta_objset(spa);
-       range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
+       range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
        dmu_tx_t *tx;
        uint64_t object = space_map_object(msp->ms_sm);
 
@@ -2168,23 +2205,24 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
        /*
         * This metaslab has just been added so there's no work to do now.
         */
-       if (msp->ms_freeingtree == NULL) {
+       if (msp->ms_freeing == NULL) {
                ASSERT3P(alloctree, ==, NULL);
                return;
        }
 
        ASSERT3P(alloctree, !=, NULL);
-       ASSERT3P(msp->ms_freeingtree, !=, NULL);
-       ASSERT3P(msp->ms_freedtree, !=, NULL);
+       ASSERT3P(msp->ms_freeing, !=, NULL);
+       ASSERT3P(msp->ms_freed, !=, NULL);
+       ASSERT3P(msp->ms_checkpointing, !=, NULL);
 
        /*
-        * Normally, we don't want to process a metaslab if there
-        * are no allocations or frees to perform. However, if the metaslab
-        * is being forced to condense and it's loaded, we need to let it
-        * through.
+        * Normally, we don't want to process a metaslab if there are no
+        * allocations or frees to perform. However, if the metaslab is being
+        * forced to condense and it's loaded, we need to let it through.
         */
-       if (range_tree_space(alloctree) == 0 &&
-           range_tree_space(msp->ms_freeingtree) == 0 &&
+       if (range_tree_is_empty(alloctree) &&
+           range_tree_is_empty(msp->ms_freeing) &&
+           range_tree_is_empty(msp->ms_checkpointing) &&
            !(msp->ms_loaded && msp->ms_condense_wanted))
                return;
 
@@ -2193,10 +2231,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 
        /*
         * The only state that can actually be changing concurrently with
-        * metaslab_sync() is the metaslab's ms_tree.  No other thread can
-        * be modifying this txg's alloctree, freeingtree, freedtree, or
-        * space_map_phys_t.  We drop ms_lock whenever we could call
-        * into the DMU, because the DMU can call down to us
+        * metaslab_sync() is the metaslab's ms_allocatable.  No other
+        * thread can be modifying this txg's alloc, freeing,
+        * freed, or space_map_phys_t.  We drop ms_lock whenever we
+        * could call into the DMU, because the DMU can call down to us
         * (e.g. via zio_free()) at any time.
         *
         * The spa_vdev_remove_thread() can be reading metaslab state
@@ -2204,13 +2242,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
         * that the ms_lock is insufficient for this, because it is dropped
         * by space_map_write().
         */
-
        tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
        if (msp->ms_sm == NULL) {
                uint64_t new_object;
 
-               new_object = space_map_alloc(mos, tx);
+               new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
                VERIFY3U(new_object, !=, 0);
 
                VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
@@ -2218,6 +2255,28 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
                ASSERT(msp->ms_sm != NULL);
        }
 
+       if (!range_tree_is_empty(msp->ms_checkpointing) &&
+           vd->vdev_checkpoint_sm == NULL) {
+               ASSERT(spa_has_checkpoint(spa));
+
+               uint64_t new_object = space_map_alloc(mos,
+                   vdev_standard_sm_blksz, tx);
+               VERIFY3U(new_object, !=, 0);
+
+               VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
+                   mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
+               ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+               /*
+                * We save the space map object as an entry in vdev_top_zap
+                * so it can be retrieved when the pool is reopened after an
+                * export or through zdb.
+                */
+               VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
+                   vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+                   sizeof (new_object), 1, &new_object, tx));
+       }
+
        mutex_enter(&msp->ms_sync_lock);
        mutex_enter(&msp->ms_lock);
 
@@ -2230,16 +2289,40 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
        metaslab_class_histogram_verify(mg->mg_class);
        metaslab_group_histogram_remove(mg, msp);
 
-       if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
-           metaslab_should_condense(msp)) {
+       if (msp->ms_loaded && metaslab_should_condense(msp)) {
                metaslab_condense(msp, txg, tx);
        } else {
                mutex_exit(&msp->ms_lock);
                space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
-               space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
+               space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx);
                mutex_enter(&msp->ms_lock);
        }
 
+       if (!range_tree_is_empty(msp->ms_checkpointing)) {
+               ASSERT(spa_has_checkpoint(spa));
+               ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+               /*
+                * Since we are doing writes to disk and the ms_checkpointing
+                * tree won't be changing during that time, we drop the
+                * ms_lock while writing to the checkpoint space map.
+                */
+               mutex_exit(&msp->ms_lock);
+               space_map_write(vd->vdev_checkpoint_sm,
+                   msp->ms_checkpointing, SM_FREE, tx);
+               mutex_enter(&msp->ms_lock);
+               space_map_update(vd->vdev_checkpoint_sm);
+
+               spa->spa_checkpoint_info.sci_dspace +=
+                   range_tree_space(msp->ms_checkpointing);
+               vd->vdev_stat.vs_checkpoint_space +=
+                   range_tree_space(msp->ms_checkpointing);
+               ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
+                   -vd->vdev_checkpoint_sm->sm_alloc);
+
+               range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
+       }
+
        if (msp->ms_loaded) {
                /*
                 * When the space map is loaded, we have an accurate
@@ -2248,7 +2331,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
                 * it first before updating it.
                 */
                space_map_histogram_clear(msp->ms_sm);
-               space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
+               space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
 
                /*
                 * Since we've cleared the histogram we need to add back
@@ -2257,7 +2340,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
                 * to accurately reflect all free space even if some space
                 * is not yet available for allocation (i.e. deferred).
                 */
-               space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
+               space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
 
                /*
                 * Add back any deferred free space that has not been
@@ -2268,7 +2351,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
                 */
                for (int t = 0; t < TXG_DEFER_SIZE; t++) {
                        space_map_histogram_add(msp->ms_sm,
-                           msp->ms_defertree[t], tx);
+                           msp->ms_defer[t], tx);
                }
        }
 
@@ -2279,7 +2362,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
         * then we will lose some accuracy but will correct it the next
         * time we load the space map.
         */
-       space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
+       space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
 
        metaslab_group_histogram_add(mg, msp);
        metaslab_group_histogram_verify(mg);
@@ -2287,21 +2370,23 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 
        /*
         * For sync pass 1, we avoid traversing this txg's free range tree
-        * and instead will just swap the pointers for freeingtree and
-        * freedtree. We can safely do this since the freed_tree is
+        * and instead will just swap the pointers for freeing and
+        * freed. We can safely do this since the freed_tree is
         * guaranteed to be empty on the initial pass.
         */
        if (spa_sync_pass(spa) == 1) {
-               range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
+               range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
        } else {
-               range_tree_vacate(msp->ms_freeingtree,
-                   range_tree_add, msp->ms_freedtree);
+               range_tree_vacate(msp->ms_freeing,
+                   range_tree_add, msp->ms_freed);
        }
        range_tree_vacate(alloctree, NULL, NULL);
 
-       ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
-       ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
-       ASSERT0(range_tree_space(msp->ms_freeingtree));
+       ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+       ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
+           & TXG_MASK]));
+       ASSERT0(range_tree_space(msp->ms_freeing));
+       ASSERT0(range_tree_space(msp->ms_checkpointing));
 
        mutex_exit(&msp->ms_lock);
 
@@ -2336,29 +2421,34 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
         * If this metaslab is just becoming available, initialize its
         * range trees and add its capacity to the vdev.
         */
-       if (msp->ms_freedtree == NULL) {
+       if (msp->ms_freed == NULL) {
                for (int t = 0; t < TXG_SIZE; t++) {
-                       ASSERT(msp->ms_alloctree[t] == NULL);
+                       ASSERT(msp->ms_allocating[t] == NULL);
 
-                       msp->ms_alloctree[t] = range_tree_create(NULL, NULL);
+                       msp->ms_allocating[t] = range_tree_create(NULL, NULL);
                }
 
-               ASSERT3P(msp->ms_freeingtree, ==, NULL);
-               msp->ms_freeingtree = range_tree_create(NULL, NULL);
+               ASSERT3P(msp->ms_freeing, ==, NULL);
+               msp->ms_freeing = range_tree_create(NULL, NULL);
 
-               ASSERT3P(msp->ms_freedtree, ==, NULL);
-               msp->ms_freedtree = range_tree_create(NULL, NULL);
+               ASSERT3P(msp->ms_freed, ==, NULL);
+               msp->ms_freed = range_tree_create(NULL, NULL);
 
                for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-                       ASSERT(msp->ms_defertree[t] == NULL);
+                       ASSERT(msp->ms_defer[t] == NULL);
 
-                       msp->ms_defertree[t] = range_tree_create(NULL, NULL);
+                       msp->ms_defer[t] = range_tree_create(NULL, NULL);
                }
 
+               ASSERT3P(msp->ms_checkpointing, ==, NULL);
+               msp->ms_checkpointing = range_tree_create(NULL, NULL);
+
                vdev_space_update(vd, 0, 0, msp->ms_size);
        }
+       ASSERT0(range_tree_space(msp->ms_freeing));
+       ASSERT0(range_tree_space(msp->ms_checkpointing));
 
-       defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
+       defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
 
        uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
            metaslab_class_get_alloc(spa_normal_class(spa));
@@ -2369,7 +2459,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
        defer_delta = 0;
        alloc_delta = space_map_alloc_delta(msp->ms_sm);
        if (defer_allowed) {
-               defer_delta = range_tree_space(msp->ms_freedtree) -
+               defer_delta = range_tree_space(msp->ms_freed) -
                    range_tree_space(*defer_tree);
        } else {
                defer_delta -= range_tree_space(*defer_tree);
@@ -2385,19 +2475,19 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 
        /*
         * Move the frees from the defer_tree back to the free
-        * range tree (if it's loaded). Swap the freed_tree and the
-        * defer_tree -- this is safe to do because we've just emptied out
-        * the defer_tree.
+        * range tree (if it's loaded). Swap the freed_tree and
+        * the defer_tree -- this is safe to do because we've
+        * just emptied out the defer_tree.
         */
        range_tree_vacate(*defer_tree,
-           msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+           msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
        if (defer_allowed) {
-               range_tree_swap(&msp->ms_freedtree, defer_tree);
+               range_tree_swap(&msp->ms_freed, defer_tree);
        } else {
-               range_tree_vacate(msp->ms_freedtree,
-                   msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+               range_tree_vacate(msp->ms_freed,
+                   msp->ms_loaded ? range_tree_add : NULL,
+                   msp->ms_allocatable);
        }
-
        space_map_update(msp->ms_sm);
 
        msp->ms_deferspace += defer_delta;
@@ -2426,16 +2516,17 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 
                for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
                        VERIFY0(range_tree_space(
-                           msp->ms_alloctree[(txg + t) & TXG_MASK]));
+                           msp->ms_allocating[(txg + t) & TXG_MASK]));
                }
 
                if (!metaslab_debug_unload)
                        metaslab_unload(msp);
        }
 
-       ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
-       ASSERT0(range_tree_space(msp->ms_freeingtree));
-       ASSERT0(range_tree_space(msp->ms_freedtree));
+       ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+       ASSERT0(range_tree_space(msp->ms_freeing));
+       ASSERT0(range_tree_space(msp->ms_freed));
+       ASSERT0(range_tree_space(msp->ms_checkpointing));
 
        mutex_exit(&msp->ms_lock);
 }
@@ -2666,7 +2757,7 @@ static uint64_t
 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 {
        uint64_t start;
-       range_tree_t *rt = msp->ms_tree;
+       range_tree_t *rt = msp->ms_allocatable;
        metaslab_class_t *mc = msp->ms_group->mg_class;
 
        VERIFY(!msp->ms_condensing);
@@ -2681,10 +2772,10 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
                VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
                range_tree_remove(rt, start, size);
 
-               if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
+               if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
                        vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
-               range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
+               range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
 
                /* Track the last successful allocation */
                msp->ms_alloc_txg = txg;
@@ -3183,12 +3274,11 @@ next:
 
 void
 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
-    uint64_t txg)
+    boolean_t checkpoint)
 {
        metaslab_t *msp;
-       ASSERTV(spa_t *spa = vd->vdev_spa);
+       spa_t *spa = vd->vdev_spa;
 
-       ASSERT3U(txg, ==, spa->spa_syncing_txg);
        ASSERT(vdev_is_concrete(vd));
        ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
        ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
@@ -3202,11 +3292,19 @@ metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
        VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
 
        metaslab_check_free_impl(vd, offset, asize);
+
        mutex_enter(&msp->ms_lock);
-       if (range_tree_space(msp->ms_freeingtree) == 0) {
-               vdev_dirty(vd, VDD_METASLAB, msp, txg);
+       if (range_tree_is_empty(msp->ms_freeing) &&
+           range_tree_is_empty(msp->ms_checkpointing)) {
+               vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
+       }
+
+       if (checkpoint) {
+               ASSERT(spa_has_checkpoint(spa));
+               range_tree_add(msp->ms_checkpointing, offset, asize);
+       } else {
+               range_tree_add(msp->ms_freeing, offset, asize);
        }
-       range_tree_add(msp->ms_freeingtree, offset, asize);
        mutex_exit(&msp->ms_lock);
 }
 
@@ -3215,23 +3313,25 @@ void
 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
-       uint64_t *txgp = arg;
+       boolean_t *checkpoint = arg;
+
+       ASSERT3P(checkpoint, !=, NULL);
 
        if (vd->vdev_ops->vdev_op_remap != NULL)
-               vdev_indirect_mark_obsolete(vd, offset, size, *txgp);
+               vdev_indirect_mark_obsolete(vd, offset, size);
        else
-               metaslab_free_impl(vd, offset, size, *txgp);
+               metaslab_free_impl(vd, offset, size, *checkpoint);
 }
 
 static void
 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
-    uint64_t txg)
+    boolean_t checkpoint)
 {
        spa_t *spa = vd->vdev_spa;
 
        ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
-       if (txg > spa_freeze_txg(spa))
+       if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
                return;
 
        if (spa->spa_vdev_removal != NULL &&
@@ -3243,13 +3343,13 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
                 * an indirect vdev (in open context), and then (in syncing
                 * context) clear spa_vdev_removal.
                 */
-               free_from_removing_vdev(vd, offset, size, txg);
+               free_from_removing_vdev(vd, offset, size);
        } else if (vd->vdev_ops->vdev_op_remap != NULL) {
-               vdev_indirect_mark_obsolete(vd, offset, size, txg);
+               vdev_indirect_mark_obsolete(vd, offset, size);
                vd->vdev_ops->vdev_op_remap(vd, offset, size,
-                   metaslab_free_impl_cb, &txg);
+                   metaslab_free_impl_cb, &checkpoint);
        } else {
-               metaslab_free_concrete(vd, offset, size, txg);
+               metaslab_free_concrete(vd, offset, size, checkpoint);
        }
 }
 
@@ -3426,26 +3526,25 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
        msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
        mutex_enter(&msp->ms_lock);
-       range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
+       range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
            offset, size);
 
        VERIFY(!msp->ms_condensing);
        VERIFY3U(offset, >=, msp->ms_start);
        VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
-       VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
+       VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
            msp->ms_size);
        VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
        VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
-       range_tree_add(msp->ms_tree, offset, size);
+       range_tree_add(msp->ms_allocatable, offset, size);
        mutex_exit(&msp->ms_lock);
 }
 
 /*
- * Free the block represented by DVA in the context of the specified
- * transaction group.
+ * Free the block represented by the given DVA.
  */
 void
-metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
 {
        uint64_t vdev = DVA_GET_VDEV(dva);
        uint64_t offset = DVA_GET_OFFSET(dva);
@@ -3459,7 +3558,7 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
                size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
        }
 
-       metaslab_free_impl(vd, offset, size, txg);
+       metaslab_free_impl(vd, offset, size, checkpoint);
 }
 
 /*
@@ -3529,7 +3628,8 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
        if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
                error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
 
-       if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
+       if (error == 0 &&
+           !range_tree_contains(msp->ms_allocatable, offset, size))
                error = SET_ERROR(ENOENT);
 
        if (error || txg == 0) {        /* txg == 0 indicates dry run */
@@ -3540,13 +3640,15 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
        VERIFY(!msp->ms_condensing);
        VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
        VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
-       VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
-       range_tree_remove(msp->ms_tree, offset, size);
+       VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
+           msp->ms_size);
+       range_tree_remove(msp->ms_allocatable, offset, size);
 
        if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
-               if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
+               if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
                        vdev_dirty(vd, VDD_METASLAB, msp, txg);
-               range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
+               range_tree_add(msp->ms_allocating[txg & TXG_MASK],
+                   offset, size);
        }
 
        mutex_exit(&msp->ms_lock);
@@ -3691,13 +3793,41 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
        ASSERT(!BP_IS_HOLE(bp));
        ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
 
+       /*
+        * If we have a checkpoint for the pool we need to make sure that
+        * the blocks that we free that are part of the checkpoint won't be
+        * reused until the checkpoint is discarded or we revert to it.
+        *
+        * The checkpoint flag is passed down the metaslab_free code path
+        * and is set whenever we want to add a block to the checkpoint's
+        * accounting. That is, we "checkpoint" blocks that existed at the
+        * time the checkpoint was created and are therefore referenced by
+        * the checkpointed uberblock.
+        *
+        * Note that, we don't checkpoint any blocks if the current
+        * syncing txg <= spa_checkpoint_txg. We want these frees to sync
+        * normally as they will be referenced by the checkpointed uberblock.
+        */
+       boolean_t checkpoint = B_FALSE;
+       if (bp->blk_birth <= spa->spa_checkpoint_txg &&
+           spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
+               /*
+                * At this point, if the block is part of the checkpoint
+                * there is no way it was created in the current txg.
+                */
+               ASSERT(!now);
+               ASSERT3U(spa_syncing_txg(spa), ==, txg);
+               checkpoint = B_TRUE;
+       }
+
        spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
 
        for (int d = 0; d < ndvas; d++) {
                if (now) {
                        metaslab_unalloc_dva(spa, &dva[d], txg);
                } else {
-                       metaslab_free_dva(spa, &dva[d], txg);
+                       ASSERT3U(txg, ==, spa_syncing_txg(spa));
+                       metaslab_free_dva(spa, &dva[d], checkpoint);
                }
        }
 
@@ -3818,12 +3948,13 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
 
        mutex_enter(&msp->ms_lock);
        if (msp->ms_loaded)
-               range_tree_verify(msp->ms_tree, offset, size);
+               range_tree_verify(msp->ms_allocatable, offset, size);
 
-       range_tree_verify(msp->ms_freeingtree, offset, size);
-       range_tree_verify(msp->ms_freedtree, offset, size);
+       range_tree_verify(msp->ms_freeing, offset, size);
+       range_tree_verify(msp->ms_checkpointing, offset, size);
+       range_tree_verify(msp->ms_freed, offset, size);
        for (int j = 0; j < TXG_DEFER_SIZE; j++)
-               range_tree_verify(msp->ms_defertree[j], offset, size);
+               range_tree_verify(msp->ms_defer[j], offset, size);
        mutex_exit(&msp->ms_lock);
 }
 
index 448d00c1e5d43f0c3af6c3f31ce2339099e9dd4f..2181a92df5e3e18e759b06065ce29d16f7ab8ffa 100644 (file)
@@ -23,7 +23,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
index cdc03e66cd779d688ee5bc2b4193a5277a889ac4..8ab7c3428f2f2139e06700b4cd5c4f63f102d4bd 100644 (file)
@@ -153,8 +153,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
-static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
-    boolean_t reloading);
+static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);
 
 uint_t         zio_taskq_batch_pct = 75;       /* 1 thread per cpu in pset */
@@ -216,6 +215,7 @@ unsigned long       zfs_max_missing_tvds = 0;
  * and we get a chance to retrieve the trusted config.
  */
 uint64_t       zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
+
 /*
  * In the case where config was assembled by scanning device paths (/dev/dsks
  * by default) we are less tolerant since all the existing devices should have
@@ -223,6 +223,11 @@ uint64_t   zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
  */
 uint64_t       zfs_max_missing_tvds_scan = 0;
 
+/*
+ * Debugging aid that pauses spa_sync() towards the end.
+ */
+boolean_t      zfs_pause_spa_sync = B_FALSE;
+
 /*
  * ==========================================================================
  * SPA properties routines
@@ -274,6 +279,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
                spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
                spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
                    size - alloc, src);
+               spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
+                   spa->spa_checkpoint_info.sci_dspace, src);
 
                spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
                    metaslab_class_fragmentation(mc), src);
@@ -811,6 +818,12 @@ spa_change_guid_check(void *arg, dmu_tx_t *tx)
        vdev_t *rvd = spa->spa_root_vdev;
        uint64_t vdev_state;
 
+       if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+               int error = (spa_has_checkpoint(spa)) ?
+                   ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+               return (SET_ERROR(error));
+       }
+
        spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
        vdev_state = rvd->vdev_state;
        spa_config_exit(spa, SCL_STATE, FTAG);
@@ -1452,6 +1465,12 @@ spa_unload(spa_t *spa)
                spa->spa_condense_zthr = NULL;
        }
 
+       if (spa->spa_checkpoint_discard_zthr != NULL) {
+               ASSERT(!zthr_isrunning(spa->spa_checkpoint_discard_zthr));
+               zthr_destroy(spa->spa_checkpoint_discard_zthr);
+               spa->spa_checkpoint_discard_zthr = NULL;
+       }
+
        spa_condense_fini(spa);
 
        bpobj_close(&spa->spa_deferred_bpobj);
@@ -1535,6 +1554,18 @@ spa_load_spares(spa_t *spa)
        int i;
        vdev_t *vd, *tvd;
 
+#ifndef _KERNEL
+       /*
+        * zdb opens both the current state of the pool and the
+        * checkpointed state (if present), with a different spa_t.
+        *
+        * As spare vdevs are shared among open pools, we skip loading
+        * them when we load the checkpointed state of the pool.
+        */
+       if (!spa_writeable(spa))
+               return;
+#endif
+
        ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
        /*
@@ -1654,6 +1685,19 @@ spa_load_l2cache(spa_t *spa)
        vdev_t *vd, **oldvdevs, **newvdevs;
        spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
+#ifndef _KERNEL
+       /*
+        * zdb opens both the current state of the pool and the
+        * checkpointed state (if present), with a different spa_t.
+        *
+        * As L2 caches are part of the ARC which is shared among open
+        * pools, we skip loading them when we load the checkpointed
+        * state of the pool.
+        */
+       if (!spa_writeable(spa))
+               return;
+#endif
+
        ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
        oldvdevs = sav->sav_vdevs;
@@ -2206,6 +2250,11 @@ spa_spawn_aux_threads(spa_t *spa)
        ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
        spa_start_indirect_condensing_thread(spa);
+
+       ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
+       spa->spa_checkpoint_discard_zthr =
+           zthr_create(spa_checkpoint_discard_thread_check,
+           spa_checkpoint_discard_thread, spa);
 }
 
 /*
@@ -2299,7 +2348,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
        spa->spa_load_state = state;
 
        gethrestime(&spa->spa_loaded_ts);
-       error = spa_load_impl(spa, type, &ereport, B_FALSE);
+       error = spa_load_impl(spa, type, &ereport);
 
        /*
         * Don't count references from objsets that are already closed
@@ -2606,8 +2655,25 @@ spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
                return (SET_ERROR(EINVAL));
        }
 
-       if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state ==
-           SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) {
+       /*
+        * If we are doing an import, ensure that the pool is not already
+        * imported by checking if its pool guid already exists in the
+        * spa namespace.
+        *
+        * The only case that we allow an already imported pool to be
+        * imported again, is when the pool is checkpointed and we want to
+        * look at its checkpointed state from userland tools like zdb.
+        */
+#ifdef _KERNEL
+       if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+           spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+           spa_guid_exists(pool_guid, 0)) {
+#else
+       if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+           spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+           spa_guid_exists(pool_guid, 0) &&
+           !spa_importing_readonly_checkpoint(spa)) {
+#endif
                spa_load_failed(spa, "a pool with guid %llu is already open",
                    (u_longlong_t)pool_guid);
                return (SET_ERROR(EEXIST));
@@ -2766,6 +2832,19 @@ spa_ld_validate_vdevs(spa_t *spa)
        return (0);
 }
 
+static void
+spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
+{
+       spa->spa_state = POOL_STATE_ACTIVE;
+       spa->spa_ubsync = spa->spa_uberblock;
+       spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+           TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+       spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+           spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+       spa->spa_claim_max_txg = spa->spa_first_txg;
+       spa->spa_prev_software_version = ub->ub_software_version;
+}
+
 static int
 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 {
@@ -2774,6 +2853,29 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
        uberblock_t *ub = &spa->spa_uberblock;
        boolean_t activity_check = B_FALSE;
 
+       /*
+        * If we are opening the checkpointed state of the pool by
+        * rewinding to it, at this point we will have written the
+        * checkpointed uberblock to the vdev labels, so searching
+        * the labels will find the right uberblock.  However, if
+        * we are opening the checkpointed state read-only, we have
+        * not modified the labels. Therefore, we must ignore the
+        * labels and continue using the spa_uberblock that was set
+        * by spa_ld_checkpoint_rewind.
+        *
+        * Note that it would be fine to ignore the labels when
+        * rewinding (opening writeable) as well. However, if we
+        * crash just after writing the labels, we will end up
+        * searching the labels. Doing so in the common case means
+        * that this code path gets exercised normally, rather than
+        * just in the edge case.
+        */
+       if (ub->ub_checkpoint_txg != 0 &&
+           spa_importing_readonly_checkpoint(spa)) {
+               spa_ld_select_uberblock_done(spa, ub);
+               return (0);
+       }
+
        /*
         * Find the best uberblock.
         */
@@ -2905,14 +3007,7 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
        /*
         * Initialize internal SPA structures.
         */
-       spa->spa_state = POOL_STATE_ACTIVE;
-       spa->spa_ubsync = spa->spa_uberblock;
-       spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
-           TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
-       spa->spa_first_txg = spa->spa_last_ubsync_txg ?
-           spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
-       spa->spa_claim_max_txg = spa->spa_first_txg;
-       spa->spa_prev_software_version = ub->ub_software_version;
+       spa_ld_select_uberblock_done(spa, ub);
 
        return (0);
 }
@@ -2935,7 +3030,7 @@ spa_ld_open_rootbp(spa_t *spa)
 }
 
 static int
-spa_ld_load_trusted_config(spa_t *spa, spa_import_type_t type,
+spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
     boolean_t reloading)
 {
        vdev_t *mrvd, *rvd = spa->spa_root_vdev;
@@ -3609,7 +3704,7 @@ spa_ld_claim_log_blocks(spa_t *spa)
 
 static void
 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
-    boolean_t reloading)
+    boolean_t update_config_cache)
 {
        vdev_t *rvd = spa->spa_root_vdev;
        int need_update = B_FALSE;
@@ -3621,7 +3716,7 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
         * If this is a verbatim import, trust the current
         * in-core spa_config and update the disk labels.
         */
-       if (reloading || config_cache_txg != spa->spa_config_txg ||
+       if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
            spa->spa_load_state == SPA_LOAD_IMPORT ||
            spa->spa_load_state == SPA_LOAD_RECOVER ||
            (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
@@ -3657,18 +3752,38 @@ spa_ld_prepare_for_reload(spa_t *spa)
        spa->spa_async_suspended = async_suspended;
 }
 
-/*
- * Load an existing storage pool, using the config provided. This config
- * describes which vdevs are part of the pool and is later validated against
- * partial configs present in each vdev's label and an entire copy of the
- * config stored in the MOS.
- */
 static int
-spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
-    boolean_t reloading)
+spa_ld_read_checkpoint_txg(spa_t *spa)
+{
+       uberblock_t checkpoint;
+       int error = 0;
+
+       ASSERT0(spa->spa_checkpoint_txg);
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+       error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+           sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+       if (error == ENOENT)
+               return (0);
+
+       if (error != 0)
+               return (error);
+
+       ASSERT3U(checkpoint.ub_txg, !=, 0);
+       ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
+       ASSERT3U(checkpoint.ub_timestamp, !=, 0);
+       spa->spa_checkpoint_txg = checkpoint.ub_txg;
+       spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
+
+       return (0);
+}
+
+static int
+spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
 {
        int error = 0;
-       boolean_t missing_feat_write = B_FALSE;
 
        ASSERT(MUTEX_HELD(&spa_namespace_lock));
        ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
@@ -3684,11 +3799,6 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
        if (type != SPA_IMPORT_ASSEMBLE)
                spa->spa_trust_config = B_FALSE;
 
-       if (reloading)
-               spa_load_note(spa, "RELOADING");
-       else
-               spa_load_note(spa, "LOADING");
-
        /*
         * Parse the config provided to create a vdev tree.
         */
@@ -3721,11 +3831,11 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
        }
 
        /*
-        * Read vdev labels to find the best uberblock (i.e. latest, unless
-        * spa_load_max_txg is set) and store it in spa_uberblock. We get the
-        * list of features required to read blkptrs in the MOS from the vdev
-        * label with the best uberblock and verify that our version of zfs
-        * supports them all.
+        * Read all vdev labels to find the best uberblock (i.e. latest,
+        * unless spa_load_max_txg is set) and store it in spa_uberblock. We
+        * get the list of features required to read blkptrs in the MOS from
+        * the vdev label with the best uberblock and verify that our version
+        * of zfs supports them all.
         */
        error = spa_ld_select_uberblock(spa, type);
        if (error != 0)
@@ -3740,23 +3850,211 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
        if (error != 0)
                return (error);
 
+       return (0);
+}
+
+static int
+spa_ld_checkpoint_rewind(spa_t *spa)
+{
+       uberblock_t checkpoint;
+       int error = 0;
+
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+
+       error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+           sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+       if (error != 0) {
+               spa_load_failed(spa, "unable to retrieve checkpointed "
+                   "uberblock from the MOS config [error=%d]", error);
+
+               if (error == ENOENT)
+                       error = ZFS_ERR_NO_CHECKPOINT;
+
+               return (error);
+       }
+
+       ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
+       ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
+
+       /*
+        * We need to update the txg and timestamp of the checkpointed
+        * uberblock to be higher than the latest one. This ensures that
+        * the checkpointed uberblock is selected if we were to close and
+        * reopen the pool right after we've written it in the vdev labels.
+        * (also see block comment in vdev_uberblock_compare)
+        */
+       checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
+       checkpoint.ub_timestamp = gethrestime_sec();
+
+       /*
+        * Set current uberblock to be the checkpointed uberblock.
+        */
+       spa->spa_uberblock = checkpoint;
+
+       /*
+        * If we are doing a normal rewind, then the pool is open for
+        * writing and we sync the "updated" checkpointed uberblock to
+        * disk. Once this is done, we've basically rewound the whole
+        * pool and there is no way back.
+        *
+        * There are cases when we don't want to attempt and sync the
+        * checkpointed uberblock to disk because we are opening a
+        * pool as read-only. Specifically, verifying the checkpointed
+        * state with zdb, and importing the checkpointed state to get
+        * a "preview" of its content.
+        */
+       if (spa_writeable(spa)) {
+               vdev_t *rvd = spa->spa_root_vdev;
+
+               spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+               vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
+               int svdcount = 0;
+               int children = rvd->vdev_children;
+               int c0 = spa_get_random(children);
+
+               for (int c = 0; c < children; c++) {
+                       vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
+
+                       /* Stop when revisiting the first vdev */
+                       if (c > 0 && svd[0] == vd)
+                               break;
+
+                       if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
+                           !vdev_is_concrete(vd))
+                               continue;
+
+                       svd[svdcount++] = vd;
+                       if (svdcount == SPA_SYNC_MIN_VDEVS)
+                               break;
+               }
+               error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
+               if (error == 0)
+                       spa->spa_last_synced_guid = rvd->vdev_guid;
+               spa_config_exit(spa, SCL_ALL, FTAG);
+
+               if (error != 0) {
+                       spa_load_failed(spa, "failed to write checkpointed "
+                           "uberblock to the vdev labels [error=%d]", error);
+                       return (error);
+               }
+       }
+
+       return (0);
+}
+
+static int
+spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
+    boolean_t *update_config_cache)
+{
+       int error;
+
+       /*
+        * Parse the config for pool, open and validate vdevs,
+        * select an uberblock, and use that uberblock to open
+        * the MOS.
+        */
+       error = spa_ld_mos_init(spa, type);
+       if (error != 0)
+               return (error);
+
        /*
         * Retrieve the trusted config stored in the MOS and use it to create
         * a new, exact version of the vdev tree, then reopen all vdevs.
         */
-       error = spa_ld_load_trusted_config(spa, type, reloading);
+       error = spa_ld_trusted_config(spa, type, B_FALSE);
        if (error == EAGAIN) {
-               VERIFY(!reloading);
+               if (update_config_cache != NULL)
+                       *update_config_cache = B_TRUE;
+
                /*
                 * Redo the loading process with the trusted config if it is
                 * too different from the untrusted config.
                 */
                spa_ld_prepare_for_reload(spa);
-               return (spa_load_impl(spa, type, ereport, B_TRUE));
+               spa_load_note(spa, "RELOADING");
+               error = spa_ld_mos_init(spa, type);
+               if (error != 0)
+                       return (error);
+
+               error = spa_ld_trusted_config(spa, type, B_TRUE);
+               if (error != 0)
+                       return (error);
+
        } else if (error != 0) {
                return (error);
        }
 
+       return (0);
+}
+
+/*
+ * Load an existing storage pool, using the config provided. This config
+ * describes which vdevs are part of the pool and is later validated against
+ * partial configs present in each vdev's label and an entire copy of the
+ * config stored in the MOS.
+ */
+static int
+spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
+{
+       int error = 0;
+       boolean_t missing_feat_write = B_FALSE;
+       boolean_t checkpoint_rewind =
+           (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+       boolean_t update_config_cache = B_FALSE;
+
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
+
+       spa_load_note(spa, "LOADING");
+
+       error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
+       if (error != 0)
+               return (error);
+
+       /*
+        * If we are rewinding to the checkpoint then we need to repeat
+        * everything we've done so far in this function but this time
+        * selecting the checkpointed uberblock and using that to open
+        * the MOS.
+        */
+       if (checkpoint_rewind) {
+               /*
+                * If we are rewinding to the checkpoint update config cache
+                * anyway.
+                */
+               update_config_cache = B_TRUE;
+
+               /*
+                * Extract the checkpointed uberblock from the current MOS
+                * and use this as the pool's uberblock from now on. If the
+                * pool is imported as writeable we also write the checkpoint
+                * uberblock to the labels, making the rewind permanent.
+                */
+               error = spa_ld_checkpoint_rewind(spa);
+               if (error != 0)
+                       return (error);
+
+               /*
+                * Redo the loading process process again with the
+                * checkpointed uberblock.
+                */
+               spa_ld_prepare_for_reload(spa);
+               spa_load_note(spa, "LOADING checkpointed uberblock");
+               error = spa_ld_mos_with_trusted_config(spa, type, NULL);
+               if (error != 0)
+                       return (error);
+       }
+
+       /*
+        * Retrieve the checkpoint txg if the pool has a checkpoint.
+        */
+       error = spa_ld_read_checkpoint_txg(spa);
+       if (error != 0)
+               return (error);
+
        /*
         * Retrieve the mapping of indirect vdevs. Those vdevs were removed
         * from the pool and their contents were re-mapped to other vdevs. Note
@@ -3859,6 +4157,16 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
 
                ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
 
+               /*
+                * In case of a checkpoint rewind, log the original txg
+                * of the checkpointed uberblock.
+                */
+               if (checkpoint_rewind) {
+                       spa_history_log_internal(spa, "checkpoint rewind",
+                           NULL, "rewound state to txg=%llu",
+                           (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
+               }
+
                /*
                 * Traverse the ZIL and claim all blocks.
                 */
@@ -3886,7 +4194,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
                 * and the cachefile (by default /etc/zfs/zpool.cache).
                 */
                spa_ld_check_for_config_update(spa, config_cache_txg,
-                   reloading);
+                   update_config_cache);
 
                /*
                 * Check all DTLs to see if anything needs resilvering.
@@ -3970,6 +4278,15 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
        load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
        if (load_error == 0)
                return (0);
+       if (load_error == ZFS_ERR_NO_CHECKPOINT) {
+               /*
+                * When attempting checkpoint-rewind on a pool with no
+                * checkpoint, we should not attempt to load uberblocks
+                * from previous txgs when spa_load fails.
+                */
+               ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+               return (load_error);
+       }
 
        if (spa->spa_root_vdev != NULL)
                config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
@@ -5564,6 +5881,13 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 
        oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+               error = (spa_has_checkpoint(spa)) ?
+                   ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+               return (spa_vdev_exit(spa, NULL, txg, error));
+       }
+
        if (spa->spa_vdev_removal != NULL)
                return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
@@ -5776,6 +6100,27 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 
        vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
+       /*
+        * Besides being called directly from the userland through the
+        * ioctl interface, spa_vdev_detach() can be potentially called
+        * at the end of spa_vdev_resilver_done().
+        *
+        * In the regular case, when we have a checkpoint this shouldn't
+        * happen as we never empty the DTLs of a vdev during the scrub
+        * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
+        * should never get here when we have a checkpoint.
+        *
+        * That said, even in a case when we checkpoint the pool exactly
+        * as spa_vdev_resilver_done() calls this function everything
+        * should be fine as the resilver will return right away.
+        */
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+               error = (spa_has_checkpoint(spa)) ?
+                   ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+               return (spa_vdev_exit(spa, NULL, txg, error));
+       }
+
        if (vd == NULL)
                return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
@@ -6014,6 +6359,13 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
 
        txg = spa_vdev_enter(spa);
 
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+               error = (spa_has_checkpoint(spa)) ?
+                   ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+               return (spa_vdev_exit(spa, NULL, txg, error));
+       }
+
        /* clear the log and flush everything up to now */
        activate_slog = spa_passivate_log(spa);
        (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
@@ -6665,6 +7017,10 @@ spa_async_suspend(spa_t *spa)
        zthr_t *condense_thread = spa->spa_condense_zthr;
        if (condense_thread != NULL && zthr_isrunning(condense_thread))
                VERIFY0(zthr_cancel(condense_thread));
+
+       zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+       if (discard_thread != NULL && zthr_isrunning(discard_thread))
+               VERIFY0(zthr_cancel(discard_thread));
 }
 
 void
@@ -6679,6 +7035,10 @@ spa_async_resume(spa_t *spa)
        zthr_t *condense_thread = spa->spa_condense_zthr;
        if (condense_thread != NULL && !zthr_isrunning(condense_thread))
                zthr_resume(condense_thread);
+
+       zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+       if (discard_thread != NULL && !zthr_isrunning(discard_thread))
+               zthr_resume(discard_thread);
 }
 
 static boolean_t
@@ -7454,6 +7814,8 @@ spa_sync(spa_t *spa, uint64_t txg)
                                    txg));
                                ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
                                ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+                               ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
+                                   txg));
                                break;
                        }
                        spa_sync_deferred_frees(spa, tx);
@@ -7499,16 +7861,22 @@ spa_sync(spa_t *spa, uint64_t txg)
                spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
                if (list_is_empty(&spa->spa_config_dirty_list)) {
-                       vdev_t *svd[SPA_SYNC_MIN_VDEVS];
+                       vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
                        int svdcount = 0;
                        int children = rvd->vdev_children;
                        int c0 = spa_get_random(children);
 
                        for (int c = 0; c < children; c++) {
                                vd = rvd->vdev_child[(c0 + c) % children];
+
+                               /* Stop when revisiting the first vdev */
+                               if (c > 0 && svd[0] == vd)
+                                       break;
+
                                if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
                                    !vdev_is_concrete(vd))
                                        continue;
+
                                svd[svdcount++] = vd;
                                if (svdcount == SPA_SYNC_MIN_VDEVS)
                                        break;
@@ -7572,6 +7940,9 @@ spa_sync(spa_t *spa, uint64_t txg)
        ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
        ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 
+       while (zfs_pause_spa_sync)
+               delay(1);
+
        spa->spa_sync_pass = 0;
 
        /*
diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c
new file mode 100644 (file)
index 0000000..5446588
--- /dev/null
@@ -0,0 +1,638 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * Storage Pool Checkpoint
+ *
+ * A storage pool checkpoint can be thought of as a pool-wide snapshot or
+ * a stable version of extreme rewind that guarantees no blocks from the
+ * checkpointed state will have been overwritten. It remembers the entire
+ * state of the storage pool (e.g. snapshots, dataset names, etc..) from the
+ * point that it was taken and the user can rewind back to that point even if
+ * they applied destructive operations on their datasets or even enabled new
+ * zpool on-disk features. If a pool has a checkpoint that is no longer
+ * needed, the user can discard it.
+ *
+ * == On disk data structures used ==
+ *
+ * - The pool has a new feature flag and a new entry in the MOS. The feature
+ *   flag is set to active when we create the checkpoint and remains active
+ *   until the checkpoint is fully discarded. The entry in the MOS config
+ *   (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that
+ *   references the state of the pool when we take the checkpoint. The entry
+ *   remains populated until we start discarding the checkpoint or we rewind
+ *   back to it.
+ *
+ * - Each vdev contains a vdev-wide space map while the pool has a checkpoint,
+ *   which persists until the checkpoint is fully discarded. The space map
+ *   contains entries that have been freed in the current state of the pool
+ *   but we want to keep around in case we decide to rewind to the checkpoint.
+ *   [see vdev_checkpoint_sm]
+ *
+ * - Each metaslab's ms_sm space map behaves the same as without the
+ *   checkpoint, with the only exception being the scenario when we free
+ *   blocks that belong to the checkpoint. In this case, these blocks remain
+ *   ALLOCATED in the metaslab's space map and they are added as FREE in the
+ *   vdev's checkpoint space map.
+ *
+ * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that
+ *   the uberblock was checkpointed. For normal uberblocks this field is 0.
+ *
+ * == Overview of operations ==
+ *
+ * - To create a checkpoint, we first wait for the current TXG to be synced,
+ *   so we can use the most recently synced uberblock (spa_ubsync) as the
+ *   checkpointed uberblock. Then we use an early synctask to place that
+ *   uberblock in MOS config, increment the feature flag for the checkpoint
+ *   (marking it active), and setting spa_checkpoint_txg (see its use below)
+ *   to the TXG of the checkpointed uberblock. We use an early synctask for
+ *   the aforementioned operations to ensure that no blocks were dirtied
+ *   between the current TXG and the TXG of the checkpointed uberblock
+ *   (e.g the previous txg).
+ *
+ * - When a checkpoint exists, we need to ensure that the blocks that
+ *   belong to the checkpoint are freed but never reused. This means that
+ *   these blocks should never end up in the ms_allocatable or the ms_freeing
+ *   trees of a metaslab. Therefore, whenever there is a checkpoint the new
+ *   ms_checkpointing tree is used in addition to the aforementioned ones.
+ *
+ *   Whenever a block is freed and we find out that it is referenced by the
+ *   checkpoint (we find out by comparing its birth to spa_checkpoint_txg),
+ *   we place it in the ms_checkpointing tree instead of the ms_freeingtree.
+ *   This way, we divide the blocks that are being freed into checkpointed
+ *   and not-checkpointed blocks.
+ *
+ *   In order to persist these frees, we write the extents from the
+ *   ms_freeingtree to the ms_sm as usual, and the extents from the
+ *   ms_checkpointing tree to the vdev_checkpoint_sm. This way, these
+ *   checkpointed extents will remain allocated in the metaslab's ms_sm space
+ *   map, and therefore won't be reused [see metaslab_sync()]. In addition,
+ *   when we discard the checkpoint, we can find the entries that have
+ *   actually been freed in vdev_checkpoint_sm.
+ *   [see spa_checkpoint_discard_thread_sync()]
+ *
+ * - To discard the checkpoint we use an early synctask to delete the
+ *   checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0,
+ *   and wakeup the discarding zthr thread (an open-context async thread).
+ *   We use an early synctask to ensure that the operation happens before any
+ *   new data end up in the checkpoint's data structures.
+ *
+ *   Once the synctask is done and the discarding zthr is awake, we discard
+ *   the checkpointed data over multiple TXGs by having the zthr prefetching
+ *   entries from vdev_checkpoint_sm and then starting a synctask that places
+ *   them as free blocks in to their respective ms_allocatable and ms_sm
+ *   structures.
+ *   [see spa_checkpoint_discard_thread()]
+ *
+ *   When there are no entries left in the vdev_checkpoint_sm of all
+ *   top-level vdevs, a final synctask runs that decrements the feature flag.
+ *
+ * - To rewind to the checkpoint, we first use the current uberblock and
+ *   open the MOS so we can access the checkpointed uberblock from the MOS
+ *   config. After we retrieve the checkpointed uberblock, we use it as the
+ *   current uberblock for the pool by writing it to disk with an updated
+ *   TXG, opening its version of the MOS, and moving on as usual from there.
+ *   [see spa_ld_checkpoint_rewind()]
+ *
+ *   An important note on rewinding to the checkpoint has to do with how we
+ *   handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL
+ *   blocks that have not been claimed by the time we took the checkpoint
+ *   as they should no longer be valid.
+ *   [see comment in zil_claim()]
+ *
+ * == Miscellaneous information ==
+ *
+ * - In the hypothetical event that we take a checkpoint, remove a vdev,
+ *   and attempt to rewind, the rewind would fail as the checkpointed
+ *   uberblock would reference data in the removed device. For this reason
+ *   and others of similar nature, we disallow the following operations that
+ *   can change the config:
+ *     vdev removal and attach/detach, mirror splitting, and pool reguid.
+ *
+ * - As most of the checkpoint logic is implemented in the SPA and doesn't
+ *   distinguish datasets when it comes to space accounting, having a
+ *   checkpoint can potentially break the boundaries set by dataset
+ *   reservations.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/metaslab_impl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/spa_checkpoint.h>
+#include <sys/vdev_impl.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+
+/*
+ * The following parameter limits the amount of memory to be used for the
+ * prefetching of the checkpoint space map done on each vdev while
+ * discarding the checkpoint.
+ *
+ * The reason it exists is because top-level vdevs with long checkpoint
+ * space maps can potentially take up a lot of memory depending on the
+ * amount of checkpointed data that has been freed within them while
+ * the pool had a checkpoint.
+ */
+unsigned long zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
+
+int
+spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
+{
+       if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+               return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
+
+       bzero(pcs, sizeof (pool_checkpoint_stat_t));
+
+       int error = zap_contains(spa_meta_objset(spa),
+           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
+       ASSERT(error == 0 || error == ENOENT);
+
+       if (error == ENOENT)
+               pcs->pcs_state = CS_CHECKPOINT_DISCARDING;
+       else
+               pcs->pcs_state = CS_CHECKPOINT_EXISTS;
+
+       pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace;
+       pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp;
+
+       return (0);
+}
+
+static void
+spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx)
+{
+       spa_t *spa = arg;
+
+       spa->spa_checkpoint_info.sci_timestamp = 0;
+
+       spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
+
+       spa_history_log_internal(spa, "spa discard checkpoint", tx,
+           "finished discarding checkpointed state from the pool");
+}
+
+typedef struct spa_checkpoint_discard_sync_callback_arg {
+       vdev_t *sdc_vd;
+       uint64_t sdc_txg;
+       uint64_t sdc_entry_limit;
+} spa_checkpoint_discard_sync_callback_arg_t;
+
+static int
+spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
+    uint64_t size, void *arg)
+{
+       spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
+       vdev_t *vd = sdc->sdc_vd;
+       metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+       uint64_t end = offset + size;
+
+       if (sdc->sdc_entry_limit == 0)
+               return (EINTR);
+
+       /*
+        * Since the space map is not condensed, we know that
+        * none of its entries is crossing the boundaries of
+        * its respective metaslab.
+        *
+        * That said, there is no fundamental requirement that
+        * the checkpoint's space map entries should not cross
+        * metaslab boundaries. So if needed we could add code
+        * that handles metaslab-crossing segments in the future.
+        */
+       VERIFY3U(type, ==, SM_FREE);
+       VERIFY3U(offset, >=, ms->ms_start);
+       VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+       /*
+        * At this point we should not be processing any
+        * other frees concurrently, so the lock is technically
+        * unnecessary. We use the lock anyway though to
+        * potentially save ourselves from future headaches.
+        */
+       mutex_enter(&ms->ms_lock);
+       if (range_tree_is_empty(ms->ms_freeing))
+               vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
+       range_tree_add(ms->ms_freeing, offset, size);
+       mutex_exit(&ms->ms_lock);
+
+       ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, size);
+       ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, size);
+
+       vd->vdev_spa->spa_checkpoint_info.sci_dspace -= size;
+       vd->vdev_stat.vs_checkpoint_space -= size;
+       sdc->sdc_entry_limit--;
+
+       return (0);
+}
+
+#ifdef ZFS_DEBUG
+static void
+spa_checkpoint_accounting_verify(spa_t *spa)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       uint64_t ckpoint_sm_space_sum = 0;
+       uint64_t vs_ckpoint_space_sum = 0;
+
+       for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *vd = rvd->vdev_child[c];
+
+               if (vd->vdev_checkpoint_sm != NULL) {
+                       ckpoint_sm_space_sum +=
+                           -vd->vdev_checkpoint_sm->sm_alloc;
+                       vs_ckpoint_space_sum +=
+                           vd->vdev_stat.vs_checkpoint_space;
+                       ASSERT3U(ckpoint_sm_space_sum, ==,
+                           vs_ckpoint_space_sum);
+               } else {
+                       ASSERT0(vd->vdev_stat.vs_checkpoint_space);
+               }
+       }
+       ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum);
+}
+#endif
+
+static void
+spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
+{
+       vdev_t *vd = arg;
+       int error;
+
+       /*
+        * The space map callback is applied only to non-debug entries.
+        * Because the number of debug entries is less or equal to the
+        * number of non-debug entries, we want to ensure that we only
+        * read what we prefetched from open-context.
+        *
+        * Thus, we set the maximum entries that the space map callback
+        * will be applied to be half the entries that could fit in the
+        * imposed memory limit.
+        */
+       uint64_t max_entry_limit =
+           (zfs_spa_discard_memory_limit / sizeof (uint64_t)) >> 1;
+
+       uint64_t entries_in_sm =
+           space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
+
+       /*
+        * Iterate from the end of the space map towards the beginning,
+        * placing its entries on ms_freeing and removing them from the
+        * space map. The iteration stops if one of the following
+        * conditions is true:
+        *
+        * 1] We reached the beginning of the space map. At this point
+        *    the space map should be completely empty and
+        *    space_map_incremental_destroy should have returned 0.
+        *    The next step would be to free and close the space map
+        *    and remove its entry from its vdev's top zap. This allows
+        *    spa_checkpoint_discard_thread() to move on to the next vdev.
+        *
+        * 2] We reached the memory limit (amount of memory used to hold
+        *    space map entries in memory) and space_map_incremental_destroy
+        *    returned EINTR. This means that there are entries remaining
+        *    in the space map that will be cleared in a future invocation
+        *    of this function by spa_checkpoint_discard_thread().
+        */
+       spa_checkpoint_discard_sync_callback_arg_t sdc;
+       sdc.sdc_vd = vd;
+       sdc.sdc_txg = tx->tx_txg;
+       sdc.sdc_entry_limit = MIN(entries_in_sm, max_entry_limit);
+
+       uint64_t entries_before = entries_in_sm;
+
+       error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
+           spa_checkpoint_discard_sync_callback, &sdc, tx);
+
+       uint64_t entries_after =
+           space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
+
+#ifdef ZFS_DEBUG
+       spa_checkpoint_accounting_verify(vd->vdev_spa);
+#endif
+
+       zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
+           "deleted %llu entries - %llu entries are left",
+           tx->tx_txg, vd->vdev_id, (entries_before - entries_after),
+           entries_after);
+
+       if (error != EINTR) {
+               if (error != 0) {
+                       zfs_panic_recover("zfs: error %d was returned "
+                           "while incrementally destroying the checkpoint "
+                           "space map of vdev %llu\n",
+                           error, vd->vdev_id);
+               }
+               ASSERT0(entries_after);
+               ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
+               ASSERT0(vd->vdev_checkpoint_sm->sm_length);
+
+               space_map_free(vd->vdev_checkpoint_sm, tx);
+               space_map_close(vd->vdev_checkpoint_sm);
+               vd->vdev_checkpoint_sm = NULL;
+
+               VERIFY0(zap_remove(vd->vdev_spa->spa_meta_objset,
+                   vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
+       }
+}
+
+static boolean_t
+spa_checkpoint_discard_is_done(spa_t *spa)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       ASSERT(!spa_has_checkpoint(spa));
+       ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT));
+
+       for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+               if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL)
+                       return (B_FALSE);
+               ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space);
+       }
+
+       return (B_TRUE);
+}
+
+/* ARGSUSED */
+boolean_t
+spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr)
+{
+       spa_t *spa = arg;
+
+       if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+               return (B_FALSE);
+
+       if (spa_has_checkpoint(spa))
+               return (B_FALSE);
+
+       return (B_TRUE);
+}
+
+int
+spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
+{
+       spa_t *spa = arg;
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+               vdev_t *vd = rvd->vdev_child[c];
+
+               while (vd->vdev_checkpoint_sm != NULL) {
+                       space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm;
+                       int numbufs;
+                       dmu_buf_t **dbp;
+
+                       if (zthr_iscancelled(zthr))
+                               return (0);
+
+                       ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
+
+                       uint64_t size = MIN(space_map_length(checkpoint_sm),
+                           zfs_spa_discard_memory_limit);
+                       uint64_t offset =
+                           space_map_length(checkpoint_sm) - size;
+
+                       /*
+                        * Ensure that the part of the space map that will
+                        * be destroyed by the synctask, is prefetched in
+                        * memory before the synctask runs.
+                        */
+                       int error = dmu_buf_hold_array_by_bonus(
+                           checkpoint_sm->sm_dbuf, offset, size,
+                           B_TRUE, FTAG, &numbufs, &dbp);
+                       if (error != 0) {
+                               zfs_panic_recover("zfs: error %d was returned "
+                                   "while prefetching checkpoint space map "
+                                   "entries of vdev %llu\n",
+                                   error, vd->vdev_id);
+                       }
+
+                       VERIFY0(dsl_sync_task(spa->spa_name, NULL,
+                           spa_checkpoint_discard_thread_sync, vd,
+                           0, ZFS_SPACE_CHECK_NONE));
+
+                       dmu_buf_rele_array(dbp, numbufs, FTAG);
+               }
+       }
+
+       VERIFY(spa_checkpoint_discard_is_done(spa));
+       VERIFY0(spa->spa_checkpoint_info.sci_dspace);
+       VERIFY0(dsl_sync_task(spa->spa_name, NULL,
+           spa_checkpoint_discard_complete_sync, spa,
+           0, ZFS_SPACE_CHECK_NONE));
+
+       return (0);
+}
+
+
+/* ARGSUSED */
+static int
+spa_checkpoint_check(void *arg, dmu_tx_t *tx)
+{
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+       if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
+               return (SET_ERROR(ENOTSUP));
+
+       if (!spa_top_vdevs_spacemap_addressable(spa))
+               return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG));
+
+       if (spa->spa_vdev_removal != NULL)
+               return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
+
+       if (spa->spa_checkpoint_txg != 0)
+               return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
+
+       if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+               return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
+
+       return (0);
+}
+
+/* ARGSUSED */
+static void
+spa_checkpoint_sync(void *arg, dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       spa_t *spa = dp->dp_spa;
+       uberblock_t checkpoint = spa->spa_ubsync;
+
+       /*
+        * At this point, there should not be a checkpoint in the MOS.
+        */
+       ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT);
+
+       ASSERT0(spa->spa_checkpoint_info.sci_timestamp);
+       ASSERT0(spa->spa_checkpoint_info.sci_dspace);
+
+       /*
+        * Since the checkpointed uberblock is the one that just got synced
+        * (we use spa_ubsync), its txg must be equal to the txg number of
+        * the txg we are syncing, minus 1.
+        */
+       ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1);
+
+       /*
+        * Once the checkpoint is in place, we need to ensure that none of
+        * its blocks will be marked for reuse after it has been freed.
+        * When there is a checkpoint and a block is freed, we compare its
+        * birth txg to the txg of the checkpointed uberblock to see if the
+        * block is part of the checkpoint or not. Therefore, we have to set
+        * spa_checkpoint_txg before any frees happen in this txg (which is
+        * why this is done as an early_synctask as explained in the comment
+        * in spa_checkpoint()).
+        */
+       spa->spa_checkpoint_txg = checkpoint.ub_txg;
+       spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
+
+       checkpoint.ub_checkpoint_txg = checkpoint.ub_txg;
+       VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
+           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT,
+           sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t),
+           &checkpoint, tx));
+
+       /*
+        * Increment the feature refcount and thus activate the feature.
+        * Note that the feature will be deactivated when we've
+        * completely discarded all checkpointed state (both vdev
+        * space maps and uberblock).
+        */
+       spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
+
+       spa_history_log_internal(spa, "spa checkpoint", tx,
+           "checkpointed uberblock txg=%llu", checkpoint.ub_txg);
+}
+
+/*
+ * Create a checkpoint for the pool.
+ */
+int
+spa_checkpoint(const char *pool)
+{
+       int error;
+       spa_t *spa;
+
+       error = spa_open(pool, &spa, FTAG);
+       if (error != 0)
+               return (error);
+
+       mutex_enter(&spa->spa_vdev_top_lock);
+
+       /*
+        * Wait for current syncing txg to finish so the latest synced
+        * uberblock (spa_ubsync) has all the changes that we expect
+        * to see if we were to revert later to the checkpoint. In other
+        * words we want the checkpointed uberblock to include/reference
+        * all the changes that were pending at the time that we issued
+        * the checkpoint command.
+        */
+       txg_wait_synced(spa_get_dsl(spa), 0);
+
+       /*
+        * As the checkpointed uberblock references blocks from the previous
+        * txg (spa_ubsync) we want to ensure that are not freeing any of
+        * these blocks in the same txg that the following synctask will
+        * run. Thus, we run it as an early synctask, so the dirty changes
+        * that are synced to disk afterwards during zios and other synctasks
+        * do not reuse checkpointed blocks.
+        */
+       error = dsl_early_sync_task(pool, spa_checkpoint_check,
+           spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL);
+
+       mutex_exit(&spa->spa_vdev_top_lock);
+
+       spa_close(spa, FTAG);
+       return (error);
+}
+
+/* ARGSUSED */
+static int
+spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
+{
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+       if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+               return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
+
+       if (spa->spa_checkpoint_txg == 0)
+               return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
+
+       VERIFY0(zap_contains(spa_meta_objset(spa),
+           DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT));
+
+       return (0);
+}
+
+/* ARGSUSED */
+static void
+spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx)
+{
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+       VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_ZPOOL_CHECKPOINT, tx));
+
+       spa->spa_checkpoint_txg = 0;
+
+       zthr_wakeup(spa->spa_checkpoint_discard_zthr);
+
+       spa_history_log_internal(spa, "spa discard checkpoint", tx,
+           "started discarding checkpointed state from the pool");
+}
+
+/*
+ * Discard the checkpoint from a pool.
+ */
+int
+spa_checkpoint_discard(const char *pool)
+{
+       /*
+        * Similarly to spa_checkpoint(), we want our synctask to run
+        * before any pending dirty data are written to disk so they
+        * won't end up in the checkpoint's data structures (e.g.
+        * ms_checkpointing and vdev_checkpoint_sm) and re-create any
+        * space maps that the discarding open-context thread has
+        * deleted.
+        * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread]
+        */
+       return (dsl_early_sync_task(pool, spa_checkpoint_discard_check,
+           spa_checkpoint_discard_sync, NULL, 0,
+           ZFS_SPACE_CHECK_DISCARD_CHECKPOINT));
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(spa_checkpoint_get_stats);
+EXPORT_SYMBOL(spa_checkpoint_discard_thread);
+EXPORT_SYMBOL(spa_checkpoint_discard_thread_check);
+
+/* BEGIN CSTYLED */
+module_param(zfs_spa_discard_memory_limit, ulong, 0644);
+MODULE_PARM_DESC(zfs_spa_discard_memory_limit,
+    "Maximum memory for prefetching checkpoint space "
+    "map per top-level vdev while discarding checkpoint");
+/* END CSTYLED */
+#endif
index 6c7e2f55cd2a88065dfd0d1fdddd642adb914621..9410fab0763cdb32b43c5428c0a844561b0f5efc 100644 (file)
@@ -357,12 +357,15 @@ int spa_asize_inflation = 24;
  * These are the operations that call dsl_pool_adjustedsize() with the netfree
  * argument set to TRUE.
  *
+ * Operations that are almost guaranteed to free up space in the absence of
+ * a pool checkpoint can use up to three quarters of the slop space
+ * (e.g zfs destroy).
+ *
  * A very restricted set of operations are always permitted, regardless of
  * the amount of free space.  These are the operations that call
- * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy".  If these
- * operations result in a net increase in the amount of space used,
- * it is possible to run the pool completely out of space, causing it to
- * be permanently read-only.
+ * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net
+ * increase in the amount of space used, it is possible to run the pool
+ * completely out of space, causing it to be permanently read-only.
  *
  * Note that on very small pools, the slop space will be larger than
  * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
@@ -1718,6 +1721,12 @@ spa_get_dspace(spa_t *spa)
        return (spa->spa_dspace);
 }
 
+uint64_t
+spa_get_checkpoint_space(spa_t *spa)
+{
+       return (spa->spa_checkpoint_info.sci_dspace);
+}
+
 void
 spa_update_dspace(spa_t *spa)
 {
@@ -2065,7 +2074,8 @@ spa_writeable(spa_t *spa)
 boolean_t
 spa_has_pending_synctask(spa_t *spa)
 {
-       return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
+       return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) ||
+           !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks));
 }
 
 int
@@ -2293,6 +2303,63 @@ spa_state_to_name(spa_t *spa)
        return ("UNKNOWN");
 }
 
+boolean_t
+spa_top_vdevs_spacemap_addressable(spa_t *spa)
+{
+       vdev_t *rvd = spa->spa_root_vdev;
+       for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+               if (!vdev_is_spacemap_addressable(rvd->vdev_child[c]))
+                       return (B_FALSE);
+       }
+       return (B_TRUE);
+}
+
+boolean_t
+spa_has_checkpoint(spa_t *spa)
+{
+       return (spa->spa_checkpoint_txg != 0);
+}
+
+boolean_t
+spa_importing_readonly_checkpoint(spa_t *spa)
+{
+       return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) &&
+           spa->spa_mode == FREAD);
+}
+
+uint64_t
+spa_min_claim_txg(spa_t *spa)
+{
+       uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg;
+
+       if (checkpoint_txg != 0)
+               return (checkpoint_txg + 1);
+
+       return (spa->spa_first_txg);
+}
+
+/*
+ * If there is a checkpoint, async destroys may consume more space from
+ * the pool instead of freeing it. In an attempt to save the pool from
+ * getting suspended when it is about to run out of space, we stop
+ * processing async destroys.
+ */
+boolean_t
+spa_suspend_async_destroy(spa_t *spa)
+{
+       dsl_pool_t *dp = spa_get_dsl(spa);
+
+       uint64_t unreserved = dsl_pool_unreserved_space(dp,
+           ZFS_SPACE_CHECK_EXTRA_RESERVED);
+       uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+       uint64_t avail = (unreserved > used) ? (unreserved - used) : 0;
+
+       if (spa_has_checkpoint(spa) && avail == 0)
+               return (B_TRUE);
+
+       return (B_FALSE);
+}
+
 #if defined(_KERNEL)
 
 #include <linux/mod_compat.h>
@@ -2446,6 +2513,11 @@ EXPORT_SYMBOL(spa_trust_config);
 EXPORT_SYMBOL(spa_missing_tvds_allowed);
 EXPORT_SYMBOL(spa_set_missing_tvds);
 EXPORT_SYMBOL(spa_state_to_name);
+EXPORT_SYMBOL(spa_importing_readonly_checkpoint);
+EXPORT_SYMBOL(spa_min_claim_txg);
+EXPORT_SYMBOL(spa_suspend_async_destroy);
+EXPORT_SYMBOL(spa_has_checkpoint);
+EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable);
 
 /* BEGIN CSTYLED */
 module_param(zfs_flags, uint, 0644);
index d84dd7583592e630fdc0b5f3ede7a2b225351b22..0e5a4b97657cfd434472159c9ff2bfce4835a621 100644 (file)
@@ -23,7 +23,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/zfeature.h>
 
 /*
+ * Note on space map block size:
+ *
  * The data for a given space map can be kept on blocks of any size.
  * Larger blocks entail fewer i/o operations, but they also cause the
  * DMU to keep more data in-core, and also to waste more i/o bandwidth
  * when only a few blocks have changed since the last transaction group.
  */
-int space_map_blksz = (1 << 12);
 
 /*
  * Iterate through the space map, invoking the callback on each (non-debug)
@@ -105,6 +106,137 @@ space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
        return (error);
 }
 
+/*
+ * Note: This function performs destructive actions - specifically
+ * it deletes entries from the end of the space map. Thus, callers
+ * should ensure that they are holding the appropriate locks for
+ * the space map that they provide.
+ */
+int
+space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
+    dmu_tx_t *tx)
+{
+       uint64_t bufsize, len;
+       uint64_t *entry_map;
+       int error = 0;
+
+       len = space_map_length(sm);
+       bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
+       entry_map = zio_buf_alloc(bufsize);
+
+       dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+       /*
+        * Since we can't move the starting offset of the space map
+        * (e.g there are reference on-disk pointing to it), we destroy
+        * its entries incrementally starting from the end.
+        *
+        * The logic that follows is basically the same as the one used
+        * in space_map_iterate() but it traverses the space map
+        * backwards:
+        *
+        * 1] We figure out the size of the buffer that we want to use
+        *    to read the on-disk space map entries.
+        * 2] We figure out the offset at the end of the space map where
+        *    we will start reading entries into our buffer.
+        * 3] We read the on-disk entries into the buffer.
+        * 4] We iterate over the entries from end to beginning calling
+        *    the callback function on each one. As we move from entry
+        *    to entry we decrease the size of the space map, deleting
+        *    effectively each entry.
+        * 5] If there are no more entries in the space map or the
+        *    callback returns a value other than 0, we stop iterating
+        *    over the space map. If there are entries remaining and
+        *    the callback returned zero we go back to step [1].
+        */
+       uint64_t offset = 0, size = 0;
+       while (len > 0 && error == 0) {
+               size = MIN(bufsize, len);
+
+               VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
+               VERIFY3U(size, >, 0);
+               ASSERT3U(sm->sm_blksz, !=, 0);
+
+               offset = len - size;
+
+               IMPLY(bufsize > len, offset == 0);
+               IMPLY(bufsize == len, offset == 0);
+               IMPLY(bufsize < len, offset > 0);
+
+
+               EQUIV(size == len, offset == 0);
+               IMPLY(size < len, bufsize < len);
+
+               dprintf("object=%llu  offset=%llx  size=%llx\n",
+                   space_map_object(sm), offset, size);
+
+               error = dmu_read(sm->sm_os, space_map_object(sm),
+                   offset, size, entry_map, DMU_READ_PREFETCH);
+               if (error != 0)
+                       break;
+
+               uint64_t num_entries = size / sizeof (uint64_t);
+
+               ASSERT3U(num_entries, >, 0);
+
+               while (num_entries > 0) {
+                       uint64_t e, entry_offset, entry_size;
+                       maptype_t type;
+
+                       e = entry_map[num_entries - 1];
+
+                       ASSERT3U(num_entries, >, 0);
+                       ASSERT0(error);
+
+                       if (SM_DEBUG_DECODE(e)) {
+                               sm->sm_phys->smp_objsize -= sizeof (uint64_t);
+                               space_map_update(sm);
+                               len -= sizeof (uint64_t);
+                               num_entries--;
+                               continue;
+                       }
+
+                       type = SM_TYPE_DECODE(e);
+                       entry_offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
+                           sm->sm_start;
+                       entry_size = SM_RUN_DECODE(e) << sm->sm_shift;
+
+                       VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
+                       VERIFY0(P2PHASE(entry_size, 1ULL << sm->sm_shift));
+                       VERIFY3U(entry_offset, >=, sm->sm_start);
+                       VERIFY3U(entry_offset + entry_size, <=,
+                           sm->sm_start + sm->sm_size);
+
+                       error = callback(type, entry_offset, entry_size, arg);
+                       if (error != 0)
+                               break;
+
+                       if (type == SM_ALLOC)
+                               sm->sm_phys->smp_alloc -= entry_size;
+                       else
+                               sm->sm_phys->smp_alloc += entry_size;
+
+                       sm->sm_phys->smp_objsize -= sizeof (uint64_t);
+                       space_map_update(sm);
+                       len -= sizeof (uint64_t);
+                       num_entries--;
+               }
+               IMPLY(error == 0, num_entries == 0);
+               EQUIV(offset == 0 && error == 0, len == 0 && num_entries == 0);
+       }
+
+       if (len == 0) {
+               ASSERT0(error);
+               ASSERT0(offset);
+               ASSERT0(sm->sm_length);
+               ASSERT0(sm->sm_phys->smp_objsize);
+               ASSERT0(sm->sm_alloc);
+       }
+
+       zio_buf_free(entry_map, bufsize);
+       return (error);
+}
+
 typedef struct space_map_load_arg {
        space_map_t     *smla_sm;
        range_tree_t    *smla_rt;
@@ -279,7 +411,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
         */
        sm->sm_phys->smp_object = sm->sm_object;
 
-       if (range_tree_space(rt) == 0) {
+       if (range_tree_is_empty(rt)) {
                VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
                return;
        }
@@ -418,7 +550,7 @@ space_map_close(space_map_t *sm)
 }
 
 void
-space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
+space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
 {
        objset_t *os = sm->sm_os;
        spa_t *spa = dmu_objset_spa(os);
@@ -440,7 +572,7 @@ space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
         */
        if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
            doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
-           doi.doi_data_block_size != space_map_blksz) {
+           doi.doi_data_block_size != blocksize) {
                zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating "
                    "object[%llu]: old bonus %u, old blocksz %u",
                    dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object,
@@ -449,7 +581,7 @@ space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
                space_map_free(sm, tx);
                dmu_buf_rele(sm->sm_dbuf, sm);
 
-               sm->sm_object = space_map_alloc(sm->sm_os, tx);
+               sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx);
                VERIFY0(space_map_open_impl(sm));
        } else {
                VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
@@ -482,7 +614,7 @@ space_map_update(space_map_t *sm)
 }
 
 uint64_t
-space_map_alloc(objset_t *os, dmu_tx_t *tx)
+space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
 {
        spa_t *spa = dmu_objset_spa(os);
        uint64_t object;
@@ -496,8 +628,7 @@ space_map_alloc(objset_t *os, dmu_tx_t *tx)
                bonuslen = SPACE_MAP_SIZE_V0;
        }
 
-       object = dmu_object_alloc(os,
-           DMU_OT_SPACE_MAP, space_map_blksz,
+       object = dmu_object_alloc(os, DMU_OT_SPACE_MAP, blocksize,
            DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
 
        return (object);
index c1e85bdce7600684e810dd3cddd390822b8bf808..3b85260764d0fd5bd0abb388e1fffe754aa0724e 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -60,6 +60,7 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay)
        ub->ub_mmp_magic = MMP_MAGIC;
        ub->ub_mmp_delay = spa_multihost(rvd->vdev_spa) ? mmp_delay : 0;
        ub->ub_mmp_seq = 0;
+       ub->ub_checkpoint_txg = 0;
 
        return (ub->ub_rootbp.blk_birth == txg);
 }
index 37bb5a0c5c5b235d2024639b14dad6944ccf744d..cf1bf2837f18a23cb3dda490730aaf0b441b7471 100644 (file)
 #include <sys/zvol.h>
 #include <sys/zfs_ratelimit.h>
 
+/* maximum number of metaslabs per top-level vdev */
+int vdev_max_ms_count = 200;
+
+/* minimum amount of metaslabs per top-level vdev */
+int vdev_min_ms_count = 16;
+
+/* see comment in vdev_metaslab_set_size() */
+int vdev_default_ms_shift = 29;
+
+int vdev_validate_skip = B_FALSE;
+
 /*
- * When a vdev is added, it will be divided into approximately (but no
- * more than) this number of metaslabs.
+ * Since the DTL space map of a vdev is not expected to have a lot of
+ * entries, we default its block size to 4K.
  */
-int metaslabs_per_vdev = 200;
+int vdev_dtl_sm_blksz = (1 << 12);
 
 /*
  * Rate limit delay events to this many IO delays per second.
@@ -74,7 +85,12 @@ unsigned int zfs_checksums_per_second = 20;
  */
 int zfs_scan_ignore_errors = 0;
 
-int vdev_validate_skip = B_FALSE;
+/*
+ * vdev-wide space maps that have lots of entries written to them at
+ * the end of each transaction can benefit from a higher I/O bandwidth
+ * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
+ */
+int vdev_standard_sm_blksz = (1 << 17);
 
 /*PRINTFLIKE2*/
 void
@@ -926,6 +942,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
        if (tvd->vdev_mg != NULL)
                tvd->vdev_mg->mg_vd = tvd;
 
+       tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
+       svd->vdev_checkpoint_sm = NULL;
+
        tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
        tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
        tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
@@ -1169,6 +1188,21 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 void
 vdev_metaslab_fini(vdev_t *vd)
 {
+       if (vd->vdev_checkpoint_sm != NULL) {
+               ASSERT(spa_feature_is_active(vd->vdev_spa,
+                   SPA_FEATURE_POOL_CHECKPOINT));
+               space_map_close(vd->vdev_checkpoint_sm);
+               /*
+                * Even though we close the space map, we need to set its
+                * pointer to NULL. The reason is that vdev_metaslab_fini()
+                * may be called multiple times for certain operations
+                * (i.e. when destroying a pool) so we need to ensure that
+                * this clause never executes twice. This logic is similar
+                * to the one used for the vdev_ms clause below.
+                */
+               vd->vdev_checkpoint_sm = NULL;
+       }
+
        if (vd->vdev_ms != NULL) {
                uint64_t count = vd->vdev_ms_count;
 
@@ -2095,11 +2129,39 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
 void
 vdev_metaslab_set_size(vdev_t *vd)
 {
+       uint64_t asize = vd->vdev_asize;
+       uint64_t ms_shift = 0;
+
        /*
-        * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
+        * For vdevs that are bigger than 8G the metaslab size varies in
+        * a way that the number of metaslabs increases in powers of two,
+        * linearly in terms of vdev_asize, starting from 16 metaslabs.
+        * So for vdev_asize of 8G we get 16 metaslabs, for 16G, we get 32,
+        * and so on, until we hit the maximum metaslab count limit
+        * [vdev_max_ms_count] from which point the metaslab count stays
+        * the same.
         */
-       vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
-       vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
+       ms_shift = vdev_default_ms_shift;
+
+       if ((asize >> ms_shift) < vdev_min_ms_count) {
+               /*
+                * For devices that are less than 8G we want to have
+                * exactly 16 metaslabs. We don't want less as integer
+                * division rounds down, so less metaslabs mean more
+                * wasted space. We don't want more as these vdevs are
+                * small and in the likely event that we are running
+                * out of space, the SPA will have a hard time finding
+                * space due to fragmentation.
+                */
+               ms_shift = highbit64(asize / vdev_min_ms_count);
+               ms_shift = MAX(ms_shift, SPA_MAXBLOCKSHIFT);
+
+       } else if ((asize >> ms_shift) > vdev_max_ms_count) {
+               ms_shift = highbit64(asize / vdev_max_ms_count);
+       }
+
+       vd->vdev_ms_shift = ms_shift;
+       ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
 }
 
 void
@@ -2204,7 +2266,7 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
                return (B_FALSE);
 
        mutex_enter(&vd->vdev_dtl_lock);
-       if (range_tree_space(rt) != 0)
+       if (!range_tree_is_empty(rt))
                dirty = range_tree_contains(rt, txg, size);
        mutex_exit(&vd->vdev_dtl_lock);
 
@@ -2218,7 +2280,7 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
        boolean_t empty;
 
        mutex_enter(&vd->vdev_dtl_lock);
-       empty = (range_tree_space(rt) == 0);
+       empty = range_tree_is_empty(rt);
        mutex_exit(&vd->vdev_dtl_lock);
 
        return (empty);
@@ -2292,7 +2354,7 @@ vdev_dtl_should_excise(vdev_t *vd)
                return (B_FALSE);
 
        if (vd->vdev_resilver_txg == 0 ||
-           range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
+           range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
                return (B_TRUE);
 
        /*
@@ -2396,8 +2458,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
                 * the top level so that we persist the change.
                 */
                if (vd->vdev_resilver_txg != 0 &&
-                   range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
-                   range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) {
+                   range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
+                   range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
                        vd->vdev_resilver_txg = 0;
                        vdev_config_dirty(vd->vdev_top);
                }
@@ -2557,7 +2619,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
        if (vd->vdev_dtl_sm == NULL) {
                uint64_t new_object;
 
-               new_object = space_map_alloc(mos, tx);
+               new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx);
                VERIFY3U(new_object, !=, 0);
 
                VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
@@ -2571,7 +2633,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
        range_tree_walk(rt, range_tree_add, rtsync);
        mutex_exit(&vd->vdev_dtl_lock);
 
-       space_map_truncate(vd->vdev_dtl_sm, tx);
+       space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
        space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
        range_tree_vacate(rtsync, NULL, NULL);
 
@@ -2642,7 +2704,7 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 
        if (vd->vdev_children == 0) {
                mutex_enter(&vd->vdev_dtl_lock);
-               if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
+               if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
                    vdev_writeable(vd)) {
 
                        thismin = vdev_dtl_min(vd);
@@ -2670,6 +2732,28 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
        return (needed);
 }
 
+/*
+ * Gets the checkpoint space map object from the vdev's ZAP.
+ * Returns the spacemap object, or 0 if it wasn't in the ZAP
+ * or the ZAP doesn't exist yet.
+ */
+int
+vdev_checkpoint_sm_object(vdev_t *vd)
+{
+       ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+       if (vd->vdev_top_zap == 0) {
+               return (0);
+       }
+
+       uint64_t sm_obj = 0;
+       int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
+           VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj);
+
+       VERIFY(err == 0 || err == ENOENT);
+
+       return (sm_obj);
+}
+
 int
 vdev_load(vdev_t *vd)
 {
@@ -2705,6 +2789,35 @@ vdev_load(vdev_t *vd)
                            VDEV_AUX_CORRUPT_DATA);
                        return (error);
                }
+
+               uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd);
+               if (checkpoint_sm_obj != 0) {
+                       objset_t *mos = spa_meta_objset(vd->vdev_spa);
+                       ASSERT(vd->vdev_asize != 0);
+                       ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
+
+                       if ((error = space_map_open(&vd->vdev_checkpoint_sm,
+                           mos, checkpoint_sm_obj, 0, vd->vdev_asize,
+                           vd->vdev_ashift))) {
+                               vdev_dbgmsg(vd, "vdev_load: space_map_open "
+                                   "failed for checkpoint spacemap (obj %llu) "
+                                   "[error=%d]",
+                                   (u_longlong_t)checkpoint_sm_obj, error);
+                               return (error);
+                       }
+                       ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+                       space_map_update(vd->vdev_checkpoint_sm);
+
+                       /*
+                        * Since the checkpoint_sm contains free entries
+                        * exclusively we can use sm_alloc to indicate the
+                        * culmulative checkpointed space that has been freed.
+                        */
+                       vd->vdev_stat.vs_checkpoint_space =
+                           -vd->vdev_checkpoint_sm->sm_alloc;
+                       vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
+                           vd->vdev_stat.vs_checkpoint_space;
+               }
        }
 
        /*
@@ -2722,7 +2835,7 @@ vdev_load(vdev_t *vd)
        if (obsolete_sm_object != 0) {
                objset_t *mos = vd->vdev_spa->spa_meta_objset;
                ASSERT(vd->vdev_asize != 0);
-               ASSERT(vd->vdev_obsolete_sm == NULL);
+               ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
 
                if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
                    obsolete_sm_object, 0, vd->vdev_asize, 0))) {
@@ -2848,6 +2961,12 @@ vdev_remove_empty(vdev_t *vd, uint64_t txg)
                        mutex_exit(&msp->ms_lock);
                }
 
+               if (vd->vdev_checkpoint_sm != NULL) {
+                       ASSERT(spa_has_checkpoint(spa));
+                       space_map_close(vd->vdev_checkpoint_sm);
+                       vd->vdev_checkpoint_sm = NULL;
+               }
+
                metaslab_group_histogram_verify(mg);
                metaslab_class_histogram_verify(mg->mg_class);
                for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
@@ -3181,6 +3300,17 @@ top:
 
                        error = spa_reset_logs(spa);
 
+                       /*
+                        * If the log device was successfully reset but has
+                        * checkpointed data, do not offline it.
+                        */
+                       if (error == 0 &&
+                           tvd->vdev_checkpoint_sm != NULL) {
+                               ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc,
+                                   !=, 0);
+                               error = ZFS_ERR_CHECKPOINT_EXISTS;
+                       }
+
                        spa_vdev_state_enter(spa, SCL_ALLOC);
 
                        /*
@@ -3419,6 +3549,23 @@ vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
 
 }
 
+boolean_t
+vdev_is_spacemap_addressable(vdev_t *vd)
+{
+       /*
+        * Assuming 47 bits of the space map entry dedicated for the entry's
+        * offset (see description in space_map.h), we calculate the maximum
+        * address that can be described by a space map entry for the given
+        * device.
+        */
+       uint64_t shift = vd->vdev_ashift + 47;
+
+       if (shift >= 63) /* detect potential overflow */
+               return (B_TRUE);
+
+       return (vd->vdev_asize < (1ULL << shift));
+}
+
 /*
  * Get statistics for the given vdev.
  */
@@ -4243,11 +4390,15 @@ EXPORT_SYMBOL(vdev_online);
 EXPORT_SYMBOL(vdev_offline);
 EXPORT_SYMBOL(vdev_clear);
 /* BEGIN CSTYLED */
-module_param(metaslabs_per_vdev, int, 0644);
-MODULE_PARM_DESC(metaslabs_per_vdev,
+module_param(vdev_max_ms_count, int, 0644);
+MODULE_PARM_DESC(vdev_max_ms_count,
        "Divide added vdev into approximately (but no more than) this number "
        "of metaslabs");
 
+module_param(vdev_min_ms_count, int, 0644);
+MODULE_PARM_DESC(vdev_min_ms_count,
+       "Minimum number of metaslabs per top-level vdev");
+
 module_param(zfs_delays_per_second, uint, 0644);
 MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many "
        "IO delays per second");
index a93e412589b487c0bd5bd90bf8a279d52a971186..b14b153b2aaf152470ecfd1d69920af50fd411bb 100644 (file)
@@ -298,14 +298,13 @@ static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
 };
 
 /*
- * Mark the given offset and size as being obsolete in the given txg.
+ * Mark the given offset and size as being obsolete.
  */
 void
-vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size,
-    uint64_t txg)
+vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
 {
        spa_t *spa = vd->vdev_spa;
-       ASSERT3U(spa_syncing_txg(spa), ==, txg);
+
        ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
        ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
        ASSERT(size > 0);
@@ -316,7 +315,7 @@ vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size,
                mutex_enter(&vd->vdev_obsolete_lock);
                range_tree_add(vd->vdev_obsolete_segments, offset, size);
                mutex_exit(&vd->vdev_obsolete_lock);
-               vdev_dirty(vd, 0, NULL, txg);
+               vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
        }
 }
 
@@ -334,7 +333,7 @@ spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
 
        /* The DMU can only remap indirect vdevs. */
        ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
-       vdev_indirect_mark_obsolete(vd, offset, size, dmu_tx_get_txg(tx));
+       vdev_indirect_mark_obsolete(vd, offset, size);
 }
 
 static spa_condensing_indirect_t *
@@ -727,7 +726,8 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr)
                return (0);
 
        VERIFY0(dsl_sync_task(spa_name(spa), NULL,
-           spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_NONE));
+           spa_condense_indirect_complete_sync, sci, 0,
+           ZFS_SPACE_CHECK_EXTRA_RESERVED));
 
        return (0);
 }
@@ -804,7 +804,8 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
 
        if (vdev_obsolete_sm_object(vd) == 0) {
                uint64_t obsolete_sm_object =
-                   space_map_alloc(spa->spa_meta_objset, tx);
+                   space_map_alloc(spa->spa_meta_objset,
+                   vdev_standard_sm_blksz, tx);
 
                ASSERT(vd->vdev_top_zap != 0);
                VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
index 7ea8da1e6ed3a4fd3d8a3d8cf7b0c73814332e4a..29d7d651bf0d651c50b52b1e382f6c54a55f2e00 100644 (file)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 /*
@@ -352,6 +352,37 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
        kmem_free(vsx, sizeof (*vsx));
 }
 
+static void
+root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
+{
+       spa_t *spa = vd->vdev_spa;
+
+       if (vd != spa->spa_root_vdev)
+               return;
+
+       /* provide either current or previous scan information */
+       pool_scan_stat_t ps;
+       if (spa_scan_get_stats(spa, &ps) == 0) {
+               fnvlist_add_uint64_array(nvl,
+                   ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+                   sizeof (pool_scan_stat_t) / sizeof (uint64_t));
+       }
+
+       pool_removal_stat_t prs;
+       if (spa_removal_get_stats(spa, &prs) == 0) {
+               fnvlist_add_uint64_array(nvl,
+                   ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
+                   sizeof (prs) / sizeof (uint64_t));
+       }
+
+       pool_checkpoint_stat_t pcs;
+       if (spa_checkpoint_get_stats(spa, &pcs) == 0) {
+               fnvlist_add_uint64_array(nvl,
+                   ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,
+                   sizeof (pcs) / sizeof (uint64_t));
+       }
+}
+
 /*
  * Generate the nvlist representing this vdev's config.
  */
@@ -474,20 +505,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
        if (getstats) {
                vdev_config_generate_stats(vd, nv);
 
-               /* provide either current or previous scan information */
-               pool_scan_stat_t ps;
-               if (spa_scan_get_stats(spa, &ps) == 0) {
-                       fnvlist_add_uint64_array(nv,
-                           ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
-                           sizeof (pool_scan_stat_t) / sizeof (uint64_t));
-               }
-
-               pool_removal_stat_t prs;
-               if (spa_removal_get_stats(spa, &prs) == 0) {
-                       fnvlist_add_uint64_array(nv,
-                           ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
-                           sizeof (prs) / sizeof (uint64_t));
-               }
+               root_vdev_actions_getprogress(vd, nv);
 
                /*
                 * Note: this can be called from open context
@@ -1525,11 +1543,10 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 {
        spa_t *spa = svd[0]->vdev_spa;
        uberblock_t *ub = &spa->spa_uberblock;
-       vdev_t *vd;
-       zio_t *zio;
        int error = 0;
        int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 
+       ASSERT(svdcount != 0);
 retry:
        /*
         * Normally, we don't want to try too hard to write every label and
@@ -1571,9 +1588,10 @@ retry:
         * written in this txg will be committed to stable storage
         * before any uberblock that references them.
         */
-       zio = zio_root(spa, NULL, NULL, flags);
+       zio_t *zio = zio_root(spa, NULL, NULL, flags);
 
-       for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
+       for (vdev_t *vd =
+           txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
            vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
                zio_flush(zio, vd);
 
@@ -1588,8 +1606,14 @@ retry:
         * the new labels to disk to ensure that all even-label updates
         * are committed to stable storage before the uberblock update.
         */
-       if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0)
+       if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) {
+               if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+                       zfs_dbgmsg("vdev_label_sync_list() returned error %d "
+                           "for pool '%s' when syncing out the even labels "
+                           "of dirty vdevs", error, spa_name(spa));
+               }
                goto retry;
+       }
 
        /*
         * Sync the uberblocks to all vdevs in svd[].
@@ -1606,8 +1630,13 @@ retry:
         *      been successfully committed) will be valid with respect
         *      to the new uberblocks.
         */
-       if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0)
+       if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) {
+               if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+                       zfs_dbgmsg("vdev_uberblock_sync_list() returned error "
+                           "%d for pool '%s'", error, spa_name(spa));
+               }
                goto retry;
+       }
 
        if (spa_multihost(spa))
                mmp_update_uberblock(spa, ub);
@@ -1622,8 +1651,14 @@ retry:
         * to disk to ensure that all odd-label updates are committed to
         * stable storage before the next transaction group begins.
         */
-       if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0)
+       if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) {
+               if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+                       zfs_dbgmsg("vdev_label_sync_list() returned error %d "
+                           "for pool '%s' when syncing out the odd labels of "
+                           "dirty vdevs", error, spa_name(spa));
+               }
                goto retry;
+       }
 
        return (0);
 }
index f9084e8cf653b7cf874be6bb021598b52145889c..f2bdd63898d5ccb3a25014cc3e0aac148c4baa6e 100644 (file)
@@ -117,6 +117,12 @@ int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
  */
 int vdev_removal_max_span = 32 * 1024;
 
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a removal.
+ */
+unsigned long zfs_remove_max_bytes_pause = -1UL;
+
 #define        VDEV_REMOVAL_ZAP_OBJS   "lzap"
 
 static void spa_vdev_remove_thread(void *arg);
@@ -286,11 +292,11 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
                 * be copied.
                 */
                spa->spa_removing_phys.sr_to_copy -=
-                   range_tree_space(ms->ms_freeingtree);
+                   range_tree_space(ms->ms_freeing);
 
-               ASSERT0(range_tree_space(ms->ms_freedtree));
+               ASSERT0(range_tree_space(ms->ms_freed));
                for (int t = 0; t < TXG_SIZE; t++)
-                       ASSERT0(range_tree_space(ms->ms_alloctree[t]));
+                       ASSERT0(range_tree_space(ms->ms_allocating[t]));
        }
 
        /*
@@ -467,19 +473,18 @@ spa_restart_removal(spa_t *spa)
  * and we correctly free already-copied data.
  */
 void
-free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
-    uint64_t txg)
+free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
 {
        spa_t *spa = vd->vdev_spa;
        spa_vdev_removal_t *svr = spa->spa_vdev_removal;
        vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+       uint64_t txg = spa_syncing_txg(spa);
        uint64_t max_offset_yet = 0;
 
        ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
        ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
            vdev_indirect_mapping_object(vim));
        ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
-       ASSERT3U(spa_syncing_txg(spa), ==, txg);
 
        mutex_enter(&svr->svr_lock);
 
@@ -494,8 +499,13 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
         * held, so that the remove_thread can not load this metaslab and then
         * visit this offset between the time that we metaslab_free_concrete()
         * and when we check to see if it has been visited.
+        *
+        * Note: The checkpoint flag is set to false as having/taking
+        * a checkpoint and removing a device can't happen at the same
+        * time.
         */
-       metaslab_free_concrete(vd, offset, size, txg);
+       ASSERT(!spa_has_checkpoint(spa));
+       metaslab_free_concrete(vd, offset, size, B_FALSE);
 
        uint64_t synced_size = 0;
        uint64_t synced_offset = 0;
@@ -627,16 +637,17 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size,
         * of this free.
         */
        if (synced_size > 0) {
-               vdev_indirect_mark_obsolete(vd, synced_offset, synced_size,
-                   txg);
+               vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
+
                /*
                 * Note: this can only be called from syncing context,
                 * and the vdev_indirect_mapping is only changed from the
                 * sync thread, so we don't need svr_lock while doing
                 * metaslab_free_impl_cb.
                 */
+               boolean_t checkpoint = B_FALSE;
                vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
-                   metaslab_free_impl_cb, &txg);
+                   metaslab_free_impl_cb, &checkpoint);
        }
 }
 
@@ -684,10 +695,10 @@ static void
 free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
 {
        vdev_t *vd = arg;
-       vdev_indirect_mark_obsolete(vd, offset, size,
-           vd->vdev_spa->spa_syncing_txg);
+       vdev_indirect_mark_obsolete(vd, offset, size);
+       boolean_t checkpoint = B_FALSE;
        vdev_indirect_ops.vdev_op_remap(vd, offset, size,
-           metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg);
+           metaslab_free_impl_cb, &checkpoint);
 }
 
 /*
@@ -1363,7 +1374,7 @@ spa_vdev_remove_thread(void *arg)
                 * Assert nothing in flight -- ms_*tree is empty.
                 */
                for (int i = 0; i < TXG_SIZE; i++) {
-                       ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+                       ASSERT0(range_tree_space(msp->ms_allocating[i]));
                }
 
                /*
@@ -1393,7 +1404,7 @@ spa_vdev_remove_thread(void *arg)
                            SM_ALLOC));
                        space_map_close(sm);
 
-                       range_tree_walk(msp->ms_freeingtree,
+                       range_tree_walk(msp->ms_freeing,
                            range_tree_remove, svr->svr_allocd_segs);
 
                        /*
@@ -1412,7 +1423,7 @@ spa_vdev_remove_thread(void *arg)
                    msp->ms_id);
 
                while (!svr->svr_thread_exit &&
-                   range_tree_space(svr->svr_allocd_segs) != 0) {
+                   !range_tree_is_empty(svr->svr_allocd_segs)) {
 
                        mutex_exit(&svr->svr_lock);
 
@@ -1427,6 +1438,19 @@ spa_vdev_remove_thread(void *arg)
                         */
                        spa_config_exit(spa, SCL_CONFIG, FTAG);
 
+                       /*
+                        * This delay will pause the removal around the point
+                        * specified by zfs_remove_max_bytes_pause. We do this
+                        * solely from the test suite or during debugging.
+                        */
+                       uint64_t bytes_copied =
+                           spa->spa_removing_phys.sr_copied;
+                       for (int i = 0; i < TXG_SIZE; i++)
+                               bytes_copied += svr->svr_bytes_done[i];
+                       while (zfs_remove_max_bytes_pause <= bytes_copied &&
+                           !svr->svr_thread_exit)
+                               delay(hz);
+
                        mutex_enter(&vca.vca_lock);
                        while (vca.vca_outstanding_bytes >
                            zfs_remove_max_copy_bytes) {
@@ -1567,10 +1591,10 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
                 * Assert nothing in flight -- ms_*tree is empty.
                 */
                for (int i = 0; i < TXG_SIZE; i++)
-                       ASSERT0(range_tree_space(msp->ms_alloctree[i]));
+                       ASSERT0(range_tree_space(msp->ms_allocating[i]));
                for (int i = 0; i < TXG_DEFER_SIZE; i++)
-                       ASSERT0(range_tree_space(msp->ms_defertree[i]));
-               ASSERT0(range_tree_space(msp->ms_freedtree));
+                       ASSERT0(range_tree_space(msp->ms_defer[i]));
+               ASSERT0(range_tree_space(msp->ms_freed));
 
                if (msp->ms_sm != NULL) {
                        /*
@@ -1586,7 +1610,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
                        mutex_enter(&svr->svr_lock);
                        VERIFY0(space_map_load(msp->ms_sm,
                            svr->svr_allocd_segs, SM_ALLOC));
-                       range_tree_walk(msp->ms_freeingtree,
+                       range_tree_walk(msp->ms_freeing,
                            range_tree_remove, svr->svr_allocd_segs);
 
                        /*
@@ -1662,7 +1686,8 @@ spa_vdev_remove_cancel(spa_t *spa)
        uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
 
        int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
-           spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE);
+           spa_vdev_remove_cancel_sync, NULL, 0,
+           ZFS_SPACE_CHECK_EXTRA_RESERVED);
 
        if (error == 0) {
                spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
@@ -1999,6 +2024,17 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
        if (!locked)
                txg = spa_vdev_enter(spa);
 
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+               error = (spa_has_checkpoint(spa)) ?
+                   ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+
+               if (!locked)
+                       return (spa_vdev_exit(spa, NULL, txg, error));
+
+               return (error);
+       }
+
        vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
        if (spa->spa_spares.sav_vdevs != NULL &&
@@ -2111,6 +2147,13 @@ module_param(vdev_removal_max_span, int, 0644);
 MODULE_PARM_DESC(vdev_removal_max_span,
        "Largest span of free chunks a remap segment can span");
 
+/* BEGIN CSTYLED */
+module_param(zfs_remove_max_bytes_pause, ulong, 0644);
+MODULE_PARM_DESC(zfs_remove_max_bytes_pause,
+       "Pause device removal after this many bytes are copied "
+       "(debug use only - causes removal to hang)");
+/* END CSTYLED */
+
 EXPORT_SYMBOL(free_from_removing_vdev);
 EXPORT_SYMBOL(spa_removal_get_stats);
 EXPORT_SYMBOL(spa_remove_init);
index dad09da50d2abbd72bf91a697136a258b62bba3a..4751386217185710d8c38d31f3303453fa5f9c64 100644 (file)
@@ -1142,7 +1142,7 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync,
 
        if (sync) {
                err = dsl_sync_task(poolname, NULL,
-                   zcp_eval_sync, &evalargs, 0, ZFS_SPACE_CHECK_NONE);
+                   zcp_eval_sync, &evalargs, 0, ZFS_SPACE_CHECK_ZCP_EVAL);
                if (err != 0)
                        zcp_pool_error(&evalargs, poolname);
        } else {
index 196a3d4b754cc5b36defe9aefc940d299a22066f..e089666f201812a74397c2795e4ae3586217c61c 100644 (file)
@@ -110,7 +110,7 @@ static zcp_synctask_info_t zcp_synctask_destroy_info = {
            {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN},
            {NULL, 0}
        },
-       .space_check = ZFS_SPACE_CHECK_NONE,
+       .space_check = ZFS_SPACE_CHECK_DESTROY,
        .blocks_modified = 0
 };
 
@@ -303,10 +303,9 @@ zcp_synctask_wrapper(lua_State *state)
        zcp_parse_args(state, info->name, info->pargs, info->kwargs);
 
        err = 0;
-       if (info->space_check != ZFS_SPACE_CHECK_NONE && funcspace > 0) {
-               uint64_t quota = dsl_pool_adjustedsize(dp,
-                   info->space_check == ZFS_SPACE_CHECK_RESERVED) -
-                   metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+       if (info->space_check != ZFS_SPACE_CHECK_NONE) {
+               uint64_t quota = dsl_pool_unreserved_space(dp,
+                   info->space_check);
                uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes +
                    ri->zri_space_used;
 
index f95b77db72eecf1f9a0ce75e0248cd4c6ac9d8ed..e70207aa50ea4a871b4a5a2e3001b0ae9f77cdc3 100644 (file)
@@ -3730,6 +3730,29 @@ zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
            nvarg, outnvl));
 }
 
+/*
+ * innvl: unused
+ * outnvl: empty
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+       return (spa_checkpoint(poolname));
+}
+
+/*
+ * innvl: unused
+ * outnvl: empty
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl,
+    nvlist_t *outnvl)
+{
+       return (spa_checkpoint_discard(poolname));
+}
+
 /*
  * inputs:
  * zc_name             name of dataset to destroy
@@ -6422,6 +6445,15 @@ zfs_ioctl_init(void)
            POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE,
            B_TRUE);
 
+       zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT,
+           zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME,
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
+       zfs_ioctl_register("zpool_discard_checkpoint",
+           ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint,
+           zfs_secpolicy_config, POOL_NAME,
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
        /* IOCTLS that use the legacy function signature */
 
        zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
index e8adc6d9986483275ea9f0f97f75e48d7ecb70df..d0b1c1d14896b5eb5ad2a38b6cd795013827d5a3 100644 (file)
@@ -29,6 +29,7 @@
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/arc.h>
@@ -430,6 +431,35 @@ done:
        return (error);
 }
 
+/* ARGSUSED */
+static int
+zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+{
+       ASSERT(!BP_IS_HOLE(bp));
+
+       /*
+        * As we call this function from the context of a rewind to a
+        * checkpoint, each ZIL block whose txg is later than the txg
+        * that we rewind to is invalid. Thus, we return -1 so
+        * zil_parse() doesn't attempt to read it.
+        */
+       if (bp->blk_birth >= first_txg)
+               return (-1);
+
+       if (zil_bp_tree_add(zilog, bp) != 0)
+               return (0);
+
+       zio_free(zilog->zl_spa, first_txg, bp);
+       return (0);
+}
+
+/* ARGSUSED */
+static int
+zil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
+{
+       return (0);
+}
+
 static int
 zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
 {
@@ -476,7 +506,7 @@ zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
 static int
 zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
 {
-       zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+       zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
        return (0);
 }
@@ -662,7 +692,7 @@ zil_create(zilog_t *zilog)
                txg = dmu_tx_get_txg(tx);
 
                if (!BP_IS_HOLE(&blk)) {
-                       zio_free_zil(zilog->zl_spa, txg, &blk);
+                       zio_free(zilog->zl_spa, txg, &blk);
                        BP_ZERO(&blk);
                }
 
@@ -767,8 +797,8 @@ int
 zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
        dmu_tx_t *tx = txarg;
-       uint64_t first_txg = dmu_tx_get_txg(tx);
        zilog_t *zilog;
+       uint64_t first_txg;
        zil_header_t *zh;
        objset_t *os;
        int error;
@@ -790,10 +820,43 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 
        zilog = dmu_objset_zil(os);
        zh = zil_header_in_syncing_context(zilog);
+       ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
+       first_txg = spa_min_claim_txg(zilog->zl_spa);
 
-       if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
-               if (!BP_IS_HOLE(&zh->zh_log))
-                       zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
+       /*
+        * If the spa_log_state is not set to be cleared, check whether
+        * the current uberblock is a checkpoint one and if the current
+        * header has been claimed before moving on.
+        *
+        * If the current uberblock is a checkpointed uberblock then
+        * one of the following scenarios took place:
+        *
+        * 1] We are currently rewinding to the checkpoint of the pool.
+        * 2] We crashed in the middle of a checkpoint rewind but we
+        *    did manage to write the checkpointed uberblock to the
+        *    vdev labels, so when we tried to import the pool again
+        *    the checkpointed uberblock was selected from the import
+        *    procedure.
+        *
+        * In both cases we want to zero out all the ZIL blocks, except
+        * the ones that have been claimed at the time of the checkpoint
+        * (their zh_claim_txg != 0). The reason is that these blocks
+        * may be corrupted since we may have reused their locations on
+        * disk after we took the checkpoint.
+        *
+        * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
+        * when we first figure out whether the current uberblock is
+        * checkpointed or not. Unfortunately, that would discard all
+        * the logs, including the ones that are claimed, and we would
+        * leak space.
+        */
+       if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
+           (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+           zh->zh_claim_txg == 0)) {
+               if (!BP_IS_HOLE(&zh->zh_log)) {
+                       (void) zil_parse(zilog, zil_clear_log_block,
+                           zil_noop_log_record, tx, first_txg, B_FALSE);
+               }
                BP_ZERO(&zh->zh_log);
                if (os->os_encrypted)
                        os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
@@ -802,6 +865,12 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
                return (0);
        }
 
+       /*
+        * If we are not rewinding and opening the pool normally, then
+        * the min_claim_txg should be equal to the first txg of the pool.
+        */
+       ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
+
        /*
         * Claim all log blocks if we haven't already done so, and remember
         * the highest claimed sequence number.  This ensures that if we can
@@ -855,16 +924,17 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
        zilog = dmu_objset_zil(os);
        bp = (blkptr_t *)&zilog->zl_header->zh_log;
 
-       /*
-        * Check the first block and determine if it's on a log device
-        * which may have been removed or faulted prior to loading this
-        * pool.  If so, there's no point in checking the rest of the log
-        * as its content should have already been synced to the pool.
-        */
        if (!BP_IS_HOLE(bp)) {
                vdev_t *vd;
                boolean_t valid = B_TRUE;
 
+               /*
+                * Check the first block and determine if it's on a log device
+                * which may have been removed or faulted prior to loading this
+                * pool.  If so, there's no point in checking the rest of the
+                * log as its content should have already been synced to the
+                * pool.
+                */
                spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
                vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
                if (vd->vdev_islog && vdev_is_dead(vd))
@@ -873,6 +943,18 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 
                if (!valid)
                        return (0);
+
+               /*
+                * Check whether the current uberblock is checkpointed (e.g.
+                * we are rewinding) and whether the current header has been
+                * claimed or not. If it hasn't then skip verifying it. We
+                * do this because its ZIL blocks may be part of the pool's
+                * state before the rewind, which is no longer valid.
+                */
+               zil_header_t *zh = zil_header_in_syncing_context(zilog);
+               if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+                   zh->zh_claim_txg == 0)
+                       return (0);
        }
 
        /*
@@ -883,8 +965,8 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
         * which will update spa_max_claim_txg.  See spa_load() for details.
         */
        error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
-           zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa),
-           B_FALSE);
+           zilog->zl_header->zh_claim_txg ? -1ULL :
+           spa_min_claim_txg(os->os_spa), B_FALSE);
 
        return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
index 8a495988b2adc385b9abdbc973505b903f8277a3..9a98d4fc0a405dd995a0fe4e01e6aeda2eaf26f9 100644 (file)
@@ -1147,8 +1147,9 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
         * starts allocating blocks -- so that nothing is allocated twice.
         * If txg == 0 we just verify that the block is claimable.
         */
-       ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
-       ASSERT(txg == spa_first_txg(spa) || txg == 0);
+       ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
+           spa_min_claim_txg(spa));
+       ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
        ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
 
        zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
@@ -3457,18 +3458,6 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
        return (error);
 }
 
-/*
- * Free an intent log block.
- */
-void
-zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
-{
-       ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
-       ASSERT(!BP_IS_GANG(bp));
-
-       zio_free(spa, txg, bp);
-}
-
 /*
  * ==========================================================================
  * Read and write to physical devices
index dc0f6d98349597daf489578c919c21d223dffafc..1c4a8e02cbbeafad76c582dd3343aca6ddb06b7b 100644 (file)
@@ -235,8 +235,6 @@ zthr_destroy(zthr_t *t)
 void
 zthr_wakeup(zthr_t *t)
 {
-       ASSERT3P(t->zthr_thread, !=, NULL);
-
        mutex_enter(&t->zthr_lock);
        cv_broadcast(&t->zthr_cv);
        mutex_exit(&t->zthr_lock);
index 3d3ef0afa655e53097529efcff3859e87285c4ed..bd301e3288df2faaa9558ceb98a55ab381a0e4e7 100644 (file)
@@ -627,6 +627,17 @@ tests = ['online_offline_001_pos', 'online_offline_002_neg',
     'online_offline_003_neg']
 tags = ['functional', 'online_offline']
 
+[tests/functional/pool_checkpoint]
+tests = ['checkpoint_after_rewind', 'checkpoint_big_rewind',
+    'checkpoint_capacity', 'checkpoint_conf_change', 'checkpoint_discard',
+    'checkpoint_discard_busy', 'checkpoint_discard_many',
+    'checkpoint_indirect', 'checkpoint_invalid', 'checkpoint_lun_expsz',
+    'checkpoint_open', 'checkpoint_removal', 'checkpoint_rewind',
+    'checkpoint_ro_rewind', 'checkpoint_sm_scale', 'checkpoint_twice',
+    'checkpoint_vdev_add', 'checkpoint_zdb', 'checkpoint_zhack_feat']
+tags = ['functional', 'pool_checkpoint']
+timeout = 1800
+
 [tests/functional/pool_names]
 tests = ['pool_names_001_pos', 'pool_names_002_neg']
 pre =
diff --git a/tests/runfiles/longevity.run b/tests/runfiles/longevity.run
new file mode 100644 (file)
index 0000000..fde2ef6
--- /dev/null
@@ -0,0 +1,23 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+[DEFAULT]
+quiet = False
+user = root
+timeout = 10800
+outputdir = /var/tmp/test_results
+
+[/opt/zfs-tests/tests/longevity]
+tests = ['slop_space_test']
index 0673fcbf62179a2ea1d679203eb8156c81ff6a23..751836ae4a30952179079d111c98ed28b9e21ead 100644 (file)
@@ -18,6 +18,7 @@ SUBDIRS = \
        mmapwrite \
        nvlist_to_lua \
        randfree_file \
+       randwritecomp \
        readmmap \
        rename_dir \
        rm_lnkcnt_zero_file \
diff --git a/tests/zfs-tests/cmd/randwritecomp/.gitignore b/tests/zfs-tests/cmd/randwritecomp/.gitignore
new file mode 100644 (file)
index 0000000..fb231c6
--- /dev/null
@@ -0,0 +1 @@
+/randwritecomp
diff --git a/tests/zfs-tests/cmd/randwritecomp/Makefile.am b/tests/zfs-tests/cmd/randwritecomp/Makefile.am
new file mode 100644 (file)
index 0000000..0002291
--- /dev/null
@@ -0,0 +1,9 @@
+include $(top_srcdir)/config/Rules.am
+
+pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin
+
+DEFAULT_INCLUDES += \
+       -I$(top_srcdir)/include
+
+pkgexec_PROGRAMS = randwritecomp
+randwritecomp_SOURCES = randwritecomp.c
diff --git a/tests/zfs-tests/cmd/randwritecomp/randwritecomp.c b/tests/zfs-tests/cmd/randwritecomp/randwritecomp.c
new file mode 100644 (file)
index 0000000..708d5ee
--- /dev/null
@@ -0,0 +1,194 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * The following is defined so the source can use
+ * lrand48() and srand48().
+ */
+#define        __EXTENSIONS__
+
+#include <stdint.h>
+#include <string.h>
+#include "../file_common.h"
+
+/*
+ * The following sample was derived from real-world data
+ * of a production Oracle database.
+ */
+static uint64_t size_distribution[] = {
+       0,
+       1499018,
+       352084,
+       1503485,
+       4206227,
+       5626657,
+       5387001,
+       3733756,
+       2233094,
+       874652,
+       238635,
+       81434,
+       33357,
+       13106,
+       2009,
+       1,
+       23660,
+};
+
+
+static uint64_t distribution_n;
+
+static uint8_t randbuf[BLOCKSZ];
+
+static void
+rwc_pwrite(int fd, const void *buf, size_t nbytes, off_t offset)
+{
+       size_t nleft = nbytes;
+       ssize_t nwrite = 0;
+
+       nwrite = pwrite(fd, buf, nbytes, offset);
+       if (nwrite < 0) {
+               perror("pwrite");
+               exit(EXIT_FAILURE);
+       }
+
+       nleft -= nwrite;
+       if (nleft != 0) {
+               (void) fprintf(stderr, "warning: pwrite: "
+                   "wrote %zu out of %zu bytes\n",
+                   (nbytes - nleft), nbytes);
+       }
+}
+
+static void
+fillbuf(char *buf)
+{
+       uint64_t rv = lrand48() % distribution_n;
+       uint64_t sum = 0;
+
+       uint64_t i;
+       for (i = 0;
+           i < sizeof (size_distribution) / sizeof (size_distribution[0]);
+           i++) {
+               sum += size_distribution[i];
+               if (rv < sum)
+                       break;
+       }
+
+       bcopy(randbuf, buf, BLOCKSZ);
+       if (i == 0)
+               bzero(buf, BLOCKSZ - 10);
+       else if (i < 16)
+               bzero(buf, BLOCKSZ - i * 512 + 256);
+       /*LINTED: E_BAD_PTR_CAST_ALIGN*/
+       ((uint32_t *)buf)[0] = lrand48();
+}
+
+static void
+exit_usage(void)
+{
+       (void) printf("usage: ");
+       (void) printf("randwritecomp <file> [-s] [nwrites]\n");
+       exit(EXIT_FAILURE);
+}
+
+static void
+sequential_writes(int fd, char *buf, uint64_t nblocks, int64_t n)
+{
+       for (int64_t i = 0; n == -1 || i < n; i++) {
+               fillbuf(buf);
+
+               static uint64_t j = 0;
+               if (j == 0)
+                       j = lrand48() % nblocks;
+               rwc_pwrite(fd, buf, BLOCKSZ, j * BLOCKSZ);
+               j++;
+               if (j >= nblocks)
+                       j = 0;
+       }
+}
+
+static void
+random_writes(int fd, char *buf, uint64_t nblocks, int64_t n)
+{
+       for (int64_t i = 0; n == -1 || i < n; i++) {
+               fillbuf(buf);
+               rwc_pwrite(fd, buf, BLOCKSZ, (lrand48() % nblocks) * BLOCKSZ);
+       }
+}
+
+int
+main(int argc, char *argv[])
+{
+       int fd, err;
+       char *filename = NULL;
+       char buf[BLOCKSZ];
+       struct stat ss;
+       uint64_t nblocks;
+       int64_t n = -1;
+       int sequential = 0;
+
+       if (argc < 2)
+               exit_usage();
+
+       argv++;
+       if (strcmp("-s", argv[0]) == 0) {
+               sequential = 1;
+               argv++;
+       }
+
+       if (argv[0] == NULL)
+               exit_usage();
+       else
+               filename = argv[0];
+
+       argv++;
+       if (argv[0] != NULL)
+               n = strtoull(argv[0], NULL, 0);
+
+       fd = open(filename, O_RDWR|O_CREAT, 0666);
+       err = fstat(fd, &ss);
+       if (err != 0) {
+               (void) fprintf(stderr,
+                   "error: fstat returned error code %d\n", err);
+               exit(EXIT_FAILURE);
+       }
+
+       nblocks = ss.st_size / BLOCKSZ;
+       if (nblocks == 0) {
+               (void) fprintf(stderr, "error: "
+                   "file is too small (min allowed size is %d bytes)\n",
+                   BLOCKSZ);
+               exit(EXIT_FAILURE);
+       }
+
+       srand48(getpid());
+       for (int i = 0; i < BLOCKSZ; i++)
+               randbuf[i] = lrand48();
+
+       distribution_n = 0;
+       for (uint64_t i = 0;
+           i < sizeof (size_distribution) / sizeof (size_distribution[0]);
+           i++) {
+               distribution_n += size_distribution[i];
+       }
+
+       if (sequential)
+               sequential_writes(fd, buf, nblocks, n);
+       else
+               random_writes(fd, buf, nblocks, n);
+
+       return (0);
+}
index 50eb6bd6bc1a94e60420d92deaa52f0697d4854a..a4417b519c2056d396219a5b2c7ff29c8195dc5a 100644 (file)
@@ -169,6 +169,7 @@ export ZFSTEST_FILES='chg_usr_exec
     mmapwrite
     nvlist_to_lua
     randfree_file
+    randwritecomp
     readmmap
     rename_dir
     rm_lnkcnt_zero_file
index 11ca81985c159af68056a3032f2bec4473b166e2..73b3978942479821c955655738fb94c5e1feba85 100644 (file)
@@ -22,7 +22,7 @@
 #
 # Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
-# Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2012, 2017 by Delphix. All rights reserved.
 # Copyright 2016 Nexenta Systems, Inc.
 # Copyright (c) 2017 Lawrence Livermore National Security, LLC.
 # Copyright (c) 2017 Datto Inc.
@@ -3525,3 +3525,21 @@ function mdb_set_uint32
 
        return 0
 }
+
+#
+# Set global scalar integer variable to a hex value using mdb.
+# Note: Target should have CTF data loaded.
+#
+function mdb_ctf_set_int
+{
+       typeset variable=$1
+       typeset value=$2
+
+       mdb -kw -e "$variable/z $value" > /dev/null
+       if [[ $? -ne 0 ]]; then
+               echo "Failed to set '$variable' to '$value' in mdb."
+               return 1
+       fi
+
+       return 0
+}
index 95d3aec97bea27b5950636002b23bfbe35081e03..5e877c1bf3abc74ad1f0055dd05b5c8ef48030d5 100644 (file)
@@ -42,6 +42,7 @@ SUBDIRS = \
        no_space \
        nopwrite \
        online_offline \
+       pool_checkpoint \
        pool_names \
        poolversion \
        privilege \
index 2a2a329f30a8d9a898fc90b0d297e671acb9ceb0..a5f827b5642ff1c6139559cdc3844884eac292ce 100755 (executable)
@@ -26,7 +26,7 @@
 #
 
 #
-# Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2012, 2017 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -56,7 +56,7 @@ set -A args "create" "add" "destroy" "import fakepool" \
     "add mirror fakepool" "add raidz fakepool" \
     "add raidz1 fakepool" "add raidz2 fakepool" \
     "setvprop" "blah blah" "-%" "--?" "-*" "-=" \
-    "-a" "-f" "-g" "-h" "-j" "-k" "-m" "-n" "-o" "-p" \
+    "-a" "-f" "-g" "-h" "-j" "-m" "-n" "-o" "-p" \
     "-p /tmp" "-r" "-t" "-w" "-x" "-y" "-z" \
     "-D" "-E" "-G" "-H" "-I" "-J" "-K" "-M" \
     "-N" "-Q" "-R" "-S" "-T" "-W" "-Y" "-Z"
index 2ea82f0f6979f44cf20b562070abb0d4eb705edb..fb389cb1024a1b251f222699dfb76b93a4fccff5 100644 (file)
@@ -41,6 +41,7 @@ typeset -a properties=(
     "delegation"
     "autoreplace"
     "cachefile"
+    "checkpoint"
     "failmode"
     "listsnapshots"
     "autoexpand"
@@ -72,6 +73,7 @@ typeset -a properties=(
     "feature@edonr"
     "feature@device_removal"
     "feature@obsolete_counts"
+    "feature@zpool_checkpoint"
 )
 
 # Additional properties added for Linux.
index ddce864a64a352ca751266b3c62313641e47d6ac..82900f4ee346b857a077acd560ce7fb9bfcc5a01 100755 (executable)
@@ -12,7 +12,7 @@
 #
 
 #
-# Copyright (c) 2016 by Delphix. All rights reserved.
+# Copyright (c) 2017 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
 #      4. Take a snapshot to make sure old blocks are not overwritten.
 #      5. Perform zpool add/attach/detach/remove operation.
 #      6. Change device paths if requested and re-import pool.
-#      7. Overwrite the files.
-#      8. Export the pool.
-#      9. Verify that we can rewind the pool to the noted txg.
-#      10. Verify that the files are readable and retain their old data.
+#      7. Checkpoint the pool as one last attempt to preserve old blocks.
+#      8. Overwrite the files.
+#      9. Export the pool.
+#      10. Verify that we can rewind the pool to the noted txg.
+#      11. Verify that the files are readable and retain their old data.
 #
 # DISCLAIMER:
 #      This test can fail since nothing guarantees that old MOS blocks aren't
@@ -47,6 +48,7 @@ function custom_cleanup
 {
        set_vdev_validate_skip 0
        cleanup
+       log_must set_tunable64 vdev_min_ms_count 16
 }
 
 log_onexit custom_cleanup
@@ -76,8 +78,8 @@ function test_common
        #
        # Perform config change operations
        #
-       if [[ -n $addvdev ]]; then
-               log_must zpool add -f $TESTPOOL1 $addvdev
+       if [[ -n $addvdevs ]]; then
+               log_must zpool add -f $TESTPOOL1 $addvdevs
        fi
        if [[ -n $attachargs ]]; then
                log_must zpool attach $TESTPOOL1 $attachargs
@@ -104,6 +106,22 @@ function test_common
                zpool import -d $DEVICE_DIR $TESTPOOL1
        fi
 
+       #
+       # In an attempt to leave MOS data untouched so extreme
+       # rewind is successful during import we checkpoint the
+       # pool and hope that these MOS data are part of the
+       # checkpoint (e.g they stay around). If this goes as
+       # expected, then extreme rewind should rewind back even
+       # further than the time that we took the checkpoint.
+       #
+       # Note that, ideally we would want to take a checkpoint
+       # right after we recond the txg we plan to rewind to.
+       # But since we can't attach, detach or remove devices
+       # while having a checkpoint, we take it after the
+       # operation that changes the config.
+       #
+       log_must zpool checkpoint $TESTPOOL1
+
        log_must overwrite_data $TESTPOOL1 ""
 
        log_must zpool export $TESTPOOL1
@@ -188,6 +206,10 @@ is_linux && log_must set_tunable32 zfs_txg_history 100
 # Make the devices bigger to reduce chances of overwriting MOS metadata.
 increase_device_sizes $(( FILE_SIZE * 4 ))
 
+# Increase the number of metaslabs for small pools temporarily to
+# reduce the chance of reusing a metaslab that holds old MOS metadata.
+log_must set_tunable64 vdev_min_ms_count 150
+
 # Part of the rewind test is to see how it reacts to path changes
 typeset pathstochange="$VDEV0 $VDEV1 $VDEV2 $VDEV3"
 
index 4761bacffd8ca3f817e945567ec1a66bdc720ce9..e72ca2157f108fa734e4be0359565a259e485fac 100755 (executable)
@@ -33,7 +33,7 @@
 verify_runnable "global"
 
 function get_txg {
-       typeset -i txg=$(zdb -u $1 | sed -n 's/^.*txg = \(.*\)$/\1/p')
+       typeset -i txg=$(zdb -u $1 | sed -n 's/^[       ][      ]*txg = \(.*\)$/\1/p')
        echo $txg
 }
 
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/Makefile.am b/tests/zfs-tests/tests/functional/pool_checkpoint/Makefile.am
new file mode 100644 (file)
index 0000000..cc1c118
--- /dev/null
@@ -0,0 +1,26 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/pool_checkpoint
+dist_pkgdata_SCRIPTS = \
+       cleanup.ksh \
+       setup.ksh \
+       checkpoint_after_rewind.ksh \
+       checkpoint_big_rewind.ksh \
+       checkpoint_capacity.ksh \
+       checkpoint_conf_change.ksh \
+       checkpoint_discard_busy.ksh \
+       checkpoint_discard.ksh \
+       checkpoint_discard_many.ksh \
+       checkpoint_indirect.ksh \
+       checkpoint_invalid.ksh \
+       checkpoint_lun_expsz.ksh \
+       checkpoint_open.ksh \
+       checkpoint_removal.ksh \
+       checkpoint_rewind.ksh \
+       checkpoint_ro_rewind.ksh \
+       checkpoint_sm_scale.ksh \
+       checkpoint_twice.ksh \
+       checkpoint_vdev_add.ksh \
+       checkpoint_zdb.ksh \
+       checkpoint_zhack_feat.ksh
+
+dist_pkgdata_DATA = \
+       pool_checkpoint.kshlib
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_after_rewind.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_after_rewind.ksh
new file mode 100755 (executable)
index 0000000..c1dec30
--- /dev/null
@@ -0,0 +1,55 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Ensure that we can checkpoint a pool that we just rewound.
+#
+# STRATEGY:
+#      1. Create pool
+#      2. Populate it
+#      3. Take checkpoint
+#      4. Modify data (include at least one destructive change)
+#      5. Rewind to checkpoint
+#      6. Verify that the data before the checkpoint are present
+#         and the data after the checkpoint is gone
+#      7. Take another checkpoint
+#      8. Change state again
+#      9. Verify the state at that time
+#
+
+verify_runnable "global"
+
+setup_test_pool
+log_onexit cleanup_test_pool
+
+populate_test_pool
+log_must zpool checkpoint $TESTPOOL
+test_change_state_after_checkpoint
+
+log_must zpool export $TESTPOOL
+log_must zpool import --rewind-to-checkpoint $TESTPOOL
+test_verify_pre_checkpoint_state
+
+log_must zpool checkpoint $TESTPOOL
+test_change_state_after_checkpoint
+
+test_verify_post_checkpoint_state
+
+log_pass "Checkpoint a pool that we just rewound."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_big_rewind.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_big_rewind.ksh
new file mode 100755 (executable)
index 0000000..f915d2a
--- /dev/null
@@ -0,0 +1,57 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Rewind to checkpoint on a stressed pool. We basically try to
+#      fragment the pool before and after taking a checkpoint and
+#      see if zdb finds any checksum or other errors that imply that
+#      blocks from the checkpoint have been reused.
+#
+# STRATEGY:
+#      1. Import pool that's slightly fragmented
+#      2. Take checkpoint
+#      3. Apply a destructive action and do more random writes
+#      4. Run zdb on both current and checkpointed data and make
+#         sure that zdb returns with no errors
+#      5. Rewind to checkpoint
+#      6. Run zdb again
+#
+
+verify_runnable "global"
+
+setup_nested_pool_state
+log_onexit cleanup_nested_pools
+
+log_must zpool checkpoint $NESTEDPOOL
+
+#
+# Destroy one dataset, modify an existing one and create a
+# a new one. Do more random writes in an attempt to raise
+# more fragmentation. Then verify both current and checkpointed
+# states.
+#
+fragment_after_checkpoint_and_verify
+
+log_must zpool export $NESTEDPOOL
+log_must zpool import -d $FILEDISKDIR --rewind-to-checkpoint $NESTEDPOOL
+
+log_must zdb $NESTEDPOOL
+
+log_pass "Rewind to checkpoint on a stressed pool."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_capacity.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_capacity.ksh
new file mode 100755 (executable)
index 0000000..c473451
--- /dev/null
@@ -0,0 +1,92 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Ensure that we don't reuse checkpointed blocks when the
+#      pool hits ENOSPC errors because of the slop space limit.
+#      This test also ensures that the DSL layer correctly takes
+#      into account the space used by the checkpoint when deciding
+#      whether to allow operations based on the reserved slop
+#      space.
+#
+# STRATEGY:
+#      1. Create pool with one disk of 1G size
+#      2. Create a file with random data of 700M in size.
+#         leaving ~200M left in pool capacity.
+#      3. Checkpoint the pool
+#      4. Remove the file. All of its blocks should stay around
+#         in ZFS as they are part of the checkpoint.
+#      5. Create a new empty file and attempt to write ~300M
+#         of data to it. This should fail, as the reserved
+#         SLOP space for the pool should be ~128M, and we should
+#         be hitting that limit getting ENOSPC.
+#      6. Use zdb to traverse and checksum all the checkpointed
+#         data to ensure its integrity.
+#      7. Export the pool and rewind to ensure that everything
+#         is actually there as expected.
+#
+
+function test_cleanup
+{
+       poolexists $NESTEDPOOL && destroy_pool $NESTEDPOOL
+       log_must set_tunable32 spa_asize_inflation 24
+       cleanup_test_pool
+}
+
+verify_runnable "global"
+
+setup_test_pool
+log_onexit test_cleanup
+log_must set_tunable32 spa_asize_inflation 4
+
+log_must zfs create $DISKFS
+
+log_must mkfile $FILEDISKSIZE $FILEDISK1
+log_must zpool create $NESTEDPOOL $FILEDISK1
+
+log_must zfs create -o compression=lz4 -o recordsize=8k $NESTEDFS0
+log_must dd if=/dev/urandom of=$NESTEDFS0FILE bs=1M count=700
+FILE0INTRO=$(head -c 100 $NESTEDFS0FILE)
+
+log_must zpool checkpoint $NESTEDPOOL
+log_must rm $NESTEDFS0FILE
+
+#
+# only for debugging purposes
+#
+log_must zpool list $NESTEDPOOL
+
+log_mustnot dd if=/dev/urandom of=$NESTEDFS0FILE bs=1M count=300
+
+#
+# only for debugging purposes
+#
+log_must zpool list $NESTEDPOOL
+
+log_must zdb -kc $NESTEDPOOL
+
+log_must zpool export $NESTEDPOOL
+log_must zpool import -d $FILEDISKDIR --rewind-to-checkpoint $NESTEDPOOL
+
+log_must [ "$(head -c 100 $NESTEDFS0FILE)" = "$FILE0INTRO" ]
+
+log_must zdb $NESTEDPOOL
+
+log_pass "Do not reuse checkpointed space at low capacity."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_conf_change.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_conf_change.ksh
new file mode 100755 (executable)
index 0000000..4f78310
--- /dev/null
@@ -0,0 +1,43 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      It shouldn't be possible to change pool's vdev config when
+#      it has a checkpoint.
+#
+# STRATEGY:
+#      1. Create pool and take checkpoint
+#      2. Attempt to change guid
+#      3. Attempt to attach/replace/remove device
+#
+
+verify_runnable "global"
+
+setup_test_pool
+log_onexit cleanup_test_pool
+
+log_must zpool checkpoint $TESTPOOL
+
+log_mustnot zpool reguid $TESTPOOL
+log_mustnot zpool attach -f $TESTPOOL $TESTDISK $EXTRATESTDISK
+log_mustnot zpool replace $TESTPOOL $TESTDISK $EXTRATESTDISK
+log_mustnot zpool remove $TESTPOOL $TESTDISK
+
+log_pass "Cannot change pool's config when pool has checkpoint."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard.ksh
new file mode 100755 (executable)
index 0000000..efd46a6
--- /dev/null
@@ -0,0 +1,53 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Ensure that we can discard the checkpoint from a pool.
+#
+# STRATEGY:
+#      1. Create pool
+#      2. Populate it
+#      3. Take checkpoint
+#      4. Modify data (include at least one destructive change)
+#      5. Discard checkpoint
+#      6. Export and attempt to rewind. Rewinding should fail
+#      7. Import pool normally and verify state
+#
+
+verify_runnable "global"
+
+setup_test_pool
+log_onexit cleanup_test_pool
+
+populate_test_pool
+
+log_must zpool checkpoint $TESTPOOL
+
+test_change_state_after_checkpoint
+
+log_must zpool checkpoint -d $TESTPOOL
+
+log_must zpool export $TESTPOOL
+log_mustnot zpool import --rewind-to-checkpoint $TESTPOOL
+
+log_must zpool import $TESTPOOL
+test_verify_post_checkpoint_state
+
+log_pass "Discard checkpoint from pool."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh
new file mode 100755 (executable)
index 0000000..54dcd59
--- /dev/null
@@ -0,0 +1,106 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Discard checkpoint on a stressed pool. Ensure that we can
+#      export and import the pool while discarding but not run any
+#      operations that have to do with the checkpoint or change the
+#      pool's config.
+#
+# STRATEGY:
+#      1. Import pools that's slightly fragmented
+#      2. Take checkpoint
+#      3. Do more random writes to "free" checkpointed blocks
+#      4. Start discarding checkpoint
+#      5. Export pool while discarding checkpoint
+#      6. Attempt to rewind (should fail)
+#      7. Import pool and ensure that discard is still running
+#      8. Attempt to run checkpoint commands, or commands that
+#         change the pool's config (should fail)
+#
+
+verify_runnable "global"
+
+function test_cleanup
+{
+       # reset memory limit to 16M
+       set_tunable64 zfs_spa_discard_memory_limit 1000000
+       cleanup_nested_pools
+}
+
+setup_nested_pool_state
+log_onexit test_cleanup
+
+#
+# Force discard to happen slower so it spans over
+# multiple txgs.
+#
+# Set memory limit to 128 bytes. Assuming that we
+# use 64-bit words for encoding space map entries,
+# ZFS will discard 8 non-debug entries per txg
+# (so at most 16 space map entries in debug-builds
+# due to debug entries).
+#
+# That should give us more than enough txgs to be
+# discarding the checkpoint for a long time as with
+# the current setup the checkpoint space maps should
+# have tens of thousands of entries.
+#
+set_tunable64 zfs_spa_discard_memory_limit 128
+
+log_must zpool checkpoint $NESTEDPOOL
+
+fragment_after_checkpoint_and_verify
+
+log_must zpool checkpoint -d $NESTEDPOOL
+
+log_must zpool export $NESTEDPOOL
+
+#
+# Verify on-disk state while pool is exported
+#
+log_must zdb -e -p $FILEDISKDIR $NESTEDPOOL
+
+#
+# Attempt to rewind on a pool that is discarding
+# a checkpoint.
+#
+log_mustnot zpool import -d $FILEDISKDIR --rewind-to-checkpoint $NESTEDPOOL
+
+log_must zpool import -d $FILEDISKDIR $NESTEDPOOL
+
+#
+# Discarding should continue after import, so
+# all the following operations should fail.
+#
+log_mustnot zpool checkpoint $NESTEDPOOL
+log_mustnot zpool checkpoint -d $NESTEDPOOL
+log_mustnot zpool remove $NESTEDPOOL $FILEDISK1
+log_mustnot zpool reguid $NESTEDPOOL
+
+# reset memory limit to 16M
+set_tunable64 zfs_spa_discard_memory_limit 16777216
+
+nested_wait_discard_finish
+
+log_must zdb $NESTEDPOOL
+
+log_pass "Can export/import but not rewind/checkpoint/discard or " \
+    "change pool's config while discarding."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_many.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_many.ksh
new file mode 100755 (executable)
index 0000000..cf0cf6c
--- /dev/null
@@ -0,0 +1,52 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Take a checkpoint and discard checkpointed data twice. The
+#      idea is to ensure that the background discard zfs thread is
+#      always running and works as expected.
+#
+# STRATEGY:
+#      1. Create pool
+#      2. Populate it and then take a checkpoint
+#      3. Do some changes afterwards, and then discard checkpoint
+#      4. Repeat steps 2 and 3
+#
+
+verify_runnable "global"
+
+setup_test_pool
+log_onexit cleanup_test_pool
+
+populate_test_pool
+log_must zpool checkpoint $TESTPOOL
+test_change_state_after_checkpoint
+log_must zpool checkpoint -d $TESTPOOL
+test_wait_discard_finish
+
+log_must mkfile -n 100M $FS2FILE
+log_must randwritecomp $FS2FILE 100
+log_must zpool checkpoint $TESTPOOL
+
+log_must randwritecomp $FS2FILE 100
+log_must zpool checkpoint -d $TESTPOOL
+test_wait_discard_finish
+
+log_pass "Background discarding works as expected."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_indirect.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_indirect.ksh
new file mode 100755 (executable)
index 0000000..aa14d8e
--- /dev/null
@@ -0,0 +1,59 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Ensure that checkpoint plays well with indirect mappings
+#      and blocks.
+#
+# STRATEGY:
+#      1. Import pool that's slightly fragmented
+#      2. Introduce indirection by removing and re-adding devices
+#      3. Take checkpoint
+#      4. Apply a destructive action and do more random writes
+#      5. Run zdb on both current and checkpointed data and make
+#         sure that zdb returns with no errors
+#
+
+verify_runnable "global"
+
+setup_nested_pool_state
+log_onexit cleanup_nested_pools
+
+#
+# Remove and re-add all disks.
+#
+introduce_indirection
+
+#
+# Display fragmentation after removals
+#
+log_must zpool list -v
+
+log_must zpool checkpoint $NESTEDPOOL
+
+#
+# Destroy one dataset, modify an existing one and create a
+# a new one. Do more random writes in an attempt to raise
+# more fragmentation. Then verify both current and checkpointed
+# states.
+#
+fragment_after_checkpoint_and_verify
+
+log_pass "Running correctly on indirect setups with a checkpoint."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_invalid.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_invalid.ksh
new file mode 100755 (executable)
index 0000000..c10f055
--- /dev/null
@@ -0,0 +1,80 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Try each 'zpool checkpoint' and relevant 'zpool import' with
+#      invalid inputs to ensure it returns an error. That includes:
+#              * A non-existent pool name or no pool name at all is supplied
+#              * Pool supplied for discarding or rewinding but the pool
+#                does not have a checkpoint
+#              * A dataset or a file/directory are supplied instead of a pool
+#
+# STRATEGY:
+#      1. Create an array of parameters for the different scenarios
+#      2. For each parameter, execute the scenarios sub-command
+#      3. Verify that an error was returned
+#
+
+verify_runnable "global"
+
+setup_test_pool
+log_onexit cleanup_test_pool
+populate_test_pool
+
+#
+# Argument groups below. Note that all_args also includes
+# an empty string as "run command with no argument".
+#
+set -A all_args "" "-d" "--discard"
+
+#
+# Target groups below. Note that invalid_targets includes
+# an empty string as "do not supply a pool name".
+#
+set -A invalid_targets "" "iDontExist" "$FS0" "$FS0FILE"
+non_checkpointed="$TESTPOOL"
+
+#
+# Scenario 1
+# Trying all checkpoint args with all invalid targets
+#
+typeset -i i=0
+while (( i < ${#invalid_targets[*]} )); do
+       typeset -i j=0
+       while (( j < ${#all_args[*]} )); do
+               log_mustnot zpool checkpoint ${all_args[j]} \
+                       ${invalid_targets[i]}
+               ((j = j + 1))
+       done
+       ((i = i + 1))
+done
+
+#
+# Scenario 2
+# If the pool does not have a checkpoint, -d nor import rewind
+# should work with it.
+#
+log_mustnot zpool checkpoint -d $non_checkpointed
+log_must zpool export $non_checkpointed
+log_mustnot zpool import --rewind-to-checkpoint $non_checkpointed
+log_must zpool import $non_checkpointed
+
+log_pass "Badly formed checkpoint related commands with " \
+       "invalid inputs fail as expected."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh
new file mode 100755 (executable)
index 0000000..59f6408
--- /dev/null
@@ -0,0 +1,61 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Ensure that we can expand a device while the pool has a
+#      checkpoint but in the case of a rewind that device rewinds
+#      back to its previous size.
+#
+# STRATEGY:
+#      1. Create pool
+#      2. Populate it
+#      3. Take checkpoint
+#      4. Expand the device and modify some data
+#         (include at least one destructive change)
+#      5. Rewind to checkpoint
+#      6. Verify that we rewinded successfully and check if the
+#         device shows up expanded in the vdev list
+#
+
+verify_runnable "global"
+
+EXPSZ=2G
+
+setup_nested_pools
+log_onexit cleanup_nested_pools
+
+populate_nested_pool
+INITSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $2}')
+log_must zpool checkpoint $NESTEDPOOL
+
+log_must truncate -s $EXPSZ $FILEDISK1
+log_must zpool online -e $NESTEDPOOL $FILEDISK1
+NEWSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $2}')
+nested_change_state_after_checkpoint
+log_mustnot [ "$INITSZ" = "$NEWSZ" ]
+
+log_must zpool export $NESTEDPOOL
+log_must zpool import -d $FILEDISKDIR --rewind-to-checkpoint $NESTEDPOOL
+
+nested_verify_pre_checkpoint_state
+FINSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $2}')
+log_must [ "$INITSZ" = "$FINSZ" ]
+
+log_pass "LUN expansion rewinded correctly."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_open.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_open.ksh
new file mode 100755 (executable)
index 0000000..018478a
--- /dev/null
@@ -0,0 +1,48 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Ensure that we can open a checkpointed pool.
+#
+# STRATEGY:
+#      1. Create pool
+#      2. Populate it
+#      3. Take checkpoint
+#      4. Modify data (include at least one destructive change) 
+#      5. Export and import pool
+#      6. Verify that the pool was opened with the most current
+#         data and not the checkpointed state.
+#
+
+verify_runnable "global"
+
+setup_test_pool
+log_onexit cleanup_test_pool
+
+populate_test_pool
+log_must zpool checkpoint $TESTPOOL
+test_change_state_after_checkpoint
+
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+
+test_verify_post_checkpoint_state
+
+log_pass "Open a checkpointed pool."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_removal.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_removal.ksh
new file mode 100755 (executable)
index 0000000..ad96d5d
--- /dev/null
@@ -0,0 +1,72 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Attempt to take a checkpoint while a removal is
+#      in progress. The attempt should fail.
+#
+# STRATEGY:
+#      1. Create pool with one disk
+#      2. Create a big file in the pool, so when the disk
+#         is later removed, it will give us enough of a
+#         time window to attempt the checkpoint while the
+#         removal takes place
+#      3. Add a second disk where all the data will be moved
+#         to when the first disk will be removed.
+#      4. Start removal of first disk
+#      5. Attempt to checkpoint (attempt should fail)
+#
+
+verify_runnable "global"
+
+function callback
+{
+       log_mustnot zpool checkpoint $TESTPOOL
+       return 0
+}
+
+#
+# Create pool
+#
+setup_test_pool
+log_onexit cleanup_test_pool
+populate_test_pool
+
+#
+# Create big empty file and do some writes at random
+# offsets to ensure that it takes up space. Note that
+# the implcitly created filesystem ($FS0) does not
+# have compression enabled.
+#
+log_must mkfile $BIGFILESIZE $FS0FILE
+log_must randwritecomp $FS0FILE 1000
+
+#
+# Add second disk
+#
+log_must zpool add $TESTPOOL $EXTRATESTDISK
+
+#
+# Remove disk and attempt to take checkpoint
+#
+log_must attempt_during_removal $TESTPOOL $TESTDISK callback
+log_must zpool status $TESTPOOL
+
+log_pass "Attempting to checkpoint during removal fails as expected."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_rewind.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_rewind.ksh
new file mode 100755 (executable)
index 0000000..2a2bb2d
--- /dev/null
@@ -0,0 +1,49 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Ensure that we can rewind on a checkpointed pool.
+#
+# STRATEGY:
+#      1. Create pool
+#      2. Populate it
+#      3. Take checkpoint
+#      4. Modify data (include at least one destructive change) 
+#      5. Rewind to checkpoint
+#      6. Verify that the data before the checkpoint are present
+#         and the data after the checkpoint is gone.
+#
+
+verify_runnable "global"
+
+setup_test_pool
+log_onexit cleanup_test_pool
+populate_test_pool
+
+log_must zpool checkpoint $TESTPOOL
+
+test_change_state_after_checkpoint
+
+log_must zpool export $TESTPOOL
+log_must zpool import --rewind-to-checkpoint $TESTPOOL
+
+test_verify_pre_checkpoint_state
+
+log_pass "Rewind on a checkpointed pool."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_ro_rewind.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_ro_rewind.ksh
new file mode 100755 (executable)
index 0000000..fd74166
--- /dev/null
@@ -0,0 +1,57 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Ensure that we can open the checkpointed state of a pool
+#      as read-only.
+#
+# STRATEGY:
+#      1. Create pool
+#      2. Populate it
+#      3. Take checkpoint
+#      4. Modify data (include at least one destructive change) 
+#      5. Export and import the checkpointed state as readonly
+#      6. Verify that we can see the checkpointed state and not
+#         the actual current state.
+#      7. Export and import the current state
+#      8. Verify that we can see the current state and not the
+#         checkpointed state.
+#
+
+verify_runnable "global"
+
+setup_test_pool
+log_onexit cleanup_test_pool
+
+populate_test_pool
+log_must zpool checkpoint $TESTPOOL
+test_change_state_after_checkpoint
+
+log_must zpool export $TESTPOOL
+log_must zpool import -o readonly=on --rewind-to-checkpoint $TESTPOOL
+
+test_verify_pre_checkpoint_state "ro-check"
+
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+
+test_verify_post_checkpoint_state
+
+log_pass "Open checkpointed state of the pool as read-only pool."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_sm_scale.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_sm_scale.ksh
new file mode 100755 (executable)
index 0000000..5247d60
--- /dev/null
@@ -0,0 +1,74 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      The maximum address that can be described by the current space
+#      map design (assuming the minimum 512-byte addressable storage)
+#      limits the maximum allocatable space of any top-level vdev to
+#      64PB whenever a vdev-wide space map is used.
+#
+#      Since a vdev-wide space map is introduced for the checkpoint
+#      we want to ensure that we cannot checkpoint a pool that has a
+#      top-level vdev with more than 64PB of allocatable space.
+#
+#      Note: Since this is a pool created from file-based vdevs we
+#            are guaranteed that vdev_ashift  is SPA_MINBLOCKSHIFT
+#            [which is currently 9 and (1 << 9) = 512], so the numbers
+#            work out for this test.
+#
+# STRATEGY:
+#      1. Create pool with a disk of exactly 64PB
+#         (so ~63.5PB of allocatable space)
+#      2. Ensure that you can checkpoint it
+#      3. Create pool with a disk of exactly 65PB
+#         (so ~64.5PB of allocatable space)
+#      4. Ensure we fail trying to checkpoint it
+#
+
+verify_runnable "global"
+
+TESTPOOL1=testpool1
+TESTPOOL2=testpool2
+
+DISK64PB=/$DISKFS/disk64PB
+DISK65PB=/$DISKFS/disk65PB
+
+function test_cleanup
+{
+       poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1
+       poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2
+       log_must rm -f $DISK64PB $DISK65PB
+       cleanup_test_pool
+}
+
+setup_test_pool
+log_onexit test_cleanup
+
+log_must zfs create $DISKFS
+log_must mkfile -n $((64 * 1024 * 1024))g $DISK64PB
+log_must mkfile -n $((65 * 1024 * 1024))g $DISK65PB
+
+log_must zpool create $TESTPOOL1 $DISK64PB
+log_must zpool create $TESTPOOL2 $DISK65PB
+
+log_must zpool checkpoint $TESTPOOL1
+log_mustnot zpool checkpoint $TESTPOOL2
+
+log_pass "Attempting to checkpoint a pool with a vdev that's more than 64PB."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_twice.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_twice.ksh
new file mode 100755 (executable)
index 0000000..3f1076b
--- /dev/null
@@ -0,0 +1,40 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Attempt to take a checkpoint for an already
+#      checkpointed pool. The attempt should fail.
+#
+# STRATEGY:
+#      1. Create pool
+#      2. Checkpoint it
+#      3. Attempt to checkpoint it again (should fail).
+#
+
+verify_runnable "global"
+
+setup_test_pool
+log_onexit cleanup_test_pool
+
+log_must zpool checkpoint $TESTPOOL
+log_mustnot zpool checkpoint $TESTPOOL
+
+log_pass "Attempting to checkpoint an already checkpointed " \
+       "pool fails as expected."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_vdev_add.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_vdev_add.ksh
new file mode 100755 (executable)
index 0000000..efb69b7
--- /dev/null
@@ -0,0 +1,63 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Ensure that we can add a device while the pool has a
+#      checkpoint but in the case of a rewind that device does
+#      not show up.
+#
+# STRATEGY:
+#      1. Create pool
+#      2. Populate it
+#      3. Take checkpoint
+#      4. Add device and modify data
+#         (include at least one destructive change) 
+#      5. Rewind to checkpoint
+#      6. Verify that we rewinded successfully and check if the
+#         device shows up in the vdev list
+#
+
+verify_runnable "global"
+
+setup_test_pool
+log_onexit cleanup_test_pool
+
+populate_test_pool
+
+log_must zpool checkpoint $TESTPOOL
+log_must zpool add $TESTPOOL $EXTRATESTDISK
+
+#
+# Ensure that the vdev shows up
+#
+log_must eval "zpool list -v $TESTPOOL | grep $EXTRATESTDISK"
+test_change_state_after_checkpoint
+
+log_must zpool export $TESTPOOL
+log_must zpool import --rewind-to-checkpoint $TESTPOOL
+
+test_verify_pre_checkpoint_state
+
+#
+# Ensure that the vdev doesn't show up after the rewind
+#
+log_mustnot eval "zpool list -v $TESTPOOL | grep $EXTRATESTDISK"
+
+log_pass "Add device in checkpointed pool."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh
new file mode 100755 (executable)
index 0000000..50c45b5
--- /dev/null
@@ -0,0 +1,80 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Ensure that checkpoint verification within zdb wowrks as
+#      we expect.
+#
+# STRATEGY:
+#      1. Create pool
+#      2. Populate it
+#      3. Take checkpoint
+#      4. Modify data (include at least one destructive change) 
+#      5. Verify zdb finds checkpoint when run on current state
+#      6. Verify zdb finds old dataset when run on checkpointed
+#         state
+#      7. Discard checkpoint
+#      8. Verify zdb does not find the checkpoint anymore in the
+#         current state.
+#      9. Verify that zdb cannot find the checkpointed state
+#         anymore when trying to open it for verification.
+#
+
+verify_runnable "global"
+
+#
+# zdb does this thing where it imports the checkpointed state of the
+# pool under a new pool with a different name, alongside the pool
+# with the current state. The name of this temporary pool is the
+# name of the actual pool with the suffix below appended to it.
+#
+CHECKPOINT_SUFFIX="_CHECKPOINTED_UNIVERSE"
+CHECKPOINTED_FS1=$TESTPOOL$CHECKPOINT_SUFFIX/$TESTFS1
+
+setup_test_pool
+log_onexit cleanup_test_pool
+
+populate_test_pool
+log_must zpool checkpoint $TESTPOOL
+
+test_change_state_after_checkpoint
+
+zdb $TESTPOOL | grep "Checkpointed uberblock found" || \
+       log_fail "zdb could not find checkpointed uberblock"
+
+zdb -k $TESTPOOL | grep "Checkpointed uberblock found" && \
+       log_fail "zdb found checkpointed uberblock in checkpointed state"
+
+zdb $TESTPOOL | grep "Dataset $FS1" && \
+       log_fail "zdb found destroyed dataset in current state"
+
+zdb -k $TESTPOOL | grep "Dataset $CHECKPOINTED_FS1" || \
+       log_fail "zdb could not find destroyed dataset in checkpoint"
+
+log_must zpool checkpoint -d $TESTPOOL
+
+zdb $TESTPOOL | grep "Checkpointed uberblock found" && \
+       log_fail "zdb found checkpointed uberblock after discarding " \
+       "the checkpoint"
+
+zdb -k $TESTPOOL && \
+       log_fail "zdb opened checkpointed state that was discarded"
+
+log_pass "zdb can analyze checkpointed pools."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zhack_feat.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zhack_feat.ksh
new file mode 100755 (executable)
index 0000000..815fc85
--- /dev/null
@@ -0,0 +1,66 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+#
+# DESCRIPTION:
+#      Ensure that we can rewind to a checkpointed state that was
+#      before a readonly-compatible feature was introduced.
+#
+# STRATEGY:
+#      1. Create pool
+#      2. Populate it
+#      3. Take checkpoint
+#      4. Modify data (include at least one destructive change) 
+#      5. Export pool
+#      6. Introduce a new feature in the pool which is unsupported
+#         but readonly-compatible and increment its reference
+#         number so it is marked active.
+#      7. Verify that the pool can't be opened writeable, but we
+#         can rewind to the checkpoint (before the feature was 
+#         introduced) if we want to.
+#
+
+verify_runnable "global"
+
+#
+# Clear all labels from all vdevs so zhack
+# doesn't get confused
+#
+for disk in ${DISKS[@]}; do
+       zpool labelclear -f $disk
+done
+
+setup_test_pool
+log_onexit cleanup_test_pool
+
+populate_test_pool
+log_must zpool checkpoint $TESTPOOL
+test_change_state_after_checkpoint
+
+log_must zpool export $TESTPOOL
+
+log_must zhack feature enable -r $TESTPOOL 'com.company:future_feature'
+log_must zhack feature ref $TESTPOOL 'com.company:future_feature'
+
+log_mustnot zpool import $TESTPOOL
+log_must zpool import --rewind-to-checkpoint $TESTPOOL
+
+test_verify_pre_checkpoint_state
+
+log_pass "Rewind to checkpoint from unsupported pool feature."
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/cleanup.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/cleanup.ksh
new file mode 100755 (executable)
index 0000000..5fa03d7
--- /dev/null
@@ -0,0 +1,23 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+verify_runnable "global"
+
+test_group_destroy_saved_pool
+log_pass
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/pool_checkpoint.kshlib b/tests/zfs-tests/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
new file mode 100644 (file)
index 0000000..54c3aff
--- /dev/null
@@ -0,0 +1,393 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/removal/removal.kshlib
+
+#
+# In general all the tests related to the pool checkpoint can
+# be divided into two categories. TESTS that verify features
+# provided by the checkpoint (e.g. checkpoint_rewind) and tests
+# that stress-test the checkpoint (e.g. checkpoint_big_rewind).
+#
+# For the first group we don't really care about the size of
+# the pool or the individual file sizes within the filesystems.
+# This is why these tests run directly on pools that use a
+# "real disk vdev" (meaning not a file based one). These tests
+# use the $TESTPOOL pool that is created on top of $TESTDISK.
+# This pool is refered to as the "test pool" and thus all
+# the tests of this group use the testpool-related functions of
+# this file (not the nested_pools ones).
+#
+# For the second group we generally try to bring the pool to its
+# limits by increasing fragmentation, filling all allocatable
+# space, attempting to use vdevs that the checkpoint spacemap
+# cannot represent, etc. For these tests we need to control
+# almost all parameters of the pool and the vdevs that back it
+# so we create them based on file-based vdevs that we carefully
+# create within the $TESTPOOL pool. So most of these tests, in
+# order to create this nested pool sctructure, generally start
+# like this:
+# 1] We create the test pool ($TESTPOOL).
+# 2] We create a filesystem and we populate it with files of
+#    some predetermined size.
+# 3] We use those files as vdevs for the pool that the test
+#    will use ($NESTEDPOOL).
+# 4] Go on and let the test run and operate on $NESTEDPOOL.
+#
+
+#
+# These disks are used to back $TESTPOOL
+#
+TESTDISK="$(echo $DISKS | cut -d' ' -f1)"
+EXTRATESTDISK="$(echo $DISKS | cut -d' ' -f2)"
+
+FS0=$TESTPOOL/$TESTFS
+FS1=$TESTPOOL/$TESTFS1
+FS2=$TESTPOOL/$TESTFS2
+
+FS0FILE=/$FS0/$TESTFILE0
+FS1FILE=/$FS1/$TESTFILE1
+FS2FILE=/$FS2/$TESTFILE2
+
+#
+# The following are created within $TESTPOOL and
+# will be used to back $NESTEDPOOL
+#
+DISKFS=$TESTPOOL/disks
+FILEDISKDIR=/$DISKFS
+FILEDISK1=/$DISKFS/dsk1
+FILEDISK2=/$DISKFS/dsk2
+FILEDISKS="$FILEDISK1 $FILEDISK2"
+
+#
+# $NESTEDPOOL related variables
+#
+NESTEDPOOL=nestedpool
+NESTEDFS0=$NESTEDPOOL/$TESTFS
+NESTEDFS1=$NESTEDPOOL/$TESTFS1
+NESTEDFS2=$NESTEDPOOL/$TESTFS2
+NESTEDFS0FILE=/$NESTEDFS0/$TESTFILE0
+NESTEDFS1FILE=/$NESTEDFS1/$TESTFILE1
+NESTEDFS2FILE=/$NESTEDFS2/$TESTFILE2
+
+#
+# In the tests that stress-test the pool (second category
+# mentioned above), there exist some that need to bring
+# fragmentation at high percentages in a relatively short
+# period of time. In order to do that we set the following
+# parameters:
+#
+# * We use two disks of 1G each, to create a pool of size 2G.
+#   The point is that 2G is not small nor large, and we also
+#   want to have 2 disks to introduce indirect vdevs on our
+#   setup.
+# * We enable compression and set the record size of all
+#   filesystems to 8K. The point of compression is to
+#   ensure that we are not filling up the whole pool (that's
+#   what checkpoint_capacity is for), and the specific
+#   record size is set to match the block size of randwritecomp
+#   which is used to increase fragmentation by writing on
+#   files.
+# * We always have 2 big files present of 512M each, which
+#   should account for 40%~50% capacity by the end of each
+#   test with fragmentation around 50~60%.
+# * At each file we attempt to do enough random writes to
+#   touch every offset twice on average.
+#
+# Note that the amount of random writes per files are based
+# on the following calculation:
+#
+# ((512M / 8K) * 3) * 2 = ~400000
+#
+# Given that the file is 512M and one write is 8K, we would
+# need (512M / 8K) writes to go through the whole file.
+# Assuming though that each write has a compression ratio of
+# 3, then we want 3 times that to cover the same amount of
+# space. Finally, we multiply that by 2 since our goal is to
+# touch each offset twice on average.
+#
+# Examples of those tests are checkpoint_big_rewind and
+# checkpoint_discard_busy.
+#
+FILEDISKSIZE=1g
+DISKSIZE=1g
+BIGFILESIZE=512M
+RANDOMWRITES=400000
+
+
+#
+# Assumes create_test_pool has been called beforehand.
+#
+function setup_nested_pool
+{
+       log_must zfs create $DISKFS
+
+       log_must truncate -s $DISKSIZE $FILEDISK1
+       log_must truncate -s $DISKSIZE $FILEDISK2
+
+       log_must zpool create -O sync=disabled $NESTEDPOOL $FILEDISKS
+}
+
+function setup_test_pool
+{
+       log_must zpool create -O sync=disabled $TESTPOOL "$TESTDISK"
+}
+
+function setup_nested_pools
+{
+       setup_test_pool
+       setup_nested_pool
+}
+
+function cleanup_nested_pool
+{
+       log_must zpool destroy $NESTEDPOOL
+       log_must rm -f $FILEDISKS
+}
+
+function cleanup_test_pool
+{
+       log_must zpool destroy $TESTPOOL
+       zpool labelclear -f "$TESTDISK"
+}
+
+function cleanup_nested_pools
+{
+       cleanup_nested_pool
+       cleanup_test_pool
+}
+
+#
+# Remove and re-add each vdev to ensure that data is
+# moved between disks and indirect mappings are created
+#
+function introduce_indirection
+{
+       for disk in ${FILEDISKS[@]}; do
+               log_must zpool remove $NESTEDPOOL $disk
+               log_must wait_for_removal $NESTEDPOOL
+               log_mustnot vdevs_in_pool $NESTEDPOOL $disk
+               log_must zpool add $NESTEDPOOL $disk
+       done
+}
+
+FILECONTENTS0="Can't wait to be checkpointed!"
+FILECONTENTS1="Can't wait to be checkpointed too!"
+NEWFILECONTENTS0="I survived after the checkpoint!"
+NEWFILECONTENTS2="I was born after the checkpoint!"
+
+function populate_test_pool
+{
+       log_must zfs create -o compression=lz4 -o recordsize=8k $FS0
+       log_must zfs create -o compression=lz4 -o recordsize=8k $FS1
+
+       echo $FILECONTENTS0 > $FS0FILE
+       echo $FILECONTENTS1 > $FS1FILE
+}
+
+function populate_nested_pool
+{
+       log_must zfs create -o compression=lz4 -o recordsize=8k $NESTEDFS0
+       log_must zfs create -o compression=lz4 -o recordsize=8k $NESTEDFS1
+
+       echo $FILECONTENTS0 > $NESTEDFS0FILE
+       echo $FILECONTENTS1 > $NESTEDFS1FILE
+}
+
+function test_verify_pre_checkpoint_state
+{
+       log_must zfs list $FS0
+       log_must zfs list $FS1
+       log_must [ "$(cat $FS0FILE)" = "$FILECONTENTS0" ]
+       log_must [ "$(cat $FS1FILE)" = "$FILECONTENTS1" ]
+
+       #
+       # If we've opened the checkpointed state of the
+       # pool as read-only without rewinding on-disk we
+       # can't really use zdb on it.
+       #
+       if [[ "$1" != "ro-check" ]] ; then
+               log_must zdb $TESTPOOL
+       fi
+
+       #
+       # Ensure post-checkpoint state is not present
+       #
+       log_mustnot zfs list $FS2
+       log_mustnot [ "$(cat $FS0FILE)" = "$NEWFILECONTENTS0" ]
+}
+
+function nested_verify_pre_checkpoint_state
+{
+       log_must zfs list $NESTEDFS0
+       log_must zfs list $NESTEDFS1
+       log_must [ "$(cat $NESTEDFS0FILE)" = "$FILECONTENTS0" ]
+       log_must [ "$(cat $NESTEDFS1FILE)" = "$FILECONTENTS1" ]
+
+       #
+       # If we've opened the checkpointed state of the
+       # pool as read-only without rewinding on-disk we
+       # can't really use zdb on it.
+       #
+       if [[ "$1" != "ro-check" ]] ; then
+               log_must zdb $NESTEDPOOL
+       fi
+
+       #
+       # Ensure post-checkpoint state is not present
+       #
+       log_mustnot zfs list $NESTEDFS2
+       log_mustnot [ "$(cat $NESTEDFS0FILE)" = "$NEWFILECONTENTS0" ]
+}
+
+function test_change_state_after_checkpoint
+{
+       log_must zfs destroy $FS1
+       log_must zfs create -o compression=lz4 -o recordsize=8k $FS2
+
+       echo $NEWFILECONTENTS0 > $FS0FILE
+       echo $NEWFILECONTENTS2 > $FS2FILE
+}
+
+function nested_change_state_after_checkpoint
+{
+       log_must zfs destroy $NESTEDFS1
+       log_must zfs create -o compression=lz4 -o recordsize=8k $NESTEDFS2
+
+       echo $NEWFILECONTENTS0 > $NESTEDFS0FILE
+       echo $NEWFILECONTENTS2 > $NESTEDFS2FILE
+}
+
+function test_verify_post_checkpoint_state
+{
+       log_must zfs list $FS0
+       log_must zfs list $FS2
+       log_must [ "$(cat $FS0FILE)" = "$NEWFILECONTENTS0" ]
+       log_must [ "$(cat $FS2FILE)" = "$NEWFILECONTENTS2" ]
+
+       log_must zdb $TESTPOOL
+
+       #
+       # Ensure pre-checkpointed state that was removed post-checkpoint
+       # is not present
+       #
+       log_mustnot zfs list $FS1
+       log_mustnot [ "$(cat $FS0FILE)" = "$FILECONTENTS0" ]
+}
+
+function fragment_before_checkpoint
+{
+       populate_nested_pool
+       log_must mkfile -n $BIGFILESIZE $NESTEDFS0FILE
+       log_must mkfile -n $BIGFILESIZE $NESTEDFS1FILE
+       log_must randwritecomp $NESTEDFS0FILE $RANDOMWRITES
+       log_must randwritecomp $NESTEDFS1FILE $RANDOMWRITES
+
+       #
+       # Display fragmentation on test log
+       #
+       log_must zpool list -v
+}
+
+function fragment_after_checkpoint_and_verify
+{
+       log_must zfs destroy $NESTEDFS1
+       log_must zfs create -o compression=lz4 -o recordsize=8k $NESTEDFS2
+       log_must mkfile -n $BIGFILESIZE $NESTEDFS2FILE
+       log_must randwritecomp $NESTEDFS0FILE $RANDOMWRITES
+       log_must randwritecomp $NESTEDFS2FILE $RANDOMWRITES
+
+       #
+       # Display fragmentation on test log
+       #
+       log_must zpool list -v
+
+       log_must zdb $NESTEDPOOL
+       log_must zdb -kc $NESTEDPOOL
+}
+
+function wait_discard_finish
+{
+       typeset pool="$1"
+
+       typeset status
+       status=$(zpool status $pool | grep "checkpoint:")
+       while [ "" != "$status" ]; do
+               sleep 5
+               status=$(zpool status $pool | grep "checkpoint:")
+       done
+}
+
+function test_wait_discard_finish
+{
+       wait_discard_finish $TESTPOOL
+}
+
+function nested_wait_discard_finish
+{
+       wait_discard_finish $NESTEDPOOL
+}
+
+#
+# Creating the setup for the second group of tests mentioned in
+# block comment of this file can take some time as we are doing
+# random writes to raise capacity and fragmentation before taking
+# the checkpoint. Thus we create this setup once and save the
+# disks of the nested pool in a temporary directory where we can
+# reuse it for each test that requires that setup.
+#
+SAVEDPOOLDIR="/var/tmp/ckpoint_saved_pool"
+
+function test_group_premake_nested_pools
+{
+       setup_nested_pools
+
+       #
+       # Populate and fragment the pool.
+       #
+       fragment_before_checkpoint
+
+       #
+       # Export and save the pool for other tests.
+       #
+       log_must zpool export $NESTEDPOOL
+       log_must mkdir $SAVEDPOOLDIR
+       log_must cp $FILEDISKS $SAVEDPOOLDIR
+
+       #
+       # Reimport pool to be destroyed by
+       # cleanup_nested_pools function
+       #
+       log_must zpool import -d $FILEDISKDIR $NESTEDPOOL
+}
+
+function test_group_destroy_saved_pool
+{
+       log_must rm -rf $SAVEDPOOLDIR
+}
+
+#
+# Recreate nested pool setup from saved pool.
+#
+function setup_nested_pool_state
+{
+       setup_test_pool
+
+       log_must zfs create $DISKFS
+       log_must cp $SAVEDPOOLDIR/* $FILEDISKDIR
+
+       log_must zpool import -d $FILEDISKDIR $NESTEDPOOL
+}
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/setup.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/setup.ksh
new file mode 100755 (executable)
index 0000000..118400c
--- /dev/null
@@ -0,0 +1,25 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib
+
+verify_runnable "global"
+
+test_group_premake_nested_pools
+log_onexit cleanup_nested_pools
+
+log_pass "Successfully saved pool to be reused for tests in the group."
index 54a2fb3bd1c36867595953a23fe13494acb4b853..7aa3835854b5a6fa338e4c7851f721bb321cb43f 100644 (file)
 #
 
 #
-# Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
 #
 
 export REMOVEDISK=${DISKS%% *}
 export NOTREMOVEDISK=${DISKS##* }
 
 #
-# Waits for the pool to finish a removal. If an optional callback is given,
-# execute it every 0.5s.
+# Waits for the pool to finish a removal.
 #
-# Example usage:
-#
-#    wait_for_removal $TESTPOOL dd if=/dev/urandom of=/$TESTPOOL/file count=1
-#
-function wait_for_removal # pool [callback args]
+function wait_for_removal # pool
 {
        typeset pool=$1
        typeset callback=$2
 
-       [[ -n $callback ]] && shift 2
-
        while is_pool_removing $pool; do
-               [[ -z $callback ]] || log_must $callback "$@"
-               sleep 0.5
+               sleep 1
        done
 
        #
@@ -50,6 +42,52 @@ function wait_for_removal # pool [callback args]
        return 0
 }
 
+#
+# Removes the specified disk from its respective pool and
+# runs the callback while the removal is in progress.
+#
+# This function is mainly used to test how other operations
+# interact with device removal. After the callback is done,
+# the removal is unpaused and we wait for it to finish.
+#
+# Example usage:
+#
+#    attempt_during_removal $TESTPOOL $DISK dd if=/dev/urandom \
+#        of=/$TESTPOOL/file count=1
+#
+function attempt_during_removal # pool disk callback [args]
+{
+       typeset pool=$1
+       typeset disk=$2
+       typeset callback=$3
+
+       shift 3
+       set_tunable64 zfs_remove_max_bytes_pause 0
+
+       log_must zpool remove $pool $disk
+
+       #
+       # We want to make sure that the removal started
+       # before issuing the callback.
+       #
+       sync
+       log_must is_pool_removing $pool
+
+       log_must $callback "$@"
+
+       #
+       # Ensure that we still haven't finished the removal
+       # as expected.
+       #
+       log_must is_pool_removing $pool
+
+       set_tunable64 zfs_remove_max_bytes_pause 18446744073709551615
+
+       log_must wait_for_removal $pool
+       log_mustnot vdevs_in_pool $pool $disk
+       return 0
+}
+
 function indirect_vdev_mapping_size # pool
 {
        typeset pool=$1
@@ -70,22 +108,6 @@ function random_write # file write_size
            bs=$block_size count=1 seek=$((RANDOM % nblocks)) >/dev/null 2>&1
 }
 
-_test_removal_with_operation_count=0
-function _test_removal_with_operation_cb # real_callback
-{
-       typeset real_callback=$1
-
-       $real_callback $_test_removal_with_operation_count || \
-           log_fail $real_callback "failed after" \
-               $_test_removal_with_operation_count "iterations"
-
-       (( _test_removal_with_operation_count++ ))
-
-       log_note "Callback called $((_test_removal_with_operation_count)) times"
-
-       return 0
-}
-
 function start_random_writer # file
 {
        typeset file=$1
@@ -99,17 +121,8 @@ function start_random_writer # file
        ) &
 }
 
-#
-# The callback should be a function that takes as input the number of
-# iterations and the given arguments.
-#
-function test_removal_with_operation # callback [count]
+function test_removal_with_operation # callback [args]
 {
-       typeset operation=$1
-       typeset count=$2
-
-       [[ -n $count ]] || count=0
-
        #
        # To ensure that the removal takes a while, we fragment the pool
        # by writing random blocks and continue to do during the removal.
@@ -122,29 +135,12 @@ function test_removal_with_operation # callback [count]
        start_random_writer $TESTDIR/$TESTFILE0 1g
        killpid=$!
 
-       log_must zpool remove $TESTPOOL $REMOVEDISK
-       log_must wait_for_removal $TESTPOOL \
-           _test_removal_with_operation_cb $operation
+       log_must attempt_during_removal $TESTPOOL $REMOVEDISK "$@"
        log_mustnot vdevs_in_pool $TESTPOOL $REMOVEDISK
        log_must zdb -cd $TESTPOOL
 
        kill $killpid
        wait
-
-       #
-       # We would love to assert that the callback happened *during* the
-       # removal, but we don't have the ability to be confident of that
-       # (via limiting bandwidth, etc.) yet. Instead, we try again.
-       #
-       if (( $_test_removal_with_operation_count <= 1 )); then
-               (( count <= 5 )) || log_fail "Attempted test too many times."
-
-               log_note "Callback only called" \
-                   $_test_removal_with_operation_count \
-                   "times, trying again."
-               default_setup_noexit "$DISKS"
-               test_removal_with_operation $operation $((count + 1))
-       fi
 }
 
 #
index 5b5be66b3343fec798b5eeb1ffcce57fae07a129..6c630f2f5355639d9f37a3f5aa4e9dacafc64744 100755 (executable)
@@ -15,7 +15,7 @@
 #
 
 #
-# Copyright (c) 2015, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2015, 2017 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -34,11 +34,10 @@ log_must zfs snapshot $TESTPOOL/$TESTFS@snap-pre2
 log_must dd if=/dev/zero of=$TESTDIR/file bs=1024k count=100 \
     conv=notrunc seek=200
 
-log_must zpool remove $TESTPOOL $REMOVEDISK
 if is_linux; then
-       log_must wait_for_removal $TESTPOOL
+       log_must attempt_during_removal $TESTPOOL $REMOVEDISK zdb -cd $TESTPOOL
 else
-       log_must wait_for_removal $TESTPOOL zdb -cd $TESTPOOL
+       log_must attempt_during_removal $TESTPOOL $REMOVEDISK
 fi
 log_mustnot vdevs_in_pool $TESTPOOL $REMOVEDISK
 log_must zdb -cd $TESTPOOL
index b57f1777ce0c203fee84bdc5f79a01ad2cb30053..bf0c202ecbf291036e5947cc875d6a735d8a7438 100755 (executable)
@@ -15,7 +15,7 @@
 #
 
 #
-# Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -48,12 +48,8 @@ log_must file_write -o create -f $TESTDIR/$TESTFILE1 -b $((2**20)) -c $((2**9))
 #
 start_random_writer $TESTDIR/$TESTFILE1
 
-callback_count=0
 function callback
 {
-       (( callback_count++ ))
-       (( callback_count == 1 )) || return 0
-
        # Attempt to write more than the new pool will be able to handle.
        file_write -o create -f $TESTDIR/$TESTFILE2 -b $((2**20)) -c $((2**9))
        zret=$?
@@ -62,7 +58,6 @@ function callback
        (( $zret == $ENOSPC )) || log_fail "Did not get ENOSPC during removal."
 }
 
-log_must zpool remove $TESTPOOL $REMOVEDISK
-log_must wait_for_removal $TESTPOOL callback
+log_must attempt_during_removal $TESTPOOL $REMOVEDISK callback
 
 log_pass "Removal properly sets reservation."
index e719a5ecc8fa460f254b14462d4d1bea895f1a42..7ec6c8675074ce88d0e88b236ee8a6956f5921b9 100755 (executable)
@@ -15,7 +15,7 @@
 #
 
 #
-# Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -36,14 +36,10 @@ log_onexit cleanup
 
 function callback
 {
-       typeset count=$1
-       if ((count == 0)); then
-               log_mustnot zpool attach -f $TESTPOOL $TMPDIR/dsk1 $TMPDIR/dsk2
-               log_mustnot zpool add -f $TESTPOOL \
-                   raidz $TMPDIR/dsk1 $TMPDIR/dsk2
-               log_must zpool add -f $TESTPOOL $TMPDIR/dsk1
-       fi
-
+       log_mustnot zpool attach -f $TESTPOOL $TMPDIR/dsk1 $TMPDIR/dsk2
+       log_mustnot zpool add -f $TESTPOOL \
+           raidz $TMPDIR/dsk1 $TMPDIR/dsk2
+       log_must zpool add -f $TESTPOOL $TMPDIR/dsk1
        return 0
 }
 
index 403428290e83bc385b6975d2525b9b63922029c8..0872fd9faf4083d3685320e80dd8c016054ab2f1 100755 (executable)
@@ -15,7 +15,7 @@
 #
 
 #
-# Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -26,11 +26,8 @@ log_onexit default_cleanup_noexit
 
 function callback
 {
-       typeset count=$1
-       if ((count == 0)); then
-               log_must zfs create $TESTPOOL/$TESTFS1
-               log_must zfs destroy $TESTPOOL/$TESTFS1
-       fi
+       log_must zfs create $TESTPOOL/$TESTFS1
+       log_must zfs destroy $TESTPOOL/$TESTFS1
        return 0
 }
 
index 38d6d53d4e459a43f4aed7ae071d40838e6fbc42..0ec358aadba939c6ade5fb4b6345664ab3d9c7f7 100755 (executable)
@@ -15,7 +15,7 @@
 #
 
 #
-# Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
 default_setup_noexit "$DISKS"
 log_onexit default_cleanup_noexit
 
-function callback # count
+function callback
 {
-       typeset count=$1
-       if ((count == 0)); then
-               is_linux && test_removal_with_operation_kill
-               log_must zpool export $TESTPOOL
-
-               #
-               # We are concurrently starting dd processes that will
-               # create files in $TESTDIR.  These could cause the import
-               # to fail because it can't mount on the filesystem on a
-               # non-empty directory.  Therefore, remove the directory
-               # so that the dd process will fail.
-               #
-               log_must rm -rf $TESTDIR
-
-               log_must zpool import $TESTPOOL
-       fi
+       is_linux && test_removal_with_operation_kill
+       log_must zpool export $TESTPOOL
+
+       #
+       # We are concurrently starting dd processes that will
+       # create files in $TESTDIR.  These could cause the import
+       # to fail because it can't mount on the filesystem on a
+       # non-empty directory.  Therefore, remove the directory
+       # so that the dd process will fail.
+       #
+       log_must rm -rf $TESTDIR
+
+       log_must zpool import $TESTPOOL
        return 0
 }
 
index 63050a6479bc02f10e31e7c5708ecb1c8c3f30c4..d3a53e40b30ddaf23a198ff27bab4d661be35721 100755 (executable)
@@ -15,7 +15,7 @@
 #
 
 #
-# Copyright (c) 2015, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2015, 2017 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
 default_setup_noexit "$DISKS"
 log_onexit default_cleanup_noexit
 
-function callback
-{
-       typeset count=$1
-       if ((count == 0)); then
-               zfs remap $TESTPOOL/$TESTFS
-       fi
-       return 0
-}
-
-test_removal_with_operation callback
+test_removal_with_operation zfs remap $TESTPOOL/$TESTFS
 
 log_pass "Can remap a filesystem during removal"
index fef7c293b943f5431a749079cbfc532b106836d7..df7bc671994a4b91d6461cbc2451ae7537b0ebbf 100755 (executable)
@@ -15,7 +15,7 @@
 #
 
 #
-# Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
 default_setup_noexit "$DISKS"
 log_onexit default_cleanup_noexit
 
-function callback # count
+function callback
 {
-       typeset count=$1
-       if ((count == 0)); then
-               log_mustnot zpool remove $TESTPOOL $NOTREMOVEDISK
-       fi
+       log_mustnot zpool remove $TESTPOOL $NOTREMOVEDISK
        return 0
 }
 
index 33eb41bf2acd8634e628a6b30cc1652f88f5d78f..d96c1ce9de1573bef7be0f8192cfa63ed435c393 100755 (executable)
@@ -15,7 +15,7 @@
 #
 
 #
-# Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
 default_setup_noexit "$DISKS"
 log_onexit default_cleanup_noexit
 
-function callback
-{
-       typeset count=$1
-       if ((count == 0)); then
-               log_must zpool scrub $TESTPOOL
-       fi
-       return 0
-}
-
-test_removal_with_operation callback
+test_removal_with_operation zpool scrub $TESTPOOL
 
 log_pass "Can use scrub during removal"
index c5a92505c4e7834179f42b8af8e6fae557c47d8e..59e66aca5256ea7546d6a8e02785a927a810dd1e 100755 (executable)
@@ -15,7 +15,7 @@
 #
 
 #
-# Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -26,12 +26,9 @@ log_onexit default_cleanup_noexit
 
 function callback
 {
-       typeset count=$1
-       if ((count == 0)); then
-               create_snapshot $TESTPOOL/$TESTFS $TESTSNAP
-               log_must ksh -c \
-                   "zfs send $TESTPOOL/$TESTFS@$TESTSNAP >/dev/null"
-       fi
+       create_snapshot $TESTPOOL/$TESTFS $TESTSNAP
+       log_must ksh -c \
+           "zfs send $TESTPOOL/$TESTFS@$TESTSNAP >/dev/null"
        return 0
 }
 
index c7d1c8a89c40d0dc05ba437ec799d510ae3f1295..c4b5f7e7686f84b97908c4746aab86afed30848a 100755 (executable)
@@ -15,7 +15,7 @@
 #
 
 #
-# Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -26,13 +26,10 @@ log_onexit default_cleanup_noexit
 
 function callback
 {
-       typeset count=$1
-       if ((count == 0)); then
-               create_snapshot $TESTPOOL/$TESTFS $TESTSNAP
-               log_must ksh -o pipefail -c \
-                   "zfs send $TESTPOOL/$TESTFS@$TESTSNAP | \
-                   zfs recv $TESTPOOL/$TESTFS1"
-       fi
+       create_snapshot $TESTPOOL/$TESTFS $TESTSNAP
+       log_must ksh -o pipefail -c \
+           "zfs send $TESTPOOL/$TESTFS@$TESTSNAP | \
+           zfs recv $TESTPOOL/$TESTFS1"
        return 0
 }
 
index 7fe36a94f15e28a588d37c2e8edb088a0ab4ef18..a4ec8ddfa6fa4b69e4973925336ea21620db63e9 100755 (executable)
@@ -15,7 +15,7 @@
 #
 
 #
-# Copyright (c) 2014 by Delphix. All rights reserved.
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -26,11 +26,8 @@ log_onexit default_cleanup_noexit
 
 function callback
 {
-       typeset count=$1
-       if ((count == 0)); then
-               create_snapshot $TESTPOOL/$TESTFS $TESTSNAP
-               destroy_snapshot $TESTPOOL/$TESTFS@$TESTSNAP
-       fi
+       create_snapshot $TESTPOOL/$TESTFS $TESTSNAP
+       destroy_snapshot $TESTPOOL/$TESTFS@$TESTSNAP
        return 0
 }
 
index 1f609273c44f3a2990f6bb190487f0f783212d31..5c469259a9efcf0b9d2fc33de60f2b44817a5350 100755 (executable)
@@ -15,7 +15,7 @@
 #
 
 #
-# Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib