]> granicus.if.org Git - zfs/commitdiff
Add TRIM support
authorBrian Behlendorf <behlendorf1@llnl.gov>
Fri, 29 Mar 2019 16:13:20 +0000 (09:13 -0700)
committerGitHub <noreply@github.com>
Fri, 29 Mar 2019 16:13:20 +0000 (09:13 -0700)
UNMAP/TRIM support is a frequently-requested feature to help
prevent performance from degrading on SSDs and on various other
SAN-like storage back-ends.  By issuing UNMAP/TRIM commands for
sectors which are no longer allocated the underlying device can
often more efficiently manage itself.

This TRIM implementation is modeled on the `zpool initialize`
feature which writes a pattern to all unallocated space in the
pool.  The new `zpool trim` command uses the same vdev_xlate()
code to calculate what sectors are unallocated, the same per-
vdev TRIM thread model and locking, and the same basic CLI for
a consistent user experience.  The core difference is that
instead of writing a pattern it will issue UNMAP/TRIM commands
for those extents.

The zio pipeline was updated to accommodate this by adding a new
ZIO_TYPE_TRIM type and associated spa taskq.  This new type makes
is straight forward to add the platform specific TRIM/UNMAP calls
to vdev_disk.c and vdev_file.c.  These new ZIO_TYPE_TRIM zios are
handled largely the same way as ZIO_TYPE_READs or ZIO_TYPE_WRITEs.
This makes it possible to largely avoid changing the pipieline,
one exception is that TRIM zio's may exceed the 16M block size
limit since they contain no data.

In addition to the manual `zpool trim` command, a background
automatic TRIM was added and is controlled by the 'autotrim'
property.  It relies on the exact same infrastructure as the
manual TRIM.  However, instead of relying on the extents in a
metaslab's ms_allocatable range tree, a ms_trim tree is kept
per metaslab.  When 'autotrim=on', ranges added back to the
ms_allocatable tree are also added to the ms_free tree.  The
ms_free tree is then periodically consumed by an autotrim
thread which systematically walks a top level vdev's metaslabs.

Since the automatic TRIM will skip ranges it considers too small
there is value in occasionally running a full `zpool trim`.  This
may occur when the freed blocks are small and not enough time
was allowed to aggregate them.  An automatic TRIM and a manual
`zpool trim` may be run concurrently, in which case the automatic
TRIM will yield to the manual TRIM.

Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Tim Chase <tim@chase2k.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Contributions-by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Contributions-by: Tim Chase <tim@chase2k.com>
Contributions-by: Chunwei Chen <tuxoko@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #8419
Closes #598

91 files changed:
cmd/zpool/zpool_main.c
cmd/ztest/ztest.c
config/kernel-blk-queue-discard.m4 [new file with mode: 0644]
config/kernel.m4
configure.ac
include/libzfs.h
include/libzfs_core.h
include/linux/blkdev_compat.h
include/spl/sys/Makefile.am
include/spl/sys/dkioc_free_util.h [deleted file]
include/sys/Makefile.am
include/sys/fs/zfs.h
include/sys/metaslab.h
include/sys/metaslab_impl.h
include/sys/spa.h
include/sys/spa_impl.h
include/sys/sysevent/eventdefs.h
include/sys/txg.h
include/sys/vdev.h
include/sys/vdev_impl.h
include/sys/vdev_initialize.h
include/sys/vdev_trim.h [new file with mode: 0644]
include/sys/zfs_context.h
include/sys/zfs_debug.h
include/sys/zio.h
include/sys/zio_impl.h
include/sys/zio_priority.h
lib/libzfs/libzfs_pool.c
lib/libzfs/libzfs_util.c
lib/libzfs_core/libzfs_core.c
lib/libzpool/Makefile.am
man/man5/zfs-module-parameters.5
man/man8/zpool.8
module/zcommon/zpool_prop.c
module/zfs/Makefile.in
module/zfs/dmu.c
module/zfs/metaslab.c
module/zfs/spa.c
module/zfs/spa_misc.c
module/zfs/spa_stats.c
module/zfs/txg.c
module/zfs/vdev.c
module/zfs/vdev_disk.c
module/zfs/vdev_file.c
module/zfs/vdev_initialize.c
module/zfs/vdev_label.c
module/zfs/vdev_queue.c
module/zfs/vdev_raidz.c
module/zfs/vdev_removal.c
module/zfs/vdev_trim.c [new file with mode: 0644]
module/zfs/zfs_ioctl.c
module/zfs/zfs_sysfs.c
module/zfs/zio.c
tests/runfiles/linux.run
tests/test-runner/bin/zts-report.py
tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c
tests/zfs-tests/include/libtest.shlib
tests/zfs-tests/tests/functional/Makefile.am
tests/zfs-tests/tests/functional/cli_root/Makefile.am
tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile.am [new file with mode: 0644]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/cleanup.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/setup.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib [new file with mode: 0644]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_attach_detach_add_remove.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_import_export.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_multiple.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_neg.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_offline_export_import_online.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_rate.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_rate_neg.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_secure.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_split.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_suspend_resume.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_unsupported_vdevs.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_checksums.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/trim/Makefile.am [new file with mode: 0644]
tests/zfs-tests/tests/functional/trim/autotrim_config.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/trim/cleanup.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/trim/setup.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/trim/trim.cfg [new file with mode: 0644]
tests/zfs-tests/tests/functional/trim/trim.kshlib [new file with mode: 0644]
tests/zfs-tests/tests/functional/trim/trim_config.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/trim/trim_integrity.ksh [new file with mode: 0755]

index 3607656e0d5fe09bed454b9bfb2b2d481b79ebdd..f4670fd6235be8e7ca81486578617876f8a40344 100644 (file)
@@ -100,6 +100,7 @@ static int zpool_do_split(int, char **);
 static int zpool_do_initialize(int, char **);
 static int zpool_do_scrub(int, char **);
 static int zpool_do_resilver(int, char **);
+static int zpool_do_trim(int, char **);
 
 static int zpool_do_import(int, char **);
 static int zpool_do_export(int, char **);
@@ -154,6 +155,7 @@ typedef enum {
        HELP_INITIALIZE,
        HELP_SCRUB,
        HELP_RESILVER,
+       HELP_TRIM,
        HELP_STATUS,
        HELP_UPGRADE,
        HELP_EVENTS,
@@ -193,7 +195,7 @@ enum iostat_type {
  * of all the nvlists a flag requires.  Also specifies the order in
  * which data gets printed in zpool iostat.
  */
-static const char *vsx_type_to_nvlist[IOS_COUNT][11] = {
+static const char *vsx_type_to_nvlist[IOS_COUNT][13] = {
        [IOS_L_HISTO] = {
            ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
            ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
@@ -204,12 +206,14 @@ static const char *vsx_type_to_nvlist[IOS_COUNT][11] = {
            ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
            ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
            ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
+           ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
            NULL},
        [IOS_LATENCY] = {
            ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
            ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
            ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
            ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+           ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
            NULL},
        [IOS_QUEUES] = {
            ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
@@ -217,6 +221,7 @@ static const char *vsx_type_to_nvlist[IOS_COUNT][11] = {
            ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
            ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
            ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
+           ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
            NULL},
        [IOS_RQ_HISTO] = {
            ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,
@@ -229,6 +234,8 @@ static const char *vsx_type_to_nvlist[IOS_COUNT][11] = {
            ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO,
            ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO,
            ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO,
+           ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,
+           ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,
            NULL},
 };
 
@@ -281,8 +288,9 @@ static zpool_command_t command_table[] = {
        { "split",      zpool_do_split,         HELP_SPLIT              },
        { NULL },
        { "initialize", zpool_do_initialize,    HELP_INITIALIZE         },
-       { "scrub",      zpool_do_scrub,         HELP_SCRUB              },
        { "resilver",   zpool_do_resilver,      HELP_RESILVER           },
+       { "scrub",      zpool_do_scrub,         HELP_SCRUB              },
+       { "trim",       zpool_do_trim,          HELP_TRIM               },
        { NULL },
        { "import",     zpool_do_import,        HELP_IMPORT             },
        { "export",     zpool_do_export,        HELP_EXPORT             },
@@ -370,6 +378,9 @@ get_usage(zpool_help_t idx)
                return (gettext("\tscrub [-s | -p] <pool> ...\n"));
        case HELP_RESILVER:
                return (gettext("\tresilver <pool> ...\n"));
+       case HELP_TRIM:
+               return (gettext("\ttrim [-dp] [-r <rate>] [-c | -s] <pool> "
+                   "[<device> ...]\n"));
        case HELP_STATUS:
                return (gettext("\tstatus [-c [script1,script2,...]] "
                    "[-igLpPsvxD]  [-T d|u] [pool] ... \n"
@@ -410,8 +421,12 @@ zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res)
            &child, &children);
 
        if (children == 0) {
-               char *path = zpool_vdev_name(g_zfs, zhp, nvroot, B_FALSE);
-               fnvlist_add_boolean(res, path);
+               char *path = zpool_vdev_name(g_zfs, zhp, nvroot,
+                   VDEV_NAME_PATH);
+
+               if (strcmp(path, VDEV_TYPE_INDIRECT) != 0)
+                       fnvlist_add_boolean(res, path);
+
                free(path);
                return;
        }
@@ -529,11 +544,11 @@ zpool_do_initialize(int argc, char **argv)
                {0, 0, 0, 0}
        };
 
-       pool_initialize_func_t cmd_type = POOL_INITIALIZE_DO;
+       pool_initialize_func_t cmd_type = POOL_INITIALIZE_START;
        while ((c = getopt_long(argc, argv, "cs", long_options, NULL)) != -1) {
                switch (c) {
                case 'c':
-                       if (cmd_type != POOL_INITIALIZE_DO &&
+                       if (cmd_type != POOL_INITIALIZE_START &&
                            cmd_type != POOL_INITIALIZE_CANCEL) {
                                (void) fprintf(stderr, gettext("-c cannot be "
                                    "combined with other options\n"));
@@ -542,7 +557,7 @@ zpool_do_initialize(int argc, char **argv)
                        cmd_type = POOL_INITIALIZE_CANCEL;
                        break;
                case 's':
-                       if (cmd_type != POOL_INITIALIZE_DO &&
+                       if (cmd_type != POOL_INITIALIZE_START &&
                            cmd_type != POOL_INITIALIZE_SUSPEND) {
                                (void) fprintf(stderr, gettext("-s cannot be "
                                    "combined with other options\n"));
@@ -585,8 +600,7 @@ zpool_do_initialize(int argc, char **argv)
                    ZPOOL_CONFIG_VDEV_TREE);
                zpool_collect_leaves(zhp, nvroot, vdevs);
        } else {
-               int i;
-               for (i = 1; i < argc; i++) {
+               for (int i = 1; i < argc; i++) {
                        fnvlist_add_boolean(vdevs, argv[i]);
                }
        }
@@ -1801,6 +1815,7 @@ typedef struct status_cbdata {
        boolean_t       cb_print_status;
        boolean_t       cb_print_slow_ios;
        boolean_t       cb_print_vdev_init;
+       boolean_t       cb_print_vdev_trim;
        vdev_cmd_data_list_t    *vcdl;
 } status_cbdata_t;
 
@@ -1869,6 +1884,109 @@ zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, char *path)
        }
 }
 
+/*
+ * Print vdev initialization status for leaves
+ */
+static void
+print_status_initialize(vdev_stat_t *vs, boolean_t verbose)
+{
+       if (verbose) {
+               if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE ||
+                   vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+                   vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) &&
+                   !vs->vs_scan_removing) {
+                       char zbuf[1024];
+                       char tbuf[256];
+                       struct tm zaction_ts;
+
+                       time_t t = vs->vs_initialize_action_time;
+                       int initialize_pct = 100;
+                       if (vs->vs_initialize_state !=
+                           VDEV_INITIALIZE_COMPLETE) {
+                               initialize_pct = (vs->vs_initialize_bytes_done *
+                                   100 / (vs->vs_initialize_bytes_est + 1));
+                       }
+
+                       (void) localtime_r(&t, &zaction_ts);
+                       (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
+
+                       switch (vs->vs_initialize_state) {
+                       case VDEV_INITIALIZE_SUSPENDED:
+                               (void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+                                   gettext("suspended, started at"), tbuf);
+                               break;
+                       case VDEV_INITIALIZE_ACTIVE:
+                               (void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+                                   gettext("started at"), tbuf);
+                               break;
+                       case VDEV_INITIALIZE_COMPLETE:
+                               (void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+                                   gettext("completed at"), tbuf);
+                               break;
+                       }
+
+                       (void) printf(gettext("  (%d%% initialized%s)"),
+                           initialize_pct, zbuf);
+               } else {
+                       (void) printf(gettext("  (uninitialized)"));
+               }
+       } else if (vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) {
+               (void) printf(gettext("  (initializing)"));
+       }
+}
+
+/*
+ * Print vdev TRIM status for leaves
+ */
+static void
+print_status_trim(vdev_stat_t *vs, boolean_t verbose)
+{
+       if (verbose) {
+               if ((vs->vs_trim_state == VDEV_TRIM_ACTIVE ||
+                   vs->vs_trim_state == VDEV_TRIM_SUSPENDED ||
+                   vs->vs_trim_state == VDEV_TRIM_COMPLETE) &&
+                   !vs->vs_scan_removing) {
+                       char zbuf[1024];
+                       char tbuf[256];
+                       struct tm zaction_ts;
+
+                       time_t t = vs->vs_trim_action_time;
+                       int trim_pct = 100;
+                       if (vs->vs_trim_state != VDEV_TRIM_COMPLETE) {
+                               trim_pct = (vs->vs_trim_bytes_done *
+                                   100 / (vs->vs_trim_bytes_est + 1));
+                       }
+
+                       (void) localtime_r(&t, &zaction_ts);
+                       (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
+
+                       switch (vs->vs_trim_state) {
+                       case VDEV_TRIM_SUSPENDED:
+                               (void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+                                   gettext("suspended, started at"), tbuf);
+                               break;
+                       case VDEV_TRIM_ACTIVE:
+                               (void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+                                   gettext("started at"), tbuf);
+                               break;
+                       case VDEV_TRIM_COMPLETE:
+                               (void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+                                   gettext("completed at"), tbuf);
+                               break;
+                       }
+
+                       (void) printf(gettext("  (%d%% trimmed%s)"),
+                           trim_pct, zbuf);
+               } else if (vs->vs_trim_notsup) {
+                       (void) printf(gettext("  (trim unsupported)"));
+               } else {
+                       (void) printf(gettext("  (untrimmed)"));
+               }
+       } else if (vs->vs_trim_state == VDEV_TRIM_ACTIVE) {
+               (void) printf(gettext("  (trimming)"));
+       }
+}
+
 /*
  * Print out configuration state as requested by status_callback.
  */
@@ -2049,52 +2167,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
                }
        }
 
-       /* Optionally display vdev initialization status for leaves */
-       if (cb->cb_print_vdev_init && children == 0) {
-               if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE ||
-                   vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
-                   vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) &&
-                   !vs->vs_scan_removing) {
-                       char zbuf[1024];
-                       char tbuf[256];
-                       struct tm zaction_ts;
-
-                       time_t t = vs->vs_initialize_action_time;
-                       int initialize_pct = 100;
-                       if (vs->vs_initialize_state !=
-                           VDEV_INITIALIZE_COMPLETE) {
-                               initialize_pct = (vs->vs_initialize_bytes_done *
-                                   100 / (vs->vs_initialize_bytes_est + 1));
-                       }
-
-                       (void) localtime_r(&t, &zaction_ts);
-                       (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
-
-                       switch (vs->vs_initialize_state) {
-                       case VDEV_INITIALIZE_SUSPENDED:
-                               (void) snprintf(zbuf, sizeof (zbuf),
-                                   ", suspended, started at %s", tbuf);
-                               break;
-                       case VDEV_INITIALIZE_ACTIVE:
-                               (void) snprintf(zbuf, sizeof (zbuf),
-                                   ", started at %s", tbuf);
-                               break;
-                       case VDEV_INITIALIZE_COMPLETE:
-                               (void) snprintf(zbuf, sizeof (zbuf),
-                                   ", completed at %s", tbuf);
-                               break;
-                       }
-
-                       (void) printf(gettext("  (%d%% initialized%s)"),
-                           initialize_pct, zbuf);
-               } else {
-                       (void) printf(gettext("  (uninitialized)"));
-               }
-       } else {
-               if (vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE &&
-                   children == 0) {
-                       (void) printf(gettext("  (initializing)"));
-               }
+       /* Display vdev initialization and trim status for leaves */
+       if (children == 0) {
+               print_status_initialize(vs, cb->cb_print_vdev_init);
+               print_status_trim(vs, cb->cb_print_vdev_trim);
        }
 
        (void) printf("\n");
@@ -3378,22 +3454,22 @@ typedef struct name_and_columns {
        unsigned int columns;   /* Center name to this number of columns */
 } name_and_columns_t;
 
-#define        IOSTAT_MAX_LABELS       11      /* Max number of labels on one line */
+#define        IOSTAT_MAX_LABELS       13      /* Max number of labels on one line */
 
 static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] =
 {
        [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2},
            {NULL}},
        [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2},
-           {"asyncq_wait", 2}, {"scrub"}},
+           {"asyncq_wait", 2}, {"scrub", 1}, {"trim", 1}, {NULL}},
        [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2},
            {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2},
-           {NULL}},
+           {"trimq_write", 2}, {NULL}},
        [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2},
            {"asyncq_wait", 2}, {NULL}},
        [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2},
-           {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, {NULL}},
-
+           {"async_read", 2}, {"async_write", 2}, {"scrub", 2},
+           {"trim", 2}, {NULL}},
 };
 
 /* Shorthand - if "columns" field not set, default to 1 column */
@@ -3402,13 +3478,14 @@ static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] =
        [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"},
            {"write"}, {NULL}},
        [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"},
-           {"write"}, {"read"}, {"write"}, {"wait"}, {NULL}},
+           {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {NULL}},
        [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"},
-           {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}},
+           {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"},
+           {"pend"}, {"activ"}, {NULL}},
        [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"},
-           {"write"}, {"read"}, {"write"}, {"scrub"}, {NULL}},
+           {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {NULL}},
        [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"},
-           {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}},
+           {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}},
 };
 
 static const char *histo_to_title[] = {
@@ -3467,6 +3544,8 @@ default_column_width(iostat_cbdata_t *cb, enum iostat_type type)
                [IOS_DEFAULT] = 15, /* 1PB capacity */
                [IOS_LATENCY] = 10, /* 1B ns = 10sec */
                [IOS_QUEUES] = 6,   /* 1M queue entries */
+               [IOS_L_HISTO] = 10, /* 1B ns = 10sec */
+               [IOS_RQ_HISTO] = 6, /* 1M queue entries */
        };
 
        if (cb->cb_literal)
@@ -3489,7 +3568,7 @@ print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width,
     const name_and_columns_t labels[][IOSTAT_MAX_LABELS])
 {
        int i, idx, s;
-       unsigned int text_start, rw_column_width, spaces_to_end;
+       int text_start, rw_column_width, spaces_to_end;
        uint64_t flags = cb->cb_flags;
        uint64_t f;
        unsigned int column_width = force_column_width;
@@ -3513,8 +3592,10 @@ print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width,
                        rw_column_width = (column_width * columns) +
                            (2 * (columns - 1));
 
-                       text_start = (int)((rw_column_width)/columns -
-                           slen/columns);
+                       text_start = (int)((rw_column_width) / columns -
+                           slen / columns);
+                       if (text_start < 0)
+                               text_start = 0;
 
                        printf("  ");   /* Two spaces between columns */
 
@@ -3526,9 +3607,11 @@ print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width,
 
                        /* Print space after label to end of column */
                        spaces_to_end = rw_column_width - text_start - slen;
+                       if (spaces_to_end < 0)
+                               spaces_to_end = 0;
+
                        for (s = 0; s < spaces_to_end; s++)
                                printf(" ");
-
                }
        }
 }
@@ -4032,6 +4115,8 @@ print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv,
                ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
                ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,
                ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
+               ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE,
+               ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
        };
 
        struct stat_array *nva;
@@ -4070,6 +4155,7 @@ print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv,
                ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
                ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
                ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
+               ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
        };
        struct stat_array *nva;
 
@@ -6695,6 +6781,126 @@ zpool_do_resilver(int argc, char **argv)
        return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
 }
 
+/*
+ * zpool trim [-d] [-r <rate>] [-c | -s] <pool> [<device> ...]
+ *
+ *     -c              Cancel. Ends any in-progress trim.
+ *     -d              Secure trim.  Requires kernel and device support.
+ *     -r <rate>       Sets the TRIM rate in bytes (per second). Supports
+ *                     adding a multiplier suffix such as 'k' or 'm'.
+ *     -s              Suspend. TRIM can then be restarted with no flags.
+ */
+int
+zpool_do_trim(int argc, char **argv)
+{
+       struct option long_options[] = {
+               {"cancel",      no_argument,            NULL,   'c'},
+               {"secure",      no_argument,            NULL,   'd'},
+               {"rate",        required_argument,      NULL,   'r'},
+               {"suspend",     no_argument,            NULL,   's'},
+               {0, 0, 0, 0}
+       };
+
+       pool_trim_func_t cmd_type = POOL_TRIM_START;
+       uint64_t rate = 0;
+       boolean_t secure = B_FALSE;
+
+       int c;
+       while ((c = getopt_long(argc, argv, "cdr:s", long_options, NULL))
+           != -1) {
+               switch (c) {
+               case 'c':
+                       if (cmd_type != POOL_TRIM_START &&
+                           cmd_type != POOL_TRIM_CANCEL) {
+                               (void) fprintf(stderr, gettext("-c cannot be "
+                                   "combined with other options\n"));
+                               usage(B_FALSE);
+                       }
+                       cmd_type = POOL_TRIM_CANCEL;
+                       break;
+               case 'd':
+                       if (cmd_type != POOL_TRIM_START) {
+                               (void) fprintf(stderr, gettext("-d cannot be "
+                                   "combined with the -c or -s options\n"));
+                               usage(B_FALSE);
+                       }
+                       secure = B_TRUE;
+                       break;
+               case 'r':
+                       if (cmd_type != POOL_TRIM_START) {
+                               (void) fprintf(stderr, gettext("-r cannot be "
+                                   "combined with the -c or -s options\n"));
+                               usage(B_FALSE);
+                       }
+                       if (zfs_nicestrtonum(NULL, optarg, &rate) == -1) {
+                               (void) fprintf(stderr,
+                                   gettext("invalid value for rate\n"));
+                               usage(B_FALSE);
+                       }
+                       break;
+               case 's':
+                       if (cmd_type != POOL_TRIM_START &&
+                           cmd_type != POOL_TRIM_SUSPEND) {
+                               (void) fprintf(stderr, gettext("-s cannot be "
+                                   "combined with other options\n"));
+                               usage(B_FALSE);
+                       }
+                       cmd_type = POOL_TRIM_SUSPEND;
+                       break;
+               case '?':
+                       if (optopt != 0) {
+                               (void) fprintf(stderr,
+                                   gettext("invalid option '%c'\n"), optopt);
+                       } else {
+                               (void) fprintf(stderr,
+                                   gettext("invalid option '%s'\n"),
+                                   argv[optind - 1]);
+                       }
+                       usage(B_FALSE);
+               }
+       }
+
+       argc -= optind;
+       argv += optind;
+
+       if (argc < 1) {
+               (void) fprintf(stderr, gettext("missing pool name argument\n"));
+               usage(B_FALSE);
+               return (-1);
+       }
+
+       char *poolname = argv[0];
+       zpool_handle_t *zhp = zpool_open(g_zfs, poolname);
+       if (zhp == NULL)
+               return (-1);
+
+       trimflags_t trim_flags = {
+               .secure = secure,
+               .rate = rate,
+       };
+
+       nvlist_t *vdevs = fnvlist_alloc();
+       if (argc == 1) {
+               /* no individual leaf vdevs specified, so add them all */
+               nvlist_t *config = zpool_get_config(zhp, NULL);
+               nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
+                   ZPOOL_CONFIG_VDEV_TREE);
+               zpool_collect_leaves(zhp, nvroot, vdevs);
+               trim_flags.fullpool = B_TRUE;
+       } else {
+               trim_flags.fullpool = B_FALSE;
+               for (int i = 1; i < argc; i++) {
+                       fnvlist_add_boolean(vdevs, argv[i]);
+               }
+       }
+
+       int error = zpool_trim(zhp, cmd_type, vdevs, &trim_flags);
+
+       fnvlist_free(vdevs);
+       zpool_close(zhp);
+
+       return (error);
+}
 
 /*
  * Print out detailed scrub status.
@@ -7551,7 +7757,7 @@ status_callback(zpool_handle_t *zhp, void *data)
 }
 
 /*
- * zpool status [-c [script1,script2,...]] [-igLpPsvx] [-T d|u] [pool] ...
+ * zpool status [-c [script1,script2,...]] [-igLpPstvx] [-T d|u] [pool] ...
  *              [interval [count]]
  *
  *     -c CMD  For each vdev, run command CMD
@@ -7564,6 +7770,7 @@ status_callback(zpool_handle_t *zhp, void *data)
  *     -v      Display complete error logs
  *     -x      Display only pools with potential problems
  *     -D      Display dedup status (undocumented)
+ *     -t      Display vdev TRIM status.
  *     -T      Display a timestamp in date(1) or Unix format
  *
  * Describes the health status of all pools or some subset.
@@ -7579,7 +7786,7 @@ zpool_do_status(int argc, char **argv)
        char *cmd = NULL;
 
        /* check options */
-       while ((c = getopt(argc, argv, "c:igLpPsvxDT:")) != -1) {
+       while ((c = getopt(argc, argv, "c:igLpPsvxDtT:")) != -1) {
                switch (c) {
                case 'c':
                        if (cmd != NULL) {
@@ -7632,6 +7839,9 @@ zpool_do_status(int argc, char **argv)
                case 'D':
                        cb.cb_dedup_stats = B_TRUE;
                        break;
+               case 't':
+                       cb.cb_print_vdev_trim = B_TRUE;
+                       break;
                case 'T':
                        get_timestamp_arg(*optarg);
                        break;
index 8cc6a6ff95bc326675dd2fcaf670efee2f0a6eb3..a1ab56bd2f52dd6290adb1fa32c091c4ef884155 100644 (file)
 #include <sys/vdev_impl.h>
 #include <sys/vdev_file.h>
 #include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
 #include <sys/spa_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/dsl_prop.h>
@@ -374,6 +375,7 @@ ztest_func_t ztest_spa_upgrade;
 ztest_func_t ztest_device_removal;
 ztest_func_t ztest_spa_checkpoint_create_discard;
 ztest_func_t ztest_initialize;
+ztest_func_t ztest_trim;
 ztest_func_t ztest_fletcher;
 ztest_func_t ztest_fletcher_incr;
 ztest_func_t ztest_verify_dnode_bt;
@@ -427,6 +429,7 @@ ztest_info_t ztest_info[] = {
        ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes),
        ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely),
        ZTI_INIT(ztest_initialize, 1, &zopt_sometimes),
+       ZTI_INIT(ztest_trim, 1, &zopt_sometimes),
        ZTI_INIT(ztest_fletcher, 1, &zopt_rarely),
        ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
        ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
@@ -4897,7 +4900,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
                        umem_free(bigcheck, bigsize);
                }
                if (i == 2) {
-                       txg_wait_open(dmu_objset_pool(os), 0);
+                       txg_wait_open(dmu_objset_pool(os), 0, B_TRUE);
                } else if (i == 3) {
                        txg_wait_synced(dmu_objset_pool(os), 0);
                }
@@ -5574,6 +5577,8 @@ ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
        (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO,
            ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
 
+       (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2));
+
        VERIFY0(spa_prop_get(ztest_spa, &props));
 
        if (ztest_opts.zo_verbose >= 6)
@@ -6484,7 +6489,7 @@ ztest_initialize(ztest_ds_t *zd, uint64_t id)
                        (void) printf("\n");
                }
                break;
-       case POOL_INITIALIZE_DO:
+       case POOL_INITIALIZE_START:
                if (ztest_opts.zo_verbose >= 4) {
                        (void) printf("Start initialize %s", path);
                        if (active && error == 0)
@@ -6507,6 +6512,82 @@ ztest_initialize(ztest_ds_t *zd, uint64_t id)
        mutex_exit(&ztest_vdev_lock);
 }
 
+/* ARGSUSED */
+void
+ztest_trim(ztest_ds_t *zd, uint64_t id)
+{
+       spa_t *spa = ztest_spa;
+       int error = 0;
+
+       mutex_enter(&ztest_vdev_lock);
+
+       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+       /* Random leaf vdev */
+       vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev);
+       if (rand_vd == NULL) {
+               spa_config_exit(spa, SCL_VDEV, FTAG);
+               mutex_exit(&ztest_vdev_lock);
+               return;
+       }
+
+       /*
+        * The random vdev we've selected may change as soon as we
+        * drop the spa_config_lock. We create local copies of things
+        * we're interested in.
+        */
+       uint64_t guid = rand_vd->vdev_guid;
+       char *path = strdup(rand_vd->vdev_path);
+       boolean_t active = rand_vd->vdev_trim_thread != NULL;
+
+       zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid);
+       spa_config_exit(spa, SCL_VDEV, FTAG);
+
+       uint64_t cmd = ztest_random(POOL_TRIM_FUNCS);
+       uint64_t rate = 1 << ztest_random(30);
+       boolean_t partial = (ztest_random(5) > 0);
+       boolean_t secure = (ztest_random(5) > 0);
+
+       nvlist_t *vdev_guids = fnvlist_alloc();
+       nvlist_t *vdev_errlist = fnvlist_alloc();
+       fnvlist_add_uint64(vdev_guids, path, guid);
+       error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial,
+           secure, vdev_errlist);
+       fnvlist_free(vdev_guids);
+       fnvlist_free(vdev_errlist);
+
+       switch (cmd) {
+       case POOL_TRIM_CANCEL:
+               if (ztest_opts.zo_verbose >= 4) {
+                       (void) printf("Cancel TRIM %s", path);
+                       if (!active)
+                               (void) printf(" failed (no TRIM active)");
+                       (void) printf("\n");
+               }
+               break;
+       case POOL_TRIM_START:
+               if (ztest_opts.zo_verbose >= 4) {
+                       (void) printf("Start TRIM %s", path);
+                       if (active && error == 0)
+                               (void) printf(" failed (already active)");
+                       else if (error != 0)
+                               (void) printf(" failed (error %d)", error);
+                       (void) printf("\n");
+               }
+               break;
+       case POOL_TRIM_SUSPEND:
+               if (ztest_opts.zo_verbose >= 4) {
+                       (void) printf("Suspend TRIM %s", path);
+                       if (!active)
+                               (void) printf(" failed (no TRIM active)");
+                       (void) printf("\n");
+               }
+               break;
+       }
+       free(path);
+       mutex_exit(&ztest_vdev_lock);
+}
+
 /*
  * Verify pool integrity by running zdb.
  */
diff --git a/config/kernel-blk-queue-discard.m4 b/config/kernel-blk-queue-discard.m4
new file mode 100644 (file)
index 0000000..addbba8
--- /dev/null
@@ -0,0 +1,65 @@
+dnl #
+dnl # 2.6.32 - 4.x API,
+dnl #   blk_queue_discard()
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISCARD], [
+       AC_MSG_CHECKING([whether blk_queue_discard() is available])
+       ZFS_LINUX_TRY_COMPILE([
+               #include <linux/blkdev.h>
+       ],[
+               struct request_queue *q __attribute__ ((unused)) = NULL;
+               int value __attribute__ ((unused));
+
+               value = blk_queue_discard(q);
+       ],[
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_BLK_QUEUE_DISCARD, 1,
+                   [blk_queue_discard() is available])
+       ],[
+               AC_MSG_RESULT(no)
+       ])
+])
+
+dnl #
+dnl # 4.8 - 4.x API,
+dnl #   blk_queue_secure_erase()
+dnl #
+dnl # 2.6.36 - 4.7 API,
+dnl #   blk_queue_secdiscard()
+dnl #
+dnl # 2.6.x - 2.6.35 API,
+dnl #   Unsupported by kernel
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE], [
+       AC_MSG_CHECKING([whether blk_queue_secure_erase() is available])
+       ZFS_LINUX_TRY_COMPILE([
+               #include <linux/blkdev.h>
+       ],[
+               struct request_queue *q __attribute__ ((unused)) = NULL;
+               int value __attribute__ ((unused));
+
+               value = blk_queue_secure_erase(q);
+       ],[
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_BLK_QUEUE_SECURE_ERASE, 1,
+                   [blk_queue_secure_erase() is available])
+       ],[
+               AC_MSG_RESULT(no)
+
+               AC_MSG_CHECKING([whether blk_queue_secdiscard() is available])
+               ZFS_LINUX_TRY_COMPILE([
+                       #include <linux/blkdev.h>
+               ],[
+                       struct request_queue *q __attribute__ ((unused)) = NULL;
+                       int value __attribute__ ((unused));
+
+                       value = blk_queue_secdiscard(q);
+               ],[
+                       AC_MSG_RESULT(yes)
+                       AC_DEFINE(HAVE_BLK_QUEUE_SECDISCARD, 1,
+                           [blk_queue_secdiscard() is available])
+               ],[
+                       AC_MSG_RESULT(no)
+               ])
+       ])
+])
index e4d0e3393b65071c3982d97e0ba4598ea521f2dc..f7d657e0ca3160a1809434812a338c332bb0d700 100644 (file)
@@ -164,6 +164,8 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
        ZFS_AC_KERNEL_IN_COMPAT_SYSCALL
        ZFS_AC_KERNEL_KTIME_GET_COARSE_REAL_TS64
        ZFS_AC_KERNEL_TOTALRAM_PAGES_FUNC
+       ZFS_AC_KERNEL_BLK_QUEUE_DISCARD
+       ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE
 
        AS_IF([test "$LINUX_OBJ" != "$LINUX"], [
                KERNEL_MAKE="$KERNEL_MAKE O=$LINUX_OBJ"
index 3d9fd2848e48576dc5af5789d2c7f219187e9744..db614084e37e3edbeef28f0430e85f41a0b46ac8 100644 (file)
@@ -263,6 +263,7 @@ AC_CONFIG_FILES([
        tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile
        tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile
        tests/zfs-tests/tests/functional/cli_root/zpool_sync/Makefile
+       tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile
        tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/Makefile
        tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/Makefile
        tests/zfs-tests/tests/functional/cli_user/Makefile
@@ -328,6 +329,7 @@ AC_CONFIG_FILES([
        tests/zfs-tests/tests/functional/alloc_class/Makefile
        tests/zfs-tests/tests/functional/threadsappend/Makefile
        tests/zfs-tests/tests/functional/tmpfile/Makefile
+       tests/zfs-tests/tests/functional/trim/Makefile
        tests/zfs-tests/tests/functional/truncate/Makefile
        tests/zfs-tests/tests/functional/user_namespace/Makefile
        tests/zfs-tests/tests/functional/userquota/Makefile
index 3405bb99bb4fd4abe144ef5efa597cb30862abf3..b604f1194dc68940b6a2ffe39b80cef2b7b38b14 100644 (file)
@@ -143,6 +143,9 @@ typedef enum zfs_error {
        EZFS_INITIALIZING,      /* currently initializing */
        EZFS_NO_INITIALIZE,     /* no active initialize */
        EZFS_WRONG_PARENT,      /* invalid parent dataset (e.g ZVOL) */
+       EZFS_TRIMMING,          /* currently trimming */
+       EZFS_NO_TRIM,           /* no active trim */
+       EZFS_TRIM_NOTSUP,       /* device does not support trim */
        EZFS_UNKNOWN
 } zfs_error_t;
 
@@ -253,12 +256,26 @@ typedef struct splitflags {
        int name_flags;
 } splitflags_t;
 
+typedef struct trimflags {
+       /* requested vdevs are for the entire pool */
+       boolean_t fullpool;
+
+       /* request a secure trim, requires support from device */
+       boolean_t secure;
+
+       /* trim at the requested rate in bytes/second */
+       uint64_t rate;
+} trimflags_t;
+
 /*
  * Functions to manipulate pool and vdev state
  */
 extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t);
 extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t,
     nvlist_t *);
+extern int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *,
+    trimflags_t *);
+
 extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
 extern int zpool_reguid(zpool_handle_t *);
 extern int zpool_reopen_one(zpool_handle_t *, void *);
index 264ce3fa02b0cbfc7bd2db91e4057098adcdb7a8..74a64d10777d8515524731cc95111b48fcae4fd6 100644 (file)
@@ -64,6 +64,8 @@ int lzc_unload_key(const char *);
 int lzc_change_key(const char *, uint64_t, nvlist_t *, uint8_t *, uint_t);
 int lzc_initialize(const char *, pool_initialize_func_t, nvlist_t *,
     nvlist_t **);
+int lzc_trim(const char *, pool_trim_func_t, uint64_t, boolean_t,
+    nvlist_t *, nvlist_t **);
 
 int lzc_snaprange_space(const char *, const char *, uint64_t *);
 
index 274552d5dc485c00aca50776c0c48f12d6ff9c25..084ea61ccc9a61b0384c6ea4833ab7e17fd29828 100644 (file)
@@ -608,6 +608,36 @@ blk_queue_discard_granularity(struct request_queue *q, unsigned int dg)
 #define        blk_queue_discard_granularity(x, dg)    ((void)0)
 #endif /* HAVE_DISCARD_GRANULARITY */
 
+/*
+ * 2.6.32 - 4.x API,
+ *   blk_queue_discard()
+ */
+#if !defined(HAVE_BLK_QUEUE_DISCARD)
+#define        blk_queue_discard(q)                    (0);
+#endif
+
+/*
+ * 4.8 - 4.x API,
+ *   blk_queue_secure_erase()
+ *
+ * 2.6.36 - 4.7 API,
+ *   blk_queue_secdiscard()
+ *
+ * 2.6.x - 2.6.35 API,
+ *   Unsupported by kernel
+ */
+static inline int
+blk_queue_discard_secure(struct request_queue *q)
+{
+#if defined(HAVE_BLK_QUEUE_SECURE_ERASE)
+       return (blk_queue_secure_erase(q));
+#elif defined(HAVE_BLK_QUEUE_SECDISCARD)
+       return (blk_queue_secdiscard(q));
+#else
+       return (0);
+#endif
+}
+
 /*
  * Default Linux IO Scheduler,
  * Setting the scheduler to noop will allow the Linux IO scheduler to
index e596ff3732f43ac31cca25e3077102298ef1224b..3b5b2755a2f2a070a08f8f05242ad495645ec471 100644 (file)
@@ -11,7 +11,6 @@ KERNEL_H = \
        $(top_srcdir)/include/spl/sys/ctype.h \
        $(top_srcdir)/include/spl/sys/debug.h \
        $(top_srcdir)/include/spl/sys/disp.h \
-       $(top_srcdir)/include/spl/sys/dkioc_free_util.h \
        $(top_srcdir)/include/spl/sys/dkio.h \
        $(top_srcdir)/include/spl/sys/errno.h \
        $(top_srcdir)/include/spl/sys/fcntl.h \
diff --git a/include/spl/sys/dkioc_free_util.h b/include/spl/sys/dkioc_free_util.h
deleted file mode 100644 (file)
index d519b2f..0000000
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
- *  Copyright (C) 2007 The Regents of the University of California.
- *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
- *  UCRL-CODE-235197
- *
- *  This file is part of the SPL, Solaris Porting Layer.
- *  For details, see <http://zfsonlinux.org/>.
- *
- *  The SPL is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License as published by the
- *  Free Software Foundation; either version 2 of the License, or (at your
- *  option) any later version.
- *
- *  The SPL is distributed in the hope that it will be useful, but WITHOUT
- *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- *  for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _SPL_DKIOC_UTIL_H
-#define        _SPL_DKIOC_UTIL_H
-
-#include <sys/dkio.h>
-
-typedef struct dkioc_free_list_ext_s {
-       uint64_t                dfle_start;
-       uint64_t                dfle_length;
-} dkioc_free_list_ext_t;
-
-typedef struct dkioc_free_list_s {
-       uint64_t                dfl_flags;
-       uint64_t                dfl_num_exts;
-       int64_t                 dfl_offset;
-
-       /*
-        * N.B. this is only an internal debugging API! This is only called
-        * from debug builds of sd for pre-release checking. Remove before GA!
-        */
-       void                    (*dfl_ck_func)(uint64_t, uint64_t, void *);
-       void                    *dfl_ck_arg;
-
-       dkioc_free_list_ext_t   dfl_exts[1];
-} dkioc_free_list_t;
-
-static inline void dfl_free(dkioc_free_list_t *dfl) {
-       vmem_free(dfl, DFL_SZ(dfl->dfl_num_exts));
-}
-
-static inline dkioc_free_list_t *dfl_alloc(uint64_t dfl_num_exts, int flags) {
-       return (vmem_zalloc(DFL_SZ(dfl_num_exts), flags));
-}
-
-#endif /* _SPL_DKIOC_UTIL_H */
index e6c82d113ccfa312ffc4e4c298d96c6ed911148a..31ffdfb4a772cf8da46c41a8410e2d9504a853e4 100644 (file)
@@ -100,6 +100,7 @@ COMMON_H = \
        $(top_srcdir)/include/sys/vdev_raidz.h \
        $(top_srcdir)/include/sys/vdev_raidz_impl.h \
        $(top_srcdir)/include/sys/vdev_removal.h \
+       $(top_srcdir)/include/sys/vdev_trim.h \
        $(top_srcdir)/include/sys/xvattr.h \
        $(top_srcdir)/include/sys/zap.h \
        $(top_srcdir)/include/sys/zap_impl.h \
index e49a58f43a2880c4ada5e884d1305c1b723b1963..bdc25ee9ffe971e0c4a8704102a1f95b9d718255 100644 (file)
@@ -244,6 +244,7 @@ typedef enum {
        ZPOOL_PROP_MULTIHOST,
        ZPOOL_PROP_CHECKPOINT,
        ZPOOL_PROP_LOAD_GUID,
+       ZPOOL_PROP_AUTOTRIM,
        ZPOOL_NUM_PROPS
 } zpool_prop_t;
 
@@ -635,6 +636,7 @@ typedef struct zpool_load_policy {
 #define        ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE  "vdev_async_r_active_queue"
 #define        ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE  "vdev_async_w_active_queue"
 #define        ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE    "vdev_async_scrub_active_queue"
+#define        ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE     "vdev_async_trim_active_queue"
 
 /* Queue sizes */
 #define        ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE     "vdev_sync_r_pend_queue"
@@ -642,6 +644,7 @@ typedef struct zpool_load_policy {
 #define        ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE    "vdev_async_r_pend_queue"
 #define        ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE    "vdev_async_w_pend_queue"
 #define        ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE      "vdev_async_scrub_pend_queue"
+#define        ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE       "vdev_async_trim_pend_queue"
 
 /* Latency read/write histogram stats */
 #define        ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO       "vdev_tot_r_lat_histo"
@@ -653,6 +656,7 @@ typedef struct zpool_load_policy {
 #define        ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO     "vdev_async_r_lat_histo"
 #define        ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO     "vdev_async_w_lat_histo"
 #define        ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO       "vdev_scrub_histo"
+#define        ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO        "vdev_trim_histo"
 
 /* Request size histograms */
 #define        ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO      "vdev_sync_ind_r_histo"
@@ -660,11 +664,13 @@ typedef struct zpool_load_policy {
 #define        ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO     "vdev_async_ind_r_histo"
 #define        ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO     "vdev_async_ind_w_histo"
 #define        ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO       "vdev_ind_scrub_histo"
+#define        ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO        "vdev_ind_trim_histo"
 #define        ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO      "vdev_sync_agg_r_histo"
 #define        ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO      "vdev_sync_agg_w_histo"
 #define        ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO     "vdev_async_agg_r_histo"
 #define        ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO     "vdev_async_agg_w_histo"
 #define        ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO       "vdev_agg_scrub_histo"
+#define        ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO        "vdev_agg_trim_histo"
 
 /* Number of slow IOs */
 #define        ZPOOL_CONFIG_VDEV_SLOW_IOS              "vdev_slow_ios"
@@ -777,6 +783,7 @@ typedef struct zpool_load_policy {
 #define        VDEV_ALLOC_BIAS_SPECIAL         "special"
 #define        VDEV_ALLOC_BIAS_DEDUP           "dedup"
 
+/* vdev initialize state */
 #define        VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET    \
        "com.delphix:next_offset_to_initialize"
 #define        VDEV_LEAF_ZAP_INITIALIZE_STATE  \
@@ -784,6 +791,20 @@ typedef struct zpool_load_policy {
 #define        VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME    \
        "com.delphix:vdev_initialize_action_time"
 
+/* vdev TRIM state */
+#define        VDEV_LEAF_ZAP_TRIM_LAST_OFFSET  \
+       "org.zfsonlinux:next_offset_to_trim"
+#define        VDEV_LEAF_ZAP_TRIM_STATE        \
+       "org.zfsonlinux:vdev_trim_state"
+#define        VDEV_LEAF_ZAP_TRIM_ACTION_TIME  \
+       "org.zfsonlinux:vdev_trim_action_time"
+#define        VDEV_LEAF_ZAP_TRIM_RATE         \
+       "org.zfsonlinux:vdev_trim_rate"
+#define        VDEV_LEAF_ZAP_TRIM_PARTIAL      \
+       "org.zfsonlinux:vdev_trim_partial"
+#define        VDEV_LEAF_ZAP_TRIM_SECURE       \
+       "org.zfsonlinux:vdev_trim_secure"
+
 /*
  * This is needed in userland to report the minimum necessary device size.
  */
@@ -915,6 +936,7 @@ typedef enum zio_type {
        ZIO_TYPE_FREE,
        ZIO_TYPE_CLAIM,
        ZIO_TYPE_IOCTL,
+       ZIO_TYPE_TRIM,
        ZIO_TYPES
 } zio_type_t;
 
@@ -982,8 +1004,14 @@ typedef enum zpool_errata {
 
 /*
  * Vdev statistics.  Note: all fields should be 64-bit because this
- * is passed between kernel and userland as an nvlist uint64 array.
+ * is passed between kernel and user land as an nvlist uint64 array.
+ *
+ * The vs_ops[] and vs_bytes[] arrays must always be an array size of 6 in
+ * order to keep subsequent members at their known fixed offsets.  When
+ * adding a new field it must be added to the end the structure.
  */
+#define        VS_ZIO_TYPES    6
+
 typedef struct vdev_stat {
        hrtime_t        vs_timestamp;           /* time since vdev load */
        uint64_t        vs_state;               /* vdev state           */
@@ -993,8 +1021,8 @@ typedef struct vdev_stat {
        uint64_t        vs_dspace;              /* deflated capacity    */
        uint64_t        vs_rsize;               /* replaceable dev size */
        uint64_t        vs_esize;               /* expandable dev size */
-       uint64_t        vs_ops[ZIO_TYPES];      /* operation count      */
-       uint64_t        vs_bytes[ZIO_TYPES];    /* bytes read/written   */
+       uint64_t        vs_ops[VS_ZIO_TYPES];   /* operation count      */
+       uint64_t        vs_bytes[VS_ZIO_TYPES]; /* bytes read/written   */
        uint64_t        vs_read_errors;         /* read errors          */
        uint64_t        vs_write_errors;        /* write errors         */
        uint64_t        vs_checksum_errors;     /* checksum errors      */
@@ -1010,6 +1038,12 @@ typedef struct vdev_stat {
        uint64_t        vs_checkpoint_space;    /* checkpoint-consumed space */
        uint64_t        vs_resilver_deferred;   /* resilver deferred    */
        uint64_t        vs_slow_ios;            /* slow IOs */
+       uint64_t        vs_trim_errors;         /* trimming errors      */
+       uint64_t        vs_trim_notsup;         /* supported by device */
+       uint64_t        vs_trim_bytes_done;     /* bytes trimmed */
+       uint64_t        vs_trim_bytes_est;      /* total bytes to trim */
+       uint64_t        vs_trim_state;          /* vdev_trim_state_t */
+       uint64_t        vs_trim_action_time;    /* time_t */
 } vdev_stat_t;
 
 /*
@@ -1068,12 +1102,22 @@ typedef struct vdev_stat_ex {
  * Initialize functions.
  */
 typedef enum pool_initialize_func {
-       POOL_INITIALIZE_DO,
+       POOL_INITIALIZE_START,
        POOL_INITIALIZE_CANCEL,
        POOL_INITIALIZE_SUSPEND,
        POOL_INITIALIZE_FUNCS
 } pool_initialize_func_t;
 
+/*
+ * TRIM functions.
+ */
+typedef enum pool_trim_func {
+       POOL_TRIM_START,
+       POOL_TRIM_CANCEL,
+       POOL_TRIM_SUSPEND,
+       POOL_TRIM_FUNCS
+} pool_trim_func_t;
+
 /*
  * DDT statistics.  Note: all fields should be 64-bit because this
  * is passed between kernel and userland as an nvlist uint64 array.
@@ -1126,6 +1170,14 @@ typedef enum {
        VDEV_INITIALIZE_COMPLETE
 } vdev_initializing_state_t;
 
+typedef enum {
+       VDEV_TRIM_NONE,
+       VDEV_TRIM_ACTIVE,
+       VDEV_TRIM_CANCELED,
+       VDEV_TRIM_SUSPENDED,
+       VDEV_TRIM_COMPLETE,
+} vdev_trim_state_t;
+
 /*
  * nvlist name constants. Facilitate restricting snapshot iteration range for
  * the "list next snapshot" ioctl
@@ -1224,6 +1276,7 @@ typedef enum zfs_ioc {
        ZFS_IOC_POOL_CHECKPOINT,                /* 0x5a4d */
        ZFS_IOC_POOL_DISCARD_CHECKPOINT,        /* 0x5a4e */
        ZFS_IOC_POOL_INITIALIZE,                /* 0x5a4f */
+       ZFS_IOC_POOL_TRIM,                      /* 0x5a50 */
 
        /*
         * Linux - 3/64 numbers reserved.
@@ -1326,6 +1379,14 @@ typedef enum {
 #define        ZPOOL_INITIALIZE_COMMAND        "initialize_command"
 #define        ZPOOL_INITIALIZE_VDEVS          "initialize_vdevs"
 
+/*
+ * The following are names used when invoking ZFS_IOC_POOL_TRIM.
+ */
+#define        ZPOOL_TRIM_COMMAND              "trim_command"
+#define        ZPOOL_TRIM_VDEVS                "trim_vdevs"
+#define        ZPOOL_TRIM_RATE                 "trim_rate"
+#define        ZPOOL_TRIM_SECURE               "trim_secure"
+
 /*
  * Flags for ZFS_IOC_VDEV_SET_STATE
  */
index a513a647039c6e660565a200fad9f65e1e210f96..2790d06c71d26327067a65e23236588994414897 100644 (file)
@@ -120,6 +120,8 @@ void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
     boolean_t);
 void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
 void metaslab_recalculate_weight_and_sort(metaslab_t *);
+void metaslab_disable(metaslab_t *);
+void metaslab_enable(metaslab_t *, boolean_t);
 
 #ifdef __cplusplus
 }
index 676c5dd46bf30bed045996bc6f07f055041b370c..ca1104c1481977349be8b2ae8f95c4e41f243e20 100644 (file)
@@ -69,7 +69,7 @@ typedef enum trace_alloc_type {
        TRACE_ENOSPC            = -6ULL,
        TRACE_CONDENSING        = -7ULL,
        TRACE_VDEV_ERROR        = -8ULL,
-       TRACE_INITIALIZING      = -9ULL
+       TRACE_DISABLED          = -9ULL,
 } trace_alloc_type_t;
 
 #define        METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
@@ -272,10 +272,10 @@ struct metaslab_group {
        uint64_t                mg_fragmentation;
        uint64_t                mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
 
-       int                     mg_ms_initializing;
-       boolean_t               mg_initialize_updating;
-       kmutex_t                mg_ms_initialize_lock;
-       kcondvar_t              mg_ms_initialize_cv;
+       int                     mg_ms_disabled;
+       boolean_t               mg_disabled_updating;
+       kmutex_t                mg_ms_disabled_lock;
+       kcondvar_t              mg_ms_disabled_cv;
 };
 
 /*
@@ -389,11 +389,24 @@ struct metaslab {
        range_tree_t    *ms_defer[TXG_DEFER_SIZE];
        range_tree_t    *ms_checkpointing; /* to add to the checkpoint */
 
+       /*
+        * The ms_trim tree is the set of allocatable segments which are
+        * eligible for trimming. (When the metaslab is loaded, it's a
+        * subset of ms_allocatable.)  It's kept in-core as long as the
+        * autotrim property is set and is not vacated when the metaslab
+        * is unloaded.  Its purpose is to aggregate freed ranges to
+        * facilitate efficient trimming.
+        */
+       range_tree_t    *ms_trim;
+
        boolean_t       ms_condensing;  /* condensing? */
        boolean_t       ms_condense_wanted;
        uint64_t        ms_condense_checked_txg;
 
-       uint64_t        ms_initializing; /* leaves initializing this ms */
+       /*
+        * The number of consumers which have disabled the metaslab.
+        */
+       uint64_t        ms_disabled;
 
        /*
         * We must always hold the ms_lock when modifying ms_loaded
index febf0e8f241b8f929491392c6e9c44da101d7814..343977b30a3829b8d1d15ac9dc6a15b60e24d567 100644 (file)
@@ -738,6 +738,24 @@ typedef enum spa_import_type {
        SPA_IMPORT_ASSEMBLE
 } spa_import_type_t;
 
+/*
+ * Send TRIM commands in-line during normal pool operation while deleting.
+ *     OFF: no
+ *     ON: yes
+ */
+typedef enum {
+       SPA_AUTOTRIM_OFF = 0,   /* default */
+       SPA_AUTOTRIM_ON
+} spa_autotrim_t;
+
+/*
+ * Reason TRIM command was issued, used internally for accounting purposes.
+ */
+typedef enum trim_type {
+       TRIM_TYPE_MANUAL = 0,
+       TRIM_TYPE_AUTO = 1,
+} trim_type_t;
+
 /* state manipulation functions */
 extern int spa_open(const char *pool, spa_t **, void *tag);
 extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
@@ -764,15 +782,17 @@ extern void spa_inject_delref(spa_t *spa);
 extern void spa_scan_stat_init(spa_t *spa);
 extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
 
-#define        SPA_ASYNC_CONFIG_UPDATE 0x01
-#define        SPA_ASYNC_REMOVE        0x02
-#define        SPA_ASYNC_PROBE         0x04
-#define        SPA_ASYNC_RESILVER_DONE 0x08
-#define        SPA_ASYNC_RESILVER      0x10
-#define        SPA_ASYNC_AUTOEXPAND    0x20
-#define        SPA_ASYNC_REMOVE_DONE   0x40
-#define        SPA_ASYNC_REMOVE_STOP   0x80
-#define        SPA_ASYNC_INITIALIZE_RESTART    0x100
+#define        SPA_ASYNC_CONFIG_UPDATE                 0x01
+#define        SPA_ASYNC_REMOVE                        0x02
+#define        SPA_ASYNC_PROBE                         0x04
+#define        SPA_ASYNC_RESILVER_DONE                 0x08
+#define        SPA_ASYNC_RESILVER                      0x10
+#define        SPA_ASYNC_AUTOEXPAND                    0x20
+#define        SPA_ASYNC_REMOVE_DONE                   0x40
+#define        SPA_ASYNC_REMOVE_STOP                   0x80
+#define        SPA_ASYNC_INITIALIZE_RESTART            0x100
+#define        SPA_ASYNC_TRIM_RESTART                  0x200
+#define        SPA_ASYNC_AUTOTRIM_RESTART              0x400
 
 /*
  * Controls the behavior of spa_vdev_remove().
@@ -790,6 +810,8 @@ extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
 extern boolean_t spa_vdev_remove_active(spa_t *spa);
 extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
     nvlist_t *vdev_errlist);
+extern int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
+    uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
 extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
@@ -887,6 +909,7 @@ typedef struct spa_stats {
        spa_history_kstat_t     io_history;
        spa_history_list_t      mmp_history;
        spa_history_kstat_t     state;          /* pool state */
+       spa_history_kstat_t     iostats;
 } spa_stats_t;
 
 typedef enum txg_state {
@@ -905,6 +928,22 @@ typedef struct txg_stat {
        uint64_t                ndirty;
 } txg_stat_t;
 
+/* Assorted pool IO kstats */
+typedef struct spa_iostats {
+       kstat_named_t   trim_extents_written;
+       kstat_named_t   trim_bytes_written;
+       kstat_named_t   trim_extents_skipped;
+       kstat_named_t   trim_bytes_skipped;
+       kstat_named_t   trim_extents_failed;
+       kstat_named_t   trim_bytes_failed;
+       kstat_named_t   autotrim_extents_written;
+       kstat_named_t   autotrim_bytes_written;
+       kstat_named_t   autotrim_extents_skipped;
+       kstat_named_t   autotrim_bytes_skipped;
+       kstat_named_t   autotrim_extents_failed;
+       kstat_named_t   autotrim_bytes_failed;
+} spa_iostats_t;
+
 extern void spa_stats_init(spa_t *spa);
 extern void spa_stats_destroy(spa_t *spa);
 extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb,
@@ -922,6 +961,10 @@ extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
 extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
     uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id,
     int error);
+extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type,
+    uint64_t extents_written, uint64_t bytes_written,
+    uint64_t extents_skipped, uint64_t bytes_skipped,
+    uint64_t extents_failed, uint64_t bytes_failed);
 
 /* Pool configuration locks */
 extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
@@ -1005,6 +1048,7 @@ extern objset_t *spa_meta_objset(spa_t *spa);
 extern uint64_t spa_deadman_synctime(spa_t *spa);
 extern uint64_t spa_deadman_ziotime(spa_t *spa);
 extern uint64_t spa_dirty_data(spa_t *spa);
+extern spa_autotrim_t spa_get_autotrim(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern void spa_load_failed(spa_t *spa, const char *fmt, ...);
index c3aaad6117005104f94ea7e099140174f5e04a22..66032d9aad7aec235b865c6507dd91cc320242f1 100644 (file)
@@ -378,6 +378,7 @@ struct spa {
        uint64_t        spa_deadman_ziotime;    /* deadman zio expiration */
        uint64_t        spa_all_vdev_zaps;      /* ZAP of per-vd ZAP obj #s */
        spa_avz_action_t        spa_avz_action; /* destroy/rebuild AVZ? */
+       uint64_t        spa_autotrim;           /* automatic background trim? */
        uint64_t        spa_errata;             /* errata issues detected */
        spa_stats_t     spa_stats;              /* assorted spa statistics */
        spa_keystore_t  spa_keystore;           /* loaded crypto keys */
index aa13bd5052c7fbdc4b3903037e9250704b8b6634..2067b355afb489477806956e453e427435874ea5 100644 (file)
@@ -118,6 +118,11 @@ extern "C" {
 #define        ESC_ZFS_BOOTFS_VDEV_ATTACH      "bootfs_vdev_attach"
 #define        ESC_ZFS_POOL_REGUID             "pool_reguid"
 #define        ESC_ZFS_HISTORY_EVENT           "history_event"
+#define        ESC_ZFS_TRIM_START              "trim_start"
+#define        ESC_ZFS_TRIM_FINISH             "trim_finish"
+#define        ESC_ZFS_TRIM_CANCEL             "trim_cancel"
+#define        ESC_ZFS_TRIM_RESUME             "trim_resume"
+#define        ESC_ZFS_TRIM_SUSPEND            "trim_suspend"
 
 /*
  * datalink subclass definitions.
index ed0e7297c2415f18c73a2852dee56f5487c2e65a..760d5208bf4a89cf59066c09f882fe4291b3e7b0 100644 (file)
@@ -90,10 +90,11 @@ extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);
 /*
  * Wait until the given transaction group, or one after it, is
  * the open transaction group.  Try to make this happen as soon
- * as possible (eg. kick off any necessary syncs immediately).
- * If txg == 0, wait for the next open txg.
+ * as possible (eg. kick off any necessary syncs immediately) when
+ * should_quiesce is set.  If txg == 0, wait for the next open txg.
  */
-extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg);
+extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg,
+    boolean_t should_quiesce);
 
 /*
  * Returns TRUE if we are "backed up" waiting for the syncing
index 2091892b27dae9af49420a4227ad2de4bcbf0444..67ca0d11614740f5153e5d6910e0111fe0732c11 100644 (file)
@@ -95,6 +95,8 @@ extern void vdev_metaslab_set_size(vdev_t *);
 extern void vdev_expand(vdev_t *vd, uint64_t txg);
 extern void vdev_split(vdev_t *vd);
 extern void vdev_deadman(vdev_t *vd, char *tag);
+extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
+    range_seg_t *physical_rs);
 
 extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
index c115a5e100f0841f59d7d3dcc62bce4d0fc71163..f6f7bbb4b28452f1c1e9f1b617ad336667bbe819 100644 (file)
@@ -145,6 +145,7 @@ struct vdev_queue {
        avl_tree_t      vq_active_tree;
        avl_tree_t      vq_read_offset_tree;
        avl_tree_t      vq_write_offset_tree;
+       avl_tree_t      vq_trim_offset_tree;
        uint64_t        vq_last_offset;
        hrtime_t        vq_io_complete_ts; /* time last i/o completed */
        hrtime_t        vq_io_delta_ts;
@@ -260,6 +261,7 @@ struct vdev {
        /* pool checkpoint related */
        space_map_t     *vdev_checkpoint_sm;    /* contains reserved blocks */
 
+       /* Initialize related */
        boolean_t       vdev_initialize_exit_wanted;
        vdev_initializing_state_t       vdev_initialize_state;
        list_node_t     vdev_initialize_node;
@@ -274,10 +276,34 @@ struct vdev {
        uint64_t        vdev_initialize_bytes_done;
        time_t          vdev_initialize_action_time;    /* start and end time */
 
-       /* for limiting outstanding I/Os */
+       /* TRIM related */
+       boolean_t       vdev_trim_exit_wanted;
+       boolean_t       vdev_autotrim_exit_wanted;
+       vdev_trim_state_t       vdev_trim_state;
+       list_node_t     vdev_trim_node;
+       kmutex_t        vdev_autotrim_lock;
+       kcondvar_t      vdev_autotrim_cv;
+       kthread_t       *vdev_autotrim_thread;
+       /* Protects vdev_trim_thread and vdev_trim_state. */
+       kmutex_t        vdev_trim_lock;
+       kcondvar_t      vdev_trim_cv;
+       kthread_t       *vdev_trim_thread;
+       uint64_t        vdev_trim_offset[TXG_SIZE];
+       uint64_t        vdev_trim_last_offset;
+       uint64_t        vdev_trim_bytes_est;
+       uint64_t        vdev_trim_bytes_done;
+       uint64_t        vdev_trim_rate;         /* requested rate (bytes/sec) */
+       uint64_t        vdev_trim_partial;      /* requested partial TRIM */
+       uint64_t        vdev_trim_secure;       /* requested secure TRIM */
+       time_t          vdev_trim_action_time;  /* start and end time */
+
+       /* for limiting outstanding I/Os (initialize and TRIM) */
        kmutex_t        vdev_initialize_io_lock;
        kcondvar_t      vdev_initialize_io_cv;
        uint64_t        vdev_initialize_inflight;
+       kmutex_t        vdev_trim_io_lock;
+       kcondvar_t      vdev_trim_io_cv;
+       uint64_t        vdev_trim_inflight[2];
 
        /*
         * Values stored in the config for an indirect or removing vdev.
@@ -343,6 +369,8 @@ struct vdev {
        uint64_t        vdev_not_present; /* not present during import  */
        uint64_t        vdev_unspare;   /* unspare when resilvering done */
        boolean_t       vdev_nowritecache; /* true if flushwritecache failed */
+       boolean_t       vdev_has_trim;  /* TRIM is supported            */
+       boolean_t       vdev_has_securetrim; /* secure TRIM is supported */
        boolean_t       vdev_checkremove; /* temporary online test      */
        boolean_t       vdev_forcefault; /* force online fault          */
        boolean_t       vdev_splitting; /* split or repair in progress  */
index 319fb9bc083368b6a3c094ae5686df47a0224fca..81d39ebebcb260dafdf56336bbaa410a98ffeae2 100644 (file)
@@ -39,8 +39,6 @@ extern void vdev_initialize_stop_all(vdev_t *vd,
     vdev_initializing_state_t tgt_state);
 extern void vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list);
 extern void vdev_initialize_restart(vdev_t *vd);
-extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
-    range_seg_t *physical_rs);
 
 #ifdef __cplusplus
 }
diff --git a/include/sys/vdev_trim.h b/include/sys/vdev_trim.h
new file mode 100644 (file)
index 0000000..1e54017
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+ */
+
+#ifndef _SYS_VDEV_TRIM_H
+#define        _SYS_VDEV_TRIM_H
+
+#include <sys/spa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern unsigned int zfs_trim_metaslab_skip;
+
+extern void vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial,
+    boolean_t secure);
+extern void vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt, list_t *vd_list);
+extern void vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state);
+extern void vdev_trim_stop_wait(spa_t *spa, list_t *vd_list);
+extern void vdev_trim_restart(vdev_t *vd);
+extern void vdev_autotrim(spa_t *spa);
+extern void vdev_autotrim_stop_all(spa_t *spa);
+extern void vdev_autotrim_stop_wait(vdev_t *vd);
+extern void vdev_autotrim_restart(spa_t *spa);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_TRIM_H */
index 260b8a458d73e92279ce694ad4737ad48827a7a8..87ddde30a3bfe074e88348593b22648a4cb9902c 100644 (file)
@@ -579,6 +579,8 @@ typedef struct vsecattr {
 
 #define        CRCREAT         0
 
+#define        F_FREESP        11
+
 extern int fop_getattr(vnode_t *vp, vattr_t *vap);
 
 #define        VOP_CLOSE(vp, f, c, o, cr, ct)  vn_close(vp)
@@ -587,6 +589,16 @@ extern int fop_getattr(vnode_t *vp, vattr_t *vap);
 
 #define        VOP_FSYNC(vp, f, cr, ct)        fsync((vp)->v_fd)
 
+#if defined(HAVE_FILE_FALLOCATE) && \
+       defined(FALLOC_FL_PUNCH_HOLE) && \
+       defined(FALLOC_FL_KEEP_SIZE)
+#define        VOP_SPACE(vp, cmd, flck, fl, off, cr, ct) \
+       fallocate((vp)->v_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, \
+           (flck)->l_start, (flck)->l_len)
+#else
+#define        VOP_SPACE(vp, cmd, flck, fl, off, cr, ct) (0)
+#endif
+
 #define        VN_RELE(vp)     vn_close(vp)
 
 extern int vn_open(char *path, int x1, int oflags, int mode, vnode_t **vpp,
index 7564ae0e432e0689a83215032df54d95c61ca6e3..7968a01cd4d3f04af9c8dd0b45f02ee7d12b7ea8 100644 (file)
@@ -54,6 +54,7 @@ extern int zfs_dbgmsg_enable;
 #define        ZFS_DEBUG_METASLAB_VERIFY       (1 << 8)
 #define        ZFS_DEBUG_SET_ERROR             (1 << 9)
 #define        ZFS_DEBUG_INDIRECT_REMAP        (1 << 10)
+#define        ZFS_DEBUG_TRIM                  (1 << 11)
 
 extern void __zfs_dbgmsg(char *buf);
 extern void __dprintf(boolean_t dprint, const char *file, const char *func,
index 4b7ad3e227e30d9bdf8faff4d107479c41599a10..e69bf9208039d4a45f30d3164e439384ccf51786 100644 (file)
@@ -416,6 +416,14 @@ typedef zio_t *zio_pipe_stage_t(zio_t *zio);
 #define        ZIO_REEXECUTE_NOW       0x01
 #define        ZIO_REEXECUTE_SUSPEND   0x02
 
+/*
+ * The io_trim flags are used to specify the type of TRIM to perform.  They
+ * only apply to ZIO_TYPE_TRIM zios are distinct from io_flags.
+ */
+enum trim_flag {
+       ZIO_TRIM_SECURE         = 1 << 0,
+};
+
 typedef struct zio_alloc_list {
        list_t  zal_list;
        uint64_t zal_size;
@@ -434,6 +442,7 @@ struct zio {
        zio_prop_t      io_prop;
        zio_type_t      io_type;
        enum zio_child  io_child_type;
+       enum trim_flag  io_trim_flags;
        int             io_cmd;
        zio_priority_t  io_priority;
        uint8_t         io_reexecute;
@@ -549,6 +558,10 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
 extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
     zio_done_func_t *done, void *private, enum zio_flag flags);
 
+extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+    zio_done_func_t *done, void *private, zio_priority_t priority,
+    enum zio_flag flags, enum trim_flag trim_flags);
+
 extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, struct abd *data, int checksum,
     zio_done_func_t *done, void *private, zio_priority_t priority,
index 344048c6a6343ba8dfae63a9097d0b909e8a9958..fbbe06eb04f895f655dba2f65edc6da8d0af1d01 100644 (file)
@@ -250,6 +250,11 @@ enum zio_stage {
        ZIO_STAGE_VDEV_IO_START |               \
        ZIO_STAGE_VDEV_IO_ASSESS)
 
+#define        ZIO_TRIM_PIPELINE                       \
+       (ZIO_INTERLOCK_STAGES |                 \
+       ZIO_STAGE_ISSUE_ASYNC |                 \
+       ZIO_VDEV_IO_STAGES)
+
 #define        ZIO_BLOCKING_STAGES                     \
        (ZIO_STAGE_DVA_ALLOCATE |               \
        ZIO_STAGE_DVA_CLAIM |                   \
index d8e6a1745969fd199f8bd4b691fddae40b5655ad..0b422904ec5a4120f624a084ce0e778e3c91a81b 100644 (file)
@@ -30,6 +30,7 @@ typedef enum zio_priority {
        ZIO_PRIORITY_SCRUB,             /* asynchronous scrub/resilver reads */
        ZIO_PRIORITY_REMOVAL,           /* reads/writes for vdev removal */
        ZIO_PRIORITY_INITIALIZING,      /* initializing I/O */
+       ZIO_PRIORITY_TRIM,              /* trim I/O (discard) */
        ZIO_PRIORITY_NUM_QUEUEABLE,
        ZIO_PRIORITY_NOW,               /* non-queued i/os (e.g. free) */
 } zio_priority_t;
index f799471e435128e43ba16cdcac07afa723df7ed1..6c797d06b6ce834765858d90acc9a6ba53bff955 100644 (file)
@@ -2092,6 +2092,57 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
        return (ret);
 }
 
+/*
+ * Translate vdev names to guids.  If a vdev_path is determined to be
+ * unsuitable then a vd_errlist is allocated and the vdev path and errno
+ * are added to it.
+ */
+static int
+zpool_translate_vdev_guids(zpool_handle_t *zhp, nvlist_t *vds,
+    nvlist_t *vdev_guids, nvlist_t *guids_to_paths, nvlist_t **vd_errlist)
+{
+       nvlist_t *errlist = NULL;
+       int error = 0;
+
+       for (nvpair_t *elem = nvlist_next_nvpair(vds, NULL); elem != NULL;
+           elem = nvlist_next_nvpair(vds, elem)) {
+               boolean_t spare, cache;
+
+               char *vd_path = nvpair_name(elem);
+               nvlist_t *tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache,
+                   NULL);
+
+               if ((tgt == NULL) || cache || spare) {
+                       if (errlist == NULL) {
+                               errlist = fnvlist_alloc();
+                               error = EINVAL;
+                       }
+
+                       uint64_t err = (tgt == NULL) ? EZFS_NODEVICE :
+                           (spare ? EZFS_ISSPARE : EZFS_ISL2CACHE);
+                       fnvlist_add_int64(errlist, vd_path, err);
+                       continue;
+               }
+
+               uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID);
+               fnvlist_add_uint64(vdev_guids, vd_path, guid);
+
+               char msg[MAXNAMELEN];
+               (void) snprintf(msg, sizeof (msg), "%llu", (u_longlong_t)guid);
+               fnvlist_add_string(guids_to_paths, msg, vd_path);
+       }
+
+       if (error != 0) {
+               verify(errlist != NULL);
+               if (vd_errlist != NULL)
+                       *vd_errlist = errlist;
+               else
+                       fnvlist_free(errlist);
+       }
+
+       return (error);
+}
+
 static int
 xlate_init_err(int err)
 {
@@ -2118,72 +2169,152 @@ zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
     nvlist_t *vds)
 {
        char msg[1024];
-       libzfs_handle_t *hdl = zhp->zpool_hdl;
-
-       nvlist_t *errlist;
+       int err;
 
-       /* translate vdev names to guids */
        nvlist_t *vdev_guids = fnvlist_alloc();
        nvlist_t *guids_to_paths = fnvlist_alloc();
-       boolean_t spare, cache;
-       nvlist_t *tgt;
+       nvlist_t *vd_errlist = NULL;
+       nvlist_t *errlist;
        nvpair_t *elem;
 
-       for (elem = nvlist_next_nvpair(vds, NULL); elem != NULL;
-           elem = nvlist_next_nvpair(vds, elem)) {
-               char *vd_path = nvpair_name(elem);
-               tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache, NULL);
+       err = zpool_translate_vdev_guids(zhp, vds, vdev_guids,
+           guids_to_paths, &vd_errlist);
 
-               if ((tgt == NULL) || cache || spare) {
-                       (void) snprintf(msg, sizeof (msg),
-                           dgettext(TEXT_DOMAIN, "cannot initialize '%s'"),
-                           vd_path);
-                       int err = (tgt == NULL) ? EZFS_NODEVICE :
-                           (spare ? EZFS_ISSPARE : EZFS_ISL2CACHE);
+       if (err == 0) {
+               err = lzc_initialize(zhp->zpool_name, cmd_type,
+                   vdev_guids, &errlist);
+               if (err == 0) {
                        fnvlist_free(vdev_guids);
                        fnvlist_free(guids_to_paths);
-                       return (zfs_error(hdl, err, msg));
+                       return (0);
                }
 
-               uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID);
-               fnvlist_add_uint64(vdev_guids, vd_path, guid);
+               if (errlist != NULL) {
+                       vd_errlist = fnvlist_lookup_nvlist(errlist,
+                           ZPOOL_INITIALIZE_VDEVS);
+               }
 
-               (void) snprintf(msg, sizeof (msg), "%llu", (u_longlong_t)guid);
-               fnvlist_add_string(guids_to_paths, msg, vd_path);
+               (void) snprintf(msg, sizeof (msg),
+                   dgettext(TEXT_DOMAIN, "operation failed"));
+       } else {
+               verify(vd_errlist != NULL);
+       }
+
+       for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL;
+           elem = nvlist_next_nvpair(vd_errlist, elem)) {
+               int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem));
+               char *path;
+
+               if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem),
+                   &path) != 0)
+                       path = nvpair_name(elem);
+
+               (void) zfs_error_fmt(zhp->zpool_hdl, vd_error,
+                   "cannot initialize '%s'", path);
        }
 
-       int err = lzc_initialize(zhp->zpool_name, cmd_type, vdev_guids,
-           &errlist);
        fnvlist_free(vdev_guids);
+       fnvlist_free(guids_to_paths);
 
-       if (err == 0) {
-               fnvlist_free(guids_to_paths);
-               return (0);
+       if (vd_errlist != NULL) {
+               fnvlist_free(vd_errlist);
+               return (-1);
+       }
+
+       return (zpool_standard_error(zhp->zpool_hdl, err, msg));
+}
+
+static int
+xlate_trim_err(int err)
+{
+       switch (err) {
+       case ENODEV:
+               return (EZFS_NODEVICE);
+       case EINVAL:
+       case EROFS:
+               return (EZFS_BADDEV);
+       case EBUSY:
+               return (EZFS_TRIMMING);
+       case ESRCH:
+               return (EZFS_NO_TRIM);
+       case EOPNOTSUPP:
+               return (EZFS_TRIM_NOTSUP);
        }
+       return (err);
+}
+
+/*
+ * Begin, suspend, or cancel the TRIM (discarding of all free blocks) for
+ * the given vdevs in the given pool.
+ */
+int
+zpool_trim(zpool_handle_t *zhp, pool_trim_func_t cmd_type, nvlist_t *vds,
+    trimflags_t *trim_flags)
+{
+       char msg[1024];
+       int err;
 
+       nvlist_t *vdev_guids = fnvlist_alloc();
+       nvlist_t *guids_to_paths = fnvlist_alloc();
        nvlist_t *vd_errlist = NULL;
-       if (errlist != NULL) {
-               vd_errlist = fnvlist_lookup_nvlist(errlist,
-                   ZPOOL_INITIALIZE_VDEVS);
+       nvlist_t *errlist;
+       nvpair_t *elem;
+
+       err = zpool_translate_vdev_guids(zhp, vds, vdev_guids,
+           guids_to_paths, &vd_errlist);
+       if (err == 0) {
+               err = lzc_trim(zhp->zpool_name, cmd_type, trim_flags->rate,
+                   trim_flags->secure, vdev_guids, &errlist);
+               if (err == 0) {
+                       fnvlist_free(vdev_guids);
+                       fnvlist_free(guids_to_paths);
+                       return (0);
+               }
+
+               if (errlist != NULL) {
+                       vd_errlist = fnvlist_lookup_nvlist(errlist,
+                           ZPOOL_TRIM_VDEVS);
+               }
+
+               (void) snprintf(msg, sizeof (msg),
+                   dgettext(TEXT_DOMAIN, "operation failed"));
+       } else {
+               verify(vd_errlist != NULL);
        }
 
-       (void) snprintf(msg, sizeof (msg),
-           dgettext(TEXT_DOMAIN, "operation failed"));
+       for (elem = nvlist_next_nvpair(vd_errlist, NULL);
+           elem != NULL; elem = nvlist_next_nvpair(vd_errlist, elem)) {
+               int64_t vd_error = xlate_trim_err(fnvpair_value_int64(elem));
+               char *path;
 
-       for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL;
-           elem = nvlist_next_nvpair(vd_errlist, elem)) {
-               int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem));
-               char *path = fnvlist_lookup_string(guids_to_paths,
-                   nvpair_name(elem));
-               (void) zfs_error_fmt(hdl, vd_error, "cannot initialize '%s'",
-                   path);
+               /*
+                * If only the pool was specified, and it was not a secure
+                * trim then suppress warnings for individual vdevs which
+                * do not support trimming.
+                */
+               if (vd_error == EZFS_TRIM_NOTSUP &&
+                   trim_flags->fullpool &&
+                   !trim_flags->secure) {
+                       continue;
+               }
+
+               if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem),
+                   &path) != 0)
+                       path = nvpair_name(elem);
+
+               (void) zfs_error_fmt(zhp->zpool_hdl, vd_error,
+                   "cannot trim '%s'", path);
        }
 
+       fnvlist_free(vdev_guids);
        fnvlist_free(guids_to_paths);
-       if (vd_errlist != NULL)
+
+       if (vd_errlist != NULL) {
+               fnvlist_free(vd_errlist);
                return (-1);
+       }
 
-       return (zpool_standard_error(hdl, err, msg));
+       return (zpool_standard_error(zhp->zpool_hdl, err, msg));
 }
 
 /*
index 4ed88588092271061336b8d210fc37911cf674e7..23dcb11bdab48ef655b48ee86d7d8759a7eb8a3a 100644 (file)
@@ -292,6 +292,13 @@ libzfs_error_description(libzfs_handle_t *hdl)
                    "initialization"));
        case EZFS_WRONG_PARENT:
                return (dgettext(TEXT_DOMAIN, "invalid parent dataset"));
+       case EZFS_TRIMMING:
+               return (dgettext(TEXT_DOMAIN, "currently trimming"));
+       case EZFS_NO_TRIM:
+               return (dgettext(TEXT_DOMAIN, "there is no active trim"));
+       case EZFS_TRIM_NOTSUP:
+               return (dgettext(TEXT_DOMAIN, "trim operations are not "
+                   "supported by this device"));
        case EZFS_UNKNOWN:
                return (dgettext(TEXT_DOMAIN, "unknown error"));
        default:
index 6bbe76e0a426c4b405a2d158133d497639da6bd8..e03c194826c872e98afd87330b361b92f8e48c43 100644 (file)
@@ -1412,7 +1412,8 @@ lzc_reopen(const char *pool_name, boolean_t scrub_restart)
  *     - ENODEV if the device was not found
  *     - EINVAL if the devices is not a leaf or is not concrete (e.g. missing)
  *     - EROFS if the device is not writeable
- *     - EBUSY start requested but the device is already being initialized
+ *     - EBUSY start requested but the device is already being either
+ *             initialized or trimmed
  *     - ESRCH cancel/suspend requested but device is not being initialized
  *
  * If the errlist is empty, then return value will be:
@@ -1425,6 +1426,7 @@ lzc_initialize(const char *poolname, pool_initialize_func_t cmd_type,
     nvlist_t *vdevs, nvlist_t **errlist)
 {
        int error;
+
        nvlist_t *args = fnvlist_alloc();
        fnvlist_add_uint64(args, ZPOOL_INITIALIZE_COMMAND, (uint64_t)cmd_type);
        fnvlist_add_nvlist(args, ZPOOL_INITIALIZE_VDEVS, vdevs);
@@ -1435,3 +1437,45 @@ lzc_initialize(const char *poolname, pool_initialize_func_t cmd_type,
 
        return (error);
 }
+
+/*
+ * Changes TRIM state.
+ *
+ * vdevs should be a list of (<key>, guid) where guid is a uint64 vdev GUID.
+ * The key is ignored.
+ *
+ * If there are errors related to vdev arguments, per-vdev errors are returned
+ * in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where
+ * guid is stringified with PRIu64, and errno is one of the following as
+ * an int64_t:
+ *     - ENODEV if the device was not found
+ *     - EINVAL if the devices is not a leaf or is not concrete (e.g. missing)
+ *     - EROFS if the device is not writeable
+ *     - EBUSY start requested but the device is already being either trimmed
+ *             or initialized
+ *     - ESRCH cancel/suspend requested but device is not being initialized
+ *     - EOPNOTSUPP if the device does not support TRIM (or secure TRIM)
+ *
+ * If the errlist is empty, then return value will be:
+ *     - EINVAL if one or more arguments was invalid
+ *     - Other spa_open failures
+ *     - 0 if the operation succeeded
+ */
+int
+lzc_trim(const char *poolname, pool_trim_func_t cmd_type, uint64_t rate,
+    boolean_t secure, nvlist_t *vdevs, nvlist_t **errlist)
+{
+       int error;
+
+       nvlist_t *args = fnvlist_alloc();
+       fnvlist_add_uint64(args, ZPOOL_TRIM_COMMAND, (uint64_t)cmd_type);
+       fnvlist_add_nvlist(args, ZPOOL_TRIM_VDEVS, vdevs);
+       fnvlist_add_uint64(args, ZPOOL_TRIM_RATE, rate);
+       fnvlist_add_boolean_value(args, ZPOOL_TRIM_SECURE, secure);
+
+       error = lzc_ioctl(ZFS_IOC_POOL_TRIM, poolname, args, errlist);
+
+       fnvlist_free(args);
+
+       return (error);
+}
index e13bb0f58d39d072ca31b1512f4236400e940503..91f47503a3bb67939dc7854877883f7d96078bf0 100644 (file)
@@ -130,6 +130,7 @@ KERNEL_C = \
        vdev_raidz_math_ssse3.c \
        vdev_removal.c \
        vdev_root.c \
+       vdev_trim.c \
        zap.c \
        zap_leaf.c \
        zap_micro.c \
index c1994f34012245c91c9fa7b38e96f5330558391a..a1a586df1c164ce26d01504448f9a27be942e7ba 100644 (file)
@@ -14,7 +14,7 @@
 .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
-.TH ZFS-MODULE-PARAMETERS 5 "Feb 8, 2019"
+.TH ZFS-MODULE-PARAMETERS 5 "Feb 15, 2019"
 .SH NAME
 zfs\-module\-parameters \- ZFS module parameters
 .SH DESCRIPTION
@@ -1532,6 +1532,30 @@ See the section "ZFS I/O SCHEDULER".
 Default value: \fB10\fR.
 .RE
 
+.sp
+.ne 2
+.na
+\fBzfs_vdev_trim_max_active\fR (int)
+.ad
+.RS 12n
+Maximum trim/discard I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB2\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_trim_min_active\fR (int)
+.ad
+.RS 12n
+Minimum trim/discard I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB1\fR.
+.RE
+
 .sp
 .ne 2
 .na
@@ -1619,6 +1643,12 @@ _
 _
 512    ZFS_DEBUG_SET_ERROR
        Enable SET_ERROR and dprintf entries in the debug log.
+_
+1024   ZFS_DEBUG_INDIRECT_REMAP
+       Verify split blocks created by device removal.
+_
+2048   ZFS_DEBUG_TRIM
+       Verify TRIM ranges are always within the allocatable range tree.
 .TE
 .sp
 * Requires debug build.
@@ -2341,6 +2371,82 @@ value of 75% will create a maximum of one thread per cpu.
 Default value: \fB75\fR%.
 .RE
 
+.sp
+.ne 2
+.na
+\fBzfs_trim_extent_bytes_max\fR (unsigned int)
+.ad
+.RS 12n
+Maximum size of TRIM command.  Ranges larger than this will be split in to
+chunks no larger than \fBzfs_trim_extent_bytes_max\fR bytes before being
+issued to the device.
+.sp
+Default value: \fB134,217,728\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_trim_extent_bytes_min\fR (unsigned int)
+.ad
+.RS 12n
+Minimum size of TRIM commands.  TRIM ranges smaller than this will be skipped
+unless they're part of a larger range which was broken in to chunks.  This is
+done because it's common for these small TRIMs to negatively impact overall
+performance.  This value can be set to 0 to TRIM all unallocated space.
+.sp
+Default value: \fB32,768\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_trim_metaslab_skip\fR (unsigned int)
+.ad
+.RS 12n
+Skip uninitialized metaslabs during the TRIM process.  This option is useful
+for pools constructed from large thinly-provisioned devices where TRIM
+operations are slow.  As a pool ages an increasing fraction of the pools
+metaslabs will be initialized progressively degrading the usefulness of
+this option.  This setting is stored when starting a manual TRIM and will
+persist for the duration of the requested TRIM.
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_trim_queue_limit\fR (unsigned int)
+.ad
+.RS 12n
+Maximum number of queued TRIMs outstanding per leaf vdev.  The number of
+concurrent TRIM commands issued to the device is controlled by the
+\fBzfs_vdev_trim_min_active\fR and \fBzfs_vdev_trim_max_active\fR module
+options.
+.sp
+Default value: \fB10\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_trim_txg_batch\fR (unsigned int)
+.ad
+.RS 12n
+The number of transaction groups worth of frees which should be aggregated
+before TRIM operations are issued to the device.  This setting represents a
+trade-off between issuing larger, more efficient TRIM operations and the
+delay before the recently trimmed space is available for use by the device.
+.sp
+Increasing this value will allow frees to be aggregated for a longer time.
+This will result is larger TRIM operations and potentially increased memory
+usage.  Decreasing this value will have the opposite effect.  The default
+value of 32 was determined to be a reasonable compromise.
+.sp
+Default value: \fB32\fR.
+.RE
+
 .sp
 .ne 2
 .na
@@ -2364,6 +2470,19 @@ Flush dirty data to disk at least every N seconds (maximum txg duration)
 Default value: \fB5\fR.
 .RE
 
+.sp
+.ne 2
+.na
+\fBzfs_vdev_aggregate_trim\fR (int)
+.ad
+.RS 12n
+Allow TRIM I/Os to be aggregated.  This is normally not helpful because
+the extents to be trimmed will have been already been aggregated by the
+metaslab.  This option is provided for debugging and performance analysis.
+.sp
+Default value: \fB0\fR.
+.RE
+
 .sp
 .ne 2
 .na
index abc272ab323f797b7d3b52c4043dc57736899c37..fd478bdaf3bd892f36d8202f80c7e974661b7a3d 100644 (file)
 .Op Fl s | Fl p
 .Ar pool Ns ...
 .Nm
+.Cm trim
+.Op Fl d
+.Op Fl r Ar rate
+.Op Fl c | Fl s
+.Ar pool
+.Op Ar device Ns ...
+.Nm
 .Cm set
 .Ar property Ns = Ns Ar value
 .Ar pool
 .Nm
 .Cm status
 .Oo Fl c Ar SCRIPT Oc
-.Op Fl DigLpPsvx
+.Op Fl DigLpPstvx
 .Op Fl T Sy u Ns | Ns Sy d
 .Oo Ar pool Oc Ns ...
 .Op Ar interval Op Ar count
@@ -806,6 +813,28 @@ Any write requests that have yet to be committed to disk would be blocked.
 .It Sy panic
 Prints out a message to the console and generates a system crash dump.
 .El
+.It Sy autotrim Ns = Ns Sy on Ns | Ns Sy off
+When set to
+.Sy on
+space which has been recently freed, and is no longer allocated by the pool,
+will be periodically trimmed.  This allows block device vdevs which support
+BLKDISCARD, such as SSDs, or file vdevs on which the underlying file system
+supports hole-punching, to reclaim unused blocks.  The default setting for
+this property is
+.Sy off .
+.Pp
+Automatic TRIM does not immediately reclaim blocks after a free. Instead,
+it will optimistically delay allowing smaller ranges to be aggregated in to
+a few larger ones.  These can then be issued more efficiently to the storage.
+.Pp
+Be aware that automatic trimming of recently freed data blocks can put
+significant stress on the underlying storage devices.  This will vary
+depending of how well the specific device handles these commands.  For
+lower end devices it is often possible to achieve most of the benefits
+of automatic trimming by running an on-demand (manual) TRIM periodically
+using the
+.Nm zpool Cm trim
+command.
 .It Sy feature@ Ns Ar feature_name Ns = Ns Sy enabled
 The value of this property is the current state of
 .Ar feature_name .
@@ -1782,15 +1811,10 @@ the path. This can be used in conjunction with the
 .Fl L
 flag.
 .It Fl r
-Print request size histograms for the leaf ZIOs. This includes
-histograms of individual ZIOs (
-.Ar ind )
-and aggregate ZIOs (
-.Ar agg ).
-These stats can be useful for seeing how well the ZFS IO aggregator is
-working. Do not confuse these request size stats with the block layer
-requests; it's possible ZIOs can be broken up before being sent to the
-block device.
+Print request size histograms for the leaf vdev's IO. This includes
+histograms of individual IOs (ind) and aggregate IOs (agg). These stats
+can be useful for observing how well IO aggregation is working.  Note
+that TRIM IOs may exceed 16M, but will be counted as 16M.
 .It Fl v
 Verbose statistics Reports usage statistics for individual vdevs within the
 pool, in addition to the pool-wide statistics.
@@ -1829,6 +1853,8 @@ Average amount of time IO spent in asynchronous priority queues.
 Does not include disk time.
 .Ar scrub :
 Average queuing time in scrub queue. Does not include disk time.
+.Ar trim :
+Average queuing time in trim queue. Does not include disk time.
 .It Fl q
 Include active queue statistics. Each priority queue has both
 pending (
@@ -1846,6 +1872,8 @@ queues.
 Current number of entries in asynchronous priority queues.
 .Ar scrubq_read :
 Current number of entries in scrub queue.
+.Ar trimq_write :
+Current number of entries in trim queue.
 .Pp
 All queue statistics are instantaneous measurements of the number of
 entries in the queues. If you specify an interval, the measurements
@@ -2151,6 +2179,48 @@ restarted from the beginning. Any drives that were scheduled for a deferred
 resilver will be added to the new one.
 .It Xo
 .Nm
+.Cm trim
+.Op Fl d
+.Op Fl c | Fl s
+.Ar pool
+.Op Ar device Ns ...
+.Xc
+Initiates an immediate on-demand TRIM operation for all of the free space in
+a pool.  This operation informs the underlying storage devices of all blocks
+in the pool which are no longer allocated and allows thinly provisioned
+devices to reclaim the space.
+.Pp
+A manual on-demand TRIM operation can be initiated irrespective of the
+.Sy autotrim
+pool property setting.  See the documentation for the
+.Sy autotrim
+property above for the types of vdev devices which can be trimmed.
+.Bl -tag -width Ds
+.It Fl d -secure
+Causes a secure TRIM to be initiated.  When performing a secure TRIM, the
+device guarantees that data stored on the trimmed blocks has been erased.
+This requires support from the device and is not supported by all SSDs.
+.It Fl r -rate Ar rate
+Controls the rate at which the TRIM operation progresses.  Without this
+option TRIM is executed as quickly as possible. The rate, expressed in bytes
+per second, is applied on a per-vdev basis and may be set differently for
+each leaf vdev.
+.It Fl c, -cancel
+Cancel trimming on the specified devices, or all eligible devices if none
+are specified.
+If one or more target devices are invalid or are not currently being
+trimmed, the command will fail and no cancellation will occur on any device.
+.It Fl s -suspend
+Suspend trimming on the specified devices, or all eligible devices if none
+are specified.
+If one or more target devices are invalid or are not currently being
+trimmed, the command will fail and no suspension will occur on any device.
+Trimming can then be resumed by running
+.Nm zpool Cm trim
+with no flags on the relevant target devices.
+.El
+.It Xo
+.Nm
 .Cm set
 .Ar property Ns = Ns Ar value
 .Ar pool
@@ -2238,7 +2308,7 @@ and automatically import it.
 .Nm
 .Cm status
 .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ...
-.Op Fl DigLpPsvx
+.Op Fl DigLpPstvx
 .Op Fl T Sy u Ns | Ns Sy d
 .Oo Ar pool Oc Ns ...
 .Op Ar interval Op Ar count
@@ -2295,6 +2365,8 @@ didn't complete in \fBzio_slow_io_ms\fR milliseconds (default 30 seconds).
 This does not necessarily mean the IOs failed to complete, just took an
 unreasonably long amount of time.  This may indicate a problem with the
 underlying storage.
+.It Fl t
+Display vdev TRIM status.
 .It Fl T Sy u Ns | Ns Sy d
 Display a time stamp.
 Specify
index 2d577793753e8a06685d0a6f7a1c093727d4b8d6..ac1c42b3f07bc4c650bdaf44846da1ce276d93c2 100644 (file)
@@ -130,6 +130,9 @@ zpool_prop_init(void)
        zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
            ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL,
            "wait | continue | panic", "FAILMODE", failuremode_table);
+       zprop_register_index(ZPOOL_PROP_AUTOTRIM, "autotrim",
+           SPA_AUTOTRIM_OFF, PROP_DEFAULT, ZFS_TYPE_POOL,
+           "on | off", "AUTOTRIM", boolean_table);
 
        /* hidden properties */
        zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
index 193bdc5105fe84b99e99ddd526002707e3442402..b2460f0d65760eb56fd7a630f1e12824fccd7677 100644 (file)
@@ -99,6 +99,7 @@ $(MODULE)-objs += vdev_raidz_math.o
 $(MODULE)-objs += vdev_raidz_math_scalar.o
 $(MODULE)-objs += vdev_removal.o
 $(MODULE)-objs += vdev_root.o
+$(MODULE)-objs += vdev_trim.o
 $(MODULE)-objs += zap.o
 $(MODULE)-objs += zap_leaf.o
 $(MODULE)-objs += zap_micro.o
index 219703231250641ba73ebc645a7456a2cec2f2d5..18328042c8f910b44707bbb178ab9a947c36ab5e 100644 (file)
@@ -842,7 +842,7 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
                if (dirty_frees_threshold != 0 &&
                    long_free_dirty_all_txgs >= dirty_frees_threshold) {
                        DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay);
-                       txg_wait_open(dp, 0);
+                       txg_wait_open(dp, 0, B_TRUE);
                        continue;
                }
 
index 8380897a9dcc50a6f01b083145510c60247c5b19..06d8383f09c93baea3762242672f7514fee3d89b 100644 (file)
@@ -181,7 +181,6 @@ int metaslab_lba_weighting_enabled = B_TRUE;
  */
 int metaslab_bias_enabled = B_TRUE;
 
-
 /*
  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
  */
@@ -219,6 +218,12 @@ boolean_t metaslab_trace_enabled = B_TRUE;
 uint64_t metaslab_trace_max_entries = 5000;
 #endif
 
+/*
+ * Maximum number of metaslabs per group that can be disabled
+ * simultaneously.
+ */
+int max_disabled_ms = 3;
+
 static uint64_t metaslab_weight(metaslab_t *);
 static void metaslab_set_fragmentation(metaslab_t *);
 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
@@ -652,8 +657,8 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 
        mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
        mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
-       cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
+       mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
+       cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
        mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
            KM_SLEEP);
        mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
@@ -700,8 +705,8 @@ metaslab_group_destroy(metaslab_group_t *mg)
        kmem_free(mg->mg_secondaries, mg->mg_allocators *
            sizeof (metaslab_t *));
        mutex_destroy(&mg->mg_lock);
-       mutex_destroy(&mg->mg_ms_initialize_lock);
-       cv_destroy(&mg->mg_ms_initialize_cv);
+       mutex_destroy(&mg->mg_ms_disabled_lock);
+       cv_destroy(&mg->mg_ms_disabled_cv);
 
        for (int i = 0; i < mg->mg_allocators; i++) {
                zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
@@ -1846,8 +1851,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
         */
        ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
            &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
-       metaslab_group_add(mg, ms);
 
+       ms->ms_trim = range_tree_create(NULL, NULL);
+
+       metaslab_group_add(mg, ms);
        metaslab_set_fragmentation(ms);
 
        /*
@@ -1921,6 +1928,9 @@ metaslab_fini(metaslab_t *msp)
        for (int t = 0; t < TXG_SIZE; t++)
                ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
 
+       range_tree_vacate(msp->ms_trim, NULL, NULL);
+       range_tree_destroy(msp->ms_trim);
+
        mutex_exit(&msp->ms_lock);
        cv_destroy(&msp->ms_load_cv);
        mutex_destroy(&msp->ms_lock);
@@ -2727,6 +2737,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
        ASSERT3P(msp->ms_freeing, !=, NULL);
        ASSERT3P(msp->ms_freed, !=, NULL);
        ASSERT3P(msp->ms_checkpointing, !=, NULL);
+       ASSERT3P(msp->ms_trim, !=, NULL);
 
        /*
         * Normally, we don't want to process a metaslab if there are no
@@ -2999,6 +3010,24 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
         */
        metaslab_load_wait(msp);
 
+       /*
+        * When auto-trimming is enabled, free ranges which are added to
+        * ms_allocatable are also be added to ms_trim.  The ms_trim tree is
+        * periodically consumed by the vdev_autotrim_thread() which issues
+        * trims for all ranges and then vacates the tree.  The ms_trim tree
+        * can be discarded at any time with the sole consequence of recent
+        * frees not being trimmed.
+        */
+       if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
+               range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
+               if (!defer_allowed) {
+                       range_tree_walk(msp->ms_freed, range_tree_add,
+                           msp->ms_trim);
+               }
+       } else {
+               range_tree_vacate(msp->ms_trim, NULL, NULL);
+       }
+
        /*
         * Move the frees from the defer_tree back to the free
         * range tree (if it's loaded). Swap the freed_tree and
@@ -3047,7 +3076,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
         * from it in 'metaslab_unload_delay' txgs, then unload it.
         */
        if (msp->ms_loaded &&
-           msp->ms_initializing == 0 &&
+           msp->ms_disabled == 0 &&
            msp->ms_selected_txg + metaslab_unload_delay < txg) {
 
                for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
@@ -3330,7 +3359,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
        metaslab_class_t *mc = msp->ms_group->mg_class;
 
        VERIFY(!msp->ms_condensing);
-       VERIFY0(msp->ms_initializing);
+       VERIFY0(msp->ms_disabled);
 
        start = mc->mc_ops->msop_alloc(msp, size);
        if (start != -1ULL) {
@@ -3341,6 +3370,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
                VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
                VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
                range_tree_remove(rt, start, size);
+               range_tree_clear(msp->ms_trim, start, size);
 
                if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
                        vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
@@ -3391,10 +3421,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
                }
 
                /*
-                * If the selected metaslab is condensing or being
-                * initialized, skip it.
+                * If the selected metaslab is condensing or disabled,
+                * skip it.
                 */
-               if (msp->ms_condensing || msp->ms_initializing > 0)
+               if (msp->ms_condensing || msp->ms_disabled > 0)
                        continue;
 
                *was_active = msp->ms_allocator != -1;
@@ -3566,9 +3596,9 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
                            ~METASLAB_ACTIVE_MASK);
                        mutex_exit(&msp->ms_lock);
                        continue;
-               } else if (msp->ms_initializing > 0) {
+               } else if (msp->ms_disabled > 0) {
                        metaslab_trace_add(zal, mg, msp, asize, d,
-                           TRACE_INITIALIZING, allocator);
+                           TRACE_DISABLED, allocator);
                        metaslab_passivate(msp, msp->ms_weight &
                            ~METASLAB_ACTIVE_MASK);
                        mutex_exit(&msp->ms_lock);
@@ -4294,6 +4324,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
        VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
            msp->ms_size);
        range_tree_remove(msp->ms_allocatable, offset, size);
+       range_tree_clear(msp->ms_trim, offset, size);
 
        if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
                if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
@@ -4606,6 +4637,7 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
                    offset, size);
        }
 
+       range_tree_verify_not_present(msp->ms_trim, offset, size);
        range_tree_verify_not_present(msp->ms_freeing, offset, size);
        range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
        range_tree_verify_not_present(msp->ms_freed, offset, size);
@@ -4637,6 +4669,89 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp)
        spa_config_exit(spa, SCL_VDEV, FTAG);
 }
 
+static void
+metaslab_group_disable_wait(metaslab_group_t *mg)
+{
+       ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
+       while (mg->mg_disabled_updating) {
+               cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
+       }
+}
+
+static void
+metaslab_group_disabled_increment(metaslab_group_t *mg)
+{
+       ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
+       ASSERT(mg->mg_disabled_updating);
+
+       while (mg->mg_ms_disabled >= max_disabled_ms) {
+               cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
+       }
+       mg->mg_ms_disabled++;
+       ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
+}
+
+/*
+ * Mark the metaslab as disabled to prevent any allocations on this metaslab.
+ * We must also track how many metaslabs are currently disabled within a
+ * metaslab group and limit them to prevent allocation failures from
+ * occurring because all metaslabs are disabled.
+ */
+void
+metaslab_disable(metaslab_t *msp)
+{
+       ASSERT(!MUTEX_HELD(&msp->ms_lock));
+       metaslab_group_t *mg = msp->ms_group;
+
+       mutex_enter(&mg->mg_ms_disabled_lock);
+
+       /*
+        * To keep an accurate count of how many threads have disabled
+        * a specific metaslab group, we only allow one thread to mark
+        * the metaslab group at a time. This ensures that the value of
+        * ms_disabled will be accurate when we decide to mark a metaslab
+        * group as disabled. To do this we force all other threads
+        * to wait till the metaslab's mg_disabled_updating flag is no
+        * longer set.
+        */
+       metaslab_group_disable_wait(mg);
+       mg->mg_disabled_updating = B_TRUE;
+       if (msp->ms_disabled == 0) {
+               metaslab_group_disabled_increment(mg);
+       }
+       mutex_enter(&msp->ms_lock);
+       msp->ms_disabled++;
+       mutex_exit(&msp->ms_lock);
+
+       mg->mg_disabled_updating = B_FALSE;
+       cv_broadcast(&mg->mg_ms_disabled_cv);
+       mutex_exit(&mg->mg_ms_disabled_lock);
+}
+
+void
+metaslab_enable(metaslab_t *msp, boolean_t sync)
+{
+       metaslab_group_t *mg = msp->ms_group;
+       spa_t *spa = mg->mg_vd->vdev_spa;
+
+       /*
+        * Wait for the outstanding IO to be synced to prevent newly
+        * allocated blocks from being overwritten.  This used by
+        * initialize and TRIM which are modifying unallocated space.
+        */
+       if (sync)
+               txg_wait_synced(spa_get_dsl(spa), 0);
+
+       mutex_enter(&mg->mg_ms_disabled_lock);
+       mutex_enter(&msp->ms_lock);
+       if (--msp->ms_disabled == 0) {
+               mg->mg_ms_disabled--;
+               cv_broadcast(&mg->mg_ms_disabled_cv);
+       }
+       mutex_exit(&msp->ms_lock);
+       mutex_exit(&mg->mg_ms_disabled_lock);
+}
+
 #if defined(_KERNEL)
 /* BEGIN CSTYLED */
 module_param(metaslab_aliquot, ulong, 0644);
index 71744139e7cd032fb6c4277cc477b327421a6669..5392e35478704cbdff0f40cf0d7a2af92aba317d 100644 (file)
@@ -57,6 +57,7 @@
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
 #include <sys/vdev_disk.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
@@ -132,7 +133,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
  * macros. Other operations process a large amount of data; the ZTI_BATCH
  * macro causes us to create a taskq oriented for throughput. Some operations
- * are so high frequency and short-lived that the taskq itself can become a a
+ * are so high frequency and short-lived that the taskq itself can become a
  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
  * additional degree of parallelism specified by the number of threads per-
  * taskq and the number of taskqs; when dispatching an event in this case, the
@@ -150,6 +151,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
        { ZTI_P(12, 8), ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* FREE */
        { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* CLAIM */
        { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* IOCTL */
+       { ZTI_N(4),     ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* TRIM */
 };
 
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
@@ -554,6 +556,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
                case ZPOOL_PROP_AUTOREPLACE:
                case ZPOOL_PROP_LISTSNAPS:
                case ZPOOL_PROP_AUTOEXPAND:
+               case ZPOOL_PROP_AUTOTRIM:
                        error = nvpair_value_uint64(elem, &intval);
                        if (!error && intval > 1)
                                error = SET_ERROR(EINVAL);
@@ -1442,8 +1445,10 @@ spa_unload(spa_t *spa)
        spa_async_suspend(spa);
 
        if (spa->spa_root_vdev) {
-               vdev_initialize_stop_all(spa->spa_root_vdev,
-                   VDEV_INITIALIZE_ACTIVE);
+               vdev_t *root_vdev = spa->spa_root_vdev;
+               vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE);
+               vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
+               vdev_autotrim_stop_all(spa);
        }
 
        /*
@@ -3585,7 +3590,7 @@ spa_ld_get_props(spa_t *spa)
                spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
                spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
                    &spa->spa_dedup_ditto);
-
+               spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
                spa->spa_autoreplace = (autoreplace != 0);
        }
 
@@ -4336,6 +4341,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 
                spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
                vdev_initialize_restart(spa->spa_root_vdev);
+               vdev_trim_restart(spa->spa_root_vdev);
+               vdev_autotrim_restart(spa);
                spa_config_exit(spa, SCL_CONFIG, FTAG);
        }
 
@@ -5338,6 +5345,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
        spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
        spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
        spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
+       spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
 
        if (props != NULL) {
                spa_configfile_set(spa, props, B_FALSE);
@@ -5746,14 +5754,16 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
 
                /*
                 * We're about to export or destroy this pool. Make sure
-                * we stop all initializtion activity here before we
-                * set the spa_final_txg. This will ensure that all
+                * we stop all initialization and trim activity here before
+                * we set the spa_final_txg. This will ensure that all
                 * dirty data resulting from the initialization is
                 * committed to disk before we unload the pool.
                 */
                if (spa->spa_root_vdev != NULL) {
-                       vdev_initialize_stop_all(spa->spa_root_vdev,
-                           VDEV_INITIALIZE_ACTIVE);
+                       vdev_t *rvd = spa->spa_root_vdev;
+                       vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
+                       vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
+                       vdev_autotrim_stop_all(spa);
                }
 
                /*
@@ -6376,7 +6386,6 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
                vdev_remove_parent(cvd);
        }
 
-
        /*
         * We don't set tvd until now because the parent we just removed
         * may have been the previous top-level vdev.
@@ -6490,7 +6499,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
         * a previous initialization process which has completed but
         * the thread is not exited.
         */
-       if (cmd_type == POOL_INITIALIZE_DO &&
+       if (cmd_type == POOL_INITIALIZE_START &&
            (vd->vdev_initialize_thread != NULL ||
            vd->vdev_top->vdev_removing)) {
                mutex_exit(&vd->vdev_initialize_lock);
@@ -6507,7 +6516,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
        }
 
        switch (cmd_type) {
-       case POOL_INITIALIZE_DO:
+       case POOL_INITIALIZE_START:
                vdev_initialize(vd);
                break;
        case POOL_INITIALIZE_CANCEL:
@@ -6571,6 +6580,126 @@ spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
        return (total_errors);
 }
 
+static int
+spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
+    uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
+{
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+       spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+
+       /* Look up vdev and ensure it's a leaf. */
+       vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+       if (vd == NULL || vd->vdev_detached) {
+               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+               return (SET_ERROR(ENODEV));
+       } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
+               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+               return (SET_ERROR(EINVAL));
+       } else if (!vdev_writeable(vd)) {
+               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+               return (SET_ERROR(EROFS));
+       } else if (!vd->vdev_has_trim) {
+               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+               return (SET_ERROR(EOPNOTSUPP));
+       } else if (secure && !vd->vdev_has_securetrim) {
+               spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+               return (SET_ERROR(EOPNOTSUPP));
+       }
+       mutex_enter(&vd->vdev_trim_lock);
+       spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+       /*
+        * When we activate a TRIM action we check to see if the
+        * vdev_trim_thread is NULL. We do this instead of using the
+        * vdev_trim_state since there might be a previous TRIM process
+        * which has completed but the thread is not exited.
+        */
+       if (cmd_type == POOL_TRIM_START &&
+           (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
+               mutex_exit(&vd->vdev_trim_lock);
+               return (SET_ERROR(EBUSY));
+       } else if (cmd_type == POOL_TRIM_CANCEL &&
+           (vd->vdev_trim_state != VDEV_TRIM_ACTIVE &&
+           vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) {
+               mutex_exit(&vd->vdev_trim_lock);
+               return (SET_ERROR(ESRCH));
+       } else if (cmd_type == POOL_TRIM_SUSPEND &&
+           vd->vdev_trim_state != VDEV_TRIM_ACTIVE) {
+               mutex_exit(&vd->vdev_trim_lock);
+               return (SET_ERROR(ESRCH));
+       }
+
+       switch (cmd_type) {
+       case POOL_TRIM_START:
+               vdev_trim(vd, rate, partial, secure);
+               break;
+       case POOL_TRIM_CANCEL:
+               vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list);
+               break;
+       case POOL_TRIM_SUSPEND:
+               vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list);
+               break;
+       default:
+               panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
+       }
+       mutex_exit(&vd->vdev_trim_lock);
+
+       return (0);
+}
+
+/*
+ * Initiates a manual TRIM for the requested vdevs. This kicks off individual
+ * TRIM threads for each child vdev.  These threads pass over all of the free
+ * space in the vdev's metaslabs and issues TRIM commands for that space.
+ */
+int
+spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
+    boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist)
+{
+       int total_errors = 0;
+       list_t vd_list;
+
+       list_create(&vd_list, sizeof (vdev_t),
+           offsetof(vdev_t, vdev_trim_node));
+
+       /*
+        * We hold the namespace lock through the whole function
+        * to prevent any changes to the pool while we're starting or
+        * stopping TRIM. The config and state locks are held so that
+        * we can properly assess the vdev state before we commit to
+        * the TRIM operation.
+        */
+       mutex_enter(&spa_namespace_lock);
+
+       for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
+           pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
+               uint64_t vdev_guid = fnvpair_value_uint64(pair);
+
+               int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type,
+                   rate, partial, secure, &vd_list);
+               if (error != 0) {
+                       char guid_as_str[MAXNAMELEN];
+
+                       (void) snprintf(guid_as_str, sizeof (guid_as_str),
+                           "%llu", (unsigned long long)vdev_guid);
+                       fnvlist_add_int64(vdev_errlist, guid_as_str, error);
+                       total_errors++;
+               }
+       }
+
+       /* Wait for all TRIM threads to stop. */
+       vdev_trim_stop_wait(spa, &vd_list);
+
+       /* Sync out the TRIM state */
+       txg_wait_synced(spa->spa_dsl_pool, 0);
+       mutex_exit(&spa_namespace_lock);
+
+       list_destroy(&vd_list);
+
+       return (total_errors);
+}
+
 /*
  * Split a set of devices from their mirrors, and create a new pool from them.
  */
@@ -6780,24 +6909,36 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
        spa_async_suspend(newspa);
 
        /*
-        * Temporarily stop the initializing activity. We set the state to
-        * ACTIVE so that we know to resume the initializing once the split
-        * has completed.
+        * Temporarily stop the initializing and TRIM activity.  We set the
+        * state to ACTIVE so that we know to resume initializing or TRIM
+        * once the split has completed.
         */
-       list_t vd_list;
-       list_create(&vd_list, sizeof (vdev_t),
+       list_t vd_initialize_list;
+       list_create(&vd_initialize_list, sizeof (vdev_t),
            offsetof(vdev_t, vdev_initialize_node));
 
+       list_t vd_trim_list;
+       list_create(&vd_trim_list, sizeof (vdev_t),
+           offsetof(vdev_t, vdev_trim_node));
+
        for (c = 0; c < children; c++) {
                if (vml[c] != NULL) {
                        mutex_enter(&vml[c]->vdev_initialize_lock);
-                       vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE,
-                           &vd_list);
+                       vdev_initialize_stop(vml[c],
+                           VDEV_INITIALIZE_ACTIVE, &vd_initialize_list);
                        mutex_exit(&vml[c]->vdev_initialize_lock);
+
+                       mutex_enter(&vml[c]->vdev_trim_lock);
+                       vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list);
+                       mutex_exit(&vml[c]->vdev_trim_lock);
                }
        }
-       vdev_initialize_stop_wait(spa, &vd_list);
-       list_destroy(&vd_list);
+
+       vdev_initialize_stop_wait(spa, &vd_initialize_list);
+       vdev_trim_stop_wait(spa, &vd_trim_list);
+
+       list_destroy(&vd_initialize_list);
+       list_destroy(&vd_trim_list);
 
        newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
 
@@ -6899,8 +7040,10 @@ out:
                        vml[c]->vdev_offline = B_FALSE;
        }
 
-       /* restart initializing disks as necessary */
+       /* restart initializing or trimming disks as necessary */
        spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+       spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
+       spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
 
        vdev_reopen(spa->spa_root_vdev);
 
@@ -7283,6 +7426,22 @@ spa_async_thread(void *arg)
                mutex_exit(&spa_namespace_lock);
        }
 
+       if (tasks & SPA_ASYNC_TRIM_RESTART) {
+               mutex_enter(&spa_namespace_lock);
+               spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+               vdev_trim_restart(spa->spa_root_vdev);
+               spa_config_exit(spa, SCL_CONFIG, FTAG);
+               mutex_exit(&spa_namespace_lock);
+       }
+
+       if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
+               mutex_enter(&spa_namespace_lock);
+               spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+               vdev_autotrim_restart(spa);
+               spa_config_exit(spa, SCL_CONFIG, FTAG);
+               mutex_exit(&spa_namespace_lock);
+       }
+
        /*
         * Let the world know that we're done.
         */
@@ -7782,6 +7941,11 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
                        case ZPOOL_PROP_FAILUREMODE:
                                spa->spa_failmode = intval;
                                break;
+                       case ZPOOL_PROP_AUTOTRIM:
+                               spa->spa_autotrim = intval;
+                               spa_async_request(spa,
+                                   SPA_ASYNC_AUTOTRIM_RESTART);
+                               break;
                        case ZPOOL_PROP_AUTOEXPAND:
                                spa->spa_autoexpand = intval;
                                if (tx->tx_txg != TXG_INITIAL)
index 71221b21bcaeafe1579993a6be2a1728449d8333..b3a4a7b124665d2019f7436a027fee84d87cdb75 100644 (file)
@@ -39,6 +39,7 @@
 #include <sys/zil.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
 #include <sys/vdev_file.h>
 #include <sys/vdev_raidz.h>
 #include <sys/metaslab.h>
@@ -1128,6 +1129,9 @@ spa_vdev_enter(spa_t *spa)
 {
        mutex_enter(&spa->spa_vdev_top_lock);
        mutex_enter(&spa_namespace_lock);
+
+       vdev_autotrim_stop_all(spa);
+
        return (spa_vdev_config_enter(spa));
 }
 
@@ -1204,8 +1208,17 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
                        vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED,
                            NULL);
                        mutex_exit(&vd->vdev_initialize_lock);
+
+                       mutex_enter(&vd->vdev_trim_lock);
+                       vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
+                       mutex_exit(&vd->vdev_trim_lock);
                }
 
+               /*
+                * The vdev may be both a leaf and top-level device.
+                */
+               vdev_autotrim_stop_wait(vd);
+
                spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
                vdev_free(vd);
                spa_config_exit(spa, SCL_ALL, spa);
@@ -1227,6 +1240,8 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
 int
 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 {
+       vdev_autotrim_restart(spa);
+
        spa_vdev_config_exit(spa, vd, txg, error, FTAG);
        mutex_exit(&spa_namespace_lock);
        mutex_exit(&spa->spa_vdev_top_lock);
@@ -1923,6 +1938,12 @@ spa_deadman_synctime(spa_t *spa)
        return (spa->spa_deadman_synctime);
 }
 
+spa_autotrim_t
+spa_get_autotrim(spa_t *spa)
+{
+       return (spa->spa_autotrim);
+}
+
 uint64_t
 spa_deadman_ziotime(spa_t *spa)
 {
index e01d2d19879c2ead5c04197db048a1b21196642e..3b51250c6832ae3b7b145aaaf0a85caedca5c725 100644 (file)
@@ -887,6 +887,105 @@ spa_health_destroy(spa_t *spa)
        mutex_destroy(&shk->lock);
 }
 
+static spa_iostats_t spa_iostats_template = {
+       { "trim_extents_written",               KSTAT_DATA_UINT64 },
+       { "trim_bytes_written",                 KSTAT_DATA_UINT64 },
+       { "trim_extents_skipped",               KSTAT_DATA_UINT64 },
+       { "trim_bytes_skipped",                 KSTAT_DATA_UINT64 },
+       { "trim_extents_failed",                KSTAT_DATA_UINT64 },
+       { "trim_bytes_failed",                  KSTAT_DATA_UINT64 },
+       { "autotrim_extents_written",           KSTAT_DATA_UINT64 },
+       { "autotrim_bytes_written",             KSTAT_DATA_UINT64 },
+       { "autotrim_extents_skipped",           KSTAT_DATA_UINT64 },
+       { "autotrim_bytes_skipped",             KSTAT_DATA_UINT64 },
+       { "autotrim_extents_failed",            KSTAT_DATA_UINT64 },
+       { "autotrim_bytes_failed",              KSTAT_DATA_UINT64 },
+};
+
+#define        SPA_IOSTATS_ADD(stat, val) \
+    atomic_add_64(&iostats->stat.value.ui64, (val));
+
+void
+spa_iostats_trim_add(spa_t *spa, trim_type_t type,
+    uint64_t extents_written, uint64_t bytes_written,
+    uint64_t extents_skipped, uint64_t bytes_skipped,
+    uint64_t extents_failed, uint64_t bytes_failed)
+{
+       spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+       kstat_t *ksp = shk->kstat;
+       spa_iostats_t *iostats;
+
+       if (ksp == NULL)
+               return;
+
+       iostats = ksp->ks_data;
+       if (type == TRIM_TYPE_MANUAL) {
+               SPA_IOSTATS_ADD(trim_extents_written, extents_written);
+               SPA_IOSTATS_ADD(trim_bytes_written, bytes_written);
+               SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped);
+               SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped);
+               SPA_IOSTATS_ADD(trim_extents_failed, extents_failed);
+               SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed);
+       } else {
+               SPA_IOSTATS_ADD(autotrim_extents_written, extents_written);
+               SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written);
+               SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped);
+               SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped);
+               SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed);
+               SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed);
+       }
+}
+
+int
+spa_iostats_update(kstat_t *ksp, int rw)
+{
+       if (rw == KSTAT_WRITE) {
+               memcpy(ksp->ks_data, &spa_iostats_template,
+                   sizeof (spa_iostats_t));
+       }
+
+       return (0);
+}
+
+static void
+spa_iostats_init(spa_t *spa)
+{
+       spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+
+       mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+       char *name = kmem_asprintf("zfs/%s", spa_name(spa));
+       kstat_t *ksp = kstat_create(name, 0, "iostats", "misc",
+           KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t),
+           KSTAT_FLAG_VIRTUAL);
+
+       shk->kstat = ksp;
+       if (ksp) {
+               int size = sizeof (spa_iostats_t);
+               ksp->ks_lock = &shk->lock;
+               ksp->ks_private = spa;
+               ksp->ks_update = spa_iostats_update;
+               ksp->ks_data = kmem_alloc(size, KM_SLEEP);
+               memcpy(ksp->ks_data, &spa_iostats_template, size);
+               kstat_install(ksp);
+       }
+
+       strfree(name);
+}
+
+static void
+spa_iostats_destroy(spa_t *spa)
+{
+       spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+       kstat_t *ksp = shk->kstat;
+       if (ksp) {
+               kmem_free(ksp->ks_data, sizeof (spa_iostats_t));
+               kstat_delete(ksp);
+       }
+
+       mutex_destroy(&shk->lock);
+}
+
 void
 spa_stats_init(spa_t *spa)
 {
@@ -896,11 +995,13 @@ spa_stats_init(spa_t *spa)
        spa_io_history_init(spa);
        spa_mmp_history_init(spa);
        spa_state_init(spa);
+       spa_iostats_init(spa);
 }
 
 void
 spa_stats_destroy(spa_t *spa)
 {
+       spa_iostats_destroy(spa);
        spa_health_destroy(spa);
        spa_tx_assign_destroy(spa);
        spa_txg_history_destroy(spa);
index db0f60cd15a344a0f2526918abab2126c35ce3f4..b3f895302e636297c684deada04d41ac8e6119fe 100644 (file)
@@ -694,8 +694,12 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
        mutex_exit(&tx->tx_sync_lock);
 }
 
+/*
+ * Wait for the specified open transaction group.  Set should_quiesce
+ * when the current open txg should be quiesced immediately.
+ */
 void
-txg_wait_open(dsl_pool_t *dp, uint64_t txg)
+txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce)
 {
        tx_state_t *tx = &dp->dp_tx;
 
@@ -705,7 +709,7 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg)
        ASSERT3U(tx->tx_threads, ==, 2);
        if (txg == 0)
                txg = tx->tx_open_txg + 1;
-       if (tx->tx_quiesce_txg_waiting < txg)
+       if (tx->tx_quiesce_txg_waiting < txg && should_quiesce)
                tx->tx_quiesce_txg_waiting = txg;
        dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
            txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
index ae1c2bcecb46caaea839079c84e63e2c69465db4..085ae687315041ec5a3291a487cce882a6afc68e 100644 (file)
@@ -51,6 +51,7 @@
 #include <sys/dsl_scan.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
 #include <sys/zvol.h>
 #include <sys/zfs_ratelimit.h>
 
@@ -543,6 +544,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
        list_link_init(&vd->vdev_state_dirty_node);
        list_link_init(&vd->vdev_initialize_node);
        list_link_init(&vd->vdev_leaf_node);
+       list_link_init(&vd->vdev_trim_node);
        mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
        mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -551,6 +553,12 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
        mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
+       mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
+       cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
 
        for (int t = 0; t < DTL_TYPES; t++) {
                vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
@@ -875,7 +883,10 @@ void
 vdev_free(vdev_t *vd)
 {
        spa_t *spa = vd->vdev_spa;
+
        ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+       ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+       ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
 
        /*
         * Scan queues are normally destroyed at the end of a scan. If the
@@ -906,7 +917,6 @@ vdev_free(vdev_t *vd)
 
        ASSERT(vd->vdev_child == NULL);
        ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
-       ASSERT(vd->vdev_initialize_thread == NULL);
 
        /*
         * Discard allocation state.
@@ -988,6 +998,12 @@ vdev_free(vdev_t *vd)
        mutex_destroy(&vd->vdev_initialize_io_lock);
        cv_destroy(&vd->vdev_initialize_io_cv);
        cv_destroy(&vd->vdev_initialize_cv);
+       mutex_destroy(&vd->vdev_trim_lock);
+       mutex_destroy(&vd->vdev_autotrim_lock);
+       mutex_destroy(&vd->vdev_trim_io_lock);
+       cv_destroy(&vd->vdev_trim_cv);
+       cv_destroy(&vd->vdev_autotrim_cv);
+       cv_destroy(&vd->vdev_trim_io_cv);
 
        zfs_ratelimit_fini(&vd->vdev_delay_rl);
        zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@@ -3475,6 +3491,16 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
        }
        mutex_exit(&vd->vdev_initialize_lock);
 
+       /* Restart trimming if necessary */
+       mutex_enter(&vd->vdev_trim_lock);
+       if (vdev_writeable(vd) &&
+           vd->vdev_trim_thread == NULL &&
+           vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
+               (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
+                   vd->vdev_trim_secure);
+       }
+       mutex_exit(&vd->vdev_trim_lock);
+
        if (wasoffline ||
            (oldstate < VDEV_STATE_DEGRADED &&
            vd->vdev_state >= VDEV_STATE_DEGRADED))
@@ -3745,8 +3771,7 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
 static void
 vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
 {
-       int t;
-       for (t = 0; t < ZIO_TYPES; t++) {
+       for (int t = 0; t < VS_ZIO_TYPES; t++) {
                vs->vs_ops[t] += cvs->vs_ops[t];
                vs->vs_bytes[t] += cvs->vs_bytes[t];
        }
@@ -3873,7 +3898,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
                        vs->vs_rsize += VDEV_LABEL_START_SIZE +
                            VDEV_LABEL_END_SIZE;
                        /*
-                        * Report intializing progress. Since we don't
+                        * Report initializing progress. Since we don't
                         * have the initializing locks held, this is only
                         * an estimate (although a fairly accurate one).
                         */
@@ -3884,9 +3909,20 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
                        vs->vs_initialize_state = vd->vdev_initialize_state;
                        vs->vs_initialize_action_time =
                            vd->vdev_initialize_action_time;
+
+                       /*
+                        * Report manual TRIM progress. Since we don't have
+                        * the manual TRIM locks held, this is only an
+                        * estimate (although fairly accurate one).
+                        */
+                       vs->vs_trim_notsup = !vd->vdev_has_trim;
+                       vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
+                       vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
+                       vs->vs_trim_state = vd->vdev_trim_state;
+                       vs->vs_trim_action_time = vd->vdev_trim_action_time;
                }
                /*
-                * Report expandable space on top-level, non-auxillary devices
+                * Report expandable space on top-level, non-auxiliary devices
                 * only. The expandable space is reported in terms of metaslab
                 * sized units since that determines how much space the pool
                 * can expand.
@@ -4004,9 +4040,18 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
                 */
                if (vd->vdev_ops->vdev_op_leaf &&
                    (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
+                       zio_type_t vs_type = type;
+
+                       /*
+                        * TRIM ops and bytes are reported to user space as
+                        * ZIO_TYPE_IOCTL.  This is done to preserve the
+                        * vdev_stat_t structure layout for user space.
+                        */
+                       if (type == ZIO_TYPE_TRIM)
+                               vs_type = ZIO_TYPE_IOCTL;
 
-                       vs->vs_ops[type]++;
-                       vs->vs_bytes[type] += psize;
+                       vs->vs_ops[vs_type]++;
+                       vs->vs_bytes[vs_type] += psize;
 
                        if (flags & ZIO_FLAG_DELEGATED) {
                                vsx->vsx_agg_histo[zio->io_priority]
@@ -4104,7 +4149,8 @@ vdev_deflated_space(vdev_t *vd, int64_t space)
 }
 
 /*
- * Update the in-core space usage stats for this vdev and the root vdev.
+ * Update the in-core space usage stats for this vdev, its metaslab class,
+ * and the root vdev.
  */
 void
 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
@@ -4650,12 +4696,56 @@ vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd)
        spa->spa_resilver_deferred = B_TRUE;
 }
 
+/*
+ * Translate a logical range to the physical range for the specified vdev_t.
+ * This function is initially called with a leaf vdev and will walk each
+ * parent vdev until it reaches a top-level vdev. Once the top-level is
+ * reached the physical range is initialized and the recursive function
+ * begins to unwind. As it unwinds it calls the parent's vdev specific
+ * translation function to do the real conversion.
+ */
+void
+vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
+{
+       /*
+        * Walk up the vdev tree
+        */
+       if (vd != vd->vdev_top) {
+               vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+       } else {
+               /*
+                * We've reached the top-level vdev, initialize the
+                * physical range to the logical range and start to
+                * unwind.
+                */
+               physical_rs->rs_start = logical_rs->rs_start;
+               physical_rs->rs_end = logical_rs->rs_end;
+               return;
+       }
+
+       vdev_t *pvd = vd->vdev_parent;
+       ASSERT3P(pvd, !=, NULL);
+       ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
+
+       /*
+        * As this recursive function unwinds, translate the logical
+        * range into its physical components by calling the
+        * vdev specific translate function.
+        */
+       range_seg_t intermediate = { { { 0, 0 } } };
+       pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+
+       physical_rs->rs_start = intermediate.rs_start;
+       physical_rs->rs_end = intermediate.rs_end;
+}
+
 #if defined(_KERNEL)
 EXPORT_SYMBOL(vdev_fault);
 EXPORT_SYMBOL(vdev_degrade);
 EXPORT_SYMBOL(vdev_online);
 EXPORT_SYMBOL(vdev_offline);
 EXPORT_SYMBOL(vdev_clear);
+
 /* BEGIN CSTYLED */
 module_param(zfs_vdev_default_ms_count, int, 0644);
 MODULE_PARM_DESC(zfs_vdev_default_ms_count,
index 4ac08c86148e2d6027eb85ca531442e5a4939359..c2312e6fa3bf8ec96bea7554b43d6d02aacad7dd 100644 (file)
@@ -30,6 +30,7 @@
 #include <sys/spa_impl.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
@@ -223,7 +224,7 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
        strfree(argv[2]);
 #endif /* HAVE_ELEVATOR_CHANGE */
        if (error) {
-               zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d\n",
+               zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d",
                    elevator, v->vdev_path, device, error);
        }
 }
@@ -322,7 +323,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 
        if (IS_ERR(bdev)) {
                int error = -PTR_ERR(bdev);
-               vdev_dbgmsg(v, "open error=%d count=%d\n", error, count);
+               vdev_dbgmsg(v, "open error=%d count=%d", error, count);
                vd->vd_bdev = NULL;
                v->vdev_tsd = vd;
                rw_exit(&vd->vd_lock);
@@ -333,14 +334,22 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
                rw_exit(&vd->vd_lock);
        }
 
+       struct request_queue *q = bdev_get_queue(vd->vd_bdev);
+
        /*  Determine the physical block size */
        block_size = vdev_bdev_block_size(vd->vd_bdev);
 
        /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
        v->vdev_nowritecache = B_FALSE;
 
+       /* Set when device reports it supports TRIM. */
+       v->vdev_has_trim = !!blk_queue_discard(q);
+
+       /* Set when device reports it supports secure TRIM. */
+       v->vdev_has_securetrim = !!blk_queue_discard_secure(q);
+
        /* Inform the ZIO pipeline that we are non-rotational */
-       v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
+       v->vdev_nonrot = blk_queue_nonrot(q);
 
        /* Physical volume size in bytes for the partition */
        *psize = bdev_capacity(vd->vd_bdev);
@@ -728,6 +737,7 @@ vdev_disk_io_start(zio_t *zio)
 {
        vdev_t *v = zio->io_vd;
        vdev_disk_t *vd = v->vdev_tsd;
+       unsigned long trim_flags = 0;
        int rw, flags, error;
 
        /*
@@ -813,6 +823,19 @@ vdev_disk_io_start(zio_t *zio)
 #endif
                break;
 
+       case ZIO_TYPE_TRIM:
+#if defined(BLKDEV_DISCARD_SECURE)
+               if (zio->io_trim_flags & ZIO_TRIM_SECURE)
+                       trim_flags |= BLKDEV_DISCARD_SECURE;
+#endif
+               zio->io_error = -blkdev_issue_discard(vd->vd_bdev,
+                   zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS,
+                   trim_flags);
+
+               rw_exit(&vd->vd_lock);
+               zio_interrupt(zio);
+               return;
+
        default:
                rw_exit(&vd->vd_lock);
                zio->io_error = SET_ERROR(ENOTSUP);
index 3551898e0781430f20d8c9ffd43d7069c6dfe163..c04f40ca452ce5fe9cfb24b43be635729006eb99 100644 (file)
 #include <sys/spa_impl.h>
 #include <sys/vdev_file.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
 #include <sys/zio.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/abd.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
 
 /*
  * Virtual device vector for files.
@@ -60,9 +63,24 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
        vattr_t vattr;
        int error;
 
-       /* Rotational optimizations only make sense on block devices */
+       /*
+        * Rotational optimizations only make sense on block devices.
+        */
        vd->vdev_nonrot = B_TRUE;
 
+       /*
+        * Allow TRIM on file based vdevs.  This may not always be supported,
+        * since it depends on your kernel version and underlying filesystem
+        * type but it is always safe to attempt.
+        */
+       vd->vdev_has_trim = B_TRUE;
+
+       /*
+        * Disable secure TRIM on file based vdevs.  There is no way to
+        * request this behavior from the underlying filesystem.
+        */
+       vd->vdev_has_securetrim = B_FALSE;
+
        /*
         * We must have a pathname, and it must be absolute.
         */
@@ -227,6 +245,21 @@ vdev_file_io_start(zio_t *zio)
                        zio->io_error = SET_ERROR(ENOTSUP);
                }
 
+               zio_execute(zio);
+               return;
+       } else if (zio->io_type == ZIO_TYPE_TRIM) {
+               struct flock flck;
+
+               ASSERT3U(zio->io_size, !=, 0);
+               bzero(&flck, sizeof (flck));
+               flck.l_type = F_FREESP;
+               flck.l_start = zio->io_offset;
+               flck.l_len = zio->io_size;
+               flck.l_whence = 0;
+
+               zio->io_error = VOP_SPACE(vf->vf_vnode, F_FREESP, &flck,
+                   0, 0, kcred, NULL);
+
                zio_execute(zio);
                return;
        }
index bca2db7a4838d8bcc20dde5ca250e8656eff768f..b1590132636bc72ca1ab2458eaee146b1bb53b42 100644 (file)
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>
 
-/*
- * Maximum number of metaslabs per group that can be initialized
- * simultaneously.
- */
-int max_initialize_ms = 3;
-
 /*
  * Value that is written to disk during initialization.
  */
@@ -132,7 +126,7 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
        dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
        VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
        dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
-           guid, 2, ZFS_SPACE_CHECK_RESERVED, tx);
+           guid, 2, ZFS_SPACE_CHECK_NONE, tx);
 
        switch (new_state) {
        case VDEV_INITIALIZE_ACTIVE:
@@ -250,49 +244,6 @@ vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
        return (0);
 }
 
-/*
- * Translate a logical range to the physical range for the specified vdev_t.
- * This function is initially called with a leaf vdev and will walk each
- * parent vdev until it reaches a top-level vdev. Once the top-level is
- * reached the physical range is initialized and the recursive function
- * begins to unwind. As it unwinds it calls the parent's vdev specific
- * translation function to do the real conversion.
- */
-void
-vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
-{
-       /*
-        * Walk up the vdev tree
-        */
-       if (vd != vd->vdev_top) {
-               vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
-       } else {
-               /*
-                * We've reached the top-level vdev, initialize the
-                * physical range to the logical range and start to
-                * unwind.
-                */
-               physical_rs->rs_start = logical_rs->rs_start;
-               physical_rs->rs_end = logical_rs->rs_end;
-               return;
-       }
-
-       vdev_t *pvd = vd->vdev_parent;
-       ASSERT3P(pvd, !=, NULL);
-       ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
-
-       /*
-        * As this recursive function unwinds, translate the logical
-        * range into its physical components by calling the
-        * vdev specific translate function.
-        */
-       range_seg_t intermediate = { { { 0, 0 } } };
-       pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
-
-       physical_rs->rs_start = intermediate.rs_start;
-       physical_rs->rs_end = intermediate.rs_end;
-}
-
 /*
  * Callback to fill each ABD chunk with zfs_initialize_value. len must be
  * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
@@ -362,81 +313,6 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
        return (0);
 }
 
-static void
-vdev_initialize_mg_wait(metaslab_group_t *mg)
-{
-       ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
-       while (mg->mg_initialize_updating) {
-               cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
-       }
-}
-
-static void
-vdev_initialize_mg_mark(metaslab_group_t *mg)
-{
-       ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
-       ASSERT(mg->mg_initialize_updating);
-
-       while (mg->mg_ms_initializing >= max_initialize_ms) {
-               cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
-       }
-       mg->mg_ms_initializing++;
-       ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms);
-}
-
-/*
- * Mark the metaslab as being initialized to prevent any allocations
- * on this metaslab. We must also track how many metaslabs are currently
- * being initialized within a metaslab group and limit them to prevent
- * allocation failures from occurring because all metaslabs are being
- * initialized.
- */
-static void
-vdev_initialize_ms_mark(metaslab_t *msp)
-{
-       ASSERT(!MUTEX_HELD(&msp->ms_lock));
-       metaslab_group_t *mg = msp->ms_group;
-
-       mutex_enter(&mg->mg_ms_initialize_lock);
-
-       /*
-        * To keep an accurate count of how many threads are initializing
-        * a specific metaslab group, we only allow one thread to mark
-        * the metaslab group at a time. This ensures that the value of
-        * ms_initializing will be accurate when we decide to mark a metaslab
-        * group as being initialized. To do this we force all other threads
-        * to wait till the metaslab's mg_initialize_updating flag is no
-        * longer set.
-        */
-       vdev_initialize_mg_wait(mg);
-       mg->mg_initialize_updating = B_TRUE;
-       if (msp->ms_initializing == 0) {
-               vdev_initialize_mg_mark(mg);
-       }
-       mutex_enter(&msp->ms_lock);
-       msp->ms_initializing++;
-       mutex_exit(&msp->ms_lock);
-
-       mg->mg_initialize_updating = B_FALSE;
-       cv_broadcast(&mg->mg_ms_initialize_cv);
-       mutex_exit(&mg->mg_ms_initialize_lock);
-}
-
-static void
-vdev_initialize_ms_unmark(metaslab_t *msp)
-{
-       ASSERT(!MUTEX_HELD(&msp->ms_lock));
-       metaslab_group_t *mg = msp->ms_group;
-       mutex_enter(&mg->mg_ms_initialize_lock);
-       mutex_enter(&msp->ms_lock);
-       if (--msp->ms_initializing == 0) {
-               mg->mg_ms_initializing--;
-               cv_broadcast(&mg->mg_ms_initialize_cv);
-       }
-       mutex_exit(&msp->ms_lock);
-       mutex_exit(&mg->mg_ms_initialize_lock);
-}
-
 static void
 vdev_initialize_calculate_progress(vdev_t *vd)
 {
@@ -535,9 +411,8 @@ vdev_initialize_load(vdev_t *vd)
        return (err);
 }
 
-
 /*
- * Convert the logical range into a physcial range and add it to our
+ * Convert the logical range into a physical range and add it to our
  * avl tree.
  */
 void
@@ -618,7 +493,8 @@ vdev_initialize_thread(void *arg)
                        ms_count = vd->vdev_top->vdev_ms_count;
                }
 
-               vdev_initialize_ms_mark(msp);
+               spa_config_exit(spa, SCL_CONFIG, FTAG);
+               metaslab_disable(msp);
                mutex_enter(&msp->ms_lock);
                VERIFY0(metaslab_load(msp));
 
@@ -626,16 +502,8 @@ vdev_initialize_thread(void *arg)
                    vd);
                mutex_exit(&msp->ms_lock);
 
-               spa_config_exit(spa, SCL_CONFIG, FTAG);
                error = vdev_initialize_ranges(vd, deadbeef);
-
-               /*
-                * Wait for the outstanding IO to be synced to prevent
-                * newly allocated blocks from being overwritten.
-                */
-               txg_wait_synced(spa_get_dsl(spa), 0);
-
-               vdev_initialize_ms_unmark(msp);
+               metaslab_enable(msp, B_TRUE);
                spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
                range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
@@ -853,12 +721,11 @@ vdev_initialize_restart(vdev_t *vd)
 }
 
 #if defined(_KERNEL)
-EXPORT_SYMBOL(vdev_initialize_restart);
-EXPORT_SYMBOL(vdev_xlate);
 EXPORT_SYMBOL(vdev_initialize);
 EXPORT_SYMBOL(vdev_initialize_stop);
 EXPORT_SYMBOL(vdev_initialize_stop_all);
 EXPORT_SYMBOL(vdev_initialize_stop_wait);
+EXPORT_SYMBOL(vdev_initialize_restart);
 
 /* CSTYLED */
 module_param(zfs_initialize_value, ulong, 0644);
index a03722d05e2558235884a0ac2ac860d782ec38ec..a0e373b3dfc51850cb2ddd9f5f1cc79abbdf1636 100644 (file)
@@ -251,6 +251,9 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
        fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
            vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]);
 
+       fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
+           vsx->vsx_active_queue[ZIO_PRIORITY_TRIM]);
+
        /* ZIOs pending */
        fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,
            vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]);
@@ -267,6 +270,9 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
        fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,
            vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]);
 
+       fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE,
+           vsx->vsx_pend_queue[ZIO_PRIORITY_TRIM]);
+
        /* Histograms */
        fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
            vsx->vsx_total_histo[ZIO_TYPE_READ],
@@ -304,6 +310,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
            vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB],
            ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB]));
 
+       fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
+           vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM],
+           ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM]));
+
        /* Request sizes */
        fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,
            vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ],
@@ -325,6 +335,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
            vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB],
            ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB]));
 
+       fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,
+           vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM],
+           ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM]));
+
        fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,
            vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ],
            ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ]));
@@ -345,6 +359,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
            vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB],
            ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB]));
 
+       fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,
+           vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM],
+           ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM]));
+
        /* IO delays */
        fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);
 
index a1861d5f0b08be26f6c0c3de3c24766a17551f61..e74df76b753083761fd2a6e416b2a3f9b1595cdc 100644 (file)
@@ -156,6 +156,8 @@ uint32_t zfs_vdev_removal_min_active = 1;
 uint32_t zfs_vdev_removal_max_active = 2;
 uint32_t zfs_vdev_initializing_min_active = 1;
 uint32_t zfs_vdev_initializing_max_active = 1;
+uint32_t zfs_vdev_trim_min_active = 1;
+uint32_t zfs_vdev_trim_max_active = 2;
 
 /*
  * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -203,6 +205,12 @@ int zfs_vdev_queue_depth_pct = 300;
  */
 int zfs_vdev_def_queue_depth = 32;
 
+/*
+ * Allow TRIM I/Os to be aggregated.  This should normally not be needed since
+ * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
+ * by the TRIM code in zfs_trim.c.
+ */
+int zfs_vdev_aggregate_trim = 0;
 
 int
 vdev_queue_offset_compare(const void *x1, const void *x2)
@@ -227,11 +235,13 @@ vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
 static inline avl_tree_t *
 vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
 {
-       ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE);
+       ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
        if (t == ZIO_TYPE_READ)
                return (&vq->vq_read_offset_tree);
-       else
+       else if (t == ZIO_TYPE_WRITE)
                return (&vq->vq_write_offset_tree);
+       else
+               return (&vq->vq_trim_offset_tree);
 }
 
 int
@@ -266,6 +276,8 @@ vdev_queue_class_min_active(zio_priority_t p)
                return (zfs_vdev_removal_min_active);
        case ZIO_PRIORITY_INITIALIZING:
                return (zfs_vdev_initializing_min_active);
+       case ZIO_PRIORITY_TRIM:
+               return (zfs_vdev_trim_min_active);
        default:
                panic("invalid priority %u", p);
                return (0);
@@ -338,6 +350,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
                return (zfs_vdev_removal_max_active);
        case ZIO_PRIORITY_INITIALIZING:
                return (zfs_vdev_initializing_max_active);
+       case ZIO_PRIORITY_TRIM:
+               return (zfs_vdev_trim_max_active);
        default:
                panic("invalid priority %u", p);
                return (0);
@@ -398,19 +412,25 @@ vdev_queue_init(vdev_t *vd)
        avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
            vdev_queue_offset_compare, sizeof (zio_t),
            offsetof(struct zio, io_offset_node));
+       avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
+           vdev_queue_offset_compare, sizeof (zio_t),
+           offsetof(struct zio, io_offset_node));
 
        for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
                int (*compfn) (const void *, const void *);
 
                /*
-                * The synchronous i/o queues are dispatched in FIFO rather
+                * The synchronous/trim i/o queues are dispatched in FIFO rather
                 * than LBA order. This provides more consistent latency for
                 * these i/os.
                 */
-               if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
+               if (p == ZIO_PRIORITY_SYNC_READ ||
+                   p == ZIO_PRIORITY_SYNC_WRITE ||
+                   p == ZIO_PRIORITY_TRIM) {
                        compfn = vdev_queue_timestamp_compare;
-               else
+               } else {
                        compfn = vdev_queue_offset_compare;
+               }
                avl_create(vdev_queue_class_tree(vq, p), compfn,
                    sizeof (zio_t), offsetof(struct zio, io_queue_node));
        }
@@ -428,6 +448,7 @@ vdev_queue_fini(vdev_t *vd)
        avl_destroy(&vq->vq_active_tree);
        avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
        avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
+       avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
 
        mutex_destroy(&vq->vq_lock);
 }
@@ -559,6 +580,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
        if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
                return (NULL);
 
+       /*
+        * While TRIM commands could be aggregated based on offset this
+        * behavior is disabled until it's determined to be beneficial.
+        */
+       if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
+               return (NULL);
+
        first = last = zio;
 
        if (zio->io_type == ZIO_TYPE_READ)
@@ -732,7 +760,7 @@ again:
         * For LBA-ordered queues (async / scrub / initializing), issue the
         * i/o which follows the most recently issued i/o in LBA (offset) order.
         *
-        * For FIFO queues (sync), issue the i/o with the lowest timestamp.
+        * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
         */
        tree = vdev_queue_class_tree(vq, p);
        vq->vq_io_search.io_timestamp = 0;
@@ -783,19 +811,27 @@ vdev_queue_io(zio_t *zio)
         * not match the child's i/o type.  Fix it up here.
         */
        if (zio->io_type == ZIO_TYPE_READ) {
+               ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
+
                if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
                    zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
                    zio->io_priority != ZIO_PRIORITY_SCRUB &&
                    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
-                   zio->io_priority != ZIO_PRIORITY_INITIALIZING)
+                   zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
                        zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
-       } else {
-               ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+               }
+       } else if (zio->io_type == ZIO_TYPE_WRITE) {
+               ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
+
                if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
                    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
                    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
-                   zio->io_priority != ZIO_PRIORITY_INITIALIZING)
+                   zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
                        zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
+               }
+       } else {
+               ASSERT(zio->io_type == ZIO_TYPE_TRIM);
+               ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM);
        }
 
        zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
@@ -922,6 +958,9 @@ module_param(zfs_vdev_aggregation_limit_non_rotating, int, 0644);
 MODULE_PARM_DESC(zfs_vdev_aggregation_limit_non_rotating,
        "Max vdev I/O aggregation size for non-rotating media");
 
+module_param(zfs_vdev_aggregate_trim, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_aggregate_trim, "Allow TRIM I/O to be aggregated");
+
 module_param(zfs_vdev_read_gap_limit, int, 0644);
 MODULE_PARM_DESC(zfs_vdev_read_gap_limit, "Aggregate read I/O over gap");
 
@@ -995,6 +1034,14 @@ module_param(zfs_vdev_sync_write_min_active, int, 0644);
 MODULE_PARM_DESC(zfs_vdev_sync_write_min_active,
        "Min active sync write I/Os per vdev");
 
+module_param(zfs_vdev_trim_max_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_trim_max_active,
+       "Max active trim/discard I/Os per vdev");
+
+module_param(zfs_vdev_trim_min_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_trim_min_active,
+       "Min active trim/discard I/Os per vdev");
+
 module_param(zfs_vdev_queue_depth_pct, int, 0644);
 MODULE_PARM_DESC(zfs_vdev_queue_depth_pct,
        "Queue depth percentage for each top-level vdev");
index d11287bdcf1df6e3c81b6b57c1a3a88836ff90c0..215cd1c12064a92f218a04bcdd8626f5287efa21 100644 (file)
@@ -37,7 +37,7 @@
 #include <sys/vdev_raidz_impl.h>
 
 #ifdef ZFS_DEBUG
-#include <sys/vdev_initialize.h>       /* vdev_xlate testing */
+#include <sys/vdev.h>  /* For vdev_xlate() in vdev_raidz_io_verify() */
 #endif
 
 /*
index 98bf2194a42e32d27b3fb30ce9825d15cc17ab56..99d67b7be85f1b66545f1b565508aab5a06d922d 100644 (file)
@@ -45,6 +45,7 @@
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
 #include <sys/trace_vdev.h>
 
 /*
@@ -1181,6 +1182,8 @@ vdev_remove_complete(spa_t *spa)
        txg = spa_vdev_enter(spa);
        vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
        ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+       ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+       ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
 
        sysevent_t *ev = spa_event_create(spa, vd, NULL,
            ESC_ZFS_VDEV_REMOVE_DEV);
@@ -1869,8 +1872,10 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 
        spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
 
-       /* Stop initializing */
+       /* Stop initializing and TRIM */
        vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
+       vdev_trim_stop_all(vd, VDEV_TRIM_CANCELED);
+       vdev_autotrim_stop_wait(vd);
 
        *txg = spa_vdev_config_enter(spa);
 
@@ -2051,11 +2056,13 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
        error = spa_reset_logs(spa);
 
        /*
-        * We stop any initializing that is currently in progress but leave
-        * the state as "active". This will allow the initializing to resume
-        * if the removal is canceled sometime later.
+        * We stop any initializing and TRIM that is currently in progress
+        * but leave the state as "active". This will allow the process to
+        * resume if the removal is canceled sometime later.
         */
        vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
+       vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE);
+       vdev_autotrim_stop_wait(vd);
 
        *txg = spa_vdev_config_enter(spa);
 
@@ -2069,6 +2076,8 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
        if (error != 0) {
                metaslab_group_activate(mg);
                spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+               spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
+               spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
                return (error);
        }
 
diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c
new file mode 100644 (file)
index 0000000..5ad47cc
--- /dev/null
@@ -0,0 +1,1460 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/txg.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/refcount.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+
+/*
+ * TRIM is a feature which is used to notify a SSD that some previously
+ * written space is no longer allocated by the pool.  This is useful because
+ * writes to a SSD must be performed to blocks which have first been erased.
+ * Ensuring the SSD always has a supply of erased blocks for new writes
+ * helps prevent the performance from deteriorating.
+ *
+ * There are two supported TRIM methods; manual and automatic.
+ *
+ * Manual TRIM:
+ *
+ * A manual TRIM is initiated by running the 'zpool trim' command.  A single
+ * 'vdev_trim' thread is created for each leaf vdev, and it is responsible for
+ * managing that vdev TRIM process.  This involves iterating over all the
+ * metaslabs, calculating the unallocated space ranges, and then issuing the
+ * required TRIM I/Os.
+ *
+ * While a metaslab is being actively trimmed it is not eligible to perform
+ * new allocations.  After traversing all of the metaslabs the thread is
+ * terminated.  Finally, both the requested options and current progress of
+ * the TRIM are regularly written to the pool.  This allows the TRIM to be
+ * suspended and resumed as needed.
+ *
+ * Automatic TRIM:
+ *
+ * An automatic TRIM is enabled by setting the 'autotrim' pool property
+ * to 'on'.  When enabled, a `vdev_autotrim' thread is created for each
+ * top-level (not leaf) vdev in the pool.  These threads perform the same
+ * core TRIM process as a manual TRIM, but with a few key differences.
+ *
+ * 1) Automatic TRIM happens continuously in the background and operates
+ *    solely on recently freed blocks (ms_trim not ms_allocatable).
+ *
+ * 2) Each thread is associated with a top-level (not leaf) vdev.  This has
+ *    the benefit of simplifying the threading model, it makes it easier
+ *    to coordinate administrative commands, and it ensures only a single
+ *    metaslab is disabled at a time.  Unlike manual TRIM, this means each
+ *    'vdev_autotrim' thread is responsible for issuing TRIM I/Os for its
+ *    children.
+ *
+ * 3) There is no automatic TRIM progress information stored on disk, nor
+ *    is it reported by 'zpool status'.
+ *
+ * While the automatic TRIM process is highly effective it is more likely
+ * than a manual TRIM to encounter tiny ranges.  Ranges less than or equal to
+ * 'zfs_trim_extent_bytes_min' (32k) are considered too small to efficiently
+ * TRIM and are skipped.  This means small amounts of freed space may not
+ * be automatically trimmed.
+ *
+ * Furthermore, devices with attached hot spares and devices being actively
+ * replaced are skipped.  This is done to avoid adding additional stress to
+ * a potentially unhealthy device and to minimize the required rebuild time.
+ *
+ * For this reason it may be beneficial to occasionally manually TRIM a pool
+ * even when automatic TRIM is enabled.
+ */
+
+/*
+ * Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths.
+ */
+unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024;
+
+/*
+ * Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped.
+ */
+unsigned int zfs_trim_extent_bytes_min = 32 * 1024;
+
+/*
+ * Skip uninitialized metaslabs during the TRIM process.  This option is
+ * useful for pools constructed from large thinly-provisioned devices where
+ * TRIM operations are slow.  As a pool ages an increasing fraction of
+ * the pools metaslabs will be initialized progressively degrading the
+ * usefulness of this option.  This setting is stored when starting a
+ * manual TRIM and will persist for the duration of the requested TRIM.
+ */
+unsigned int zfs_trim_metaslab_skip = 0;
+
+/*
+ * Maximum number of queued TRIM I/Os per leaf vdev.  The number of
+ * concurrent TRIM I/Os issued to the device is controlled by the
+ * zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options.
+ */
+unsigned int zfs_trim_queue_limit = 10;
+
+/*
+ * The minimum number of transaction groups between automatic trims of a
+ * metaslab.  This setting represents a trade-off between issuing more
+ * efficient TRIM operations, by allowing them to be aggregated longer,
+ * and issuing them promptly so the trimmed space is available.  Note
+ * that this value is a minimum; metaslabs can be trimmed less frequently
+ * when there are a large number of ranges which need to be trimmed.
+ *
+ * Increasing this value will allow frees to be aggregated for a longer
+ * time.  This can result is larger TRIM operations, and increased memory
+ * usage in order to track the ranges to be trimmed.  Decreasing this value
+ * has the opposite effect.  The default value of 32 was determined though
+ * testing to be a reasonable compromise.
+ */
+unsigned int zfs_trim_txg_batch = 32;
+
+/*
+ * The trim_args are a control structure which describe how a leaf vdev
+ * should be trimmed.  The core elements are the vdev, the metaslab being
+ * trimmed and a range tree containing the extents to TRIM.  All provided
+ * ranges must be within the metaslab.
+ */
+typedef struct trim_args {
+       /*
+        * These fields are set by the caller of vdev_trim_ranges().
+        */
+       vdev_t          *trim_vdev;             /* Leaf vdev to TRIM */
+       metaslab_t      *trim_msp;              /* Disabled metaslab */
+       range_tree_t    *trim_tree;             /* TRIM ranges (in metaslab) */
+       trim_type_t     trim_type;              /* Manual or auto TRIM */
+       uint64_t        trim_extent_bytes_max;  /* Maximum TRIM I/O size */
+       uint64_t        trim_extent_bytes_min;  /* Minimum TRIM I/O size */
+       enum trim_flag  trim_flags;             /* TRIM flags (secure) */
+
+       /*
+        * These fields are updated by vdev_trim_ranges().
+        */
+       hrtime_t        trim_start_time;        /* Start time */
+       uint64_t        trim_bytes_done;        /* Bytes trimmed */
+} trim_args_t;
+
+/*
+ * Determines whether a vdev_trim_thread() should be stopped.
+ */
+static boolean_t
+vdev_trim_should_stop(vdev_t *vd)
+{
+       return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) ||
+           vd->vdev_detached || vd->vdev_top->vdev_removing);
+}
+
+/*
+ * Determines whether a vdev_autotrim_thread() should be stopped.
+ */
+static boolean_t
+vdev_autotrim_should_stop(vdev_t *tvd)
+{
+       return (tvd->vdev_autotrim_exit_wanted ||
+           !vdev_writeable(tvd) || tvd->vdev_removing ||
+           spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF);
+}
+
+/*
+ * The sync task for updating the on-disk state of a manual TRIM.  This
+ * is scheduled by vdev_trim_change_state().
+ */
+static void
+vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx)
+{
+       /*
+        * We pass in the guid instead of the vdev_t since the vdev may
+        * have been freed prior to the sync task being processed.  This
+        * happens when a vdev is detached as we call spa_config_vdev_exit(),
+        * stop the trimming thread, schedule the sync task, and free
+        * the vdev. Later when the scheduled sync task is invoked, it would
+        * find that the vdev has been freed.
+        */
+       uint64_t guid = *(uint64_t *)arg;
+       uint64_t txg = dmu_tx_get_txg(tx);
+       kmem_free(arg, sizeof (uint64_t));
+
+       vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
+       if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+               return;
+
+       uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK];
+       vd->vdev_trim_offset[txg & TXG_MASK] = 0;
+
+       VERIFY3U(vd->vdev_leaf_zap, !=, 0);
+
+       objset_t *mos = vd->vdev_spa->spa_meta_objset;
+
+       if (last_offset > 0 || vd->vdev_trim_last_offset == UINT64_MAX) {
+
+               if (vd->vdev_trim_last_offset == UINT64_MAX)
+                       last_offset = 0;
+
+               vd->vdev_trim_last_offset = last_offset;
+               VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+                   VDEV_LEAF_ZAP_TRIM_LAST_OFFSET,
+                   sizeof (last_offset), 1, &last_offset, tx));
+       }
+
+       if (vd->vdev_trim_action_time > 0) {
+               uint64_t val = (uint64_t)vd->vdev_trim_action_time;
+               VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+                   VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (val),
+                   1, &val, tx));
+       }
+
+       if (vd->vdev_trim_rate > 0) {
+               uint64_t rate = (uint64_t)vd->vdev_trim_rate;
+
+               if (rate == UINT64_MAX)
+                       rate = 0;
+
+               VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+                   VDEV_LEAF_ZAP_TRIM_RATE, sizeof (rate), 1, &rate, tx));
+       }
+
+       uint64_t partial = vd->vdev_trim_partial;
+       if (partial == UINT64_MAX)
+               partial = 0;
+
+       VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL,
+           sizeof (partial), 1, &partial, tx));
+
+       uint64_t secure = vd->vdev_trim_secure;
+       if (secure == UINT64_MAX)
+               secure = 0;
+
+       VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE,
+           sizeof (secure), 1, &secure, tx));
+
+
+       uint64_t trim_state = vd->vdev_trim_state;
+       VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE,
+           sizeof (trim_state), 1, &trim_state, tx));
+}
+
+/*
+ * Update the on-disk state of a manual TRIM.  This is called to request
+ * that a TRIM be started/suspended/canceled, or to change one of the
+ * TRIM options (partial, secure, rate).
+ */
+static void
+vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
+    uint64_t rate, boolean_t partial, boolean_t secure)
+{
+       ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
+       spa_t *spa = vd->vdev_spa;
+
+       if (new_state == vd->vdev_trim_state)
+               return;
+
+       /*
+        * Copy the vd's guid, this will be freed by the sync task.
+        */
+       uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+       *guid = vd->vdev_guid;
+
+       /*
+        * If we're suspending, then preserve the original start time.
+        */
+       if (vd->vdev_trim_state != VDEV_TRIM_SUSPENDED) {
+               vd->vdev_trim_action_time = gethrestime_sec();
+       }
+
+       /*
+        * If we're activating, then preserve the requested rate and trim
+        * method.  Setting the last offset and rate to UINT64_MAX is used
+        * as a sentinel to indicate they should be reset to default values.
+        */
+       if (new_state == VDEV_TRIM_ACTIVE) {
+               if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE ||
+                   vd->vdev_trim_state == VDEV_TRIM_CANCELED) {
+                       vd->vdev_trim_last_offset = UINT64_MAX;
+                       vd->vdev_trim_rate = UINT64_MAX;
+                       vd->vdev_trim_partial = UINT64_MAX;
+                       vd->vdev_trim_secure = UINT64_MAX;
+               }
+
+               if (rate != 0)
+                       vd->vdev_trim_rate = rate;
+
+               if (partial != 0)
+                       vd->vdev_trim_partial = partial;
+
+               if (secure != 0)
+                       vd->vdev_trim_secure = secure;
+       }
+
+       boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED);
+       vd->vdev_trim_state = new_state;
+
+       dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+       VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+       dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync,
+           guid, 2, ZFS_SPACE_CHECK_NONE, tx);
+
+       switch (new_state) {
+       case VDEV_TRIM_ACTIVE:
+               spa_event_notify(spa, vd, NULL,
+                   resumed ? ESC_ZFS_TRIM_RESUME : ESC_ZFS_TRIM_START);
+               spa_history_log_internal(spa, "trim", tx,
+                   "vdev=%s activated", vd->vdev_path);
+               break;
+       case VDEV_TRIM_SUSPENDED:
+               spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_SUSPEND);
+               spa_history_log_internal(spa, "trim", tx,
+                   "vdev=%s suspended", vd->vdev_path);
+               break;
+       case VDEV_TRIM_CANCELED:
+               spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
+               spa_history_log_internal(spa, "trim", tx,
+                   "vdev=%s canceled", vd->vdev_path);
+               break;
+       case VDEV_TRIM_COMPLETE:
+               spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH);
+               spa_history_log_internal(spa, "trim", tx,
+                   "vdev=%s complete", vd->vdev_path);
+               break;
+       default:
+               panic("invalid state %llu", (unsigned long long)new_state);
+       }
+
+       dmu_tx_commit(tx);
+}
+
+/*
+ * The zio_done_func_t done callback for each manual TRIM issued.  It is
+ * responsible for updating the TRIM stats, reissuing failed TRIM I/Os,
+ * and limiting the number of in flight TRIM I/Os.
+ */
+static void
+vdev_trim_cb(zio_t *zio)
+{
+       vdev_t *vd = zio->io_vd;
+
+       mutex_enter(&vd->vdev_trim_io_lock);
+       if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
+               /*
+                * The I/O failed because the vdev was unavailable; roll the
+                * last offset back. (This works because spa_sync waits on
+                * spa_txg_zio before it runs sync tasks.)
+                */
+               uint64_t *offset =
+                   &vd->vdev_trim_offset[zio->io_txg & TXG_MASK];
+               *offset = MIN(*offset, zio->io_offset);
+       } else {
+               if (zio->io_error != 0) {
+                       vd->vdev_stat.vs_trim_errors++;
+                       spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL,
+                           0, 0, 0, 0, 1, zio->io_orig_size);
+               } else {
+                       spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL,
+                           1, zio->io_orig_size, 0, 0, 0, 0);
+               }
+
+               vd->vdev_trim_bytes_done += zio->io_orig_size;
+       }
+
+       ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_MANUAL], >, 0);
+       vd->vdev_trim_inflight[TRIM_TYPE_MANUAL]--;
+       cv_broadcast(&vd->vdev_trim_io_cv);
+       mutex_exit(&vd->vdev_trim_io_lock);
+
+       spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/*
+ * The zio_done_func_t done callback for each automatic TRIM issued.  It
+ * is responsible for updating the TRIM stats and limiting the number of
+ * in flight TRIM I/Os.  Automatic TRIM I/Os are best effort and are
+ * never reissued on failure.
+ */
+static void
+vdev_autotrim_cb(zio_t *zio)
+{
+       vdev_t *vd = zio->io_vd;
+
+       mutex_enter(&vd->vdev_trim_io_lock);
+
+       if (zio->io_error != 0) {
+               vd->vdev_stat.vs_trim_errors++;
+               spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO,
+                   0, 0, 0, 0, 1, zio->io_orig_size);
+       } else {
+               spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO,
+                   1, zio->io_orig_size, 0, 0, 0, 0);
+       }
+
+       ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_AUTO], >, 0);
+       vd->vdev_trim_inflight[TRIM_TYPE_AUTO]--;
+       cv_broadcast(&vd->vdev_trim_io_cv);
+       mutex_exit(&vd->vdev_trim_io_lock);
+
+       spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/*
+ * Returns the average trim rate in bytes/sec for the ta->trim_vdev.
+ */
+static uint64_t
+vdev_trim_calculate_rate(trim_args_t *ta)
+{
+       return (ta->trim_bytes_done * 1000 /
+           (NSEC2MSEC(gethrtime() - ta->trim_start_time) + 1));
+}
+
+/*
+ * Issues a physical TRIM and takes care of rate limiting (bytes/sec)
+ * and number of concurrent TRIM I/Os.
+ */
+static int
+vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)
+{
+       vdev_t *vd = ta->trim_vdev;
+       spa_t *spa = vd->vdev_spa;
+
+       mutex_enter(&vd->vdev_trim_io_lock);
+
+       /*
+        * Limit manual TRIM I/Os to the requested rate.  This does not
+        * apply to automatic TRIM since no per vdev rate can be specified.
+        */
+       if (ta->trim_type == TRIM_TYPE_MANUAL) {
+               while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) &&
+                   vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) {
+                       cv_timedwait_sig(&vd->vdev_trim_io_cv,
+                           &vd->vdev_trim_io_lock, ddi_get_lbolt() +
+                           MSEC_TO_TICK(10));
+               }
+       }
+       ta->trim_bytes_done += size;
+
+       /* Limit in flight trimming I/Os */
+       while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] >=
+           zfs_trim_queue_limit) {
+               cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+       }
+       vd->vdev_trim_inflight[ta->trim_type]++;
+       mutex_exit(&vd->vdev_trim_io_lock);
+
+       dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+       VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+       uint64_t txg = dmu_tx_get_txg(tx);
+
+       spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
+       mutex_enter(&vd->vdev_trim_lock);
+
+       if (ta->trim_type == TRIM_TYPE_MANUAL &&
+           vd->vdev_trim_offset[txg & TXG_MASK] == 0) {
+               uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+               *guid = vd->vdev_guid;
+
+               /* This is the first write of this txg. */
+               dsl_sync_task_nowait(spa_get_dsl(spa),
+                   vdev_trim_zap_update_sync, guid, 2,
+                   ZFS_SPACE_CHECK_RESERVED, tx);
+       }
+
+       /*
+        * We know the vdev_t will still be around since all consumers of
+        * vdev_free must stop the trimming first.
+        */
+       if ((ta->trim_type == TRIM_TYPE_MANUAL &&
+           vdev_trim_should_stop(vd)) ||
+           (ta->trim_type == TRIM_TYPE_AUTO &&
+           vdev_autotrim_should_stop(vd->vdev_top))) {
+               mutex_enter(&vd->vdev_trim_io_lock);
+               vd->vdev_trim_inflight[ta->trim_type]--;
+               mutex_exit(&vd->vdev_trim_io_lock);
+               spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+               mutex_exit(&vd->vdev_trim_lock);
+               dmu_tx_commit(tx);
+               return (SET_ERROR(EINTR));
+       }
+       mutex_exit(&vd->vdev_trim_lock);
+
+       if (ta->trim_type == TRIM_TYPE_MANUAL)
+               vd->vdev_trim_offset[txg & TXG_MASK] = start + size;
+
+       zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd,
+           start, size, ta->trim_type == TRIM_TYPE_MANUAL ?
+           vdev_trim_cb : vdev_autotrim_cb, NULL,
+           ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, ta->trim_flags));
+       /* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */
+
+       dmu_tx_commit(tx);
+
+       return (0);
+}
+
+/*
+ * Issues TRIM I/Os for all ranges in the provided ta->trim_tree range tree.
+ * Additional parameters describing how the TRIM should be performed must
+ * be set in the trim_args structure.  See the trim_args definition for
+ * additional information.
+ */
+static int
+vdev_trim_ranges(trim_args_t *ta)
+{
+       vdev_t *vd = ta->trim_vdev;
+       avl_tree_t *rt = &ta->trim_tree->rt_root;
+       uint64_t extent_bytes_max = ta->trim_extent_bytes_max;
+       uint64_t extent_bytes_min = ta->trim_extent_bytes_min;
+       spa_t *spa = vd->vdev_spa;
+
+       ta->trim_start_time = gethrtime();
+       ta->trim_bytes_done = 0;
+
+       for (range_seg_t *rs = avl_first(rt); rs != NULL;
+           rs = AVL_NEXT(rt, rs)) {
+               uint64_t size = rs->rs_end - rs->rs_start;
+
+               if (extent_bytes_min && size < extent_bytes_min) {
+                       spa_iostats_trim_add(spa, ta->trim_type,
+                           0, 0, 1, size, 0, 0);
+                       continue;
+               }
+
+               /* Split range into legally-sized physical chunks */
+               uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1;
+
+               for (uint64_t w = 0; w < writes_required; w++) {
+                       int error;
+
+                       error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE +
+                           rs->rs_start + (w * extent_bytes_max),
+                           MIN(size - (w * extent_bytes_max),
+                           extent_bytes_max));
+                       if (error != 0) {
+                               return (error);
+                       }
+               }
+       }
+
+       return (0);
+}
+
+/*
+ * Calculates the completion percentage of a manual TRIM.
+ */
+static void
+vdev_trim_calculate_progress(vdev_t *vd)
+{
+       ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+           spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+       ASSERT(vd->vdev_leaf_zap != 0);
+
+       vd->vdev_trim_bytes_est = 0;
+       vd->vdev_trim_bytes_done = 0;
+
+       for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
+               metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+               mutex_enter(&msp->ms_lock);
+
+               uint64_t ms_free = msp->ms_size -
+                   metaslab_allocated_space(msp);
+
+               if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
+                       ms_free /= vd->vdev_top->vdev_children;
+
+               /*
+                * Convert the metaslab range to a physical range
+                * on our vdev. We use this to determine if we are
+                * in the middle of this metaslab range.
+                */
+               range_seg_t logical_rs, physical_rs;
+               logical_rs.rs_start = msp->ms_start;
+               logical_rs.rs_end = msp->ms_start + msp->ms_size;
+               vdev_xlate(vd, &logical_rs, &physical_rs);
+
+               if (vd->vdev_trim_last_offset <= physical_rs.rs_start) {
+                       vd->vdev_trim_bytes_est += ms_free;
+                       mutex_exit(&msp->ms_lock);
+                       continue;
+               } else if (vd->vdev_trim_last_offset > physical_rs.rs_end) {
+                       vd->vdev_trim_bytes_done += ms_free;
+                       vd->vdev_trim_bytes_est += ms_free;
+                       mutex_exit(&msp->ms_lock);
+                       continue;
+               }
+
+               /*
+                * If we get here, we're in the middle of trimming this
+                * metaslab.  Load it and walk the free tree for more
+                * accurate progress estimation.
+                */
+               VERIFY0(metaslab_load(msp));
+
+               for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
+                   rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
+                       logical_rs.rs_start = rs->rs_start;
+                       logical_rs.rs_end = rs->rs_end;
+                       vdev_xlate(vd, &logical_rs, &physical_rs);
+
+                       uint64_t size = physical_rs.rs_end -
+                           physical_rs.rs_start;
+                       vd->vdev_trim_bytes_est += size;
+                       if (vd->vdev_trim_last_offset >= physical_rs.rs_end) {
+                               vd->vdev_trim_bytes_done += size;
+                       } else if (vd->vdev_trim_last_offset >
+                           physical_rs.rs_start &&
+                           vd->vdev_trim_last_offset <=
+                           physical_rs.rs_end) {
+                               vd->vdev_trim_bytes_done +=
+                                   vd->vdev_trim_last_offset -
+                                   physical_rs.rs_start;
+                       }
+               }
+               mutex_exit(&msp->ms_lock);
+       }
+}
+
+/*
+ * Load from disk the vdev's manual TRIM information.  This includes the
+ * state, progress, and options provided when initiating the manual TRIM.
+ */
+static int
+vdev_trim_load(vdev_t *vd)
+{
+       int err = 0;
+       ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+           spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+       ASSERT(vd->vdev_leaf_zap != 0);
+
+       if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE ||
+           vd->vdev_trim_state == VDEV_TRIM_SUSPENDED) {
+               err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+                   vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_LAST_OFFSET,
+                   sizeof (vd->vdev_trim_last_offset), 1,
+                   &vd->vdev_trim_last_offset);
+               if (err == ENOENT) {
+                       vd->vdev_trim_last_offset = 0;
+                       err = 0;
+               }
+
+               if (err == 0) {
+                       err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+                           vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_RATE,
+                           sizeof (vd->vdev_trim_rate), 1,
+                           &vd->vdev_trim_rate);
+                       if (err == ENOENT) {
+                               vd->vdev_trim_rate = 0;
+                               err = 0;
+                       }
+               }
+
+               if (err == 0) {
+                       err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+                           vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL,
+                           sizeof (vd->vdev_trim_partial), 1,
+                           &vd->vdev_trim_partial);
+                       if (err == ENOENT) {
+                               vd->vdev_trim_partial = 0;
+                               err = 0;
+                       }
+               }
+
+               if (err == 0) {
+                       err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+                           vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE,
+                           sizeof (vd->vdev_trim_secure), 1,
+                           &vd->vdev_trim_secure);
+                       if (err == ENOENT) {
+                               vd->vdev_trim_secure = 0;
+                               err = 0;
+                       }
+               }
+       }
+
+       vdev_trim_calculate_progress(vd);
+
+       return (err);
+}
+
+/*
+ * Convert the logical range into a physical range and add it to the
+ * range tree passed in the trim_args_t.
+ */
+static void
+vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
+{
+       trim_args_t *ta = arg;
+       vdev_t *vd = ta->trim_vdev;
+       range_seg_t logical_rs, physical_rs;
+       logical_rs.rs_start = start;
+       logical_rs.rs_end = start + size;
+
+       /*
+        * Every range to be trimmed must be part of ms_allocatable.
+        * When ZFS_DEBUG_TRIM is set load the metaslab to verify this
+        * is always the case.
+        */
+       if (zfs_flags & ZFS_DEBUG_TRIM) {
+               metaslab_t *msp = ta->trim_msp;
+               VERIFY0(metaslab_load(msp));
+               VERIFY3B(msp->ms_loaded, ==, B_TRUE);
+               VERIFY(range_tree_find(msp->ms_allocatable, start, size));
+       }
+
+       ASSERT(vd->vdev_ops->vdev_op_leaf);
+       vdev_xlate(vd, &logical_rs, &physical_rs);
+
+       IMPLY(vd->vdev_top == vd,
+           logical_rs.rs_start == physical_rs.rs_start);
+       IMPLY(vd->vdev_top == vd,
+           logical_rs.rs_end == physical_rs.rs_end);
+
+       /*
+        * Only a manual trim will be traversing the vdev sequentially.
+        * For an auto trim all valid ranges should be added.
+        */
+       if (ta->trim_type == TRIM_TYPE_MANUAL) {
+
+               /* Only add segments that we have not visited yet */
+               if (physical_rs.rs_end <= vd->vdev_trim_last_offset)
+                       return;
+
+               /* Pick up where we left off mid-range. */
+               if (vd->vdev_trim_last_offset > physical_rs.rs_start) {
+                       ASSERT3U(physical_rs.rs_end, >,
+                           vd->vdev_trim_last_offset);
+                       physical_rs.rs_start = vd->vdev_trim_last_offset;
+               }
+       }
+
+       ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
+
+       /*
+        * With raidz, it's possible that the logical range does not live on
+        * this leaf vdev. We only add the physical range to this vdev's if it
+        * has a length greater than 0.
+        */
+       if (physical_rs.rs_end > physical_rs.rs_start) {
+               range_tree_add(ta->trim_tree, physical_rs.rs_start,
+                   physical_rs.rs_end - physical_rs.rs_start);
+       } else {
+               ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
+       }
+}
+
+/*
+ * Each manual TRIM thread is responsible for trimming the unallocated
+ * space for each leaf vdev.  This is accomplished by sequentially iterating
+ * over its top-level metaslabs and issuing TRIM I/O for the space described
+ * by its ms_allocatable.  While a metaslab is undergoing trimming it is
+ * not eligible for new allocations.
+ */
+static void
+vdev_trim_thread(void *arg)
+{
+       vdev_t *vd = arg;
+       spa_t *spa = vd->vdev_spa;
+       trim_args_t ta;
+       int error = 0;
+
+       /*
+        * The VDEV_LEAF_ZAP_TRIM_* entries may have been updated by
+        * vdev_trim().  Wait for the updated values to be reflected
+        * in the zap in order to start with the requested settings.
+        */
+       txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+
+       ASSERT(vdev_is_concrete(vd));
+       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+       vd->vdev_trim_last_offset = 0;
+       vd->vdev_trim_rate = 0;
+       vd->vdev_trim_partial = 0;
+       vd->vdev_trim_secure = 0;
+
+       VERIFY0(vdev_trim_load(vd));
+
+       ta.trim_vdev = vd;
+       ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
+       ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min;
+       ta.trim_tree = range_tree_create(NULL, NULL);
+       ta.trim_type = TRIM_TYPE_MANUAL;
+       ta.trim_flags = 0;
+
+       /*
+        * When a secure TRIM has been requested infer that the intent
+        * is that everything must be trimmed.  Override the default
+        * minimum TRIM size to prevent ranges from being skipped.
+        */
+       if (vd->vdev_trim_secure) {
+               ta.trim_flags |= ZIO_TRIM_SECURE;
+               ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
+       }
+
+       uint64_t ms_count = 0;
+       for (uint64_t i = 0; !vd->vdev_detached &&
+           i < vd->vdev_top->vdev_ms_count; i++) {
+               metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+
+               /*
+                * If we've expanded the top-level vdev or it's our
+                * first pass, calculate our progress.
+                */
+               if (vd->vdev_top->vdev_ms_count != ms_count) {
+                       vdev_trim_calculate_progress(vd);
+                       ms_count = vd->vdev_top->vdev_ms_count;
+               }
+
+               spa_config_exit(spa, SCL_CONFIG, FTAG);
+               metaslab_disable(msp);
+               mutex_enter(&msp->ms_lock);
+               VERIFY0(metaslab_load(msp));
+
+               /*
+                * If a partial TRIM was requested skip metaslabs which have
+                * never been initialized and thus have never been written.
+                */
+               if (msp->ms_sm == NULL && vd->vdev_trim_partial) {
+                       mutex_exit(&msp->ms_lock);
+                       metaslab_enable(msp, B_FALSE);
+                       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+                       vdev_trim_calculate_progress(vd);
+                       continue;
+               }
+
+               ta.trim_msp = msp;
+               range_tree_walk(msp->ms_allocatable, vdev_trim_range_add, &ta);
+               range_tree_vacate(msp->ms_trim, NULL, NULL);
+               mutex_exit(&msp->ms_lock);
+
+               error = vdev_trim_ranges(&ta);
+               metaslab_enable(msp, B_TRUE);
+               spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+               range_tree_vacate(ta.trim_tree, NULL, NULL);
+               if (error != 0)
+                       break;
+       }
+
+       spa_config_exit(spa, SCL_CONFIG, FTAG);
+       mutex_enter(&vd->vdev_trim_io_lock);
+       while (vd->vdev_trim_inflight[0] > 0) {
+               cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+       }
+       mutex_exit(&vd->vdev_trim_io_lock);
+
+       range_tree_destroy(ta.trim_tree);
+
+       mutex_enter(&vd->vdev_trim_lock);
+       if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) {
+               vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE,
+                   vd->vdev_trim_rate, vd->vdev_trim_partial,
+                   vd->vdev_trim_secure);
+       }
+       ASSERT(vd->vdev_trim_thread != NULL || vd->vdev_trim_inflight[0] == 0);
+
+       /*
+        * Drop the vdev_trim_lock while we sync out the txg since it's
+        * possible that a device might be trying to come online and must
+        * check to see if it needs to restart a trim. That thread will be
+        * holding the spa_config_lock which would prevent the txg_wait_synced
+        * from completing.
+        */
+       mutex_exit(&vd->vdev_trim_lock);
+       txg_wait_synced(spa_get_dsl(spa), 0);
+       mutex_enter(&vd->vdev_trim_lock);
+
+       vd->vdev_trim_thread = NULL;
+       cv_broadcast(&vd->vdev_trim_cv);
+       mutex_exit(&vd->vdev_trim_lock);
+}
+
+/*
+ * Initiates a manual TRIM for the vdev_t.  Callers must hold vdev_trim_lock,
+ * the vdev_t must be a leaf and cannot already be manually trimming.
+ */
+void
+vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure)
+{
+       ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
+       ASSERT(vd->vdev_ops->vdev_op_leaf);
+       ASSERT(vdev_is_concrete(vd));
+       ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+       ASSERT(!vd->vdev_detached);
+       ASSERT(!vd->vdev_trim_exit_wanted);
+       ASSERT(!vd->vdev_top->vdev_removing);
+
+       vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure);
+       vd->vdev_trim_thread = thread_create(NULL, 0,
+           vdev_trim_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+}
+
+/*
+ * Wait for the trimming thread to be terminated (canceled or stopped).
+ */
+static void
+vdev_trim_stop_wait_impl(vdev_t *vd)
+{
+       ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
+
+       while (vd->vdev_trim_thread != NULL)
+               cv_wait(&vd->vdev_trim_cv, &vd->vdev_trim_lock);
+
+       ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+       vd->vdev_trim_exit_wanted = B_FALSE;
+}
+
+/*
+ * Wait for vdev trim threads which were listed to cleanly exit.
+ */
+void
+vdev_trim_stop_wait(spa_t *spa, list_t *vd_list)
+{
+       vdev_t *vd;
+
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+       while ((vd = list_remove_head(vd_list)) != NULL) {
+               mutex_enter(&vd->vdev_trim_lock);
+               vdev_trim_stop_wait_impl(vd);
+               mutex_exit(&vd->vdev_trim_lock);
+       }
+}
+
+/*
+ * Stop trimming a device, with the resultant trimming state being tgt_state.
+ * For blocking behavior pass NULL for vd_list.  Otherwise, when a list_t is
+ * provided the stopping vdev is inserted in to the list.  Callers are then
+ * required to call vdev_trim_stop_wait() to block for all the trim threads
+ * to exit.  The caller must hold vdev_trim_lock and must not be writing to
+ * the spa config, as the trimming thread may try to enter the config as a
+ * reader before exiting.
+ */
+void
+vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list)
+{
+       ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER));
+       ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
+       ASSERT(vd->vdev_ops->vdev_op_leaf);
+       ASSERT(vdev_is_concrete(vd));
+
+       /*
+        * Allow cancel requests to proceed even if the trim thread has
+        * stopped.
+        */
+       if (vd->vdev_trim_thread == NULL && tgt_state != VDEV_TRIM_CANCELED)
+               return;
+
+       vdev_trim_change_state(vd, tgt_state, 0, 0, 0);
+       vd->vdev_trim_exit_wanted = B_TRUE;
+
+       if (vd_list == NULL) {
+               vdev_trim_stop_wait_impl(vd);
+       } else {
+               ASSERT(MUTEX_HELD(&spa_namespace_lock));
+               list_insert_tail(vd_list, vd);
+       }
+}
+
+/*
+ * Requests that all listed vdevs stop trimming.
+ */
+static void
+vdev_trim_stop_all_impl(vdev_t *vd, vdev_trim_state_t tgt_state,
+    list_t *vd_list)
+{
+       if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
+               mutex_enter(&vd->vdev_trim_lock);
+               vdev_trim_stop(vd, tgt_state, vd_list);
+               mutex_exit(&vd->vdev_trim_lock);
+               return;
+       }
+
+       for (uint64_t i = 0; i < vd->vdev_children; i++) {
+               vdev_trim_stop_all_impl(vd->vdev_child[i], tgt_state,
+                   vd_list);
+       }
+}
+
+/*
+ * Convenience function to stop trimming of a vdev tree and set all trim
+ * thread pointers to NULL.
+ */
+void
+vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
+{
+       spa_t *spa = vd->vdev_spa;
+       list_t vd_list;
+
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+       list_create(&vd_list, sizeof (vdev_t),
+           offsetof(vdev_t, vdev_trim_node));
+
+       vdev_trim_stop_all_impl(vd, tgt_state, &vd_list);
+       vdev_trim_stop_wait(spa, &vd_list);
+
+       if (vd->vdev_spa->spa_sync_on) {
+               /* Make sure that our state has been synced to disk */
+               txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+       }
+
+       list_destroy(&vd_list);
+}
+
+/*
+ * Conditionally restarts a manual TRIM given its on-disk state.
+ */
+void
+vdev_trim_restart(vdev_t *vd)
+{
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+       if (vd->vdev_leaf_zap != 0) {
+               mutex_enter(&vd->vdev_trim_lock);
+               uint64_t trim_state = VDEV_TRIM_NONE;
+               int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+                   vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE,
+                   sizeof (trim_state), 1, &trim_state);
+               ASSERT(err == 0 || err == ENOENT);
+               vd->vdev_trim_state = trim_state;
+
+               uint64_t timestamp = 0;
+               err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+                   vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_ACTION_TIME,
+                   sizeof (timestamp), 1, &timestamp);
+               ASSERT(err == 0 || err == ENOENT);
+               vd->vdev_trim_action_time = (time_t)timestamp;
+
+               if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED ||
+                   vd->vdev_offline) {
+                       /* load progress for reporting, but don't resume */
+                       VERIFY0(vdev_trim_load(vd));
+               } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE &&
+                   vdev_writeable(vd) && !vd->vdev_top->vdev_removing &&
+                   vd->vdev_trim_thread == NULL) {
+                       VERIFY0(vdev_trim_load(vd));
+                       vdev_trim(vd, vd->vdev_trim_rate,
+                           vd->vdev_trim_partial, vd->vdev_trim_secure);
+               }
+
+               mutex_exit(&vd->vdev_trim_lock);
+       }
+
+       for (uint64_t i = 0; i < vd->vdev_children; i++) {
+               vdev_trim_restart(vd->vdev_child[i]);
+       }
+}
+
+/*
+ * Used by the automatic TRIM when ZFS_DEBUG_TRIM is set to verify that
+ * every TRIM range is contained within ms_allocatable.
+ */
+static void
+vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size)
+{
+       trim_args_t *ta = arg;
+       metaslab_t *msp = ta->trim_msp;
+
+       VERIFY3B(msp->ms_loaded, ==, B_TRUE);
+       VERIFY3U(msp->ms_disabled, >, 0);
+       VERIFY(range_tree_find(msp->ms_allocatable, start, size) != NULL);
+}
+
+/*
+ * Each automatic TRIM thread is responsible for managing the trimming of a
+ * top-level vdev in the pool.  No automatic TRIM state is maintained on-disk.
+ *
+ * N.B. This behavior is different from a manual TRIM where a thread
+ * is created for each leaf vdev, instead of each top-level vdev.
+ */
+static void
+vdev_autotrim_thread(void *arg)
+{
+       vdev_t *vd = arg;
+       spa_t *spa = vd->vdev_spa;
+       int shift = 0;
+
+       mutex_enter(&vd->vdev_autotrim_lock);
+       ASSERT3P(vd->vdev_top, ==, vd);
+       ASSERT3P(vd->vdev_autotrim_thread, !=, NULL);
+       mutex_exit(&vd->vdev_autotrim_lock);
+       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+       uint64_t extent_bytes_max = zfs_trim_extent_bytes_max;
+       uint64_t extent_bytes_min = zfs_trim_extent_bytes_min;
+
+       while (!vdev_autotrim_should_stop(vd)) {
+               int txgs_per_trim = MAX(zfs_trim_txg_batch, 1);
+               boolean_t issued_trim = B_FALSE;
+
+               /*
+                * All of the metaslabs are divided in to groups of size
+                * num_metaslabs / zfs_trim_txg_batch.  Each of these groups
+                * is composed of metaslabs which are spread evenly over the
+                * device.
+                *
+                * For example, when zfs_trim_txg_batch = 32 (default) then
+                * group 0 will contain metaslabs 0, 32, 64, ...;
+                * group 1 will contain metaslabs 1, 33, 65, ...;
+                * group 2 will contain metaslabs 2, 34, 66, ...; and so on.
+                *
+                * On each pass through the while() loop one of these groups
+                * is selected.  This is accomplished by using a shift value
+                * to select the starting metaslab, then striding over the
+                * metaslabs using the zfs_trim_txg_batch size.  This is
+                * done to accomplish two things.
+                *
+                * 1) By dividing the metaslabs in to groups, and making sure
+                *    that each group takes a minimum of one txg to process.
+                *    Then zfs_trim_txg_batch controls the minimum number of
+                *    txgs which must occur before a metaslab is revisited.
+                *
+                * 2) Selecting non-consecutive metaslabs distributes the
+                *    TRIM commands for a group evenly over the entire device.
+                *    This can be advantageous for certain types of devices.
+                */
+               for (uint64_t i = shift % txgs_per_trim; i < vd->vdev_ms_count;
+                   i += txgs_per_trim) {
+                       metaslab_t *msp = vd->vdev_ms[i];
+                       range_tree_t *trim_tree;
+
+                       spa_config_exit(spa, SCL_CONFIG, FTAG);
+                       metaslab_disable(msp);
+                       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+                       mutex_enter(&msp->ms_lock);
+
+                       /*
+                        * Skip the metaslab when it has never been allocated
+                        * or when there are no recent frees to trim.
+                        */
+                       if (msp->ms_sm == NULL ||
+                           range_tree_is_empty(msp->ms_trim)) {
+                               mutex_exit(&msp->ms_lock);
+                               metaslab_enable(msp, B_FALSE);
+                               continue;
+                       }
+
+                       /*
+                        * Skip the metaslab when it has already been disabled.
+                        * This may happen when a manual TRIM or initialize
+                        * operation is running concurrently.  In the case
+                        * of a manual TRIM, the ms_trim tree will have been
+                        * vacated.  Only ranges added after the manual TRIM
+                        * disabled the metaslab will be included in the tree.
+                        * These will be processed when the automatic TRIM
+                        * next revisits this metaslab.
+                        */
+                       if (msp->ms_disabled > 1) {
+                               mutex_exit(&msp->ms_lock);
+                               metaslab_enable(msp, B_FALSE);
+                               continue;
+                       }
+
+                       /*
+                        * Allocate an empty range tree which is swapped in
+                        * for the existing ms_trim tree while it is processed.
+                        */
+                       trim_tree = range_tree_create(NULL, NULL);
+                       range_tree_swap(&msp->ms_trim, &trim_tree);
+                       ASSERT(range_tree_is_empty(msp->ms_trim));
+
+                       /*
+                        * There are two cases when constructing the per-vdev
+                        * trim trees for a metaslab.  If the top-level vdev
+                        * has no children then it is also a leaf and should
+                        * be trimmed.  Otherwise our children are the leaves
+                        * and a trim tree should be constructed for each.
+                        */
+                       trim_args_t *tap;
+                       uint64_t children = vd->vdev_children;
+                       if (children == 0) {
+                               children = 1;
+                               tap = kmem_zalloc(sizeof (trim_args_t) *
+                                   children, KM_SLEEP);
+                               tap[0].trim_vdev = vd;
+                       } else {
+                               tap = kmem_zalloc(sizeof (trim_args_t) *
+                                   children, KM_SLEEP);
+
+                               for (uint64_t c = 0; c < children; c++) {
+                                       tap[c].trim_vdev = vd->vdev_child[c];
+                               }
+                       }
+
+                       for (uint64_t c = 0; c < children; c++) {
+                               trim_args_t *ta = &tap[c];
+                               vdev_t *cvd = ta->trim_vdev;
+
+                               ta->trim_msp = msp;
+                               ta->trim_extent_bytes_max = extent_bytes_max;
+                               ta->trim_extent_bytes_min = extent_bytes_min;
+                               ta->trim_type = TRIM_TYPE_AUTO;
+                               ta->trim_flags = 0;
+
+                               if (cvd->vdev_detached ||
+                                   !vdev_writeable(cvd) ||
+                                   !cvd->vdev_has_trim ||
+                                   cvd->vdev_trim_thread != NULL) {
+                                       continue;
+                               }
+
+                               /*
+                                * When a device has an attached hot spare, or
+                                * is being replaced it will not be trimmed.
+                                * This is done to avoid adding additional
+                                * stress to a potentially unhealthy device,
+                                * and to minimize the required rebuild time.
+                                */
+                               if (!cvd->vdev_ops->vdev_op_leaf)
+                                       continue;
+
+                               ta->trim_tree = range_tree_create(NULL, NULL);
+                               range_tree_walk(trim_tree,
+                                   vdev_trim_range_add, ta);
+                       }
+
+                       mutex_exit(&msp->ms_lock);
+                       spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+                       /*
+                        * Issue the TRIM I/Os for all ranges covered by the
+                        * TRIM trees.  These ranges are safe to TRIM because
+                        * no new allocations will be performed until the call
+                        * to metaslab_enabled() below.
+                        */
+                       for (uint64_t c = 0; c < children; c++) {
+                               trim_args_t *ta = &tap[c];
+
+                               /*
+                                * Always yield to a manual TRIM if one has
+                                * been started for the child vdev.
+                                */
+                               if (ta->trim_tree == NULL ||
+                                   ta->trim_vdev->vdev_trim_thread != NULL) {
+                                       continue;
+                               }
+
+                               /*
+                                * After this point metaslab_enable() must be
+                                * called with the sync flag set.  This is done
+                                * here because vdev_trim_ranges() is allowed
+                                * to be interrupted (EINTR) before issuing all
+                                * of the required TRIM I/Os.
+                                */
+                               issued_trim = B_TRUE;
+
+                               int error = vdev_trim_ranges(ta);
+                               if (error)
+                                       break;
+                       }
+
+                       /*
+                        * Verify every range which was trimmed is still
+                        * contained within the ms_allocatable tree.
+                        */
+                       if (zfs_flags & ZFS_DEBUG_TRIM) {
+                               mutex_enter(&msp->ms_lock);
+                               VERIFY0(metaslab_load(msp));
+                               VERIFY3P(tap[0].trim_msp, ==, msp);
+                               range_tree_walk(trim_tree,
+                                   vdev_trim_range_verify, &tap[0]);
+                               mutex_exit(&msp->ms_lock);
+                       }
+
+                       range_tree_vacate(trim_tree, NULL, NULL);
+                       range_tree_destroy(trim_tree);
+
+                       metaslab_enable(msp, issued_trim);
+                       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+                       for (uint64_t c = 0; c < children; c++) {
+                               trim_args_t *ta = &tap[c];
+
+                               if (ta->trim_tree == NULL)
+                                       continue;
+
+                               range_tree_vacate(ta->trim_tree, NULL, NULL);
+                               range_tree_destroy(ta->trim_tree);
+                       }
+
+                       kmem_free(tap, sizeof (trim_args_t) * children);
+               }
+
+               spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+               /*
+                * After completing the group of metaslabs wait for the next
+                * open txg.  This is done to make sure that a minimum of
+                * zfs_trim_txg_batch txgs will occur before these metaslabs
+                * are trimmed again.
+                */
+               txg_wait_open(spa_get_dsl(spa), 0, issued_trim);
+
+               shift++;
+               spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+       }
+
+       for (uint64_t c = 0; c < vd->vdev_children; c++) {
+               vdev_t *cvd = vd->vdev_child[c];
+               mutex_enter(&cvd->vdev_trim_io_lock);
+
+               while (cvd->vdev_trim_inflight[1] > 0) {
+                       cv_wait(&cvd->vdev_trim_io_cv,
+                           &cvd->vdev_trim_io_lock);
+               }
+               mutex_exit(&cvd->vdev_trim_io_lock);
+       }
+
+       spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+       /*
+        * When exiting because the autotrim property was set to off, then
+        * abandon any unprocessed ms_trim ranges to reclaim the memory.
+        */
+       if (spa_get_autotrim(spa) == SPA_AUTOTRIM_OFF) {
+               for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
+                       metaslab_t *msp = vd->vdev_ms[i];
+
+                       mutex_enter(&msp->ms_lock);
+                       range_tree_vacate(msp->ms_trim, NULL, NULL);
+                       mutex_exit(&msp->ms_lock);
+               }
+       }
+
+       mutex_enter(&vd->vdev_autotrim_lock);
+       ASSERT(vd->vdev_autotrim_thread != NULL);
+       vd->vdev_autotrim_thread = NULL;
+       cv_broadcast(&vd->vdev_autotrim_cv);
+       mutex_exit(&vd->vdev_autotrim_lock);
+}
+
+/*
+ * Starts an autotrim thread, if needed, for each top-level vdev which can be
+ * trimmed.  A top-level vdev which has been evacuated will never be trimmed.
+ */
+void
+vdev_autotrim(spa_t *spa)
+{
+       vdev_t *root_vd = spa->spa_root_vdev;
+
+       for (uint64_t i = 0; i < root_vd->vdev_children; i++) {
+               vdev_t *tvd = root_vd->vdev_child[i];
+
+               mutex_enter(&tvd->vdev_autotrim_lock);
+               if (vdev_writeable(tvd) && !tvd->vdev_removing &&
+                   tvd->vdev_autotrim_thread == NULL) {
+                       ASSERT3P(tvd->vdev_top, ==, tvd);
+
+                       tvd->vdev_autotrim_thread = thread_create(NULL, 0,
+                           vdev_autotrim_thread, tvd, 0, &p0, TS_RUN,
+                           maxclsyspri);
+                       ASSERT(tvd->vdev_autotrim_thread != NULL);
+               }
+               mutex_exit(&tvd->vdev_autotrim_lock);
+       }
+}
+
+/*
+ * Wait for the vdev_autotrim_thread associated with the passed top-level
+ * vdev to be terminated (canceled or stopped).
+ */
+void
+vdev_autotrim_stop_wait(vdev_t *tvd)
+{
+       mutex_enter(&tvd->vdev_autotrim_lock);
+       if (tvd->vdev_autotrim_thread != NULL) {
+               tvd->vdev_autotrim_exit_wanted = B_TRUE;
+
+               while (tvd->vdev_autotrim_thread != NULL) {
+                       cv_wait(&tvd->vdev_autotrim_cv,
+                           &tvd->vdev_autotrim_lock);
+               }
+
+               ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL);
+               tvd->vdev_autotrim_exit_wanted = B_FALSE;
+       }
+       mutex_exit(&tvd->vdev_autotrim_lock);
+}
+
+/*
+ * Wait for all of the vdev_autotrim_thread associated with the pool to
+ * be terminated (canceled or stopped).
+ */
+void
+vdev_autotrim_stop_all(spa_t *spa)
+{
+       vdev_t *root_vd = spa->spa_root_vdev;
+
+       for (uint64_t i = 0; i < root_vd->vdev_children; i++)
+               vdev_autotrim_stop_wait(root_vd->vdev_child[i]);
+}
+
+/*
+ * Conditionally restart all of the vdev_autotrim_thread's for the pool.
+ */
+void
+vdev_autotrim_restart(spa_t *spa)
+{
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+       if (spa->spa_autotrim)
+               vdev_autotrim(spa);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(vdev_trim);
+EXPORT_SYMBOL(vdev_trim_stop);
+EXPORT_SYMBOL(vdev_trim_stop_all);
+EXPORT_SYMBOL(vdev_trim_stop_wait);
+EXPORT_SYMBOL(vdev_trim_restart);
+EXPORT_SYMBOL(vdev_autotrim);
+EXPORT_SYMBOL(vdev_autotrim_stop_all);
+EXPORT_SYMBOL(vdev_autotrim_stop_wait);
+EXPORT_SYMBOL(vdev_autotrim_restart);
+
+/* BEGIN CSTYLED */
+module_param(zfs_trim_extent_bytes_max, uint, 0644);
+MODULE_PARM_DESC(zfs_trim_extent_bytes_max,
+    "Max size of TRIM commands, larger will be split");
+
+module_param(zfs_trim_extent_bytes_min, uint, 0644);
+MODULE_PARM_DESC(zfs_trim_extent_bytes_min,
+    "Min size of TRIM commands, smaller will be skipped");
+
+module_param(zfs_trim_metaslab_skip, uint, 0644);
+MODULE_PARM_DESC(zfs_trim_metaslab_skip,
+    "Skip metaslabs which have never been initialized");
+
+module_param(zfs_trim_txg_batch, uint, 0644);
+MODULE_PARM_DESC(zfs_trim_txg_batch,
+    "Min number of txgs to aggregate frees before issuing TRIM");
+
+module_param(zfs_trim_queue_limit, uint, 0644);
+MODULE_PARM_DESC(zfs_trim_queue_limit,
+    "Max queued TRIMs outstanding per leaf vdev");
+/* END CSTYLED */
+#endif
index 047193c61412ef6678143f724cf83085f4c96384..debe733dab7ca672be1be721080d3d114455260e 100644 (file)
 #include <sys/zfs_sysfs.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
 
 #include <linux/miscdevice.h>
 #include <linux/slab.h>
@@ -3885,7 +3886,7 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
 
 /*
  * innvl: {
- *     "initialize_command" -> POOL_INITIALIZE_{CANCEL|DO|SUSPEND} (uint64)
+ *     "initialize_command" -> POOL_INITIALIZE_{CANCEL|START|SUSPEND} (uint64)
  *     "initialize_vdevs": { -> guids to initialize (nvlist)
  *         "vdev_path_1": vdev_guid_1, (uint64),
  *         "vdev_path_2": vdev_guid_2, (uint64),
@@ -3919,7 +3920,7 @@ zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
        }
 
        if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
-           cmd_type == POOL_INITIALIZE_DO ||
+           cmd_type == POOL_INITIALIZE_START ||
            cmd_type == POOL_INITIALIZE_SUSPEND)) {
                return (SET_ERROR(EINVAL));
        }
@@ -3957,6 +3958,91 @@ zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
        return (total_errors > 0 ? EINVAL : 0);
 }
 
+/*
+ * innvl: {
+ *     "trim_command" -> POOL_TRIM_{CANCEL|START|SUSPEND} (uint64)
+ *     "trim_vdevs": { -> guids to TRIM (nvlist)
+ *         "vdev_path_1": vdev_guid_1, (uint64),
+ *         "vdev_path_2": vdev_guid_2, (uint64),
+ *         ...
+ *     },
+ *     "trim_rate" -> Target TRIM rate in bytes/sec.
+ *     "trim_secure" -> Set to request a secure TRIM.
+ * }
+ *
+ * outnvl: {
+ *     "trim_vdevs": { -> TRIM errors (nvlist)
+ *         "vdev_path_1": errno, see function body for possible errnos (uint64)
+ *         "vdev_path_2": errno, ... (uint64)
+ *         ...
+ *     }
+ * }
+ *
+ * EINVAL is returned for an unknown commands or if any of the provided vdev
+ * guids have be specified with a type other than uint64.
+ */
+static const zfs_ioc_key_t zfs_keys_pool_trim[] = {
+       {ZPOOL_TRIM_COMMAND,    DATA_TYPE_UINT64,               0},
+       {ZPOOL_TRIM_VDEVS,      DATA_TYPE_NVLIST,               0},
+       {ZPOOL_TRIM_RATE,       DATA_TYPE_UINT64,               ZK_OPTIONAL},
+       {ZPOOL_TRIM_SECURE,     DATA_TYPE_BOOLEAN_VALUE,        ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+       uint64_t cmd_type;
+       if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_COMMAND, &cmd_type) != 0)
+               return (SET_ERROR(EINVAL));
+
+       if (!(cmd_type == POOL_TRIM_CANCEL ||
+           cmd_type == POOL_TRIM_START ||
+           cmd_type == POOL_TRIM_SUSPEND)) {
+               return (SET_ERROR(EINVAL));
+       }
+
+       nvlist_t *vdev_guids;
+       if (nvlist_lookup_nvlist(innvl, ZPOOL_TRIM_VDEVS, &vdev_guids) != 0)
+               return (SET_ERROR(EINVAL));
+
+       for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
+           pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
+               uint64_t vdev_guid;
+               if (nvpair_value_uint64(pair, &vdev_guid) != 0) {
+                       return (SET_ERROR(EINVAL));
+               }
+       }
+
+       /* Optional, defaults to maximum rate when not provided */
+       uint64_t rate;
+       if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_RATE, &rate) != 0)
+               rate = 0;
+
+       /* Optional, defaults to standard TRIM when not provided */
+       boolean_t secure;
+       if (nvlist_lookup_boolean_value(innvl, ZPOOL_TRIM_SECURE,
+           &secure) != 0) {
+               secure = B_FALSE;
+       }
+
+       spa_t *spa;
+       int error = spa_open(poolname, &spa, FTAG);
+       if (error != 0)
+               return (error);
+
+       nvlist_t *vdev_errlist = fnvlist_alloc();
+       int total_errors = spa_vdev_trim(spa, vdev_guids, cmd_type,
+           rate, !!zfs_trim_metaslab_skip, secure, vdev_errlist);
+
+       if (fnvlist_size(vdev_errlist) > 0)
+               fnvlist_add_nvlist(outnvl, ZPOOL_TRIM_VDEVS, vdev_errlist);
+
+       fnvlist_free(vdev_errlist);
+
+       spa_close(spa, FTAG);
+       return (total_errors > 0 ? EINVAL : 0);
+}
+
 /*
  * fsname is name of dataset to rollback (to most recent snapshot)
  *
@@ -6580,6 +6666,11 @@ zfs_ioctl_init(void)
            POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
            zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize));
 
+       zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM,
+           zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME,
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+           zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim));
+
        /* IOCTLS that use the legacy function signature */
 
        zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
index 87c4ac117c14199784583cace61a813f323243b5..ec8ae4216263799ce849358c4f653d8669a87fce 100644 (file)
@@ -358,7 +358,8 @@ pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
  */
 static const char *zfs_features[]  = {
        /* --> Add new kernel features here (post ZoL 0.8.0) */
-       "vdev_initialize"
+       "initialize",
+       "trim",
 };
 
 #define        ZFS_FEATURE_COUNT       ARRAY_SIZE(zfs_features)
index 0912f607f258bf0ada16362151f73545878e4f8d..1915de417295de81f36f69bc24ee6c8afc554ead 100644 (file)
@@ -32,6 +32,7 @@
 #include <sys/txg.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
 #include <sys/zio_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
@@ -58,7 +59,7 @@ const char *zio_type_name[ZIO_TYPES] = {
         * Note: Linux kernel thread name length is limited
         * so these names will differ from upstream open zfs.
         */
-       "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
+       "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
 };
 
 int zio_dva_throttle_enabled = B_TRUE;
@@ -761,7 +762,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 {
        zio_t *zio;
 
-       ASSERT3U(psize, <=, SPA_MAXBLOCKSIZE);
+       IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE);
        ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
        ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 
@@ -1211,6 +1212,26 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
        return (zio);
 }
 
+zio_t *
+zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+    zio_done_func_t *done, void *private, zio_priority_t priority,
+    enum zio_flag flags, enum trim_flag trim_flags)
+{
+       zio_t *zio;
+
+       ASSERT0(vd->vdev_children);
+       ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+       ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+       ASSERT3U(size, !=, 0);
+
+       zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done,
+           private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL,
+           vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE);
+       zio->io_trim_flags = trim_flags;
+
+       return (zio);
+}
+
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
@@ -3562,7 +3583,6 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
  * ==========================================================================
  */
 
-
 /*
  * Issue an I/O to the underlying vdev. Typically the issue pipeline
  * stops after this stage and will resume upon I/O completion.
@@ -3685,8 +3705,8 @@ zio_vdev_io_start(zio_t *zio)
                return (zio);
        }
 
-       if (vd->vdev_ops->vdev_op_leaf &&
-           (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
+       if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ ||
+           zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) {
 
                if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
                        return (zio);
@@ -3717,7 +3737,8 @@ zio_vdev_io_done(zio_t *zio)
                return (NULL);
        }
 
-       ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+       ASSERT(zio->io_type == ZIO_TYPE_READ ||
+           zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM);
 
        if (zio->io_delay)
                zio->io_delay = gethrtime() - zio->io_delay;
@@ -3736,7 +3757,7 @@ zio_vdev_io_done(zio_t *zio)
                if (zio_injection_enabled && zio->io_error == 0)
                        zio->io_error = zio_handle_label_injection(zio, EIO);
 
-               if (zio->io_error) {
+               if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) {
                        if (!vdev_accessible(vd, zio)) {
                                zio->io_error = SET_ERROR(ENXIO);
                        } else {
@@ -3866,8 +3887,8 @@ zio_vdev_io_assess(zio_t *zio)
 
        /*
         * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
-        * attempts will ever succeed. In this case we set a persistent bit so
-        * that we don't bother with it in the future.
+        * attempts will ever succeed. In this case we set a persistent
+        * boolean flag so that we don't bother with it in the future.
         */
        if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
            zio->io_type == ZIO_TYPE_IOCTL &&
index 211daaf27aa951b9d853ef73a1b58454e129e89e..aee1ad9ea85f78e5de6333d1b7c266a72b5872a3 100644 (file)
@@ -470,6 +470,17 @@ tags = ['functional', 'cli_root', 'zpool_status']
 tests = ['zpool_sync_001_pos', 'zpool_sync_002_neg']
 tags = ['functional', 'cli_root', 'zpool_sync']
 
+[tests/functional/cli_root/zpool_trim]
+tests = ['zpool_trim_attach_detach_add_remove',
+    'zpool_trim_import_export', 'zpool_trim_multiple', 'zpool_trim_neg',
+    'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline',
+    'zpool_trim_partial', 'zpool_trim_rate', 'zpool_trim_rate_neg',
+    'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg',
+    'zpool_trim_start_and_cancel_pos', 'zpool_trim_suspend_resume',
+    'zpool_trim_unsupported_vdevs', 'zpool_trim_verify_checksums',
+    'zpool_trim_verify_trimmed']
+tags = ['functional', 'zpool_trim']
+
 [tests/functional/cli_root/zpool_upgrade]
 tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos',
     'zpool_upgrade_003_pos', 'zpool_upgrade_004_pos',
@@ -839,6 +850,11 @@ tags = ['functional', 'threadsappend']
 tests = ['tmpfile_001_pos', 'tmpfile_002_pos', 'tmpfile_003_pos']
 tags = ['functional', 'tmpfile']
 
+[tests/functional/trim]
+tests = ['autotrim_integrity', 'autotrim_config', 'autotrim_trim_integrity',
+    'trim_integrity', 'trim_config']
+tags = ['functional', 'trim']
+
 [tests/functional/truncate]
 tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps']
 tags = ['functional', 'truncate']
index 953427bba476c9d62da22d7ca581f46a7c2cef87..d046c13a55ef022766dc0342c944b0b147bc7e82 100755 (executable)
@@ -128,6 +128,13 @@ enospc_reason = 'Exact free space reporting is not guaranteed'
 #
 fio_reason = 'Fio v2.3 or newer required'
 
+#
+# Some tests require that the DISKS provided support the discard operation.
+# Normally this is not an issue because loop back devices are used for DISKS
+# and they support discard (TRIM/UNMAP).
+#
+trim_reason = 'DISKS must support discard (TRIM/UNMAP)'
+
 #
 # Some tests are not applicable to Linux or need to be updated to operate
 # in the manor required by Linux.  Any tests which are skipped for this
@@ -235,6 +242,7 @@ maybe = {
         ['FAIL', rewind_reason],
     'cli_root/zpool_import/zpool_import_missing_003_pos': ['SKIP', '6839'],
     'cli_root/zpool_remove/setup': ['SKIP', disk_reason],
+    'cli_root/zpool_trim/setup': ['SKIP', trim_reason],
     'cli_root/zpool_upgrade/zpool_upgrade_004_pos': ['FAIL', '6141'],
     'cli_user/misc/arc_summary3_001_pos': ['SKIP', python_reason],
     'delegate/setup': ['SKIP', exec_reason],
@@ -267,6 +275,7 @@ maybe = {
     'snapused/snapused_004_pos': ['FAIL', '5513'],
     'tmpfile/setup': ['SKIP', tmpfile_reason],
     'threadsappend/threadsappend_001_pos': ['FAIL', '6136'],
+    'trim/setup': ['SKIP', trim_reason],
     'upgrade/upgrade_projectquota_001_pos': ['SKIP', project_id_reason],
     'user_namespace/setup': ['SKIP', user_ns_reason],
     'userquota/setup': ['SKIP', exec_reason],
index b927cd4e61da11816e9ccba43410531044f87bd3..8855a5358e3d784f1cdd799d275fd1bdbe3407e3 100644 (file)
@@ -169,7 +169,7 @@ lzc_ioctl_run(zfs_ioc_t ioc, const char *name, nvlist_t *innvl, int expected)
 }
 
 /*
- * Test each ioc for the folowing ioctl input errors:
+ * Test each ioc for the following ioctl input errors:
  *   ZFS_ERR_IOC_ARG_UNAVAIL   an input argument is not supported by kernel
  *   ZFS_ERR_IOC_ARG_REQUIRED  a required input argument is missing
  *   ZFS_ERR_IOC_ARG_BADTYPE   an input argument has an invalid type
@@ -650,7 +650,7 @@ test_vdev_initialize(const char *pool)
 
        fnvlist_add_uint64(vdev_guids, "path", 0xdeadbeefdeadbeef);
        fnvlist_add_uint64(required, ZPOOL_INITIALIZE_COMMAND,
-           POOL_INITIALIZE_DO);
+           POOL_INITIALIZE_START);
        fnvlist_add_nvlist(required, ZPOOL_INITIALIZE_VDEVS, vdev_guids);
 
        IOC_INPUT_TEST(ZFS_IOC_POOL_INITIALIZE, pool, required, NULL, EINVAL);
@@ -658,6 +658,25 @@ test_vdev_initialize(const char *pool)
        nvlist_free(required);
 }
 
+static void
+test_vdev_trim(const char *pool)
+{
+       nvlist_t *required = fnvlist_alloc();
+       nvlist_t *optional = fnvlist_alloc();
+       nvlist_t *vdev_guids = fnvlist_alloc();
+
+       fnvlist_add_uint64(vdev_guids, "path", 0xdeadbeefdeadbeef);
+       fnvlist_add_uint64(required, ZPOOL_TRIM_COMMAND, POOL_TRIM_START);
+       fnvlist_add_nvlist(required, ZPOOL_TRIM_VDEVS, vdev_guids);
+       fnvlist_add_uint64(optional, ZPOOL_TRIM_RATE, 1ULL << 30);
+       fnvlist_add_boolean_value(optional, ZPOOL_TRIM_SECURE, B_TRUE);
+
+       IOC_INPUT_TEST(ZFS_IOC_POOL_TRIM, pool, required, optional, EINVAL);
+       nvlist_free(vdev_guids);
+       nvlist_free(optional);
+       nvlist_free(required);
+}
+
 static int
 zfs_destroy(const char *dataset)
 {
@@ -749,6 +768,7 @@ zfs_ioc_input_tests(const char *pool)
        test_unload_key(dataset);
 
        test_vdev_initialize(pool);
+       test_vdev_trim(pool);
 
        /*
         * cleanup
@@ -888,6 +908,7 @@ validate_ioc_values(void)
            ZFS_IOC_BASE + 77 == ZFS_IOC_POOL_CHECKPOINT &&
            ZFS_IOC_BASE + 78 == ZFS_IOC_POOL_DISCARD_CHECKPOINT &&
            ZFS_IOC_BASE + 79 == ZFS_IOC_POOL_INITIALIZE &&
+           ZFS_IOC_BASE + 80 == ZFS_IOC_POOL_TRIM &&
            LINUX_IOC_BASE + 1 == ZFS_IOC_EVENTS_NEXT &&
            LINUX_IOC_BASE + 2 == ZFS_IOC_EVENTS_CLEAR &&
            LINUX_IOC_BASE + 3 == ZFS_IOC_EVENTS_SEEK);
index 045c122077729c02a082a8f03f78e7ce3070fa86..748a4f96dba979cebec244e2f4dbb665a4122fd0 100644 (file)
 # Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 # Copyright (c) 2012, 2017 by Delphix. All rights reserved.
-# Copyright 2016 Nexenta Systems, Inc.
+# Copyright (c) 2017 by Tim Chase. All rights reserved.
+# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved.
 # Copyright (c) 2017 Lawrence Livermore National Security, LLC.
 # Copyright (c) 2017 Datto Inc.
 # Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+# Use is subject to license terms.
 #
 
 . ${STF_TOOLS}/include/logapi.shlib
@@ -1092,14 +1094,14 @@ function fill_fs # destdir dirnum filenum bytes num_writes data
        typeset -i filenum=${3:-50}
        typeset -i bytes=${4:-8192}
        typeset -i num_writes=${5:-10240}
-       typeset -i data=${6:-0}
+       typeset data=${6:-0}
 
        typeset -i odirnum=1
        typeset -i idirnum=0
        typeset -i fn=0
        typeset -i retval=0
 
-       log_must mkdir -p $destdir/$idirnum
+       mkdir -p $destdir/$idirnum
        while (($odirnum > 0)); do
                if ((dirnum >= 0 && idirnum >= dirnum)); then
                        odirnum=0
@@ -1115,7 +1117,7 @@ function fill_fs # destdir dirnum filenum bytes num_writes data
                if (($fn >= $filenum)); then
                        fn=0
                        ((idirnum = idirnum + 1))
-                       log_must mkdir -p $destdir/$idirnum
+                       mkdir -p $destdir/$idirnum
                else
                        ((fn = fn + 1))
                fi
index 90f5e1821318c6fda21e85aef3e5259aa18abe7a..da27673ec94648a3e31fe289cc8735f847048f8c 100644 (file)
@@ -68,6 +68,7 @@ SUBDIRS = \
        sparse \
        threadsappend \
        tmpfile \
+       trim \
        truncate \
        upgrade \
        user_namespace \
index 625cf8579f821d23f4f230fc5183b0b39ed1ca6a..99f1257837c9837723542d13c11230fc223257bc 100644 (file)
@@ -59,4 +59,5 @@ SUBDIRS = \
        zpool_split \
        zpool_status \
        zpool_sync \
+       zpool_trim \
        zpool_upgrade
index b9ede86ede993eb3cdf6c0b3c214c4d77467e57c..fdcce8b56256ecd0cfc8eb5e43faa40f1487d1f4 100644 (file)
@@ -57,6 +57,7 @@ typeset -a properties=(
     "fragmentation"
     "leaked"
     "multihost"
+    "autotrim"
     "feature@async_destroy"
     "feature@empty_bpobj"
     "feature@lz4_compress"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile.am
new file mode 100644 (file)
index 0000000..c357eef
--- /dev/null
@@ -0,0 +1,22 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_trim
+dist_pkgdata_SCRIPTS = \
+       setup.ksh \
+       cleanup.ksh \
+       zpool_trim.kshlib \
+       zpool_trim_attach_detach_add_remove.ksh \
+       zpool_trim_import_export.ksh \
+       zpool_trim_multiple.ksh \
+       zpool_trim_neg.ksh \
+       zpool_trim_offline_export_import_online.ksh \
+       zpool_trim_online_offline.ksh \
+       zpool_trim_partial.ksh \
+       zpool_trim_rate.ksh \
+       zpool_trim_rate_neg.ksh \
+       zpool_trim_secure.ksh \
+       zpool_trim_split.ksh \
+       zpool_trim_start_and_cancel_neg.ksh \
+       zpool_trim_start_and_cancel_pos.ksh \
+       zpool_trim_suspend_resume.ksh \
+       zpool_trim_unsupported_vdevs.ksh \
+       zpool_trim_verify_checksums.ksh \
+       zpool_trim_verify_trimmed.ksh
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/cleanup.ksh
new file mode 100755 (executable)
index 0000000..8ba8166
--- /dev/null
@@ -0,0 +1,34 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+if poolexists $TESTPOOL; then
+       destroy_pool $TESTPOOL
+fi
+
+if poolexists $TESTPOOL1; then
+       destroy_pool $TESTPOOL1
+fi
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/setup.ksh
new file mode 100755 (executable)
index 0000000..cdcf038
--- /dev/null
@@ -0,0 +1,37 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+DISK1=${DISKS%% *}
+
+typeset -i max_discard=0
+if [[ -b $DEV_RDSKDIR/$DISK1 ]]; then
+       max_discard=$(lsblk -Dbn $DEV_RDSKDIR/$DISK1 | awk '{ print $4; exit }')
+fi
+
+if test $max_discard -eq 0; then
+       log_unsupported "DISKS do not support discard (TRIM/UNMAP)"
+fi
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
new file mode 100644 (file)
index 0000000..1c54c66
--- /dev/null
@@ -0,0 +1,43 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+# Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
+#
+
+function trim_prog_line # pool disk
+{
+       typeset pool="$1"
+       typeset disk="$2"
+       zpool status -t "$pool" | grep "$disk" | grep "[[:digit:]]* trimmed"
+}
+
+function trim_progress # pool disk
+{
+       trim_prog_line "$1" "$2" | sed 's/.*(\([0-9]\{1,\}\)% trimmed.*/\1/g'
+}
+
+function cleanup
+{
+       if poolexists $TESTPOOL; then
+               destroy_pool $TESTPOOL
+       fi
+
+       if poolexists $TESTPOOL1; then
+               destroy_pool $TESTPOOL1
+       fi
+}
+log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_attach_detach_add_remove.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_attach_detach_add_remove.ksh
new file mode 100755 (executable)
index 0000000..e715de9
--- /dev/null
@@ -0,0 +1,64 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+# Detaching/attaching, adding/removing data devices works with trimming.
+#
+# STRATEGY:
+# 1. Create a single-disk pool.
+# 2. Start trimming.
+# 3. Attach a second disk, ensure trimming continues.
+# 4. Detach the second disk, ensure trimming continues.
+# 5. Add a second disk, ensure trimming continues.
+# 6. Remove the first disk, ensure trimming stops.
+#
+
+DISK1="$(echo $DISKS | cut -d' ' -f1)"
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+
+log_must zpool create -f $TESTPOOL $DISK1
+
+log_must zpool trim -r 128M $TESTPOOL $DISK1
+progress="$(trim_progress $TESTPOOL $DISK1)"
+[[ -z "$progress" ]] && log_fail "Trim did not start"
+
+log_must zpool attach $TESTPOOL $DISK1 $DISK2
+new_progress="$(trim_progress $TESTPOOL $DISK1)"
+[[ "$progress" -le "$new_progress" ]] || \
+        log_fail "Lost trimming progress on demotion to child vdev"
+progress="$new_progress"
+
+log_must zpool detach $TESTPOOL $DISK2
+new_progress="$(trim_progress $TESTPOOL $DISK1)"
+[[ "$progress" -le "$new_progress" ]] || \
+        log_fail "Lost trimming progress on promotion to top vdev"
+progress="$new_progress"
+
+log_must zpool add $TESTPOOL $DISK2
+log_must zpool remove $TESTPOOL $DISK1
+[[ -z "$(trim_prog_line $TESTPOOL $DISK1)" ]] || \
+        log_fail "Trimming continued after initiating removal"
+
+log_pass "Trimming worked as expected across attach/detach and add/remove"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_import_export.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_import_export.ksh
new file mode 100755 (executable)
index 0000000..a624d36
--- /dev/null
@@ -0,0 +1,88 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+# Trimming automatically resumes across import/export.
+#
+# STRATEGY:
+# 1. Create a one-disk pool.
+# 2. Start trimming and verify that trimming is active.
+# 3. Export the pool.
+# 4. Import the pool.
+# 5. Verify that trimming resumes and progress does not regress.
+# 6. Suspend trimming.
+# 7. Repeat steps 3-4.
+# 8. Verify that progress does not regress but trimming is still suspended.
+#
+
+function cleanup
+{
+       if poolexists $TESTPOOL; then
+               destroy_pool $TESTPOOL
+       fi
+
+       if [[ -d "$TESTDIR" ]]; then
+               rm -rf "$TESTDIR"
+       fi
+}
+
+LARGEFILE="$TESTDIR/largefile"
+
+log_must mkdir "$TESTDIR"
+log_must truncate -s 10G "$LARGEFILE"
+log_must zpool create -f $TESTPOOL $LARGEFILE
+
+log_must zpool trim -r 256M $TESTPOOL
+sleep 2
+
+progress="$(trim_progress $TESTPOOL $LARGEFILE)"
+[[ -z "$progress" ]] && log_fail "Trimming did not start"
+
+log_must zpool export $TESTPOOL
+log_must zpool import -d $TESTDIR $TESTPOOL
+
+new_progress="$(trim_progress $TESTPOOL $LARGEFILE)"
+[[ -z "$new_progress" ]] && log_fail "Trimming did not restart after import"
+
+[[ "$progress" -le "$new_progress" ]] || \
+    log_fail "Trimming lost progress after import"
+log_mustnot eval "trim_prog_line $TESTPOOL $LARGEFILE | grep suspended"
+
+log_must zpool trim -s $TESTPOOL $LARGEFILE
+action_date="$(trim_prog_line $TESTPOOL $LARGEFILE | \
+    sed 's/.*ed at \(.*\)).*/\1/g')"
+log_must zpool export $TESTPOOL
+log_must zpool import -d $TESTDIR $TESTPOOL
+new_action_date=$(trim_prog_line $TESTPOOL $LARGEFILE | \
+    sed 's/.*ed at \(.*\)).*/\1/g')
+[[ "$action_date" != "$new_action_date" ]] && \
+    log_fail "Trimming action date did not persist across export/import"
+
+[[ "$new_progress" -le "$(trim_progress $TESTPOOL $LARGEFILE)" ]] || \
+       log_fail "Trimming lost progress after import"
+
+log_must eval "trim_prog_line $TESTPOOL $LARGEFILE | grep suspended"
+
+log_pass "Trimming retains state as expected across export/import"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_multiple.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_multiple.ksh
new file mode 100755 (executable)
index 0000000..e8236ff
--- /dev/null
@@ -0,0 +1,65 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+# Trimming can be performed multiple times
+#
+# STRATEGY:
+# 1. Create a pool with a single disk.
+# 2. Trim the entire pool.
+# 3. Verify trimming is reset (status, offset, and action date).
+# 4. Repeat steps 2 and 3 with the existing pool.
+#
+
+DISK1=${DISKS%% *}
+
+log_must zpool create -f $TESTPOOL $DISK1
+
+typeset action_date="none"
+for n in {1..3}; do
+       log_must zpool trim -r 2G $TESTPOOL
+       log_mustnot eval "trim_prog_line $TESTPOOL $DISK1 | grep complete"
+
+       [[ "$(trim_progress $TESTPOOL $DISK1)" -lt "100" ]] ||
+           log_fail "Trimming progress wasn't reset"
+
+       new_action_date="$(trim_prog_line $TESTPOOL $DISK1 | \
+           sed 's/.*ed at \(.*\)).*/\1/g')"
+       [[ "$action_date" != "$new_action_date" ]] ||
+               log_fail "Trimming action date wasn't reset"
+       action_date=$new_action_date
+
+       while [[ "$(trim_progress $TESTPOOL $DISK1)" -lt "100" ]]; do
+               progress="$(trim_progress $TESTPOOL $DISK1)"
+               sleep 0.5
+               [[ "$progress" -le "$(trim_progress $TESTPOOL $DISK1)" ]] ||
+                   log_fail "Trimming progress regressed"
+       done
+
+       log_must eval "trim_prog_line $TESTPOOL $DISK1 | grep complete"
+       sleep 1
+done
+
+log_pass "Trimming multiple times performs as expected"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_neg.ksh
new file mode 100755 (executable)
index 0000000..4ab2eb1
--- /dev/null
@@ -0,0 +1,53 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+#      A badly formed parameter passed to 'zpool trim' should
+#      return an error.
+#
+# STRATEGY:
+#      1. Create an array containing bad 'zpool trim' parameters.
+#      2. For each element, execute the sub-command.
+#      3. Verify it returns an error.
+#
+
+DISK1=${DISKS%% *}
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+
+verify_runnable "global"
+
+set -A args "1" "-a" "-?" "--%" "-123456" "0.5" "-o" "-b" "-b no" "-z 2"
+
+log_assert "Execute 'zpool trim' using invalid parameters."
+
+log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2
+
+typeset -i i=0
+while [[ $i -lt ${#args[*]} ]]; do
+       log_mustnot zpool trim ${args[i]} $TESTPOOL
+       ((i = i + 1))
+done
+
+log_pass "Invalid parameters to 'zpool trim' fail as expected."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_offline_export_import_online.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_offline_export_import_online.ksh
new file mode 100755 (executable)
index 0000000..4f904d1
--- /dev/null
@@ -0,0 +1,62 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+# Miscellaneous complex sequences of operations function as expected.
+#
+# STRATEGY:
+# 1. Create a pool with a two-way mirror.
+# 2. Start trimming, offline, export, import, online and verify that
+#    trimming state is preserved / trimming behaves as expected
+#    at each step.
+#
+
+DISK1="$(echo $DISKS | cut -d' ' -f1)"
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+
+log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2
+
+log_must zpool trim -r 128M $TESTPOOL $DISK1
+log_must zpool offline $TESTPOOL $DISK1
+progress="$(trim_progress $TESTPOOL $DISK1)"
+[[ -z "$progress" ]] && log_fail "Trimming did not start"
+log_mustnot eval "trim_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+
+new_progress="$(trim_progress $TESTPOOL $DISK1)"
+[[ -z "$new_progress" ]] && log_fail "Trimming did not start after import"
+[[ "$new_progress" -ge "$progress" ]] || \
+    log_fail "Trimming lost progress after import"
+log_mustnot eval "trim_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_must zpool online $TESTPOOL $DISK1
+new_progress="$(trim_progress $TESTPOOL $DISK1)"
+[[ "$new_progress" -ge "$progress" ]] || \
+    log_fail "Trimming lost progress after online"
+
+log_pass "Trimming behaves as expected at each step of:" \
+    "trim + offline + export + import + online"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh
new file mode 100755 (executable)
index 0000000..681cd12
--- /dev/null
@@ -0,0 +1,70 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+# Trimming automatically resumes across offline/online.
+#
+# STRATEGY:
+# 1. Create a pool with a two-way mirror.
+# 2. Start trimming one of the disks and verify that trimming is active.
+# 3. Offline the disk.
+# 4. Online the disk.
+# 5. Verify that trimming resumes and progress does not regress.
+# 6. Suspend trimming.
+# 7. Repeat steps 3-4 and verify that trimming does not resume.
+#
+
+DISK1=${DISKS%% *}
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+
+log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2
+log_must zpool trim -r 128M $TESTPOOL $DISK1
+
+log_must zpool offline $TESTPOOL $DISK1
+
+progress="$(trim_progress $TESTPOOL $DISK1)"
+[[ -z "$progress" ]] && log_fail "Trimming did not start"
+
+log_must zpool online $TESTPOOL $DISK1
+
+new_progress="$(trim_progress $TESTPOOL $DISK1)"
+[[ -z "$new_progress" ]] && \
+    log_fail "Trimming did not restart after onlining"
+[[ "$progress" -le "$new_progress" ]] || \
+    log_fail "Trimming lost progress after onlining"
+log_mustnot eval "trim_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_must zpool trim -s $TESTPOOL $DISK1
+action_date="$(trim_prog_line $TESTPOOL $DISK1 | \
+    sed 's/.*ed at \(.*\)).*/\1/g')"
+log_must zpool offline $TESTPOOL $DISK1
+log_must zpool online $TESTPOOL $DISK1
+new_action_date=$(trim_prog_line $TESTPOOL $DISK1 | \
+    sed 's/.*ed at \(.*\)).*/\1/g')
+[[ "$action_date" != "$new_action_date" ]] && \
+    log_fail "Trimming action date did not persist across offline/online"
+log_must eval "trim_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_pass "Trimming performs as expected across offline/online"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh
new file mode 100755 (executable)
index 0000000..58e0ef7
--- /dev/null
@@ -0,0 +1,114 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+#      Verify 'zpool trim' partial trim.
+#
+# STRATEGY:
+#      1. Create a pool on a single disk and mostly fill it.
+#      2. Expand the pool to create new unallocated metaslabs.
+#      3. Run 'zpool trim' to only TRIM allocated space maps.
+#      4. Verify the disk is least 90% of its original size.
+#      5. Run 'zpool trim' to perform a full TRIM.
+#      6. Verify the disk is less than 10% of its original size.
+
+function cleanup
+{
+       if poolexists $TESTPOOL; then
+               destroy_pool $TESTPOOL
+       fi
+
+       if [[ -d "$TESTDIR" ]]; then
+               rm -rf "$TESTDIR"
+       fi
+
+       log_must set_tunable64 zfs_trim_metaslab_skip 0
+       log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min
+       log_must set_tunable64 zfs_vdev_min_ms_count $vdev_min_ms_count
+}
+log_onexit cleanup
+
+LARGESIZE=$((MINVDEVSIZE * 4))
+LARGEFILE="$TESTDIR/largefile"
+
+# The minimum number of metaslabs is increased in order to simulate the
+# behavior of partial trimming on a more typically sized 1TB disk.
+typeset vdev_min_ms_count=$(get_tunable zfs_vdev_min_ms_count)
+log_must set_tunable64 zfs_vdev_min_ms_count 64
+
+# Minimum trim size is decreased to verify all trim sizes.
+typeset trim_extent_bytes_min=$(get_tunable zfs_trim_extent_bytes_min)
+log_must set_tunable64 zfs_trim_extent_bytes_min 4096
+
+log_must mkdir "$TESTDIR"
+log_must truncate -s $LARGESIZE "$LARGEFILE"
+log_must zpool create $TESTPOOL "$LARGEFILE"
+log_must mkfile $(( floor(LARGESIZE * 0.80) )) /$TESTPOOL/file
+log_must zpool sync
+
+new_size=$(du -B1 "$LARGEFILE" | cut -f1)
+log_must test $new_size -le $LARGESIZE
+log_must test $new_size -gt $(( floor(LARGESIZE * 0.70) ))
+
+# Expand the pool to create new unallocated metaslabs.
+log_must zpool export $TESTPOOL
+log_must dd if=/dev/urandom of=$LARGEFILE conv=notrunc,nocreat \
+    seek=$((LARGESIZE / (1024 * 1024))) bs=$((1024 * 1024)) \
+    count=$((3 * LARGESIZE / (1024 * 1024)))
+log_must zpool import -d $TESTDIR $TESTPOOL
+log_must zpool online -e $TESTPOOL "$LARGEFILE"
+
+new_size=$(du -B1 "$LARGEFILE" | cut -f1)
+log_must test $new_size -gt $((4 * floor(LARGESIZE * 0.70) ))
+
+# Perform a partial trim, we expect it to skip most of the new metaslabs
+# which have never been used and therefore do not need be trimmed.
+log_must set_tunable64 zfs_trim_metaslab_skip 1
+log_must zpool trim $TESTPOOL
+log_must set_tunable64 zfs_trim_metaslab_skip 0
+
+log_must zpool sync
+while [[ "$(trim_progress $TESTPOOL $LARGEFILE)" -lt "100" ]]; do
+       sleep 0.5
+done
+
+new_size=$(du -B1 "$LARGEFILE" | cut -f1)
+log_must test $new_size -gt $LARGESIZE
+
+# Perform a full trim, all metaslabs will be trimmed the pool vdev
+# size will be reduced but not down to its original size due to the
+# space usage of the new metaslabs.
+log_must zpool trim $TESTPOOL
+
+log_must zpool sync
+while [[ "$(trim_progress $TESTPOOL $LARGEFILE)" -lt "100" ]]; do
+       sleep 0.5
+done
+
+new_size=$(du -B1 "$LARGEFILE" | cut -f1)
+log_must test $new_size -le $(( 2 * LARGESIZE))
+log_must test $new_size -gt $(( floor(LARGESIZE * 0.70) ))
+
+log_pass "Manual 'zpool trim' successfully partially trimmed pool"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_rate.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_rate.ksh
new file mode 100755 (executable)
index 0000000..6b83a1e
--- /dev/null
@@ -0,0 +1,90 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+#      Verify 'zpool trim -r <rate>' rate limiting.
+#
+# STRATEGY:
+#      1. Create a pool on a single disk.
+#      2. Manually TRIM the pool with rate limiting.
+#      3. Verify the TRIM can be suspended.
+#      4. Restart the TRIM and verify the rate is preserved.
+#
+# NOTE: The tolerances and delays used in the test below are intentionally
+# set be to fairly large since we are capping the maximum trim rate.  The
+# actual trim rate can be lower.  The critical thing is that the trim rate
+# is limited, the rate is preserved when resuming, and it can be changed.
+#
+
+function cleanup
+{
+       if poolexists $TESTPOOL; then
+               destroy_pool $TESTPOOL
+       fi
+
+       if [[ -d "$TESTDIR" ]]; then
+               rm -rf "$TESTDIR"
+       fi
+}
+log_onexit cleanup
+
+LARGEFILE="$TESTDIR/largefile"
+
+log_must mkdir "$TESTDIR"
+log_must truncate -s 10G "$LARGEFILE"
+log_must zpool create -f $TESTPOOL "$LARGEFILE"
+
+# Start trimming at 200M/s for 5 seconds (approximately 10% of the pool)
+log_must zpool trim -r 200M $TESTPOOL
+log_must sleep 4
+progress=$(trim_progress $TESTPOOL $LARGEFILE)
+log_must zpool trim -s $TESTPOOL
+log_must eval "trim_prog_line $TESTPOOL $LARGEFILE | grep suspended"
+log_must within_tolerance 10 $progress 5
+
+# Resuming trimming at 200M/s for 5 seconds (approximately 20% of the pool)
+log_must zpool trim $TESTPOOL
+log_must sleep 4
+progress=$(trim_progress $TESTPOOL $LARGEFILE)
+log_must zpool trim -s $TESTPOOL
+log_must eval "trim_prog_line $TESTPOOL $LARGEFILE | grep suspended"
+log_must within_tolerance 20 $progress 10
+
+# Increase trimming to 600M/s for 5 seconds (approximately 50% of the pool)
+log_must zpool trim -r 600M $TESTPOOL
+log_must sleep 4
+progress=$(trim_progress $TESTPOOL $LARGEFILE)
+log_must zpool trim -s $TESTPOOL
+log_must eval "trim_prog_line $TESTPOOL $LARGEFILE | grep suspended"
+log_must within_tolerance 50 $progress 15
+
+# Set maximum trim rate for 5 seconds (100% of the pool)
+log_must zpool trim -r 1T $TESTPOOL
+log_must sleep 4
+progress=$(trim_progress $TESTPOOL $LARGEFILE)
+log_must eval "trim_prog_line $TESTPOOL $LARGEFILE | grep complete"
+log_must within_tolerance 100 $progress 0
+
+log_pass "Manual TRIM rate throttles as expected"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_rate_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_rate_neg.ksh
new file mode 100755 (executable)
index 0000000..11bd430
--- /dev/null
@@ -0,0 +1,53 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+#      A badly formed parameter passed to 'zpool trim -r' should
+#      return an error.
+#
+# STRATEGY:
+#      1. Create an array containing bad 'zpool trim -r' parameters.
+#      2. For each element, execute the sub-command.
+#      3. Verify it returns an error.
+#
+
+DISK1=${DISKS%% *}
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+
+verify_runnable "global"
+
+set -A args "a" "--%" "10X" "yes" "-?" "z 99"
+
+log_assert "Execute 'zpool trim -r' using invalid parameters."
+
+log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2
+
+typeset -i i=0
+while [[ $i -lt ${#args[*]} ]]; do
+       log_mustnot zpool trim -r ${args[i]} $TESTPOOL
+        ((i = i + 1))
+done
+
+log_pass "Invalid parameters to 'zpool trim -r' fail as expected."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_secure.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_secure.ksh
new file mode 100755 (executable)
index 0000000..e97d09f
--- /dev/null
@@ -0,0 +1,59 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+#      Verify 'zpool trim -d' secure trim.
+#
+# STRATEGY:
+#      1. Create a pool on a single file vdev.
+#      2. Run 'zpool trim -d' to securely TRIM allocated space maps.
+#      3. Verify it fails when using a file vdev.
+#
+# NOTE: Currently secure discard cannot be verified using file vdevs,
+# loopback, or scsi_debug devices.  None of which support the feature.
+# It can only be tested using real SSDs which provide support.
+#
+
+function cleanup
+{
+       if poolexists $TESTPOOL; then
+               destroy_pool $TESTPOOL
+       fi
+
+       if [[ -d "$TESTDIR" ]]; then
+               rm -rf "$TESTDIR"
+       fi
+}
+log_onexit cleanup
+
+LARGESIZE=$((MINVDEVSIZE * 4))
+LARGEFILE="$TESTDIR/largefile"
+
+log_must mkdir "$TESTDIR"
+log_must truncate -s $LARGESIZE "$LARGEFILE"
+log_must zpool create $TESTPOOL "$LARGEFILE"
+log_mustnot zpool trim -d $TESTPOOL
+
+log_pass "Manual 'zpool trim -d' failed as expected for file vdevs"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_split.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_split.ksh
new file mode 100755 (executable)
index 0000000..450dc6f
--- /dev/null
@@ -0,0 +1,60 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+# Trimming state is preserved across zpool split.
+#
+# STRATEGY:
+# 1. Create a pool with a two-way mirror.
+# 2. Start trimming both devices.
+# 3. Split the pool. Ensure trimming continues on the original.
+# 4. Import the new pool. Ensure trimming resumes on it.
+#
+
+DISK1="$(echo $DISKS | cut -d' ' -f1)"
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+POOL2="${TESTPOOL}_split"
+
+log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2
+
+log_must zpool trim $TESTPOOL $DISK1 $DISK2
+orig_prog1="$(trim_progress $TESTPOOL $DISK1)"
+orig_prog2="$(trim_progress $TESTPOOL $DISK2)"
+[[ -z "$orig_prog1" ]] && log_fail "Trimming did not start"
+
+log_must zpool split $TESTPOOL $TESTPOOL1 $DISK2
+
+# Ensure trimming continued as expected on the original pool.
+[[ "$(trim_progress $TESTPOOL $DISK1)" -ge "$orig_prog1" ]] || \
+        log_fail "Trimming lost progress on original pool"
+log_mustnot eval "trim_prog_line $TESTPOOL $DISK1 | grep suspended"
+
+log_must zpool import $TESTPOOL1
+
+[[ "$(trim_progress $TESTPOOL1 $DISK2)" -ge "$orig_prog2" ]] || \
+        log_fail "Trimming lost progress on split pool"
+log_mustnot eval "trim_prog_line $TESTPOOL1 $DISK1 | grep suspended"
+
+log_pass "Trimming behaves as expected on zpool split"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh
new file mode 100755 (executable)
index 0000000..faf134f
--- /dev/null
@@ -0,0 +1,56 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+# Cancelling and suspending trim doesn't work if not all specified vdevs
+# are being trimmed.
+#
+# STRATEGY:
+# 1. Create a three-disk pool.
+# 2. Start trimming and verify that trimming is active.
+# 3. Try to cancel and suspend trimming on the non-trimming disks.
+# 4. Try to re-trim the currently trimming disk.
+#
+
+DISK1=${DISKS%% *}
+DISK2="$(echo $DISKS | cut -d' ' -f2)"
+DISK3="$(echo $DISKS | cut -d' ' -f3)"
+
+log_must zpool list -v
+log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3
+log_must zpool trim -r 128M $TESTPOOL $DISK1
+
+[[ -z "$(trim_progress $TESTPOOL $DISK1)" ]] && \
+    log_fail "Trim did not start"
+
+log_mustnot zpool trim -c $TESTPOOL $DISK2
+log_mustnot zpool trim -c $TESTPOOL $DISK2 $DISK3
+
+log_mustnot zpool trim -s $TESTPOOL $DISK2
+log_mustnot zpool trim -s $TESTPOOL $DISK2 $DISK3
+
+log_mustnot zpool trim $TESTPOOL $DISK1
+
+log_pass "Nonsensical trim operations fail"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh
new file mode 100755 (executable)
index 0000000..eaa4d90
--- /dev/null
@@ -0,0 +1,48 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+
+#
+# DESCRIPTION:
+# Starting and stopping an initialize works.
+#
+# STRATEGY:
+# 1. Create a one-disk pool.
+# 2. Start initializing and verify that initializing is active.
+# 3. Cancel initializing and verify that initializing is not active.
+#
+
+DISK1=${DISKS%% *}
+
+log_must zpool create -f $TESTPOOL $DISK1
+log_must zpool initialize $TESTPOOL
+
+[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \
+    log_fail "Initialize did not start"
+
+log_must zpool initialize -c $TESTPOOL
+
+[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] || \
+    log_fail "Initialize did not stop"
+
+log_pass "Initialize start + cancel works"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_suspend_resume.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_suspend_resume.ksh
new file mode 100755 (executable)
index 0000000..553d911
--- /dev/null
@@ -0,0 +1,74 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+# Suspending and resuming trimming works.
+#
+# STRATEGY:
+# 1. Create a one-disk pool.
+# 2. Start trimming and verify that trimming is active.
+# 3. Wait 3 seconds, then suspend trimming and verify that the progress
+#    reporting says so.
+# 4. Wait 3 seconds and ensure trimming progress doesn't advance.
+# 5. Restart trimming and verify that the progress doesn't regress.
+#
+
+function cleanup
+{
+       if poolexists $TESTPOOL; then
+               destroy_pool $TESTPOOL
+       fi
+
+       if [[ -d "$TESTDIR" ]]; then
+               rm -rf "$TESTDIR"
+       fi
+}
+
+LARGEFILE="$TESTDIR/largefile"
+
+log_must mkdir "$TESTDIR"
+log_must truncate -s 10G "$LARGEFILE"
+log_must zpool create -f $TESTPOOL $LARGEFILE
+
+log_must zpool trim -r 256M $TESTPOOL
+sleep 2
+
+[[ -z "$(trim_progress $TESTPOOL $LARGEFILE)" ]] && \
+    log_fail "Trimming did not start"
+
+sleep 3
+log_must zpool trim -s $TESTPOOL
+log_must eval "trim_prog_line $TESTPOOL $LARGEFILE | grep suspended"
+progress="$(trim_progress $TESTPOOL $LARGEFILE)"
+
+sleep 3
+[[ "$progress" -eq "$(trim_progress $TESTPOOL $LARGEFILE)" ]] || \
+       log_fail "Trimming progress advanced while suspended"
+
+log_must zpool trim $TESTPOOL $LARGEFILE
+[[ "$progress" -le "$(trim_progress $TESTPOOL $LARGEFILE)" ]] ||
+       log_fail "Trimming progress regressed after resuming"
+
+log_pass "Suspend + resume trimming works as expected"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_unsupported_vdevs.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_unsupported_vdevs.ksh
new file mode 100755 (executable)
index 0000000..988745e
--- /dev/null
@@ -0,0 +1,70 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+# Attempting to trim unsupported vdevs should fail.
+#
+# STRATEGY:
+# 1. Create a pool with the following configuration:
+#    root
+#      mirror
+#        vdev0
+#        vdev1 (offline)
+#      cache
+#        vdev2
+#      spare
+#        vdev3
+# 2. Try to trim vdev1, vdev2, and vdev3. Ensure that all 3 fail.
+#
+function cleanup
+{
+        if datasetexists $TESTPOOL; then
+                destroy_pool $TESTPOOL
+        fi
+        if [[ -d $TESTDIR ]]; then
+                log_must rm -rf $TESTDIR
+        fi
+}
+log_onexit cleanup
+
+log_must mkdir $TESTDIR
+set -A FDISKS
+for n in {0..2}; do
+        log_must mkfile $MINVDEVSIZE $TESTDIR/vdev$n
+        FDISKS+=("$TESTDIR/vdev$n")
+done
+FDISKS+=("${DISKS%% *}")
+
+log_must zpool create $TESTPOOL mirror ${FDISKS[0]} ${FDISKS[1]} \
+        spare ${FDISKS[2]} cache ${FDISKS[3]}
+
+log_must zpool offline $TESTPOOL ${FDISKS[1]}
+
+log_mustnot zpool trim $TESTPOOL mirror-0
+for n in {1..3}; do
+        log_mustnot zpool trim $TESTPOOL ${FDISKS[$n]}
+done
+
+log_pass "Attempting to trim failed on unsupported devices"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_checksums.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_checksums.ksh
new file mode 100755 (executable)
index 0000000..093dc3f
--- /dev/null
@@ -0,0 +1,69 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+# Trimming does not cause file corruption.
+#
+# STRATEGY:
+# 1. Create a one-disk pool.
+# 2. Write data to the pool.
+# 3. Start trimming and verify that trimming is active.
+# 4. Write more data to the pool.
+# 5. Export the pool and use zdb to validate checksums.
+#
+
+function cleanup
+{
+       if poolexists $TESTPOOL; then
+               destroy_pool $TESTPOOL
+       fi
+
+       if [[ -d "$TESTDIR" ]]; then
+               rm -rf "$TESTDIR"
+       fi
+}
+log_onexit cleanup
+
+LARGESIZE=$((MINVDEVSIZE * 4))
+LARGEFILE="$TESTDIR/largefile"
+
+log_must mkdir "$TESTDIR"
+log_must truncate -s $LARGESIZE "$LARGEFILE"
+log_must zpool create $TESTPOOL "$LARGEFILE"
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1048576 count=64
+log_must zpool sync
+log_must zpool trim $TESTPOOL
+
+[[ -z "$(trim_progress $TESTPOOL $DISK1)" ]] && \
+    log_fail "Trimming did not start"
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file2 bs=1048576 count=64
+log_must zpool sync
+
+log_must zpool export $TESTPOOL
+log_must zdb -e -p "$TESTDIR" -cc $TESTPOOL
+
+log_pass "Trimming does not corrupt existing or new data"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh
new file mode 100755 (executable)
index 0000000..a216d13
--- /dev/null
@@ -0,0 +1,81 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# DESCRIPTION:
+# After trimming, the disk is actually trimmed.
+#
+# STRATEGY:
+# 1. Create a one-disk pool using a sparse file.
+# 2. Initialize the pool and verify the file vdev is no longer sparse.
+# 3. Trim the pool and verify the file vdev is again sparse.
+#
+
+function cleanup
+{
+       if poolexists $TESTPOOL; then
+               destroy_pool $TESTPOOL
+       fi
+
+        if [[ -d "$TESTDIR" ]]; then
+                rm -rf "$TESTDIR"
+        fi
+
+       log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min
+}
+log_onexit cleanup
+
+LARGESIZE=$((MINVDEVSIZE * 4))
+LARGEFILE="$TESTDIR/largefile"
+
+# Reduce trim size to allow for tighter tolerance below when checking.
+typeset trim_extent_bytes_min=$(get_tunable zfs_trim_extent_bytes_min)
+log_must set_tunable64 zfs_trim_extent_bytes_min 4096
+
+log_must mkdir "$TESTDIR"
+log_must truncate -s $LARGESIZE "$LARGEFILE"
+log_must zpool create $TESTPOOL "$LARGEFILE"
+
+original_size=$(du -B1 "$LARGEFILE" | cut -f1)
+
+log_must zpool initialize $TESTPOOL
+
+while [[ "$(initialize_progress $TESTPOOL $LARGEFILE)" -lt "100" ]]; do
+        sleep 0.5
+done
+
+new_size=$(du -B1 "$LARGEFILE" | cut -f1)
+log_must within_tolerance $new_size $LARGESIZE $((128 * 1024 * 1024))
+
+log_must zpool trim $TESTPOOL
+
+while [[ "$(trim_progress $TESTPOOL $LARGEFILE)" -lt "100" ]]; do
+        sleep 0.5
+done
+
+new_size=$(du -B1 "$LARGEFILE" | cut -f1)
+log_must within_tolerance $new_size $original_size $((128 * 1024 * 1024))
+
+log_pass "Trimmed appropriate amount of disk space"
diff --git a/tests/zfs-tests/tests/functional/trim/Makefile.am b/tests/zfs-tests/tests/functional/trim/Makefile.am
new file mode 100644 (file)
index 0000000..4f260a8
--- /dev/null
@@ -0,0 +1,11 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/trim
+dist_pkgdata_SCRIPTS = \
+       setup.ksh \
+       cleanup.ksh \
+       trim.kshlib \
+       trim.cfg \
+       autotrim_integrity.ksh \
+       autotrim_config.ksh \
+       autotrim_trim_integrity.ksh \
+       trim_integrity.ksh \
+       trim_config.ksh
diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh
new file mode 100755 (executable)
index 0000000..6ce396a
--- /dev/null
@@ -0,0 +1,103 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/trim/trim.kshlib
+. $STF_SUITE/tests/functional/trim/trim.cfg
+
+#
+# DESCRIPTION:
+#      Check various pool geometries stripe, mirror, raidz)
+#
+# STRATEGY:
+#      1. Create a pool on file vdevs to trim.
+#      2. Set 'autotrim=on' on pool.
+#      3. Fill the pool to a known percentage of capacity.
+#      4. Verify the vdevs contain 75% or more allocated blocks.
+#      5. Remove all files making it possible to trim the entire pool.
+#      6. Wait for auto trim to issue trim IOs for the free blocks.
+#      7. Verify the disks contain 30% or less allocated blocks.
+#      8. Repeat for test for striped, mirrored, and RAIDZ pools.
+
+verify_runnable "global"
+
+log_assert "Set 'autotrim=on' verify pool disks were trimmed"
+
+function cleanup
+{
+       if poolexists $TESTPOOL; then
+               destroy_pool $TESTPOOL
+       fi
+
+       log_must rm -f $TRIM_VDEVS
+
+       log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min
+       log_must set_tunable64 zfs_trim_txg_batch $trim_txg_batch
+       log_must set_tunable64 zfs_vdev_min_ms_count $vdev_min_ms_count
+}
+log_onexit cleanup
+
+# Minimum trim size is decreased to verify all trim sizes.
+typeset trim_extent_bytes_min=$(get_tunable zfs_trim_extent_bytes_min)
+log_must set_tunable64 zfs_trim_extent_bytes_min 4096
+
+# Reduced zfs_trim_txg_batch to make trimming more frequent.
+typeset trim_txg_batch=$(get_tunable zfs_trim_txg_batch)
+log_must set_tunable64 zfs_trim_txg_batch 8
+
+# Increased metaslabs to better simulate larger more realistic devices.
+typeset vdev_min_ms_count=$(get_tunable zfs_vdev_min_ms_count)
+log_must set_tunable64 zfs_vdev_min_ms_count 32
+
+typeset VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.75 / 1024 / 1024) ))
+typeset VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) ))
+
+for type in "" "mirror" "raidz2"; do
+
+       if [[ "$type" = "" ]]; then
+               VDEVS="$TRIM_VDEV1"
+       elif [[ "$type" = "mirror" ]]; then
+               VDEVS="$TRIM_VDEV1 $TRIM_VDEV2"
+       else
+               VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3"
+       fi
+
+       log_must truncate -s $((4 * MINVDEVSIZE)) $VDEVS
+       log_must zpool create -f $TESTPOOL $VDEVS
+       log_must zpool set autotrim=on $TESTPOOL
+
+       typeset availspace=$(get_prop available $TESTPOOL)
+       typeset fill_mb=$(( floor(availspace * 0.90 / 1024 / 1024) ))
+
+       # Fill the pool, verify the vdevs are no longer sparse.
+       file_write -o create -f /$TESTPOOL/file -b 1048576 -c $fill_mb -d R
+       verify_vdevs "-gt" "$VDEV_MAX_MB" $VDEVS
+
+       # Remove the file, wait for trim, verify the vdevs are now sparse.
+       log_must rm /$TESTPOOL/file
+       wait_trim_io $TESTPOOL "ind" 64
+       verify_vdevs "-le" "$VDEV_MIN_MB" $VDEVS
+
+       log_must zpool destroy $TESTPOOL
+       log_must rm -f $VDEVS
+done
+
+log_pass "Auto trim successfully shrunk vdevs"
diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh
new file mode 100755 (executable)
index 0000000..c7b3da7
--- /dev/null
@@ -0,0 +1,87 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/trim/trim.kshlib
+. $STF_SUITE/tests/functional/trim/trim.cfg
+
+#
+# DESCRIPTION:
+#      Verify automatic trim pool data integrity.
+#
+# STRATEGY:
+#      1. Create a pool on sparse file vdevs to trim.
+#      2. Set autotrim=on to enable asynchronous pool trimming.
+#      3. Generate some interesting pool data which can be trimmed.
+#      4. Verify trim IOs of the expected type were issued for the pool.
+#      5. Verify data integrity of the pool after trim.
+#      6. Repeat test for striped, mirrored, and RAIDZ pools.
+
+verify_runnable "global"
+
+log_assert "Set 'autotrim=on' pool property verify pool data integrity"
+
+function cleanup
+{
+       if poolexists $TESTPOOL; then
+               destroy_pool $TESTPOOL
+       fi
+
+       log_must rm -f $TRIM_VDEVS
+
+       log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min
+       log_must set_tunable64 zfs_trim_txg_batch $trim_txg_batch
+}
+log_onexit cleanup
+
+# Minimum trim size is decreased to verify all trim sizes.
+typeset trim_extent_bytes_min=$(get_tunable zfs_trim_extent_bytes_min)
+log_must set_tunable64 zfs_trim_extent_bytes_min 4096
+
+# Reduced zfs_trim_txg_batch to make trimming more frequent.
+typeset trim_txg_batch=$(get_tunable zfs_trim_txg_batch)
+log_must set_tunable64 zfs_trim_txg_batch 8
+
+for type in "" "mirror" "raidz" "raidz2" "raidz3"; do
+       log_must truncate -s 1G $TRIM_VDEVS
+
+       log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS
+       log_must zpool set autotrim=on $TESTPOOL
+
+       # Add and remove data from the pool in a random fashion in order
+       # to generate a variety of interesting ranges to be auto trimmed.
+       for n in {0..10}; do
+               dir="/$TESTPOOL/autotrim-$((RANDOM % 5))"
+               filesize=$((4096 + ((RANDOM * 691) % 131072) ))
+               log_must rm -rf $dir
+               log_must fill_fs $dir 10 10 $filesize 1 R
+               zpool sync
+       done
+       log_must du -hs /$TESTPOOL
+
+       verify_trim_io $TESTPOOL "ind" 10
+       verify_pool $TESTPOOL
+
+       log_must zpool destroy $TESTPOOL
+       log_must rm -f $TRIM_VDEVS
+done
+
+log_pass "Automatic trim successfully validated"
diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh
new file mode 100755 (executable)
index 0000000..c0e850c
--- /dev/null
@@ -0,0 +1,93 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/trim/trim.kshlib
+. $STF_SUITE/tests/functional/trim/trim.cfg
+
+#
+# DESCRIPTION:
+#      Verify automatic trim and manual trim coexist correctly.
+#
+# STRATEGY:
+#      1. Create a pool on sparse file vdevs to trim.
+#      2. Set autotrim=on to enable asynchronous pool trimming.
+#      3. Generate some interesting pool data which can be trimmed.
+#      4. While generating data issue manual trims.
+#      4. Verify trim IOs of the expected type were issued for the pool.
+#      5. Verify data integrity of the pool after trim.
+#      6. Repeat test for striped, mirrored, and RAIDZ pools.
+
+verify_runnable "global"
+
+log_assert "Set 'autotrim=on', run 'zpool trim' and verify pool data integrity"
+
+function cleanup
+{
+       if poolexists $TESTPOOL; then
+               destroy_pool $TESTPOOL
+       fi
+
+       log_must rm -f $TRIM_VDEVS
+
+       log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min
+       log_must set_tunable64 zfs_trim_txg_batch $trim_txg_batch
+}
+log_onexit cleanup
+
+# Minimum trim size is decreased to verify all trim sizes.
+typeset trim_extent_bytes_min=$(get_tunable zfs_trim_extent_bytes_min)
+log_must set_tunable64 zfs_trim_extent_bytes_min 4096
+
+# Reduced zfs_trim_txg_batch to make trimming more frequent.
+typeset trim_txg_batch=$(get_tunable zfs_trim_txg_batch)
+log_must set_tunable64 zfs_trim_txg_batch 8
+
+for type in "" "mirror" "raidz" "raidz2" "raidz3"; do
+       log_must truncate -s 1G $TRIM_VDEVS
+
+       log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS
+       log_must zpool set autotrim=on $TESTPOOL
+
+       # Add and remove data from the pool in a random fashion in order
+       # to generate a variety of interesting ranges to be auto trimmed.
+       for n in {0..10}; do
+               dir="/$TESTPOOL/autotrim-$((RANDOM % 5))"
+               filesize=$((4096 + ((RANDOM * 691) % 131072) ))
+               log_must rm -rf $dir
+               log_must fill_fs $dir 10 10 $filesize 1 R
+               zpool sync
+
+               if [[ $((n % 4)) -eq 0 ]]; then
+                       log_must zpool trim $TESTPOOL
+                       wait_trim $TESTPOOL $TRIM_VDEVS
+               fi
+       done
+       log_must du -hs /$TESTPOOL
+
+       verify_trim_io $TESTPOOL "ind" 10
+       verify_pool $TESTPOOL
+
+       log_must zpool destroy $TESTPOOL
+       log_must rm -f $TRIM_VDEVS
+done
+
+log_pass "Automatic trim and manual trim coexistence successfully validated"
diff --git a/tests/zfs-tests/tests/functional/trim/cleanup.ksh b/tests/zfs-tests/tests/functional/trim/cleanup.ksh
new file mode 100755 (executable)
index 0000000..29d1404
--- /dev/null
@@ -0,0 +1,48 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+if poolexists $TESTPOOL; then
+       destroy_pool $TESTPOOL
+fi
+
+if poolexists $TESTPOOL1; then
+       destroy_pool $TESTPOOL1
+fi
+
+TRIM_DIR="$TEST_BASE_DIR"
+TRIM_VDEVS="$TRIM_DIR/trim-vdev1 $TRIM_DIR/trim-vdev2 \
+    $TRIM_DIR/trim-vdev3 $TRIM_DIR/trim-vdev4"
+
+rm -rf $TRIM_VDEVS
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/trim/setup.ksh b/tests/zfs-tests/tests/functional/trim/setup.ksh
new file mode 100755 (executable)
index 0000000..cdcf038
--- /dev/null
@@ -0,0 +1,37 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+DISK1=${DISKS%% *}
+
+typeset -i max_discard=0
+if [[ -b $DEV_RDSKDIR/$DISK1 ]]; then
+       max_discard=$(lsblk -Dbn $DEV_RDSKDIR/$DISK1 | awk '{ print $4; exit }')
+fi
+
+if test $max_discard -eq 0; then
+       log_unsupported "DISKS do not support discard (TRIM/UNMAP)"
+fi
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/trim/trim.cfg b/tests/zfs-tests/tests/functional/trim/trim.cfg
new file mode 100644 (file)
index 0000000..91adb76
--- /dev/null
@@ -0,0 +1,33 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+TRIM_DIR="$TEST_BASE_DIR"
+TRIM_VDEV1="$TRIM_DIR/trim-vdev1"
+TRIM_VDEV2="$TRIM_DIR/trim-vdev2"
+TRIM_VDEV3="$TRIM_DIR/trim-vdev3"
+TRIM_VDEV4="$TRIM_DIR/trim-vdev4"
+TRIM_VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3 $TRIM_VDEV4"
diff --git a/tests/zfs-tests/tests/functional/trim/trim.kshlib b/tests/zfs-tests/tests/functional/trim/trim.kshlib
new file mode 100644 (file)
index 0000000..02802d8
--- /dev/null
@@ -0,0 +1,154 @@
+#!/bin/ksh -p
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib
+
+#
+# Get the actual on disk disk for the provided file.
+#
+function get_size_mb
+{
+       typeset rval=$(du --block-size 1048576 -s "$1" | awk '{print $1}')
+       echo -n "$rval"
+}
+
+#
+# Get the number of trim IOs issued for the pool (ind or agg).
+#
+function get_trim_io
+{
+       typeset pool="${1-:$TESTPOOL}"
+       typeset type="${2-:ind}"
+       typeset rval
+
+       # Sum the ind or agg columns of the trim request size histogram.
+       case "$type" in
+       "ind")
+               rval=$(zpool iostat -pr $pool | awk \
+                   '$1 ~ /[0-9].*/ { sum += $12 } END { print sum }')
+               echo -n "$rval"
+               ;;
+       "agg")
+               rval=$(zpool iostat -pr $pool | awk \
+                   '$1 ~ /[0-9].*/ { sum += $13 } END { print sum }')
+               echo -n "$rval"
+               ;;
+       *)
+               log_fail "Type must be 'ind' or 'agg'"
+               ;;
+       esac
+}
+
+#
+# Verify that trim IOs were send to devices in the pool.
+#
+function verify_trim_io
+{
+       typeset pool="${1:-$TESTPOOL}"
+       typeset type="${2:-ind}"
+       typeset min_trim_ios=${3:-100}
+       typeset ios
+
+       ios=$(get_trim_io $pool $type)
+       if [[ $ios -ge $min_trim_ios ]]; then
+               log_note "Issued $ios $type trim IOs for pool $pool"
+       else
+               log_fail "Too few trim IOs issued $ios/$min_trim_ios"
+       fi
+}
+
+#
+# Run N txgs which should be enough to trim the entire pool.
+#
+function wait_trim_io # pool type txgs
+{
+       typeset pool="${1-:$TESTPOOL}"
+       typeset type="${2-:ind}"
+       typeset txgs=${3:-10}
+       typeset timeout=120
+       typeset stop_time=$(( $(date +%s) + $timeout ))
+
+       typeset -i i=0
+       while [[ $i -lt $txgs ]]; do
+               if [ "$(date +%s)" -ge $stop_time ]; then
+                       log_fail "Exceeded trim time limit of ${timeout}s"
+                       return
+               fi
+
+               zpool sync -f
+               ((i = i + 1))
+       done
+
+       typeset ios=$(get_trim_io $pool $type)
+       log_note "Waited for $txgs txgs, $ios $type TRIM IOs"
+}
+
+#
+# Verify that file vdevs against a target value.
+#
+function verify_vdevs # op size vdevs
+{
+       typeset tgt_op=$1
+       typeset tgt_size=$2
+       shift 2
+       typeset vdevs=$@
+
+       for vdev in $vdevs; do
+               typeset size=$(get_size_mb $vdev)
+               if test $size $tgt_op $tgt_size; then
+                       log_note "Success $vdev is $size MB which is $tgt_op" \
+                           "than $tgt_size MB"
+               else
+                       log_fail "Failure $vdev is $size MB which is not" \
+                           "$tgt_op than $tgt_size MB"
+               fi
+       done
+}
+
+#
+# Wait for up to 120 seconds for trimming of the listed vdevs to complete.
+#
+function wait_trim # pool vdevs
+{
+       typeset stop_time=$(( $(date +%s) + 120 ))
+       typeset pool="$1"
+       shift
+       typeset vdevs=$@
+       typeset complete
+
+       while [[ $complete -eq 0 ]]; do
+               complete=1
+
+               for vdev in $vdevs; do
+                       if [[ "$(trim_progress $pool $vdev)" -lt "100" ]]; then
+                               complete=0
+                               break
+                       else
+                               log_must eval "trim_prog_line $pool $vdev | \
+                                   grep complete"
+                       fi
+               done
+
+               if [ "$(date +%s)" -ge $stop_time ]; then
+                       log_fail "Exceeded trim time limit of 120s"
+               fi
+
+               sleep 0.5
+       done
+
+       log_note "Pool completed trim successfully."
+}
diff --git a/tests/zfs-tests/tests/functional/trim/trim_config.ksh b/tests/zfs-tests/tests/functional/trim/trim_config.ksh
new file mode 100755 (executable)
index 0000000..e56bd62
--- /dev/null
@@ -0,0 +1,103 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/trim/trim.kshlib
+. $STF_SUITE/tests/functional/trim/trim.cfg
+
+#
+# DESCRIPTION:
+#      Check various pool geometries stripe, mirror, raidz
+#
+# STRATEGY:
+#      1. Create a pool on file vdevs to trim.
+#      2. Fill the pool to a known percentage of capacity.
+#      3. Verify the vdevs contain 75% or more allocated blocks.
+#      4. Remove all files making it possible to trim the entire pool.
+#      5. Manually trim the pool.
+#      6. Wait for trim to issue trim IOs for the free blocks.
+#      7. Verify the disks contain 30% or less allocated blocks.
+#      8. Repeat for test for striped, mirrored, and RAIDZ pools.
+
+verify_runnable "global"
+
+log_assert "Run 'zpool trim' verify pool disks were trimmed"
+
+function cleanup
+{
+       if poolexists $TESTPOOL; then
+               destroy_pool $TESTPOOL
+       fi
+
+       log_must rm -f $TRIM_VDEVS
+
+       log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min
+       log_must set_tunable64 zfs_trim_txg_batch $trim_txg_batch
+       log_must set_tunable64 zfs_vdev_min_ms_count $vdev_min_ms_count
+}
+log_onexit cleanup
+
+# Minimum trim size is decreased to verify all trim sizes.
+typeset trim_extent_bytes_min=$(get_tunable zfs_trim_extent_bytes_min)
+log_must set_tunable64 zfs_trim_extent_bytes_min 4096
+
+# Reduced zfs_trim_txg_batch to make trimming more frequent.
+typeset trim_txg_batch=$(get_tunable zfs_trim_txg_batch)
+log_must set_tunable64 zfs_trim_txg_batch 8
+
+# Increased metaslabs to better simulate larger more realistic devices.
+typeset vdev_min_ms_count=$(get_tunable zfs_vdev_min_ms_count)
+log_must set_tunable64 zfs_vdev_min_ms_count 32
+
+typeset VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.75 / 1024 / 1024) ))
+typeset VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) ))
+
+for type in "" "mirror" "raidz2"; do
+
+       if [[ "$type" = "" ]]; then
+               VDEVS="$TRIM_VDEV1"
+       elif [[ "$type" = "mirror" ]]; then
+               VDEVS="$TRIM_VDEV1 $TRIM_VDEV2"
+       else
+               VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3"
+       fi
+
+       log_must truncate -s $((4 * MINVDEVSIZE)) $VDEVS
+       log_must zpool create -f $TESTPOOL $type $VDEVS
+
+       typeset availspace=$(get_prop available $TESTPOOL)
+       typeset fill_mb=$(( floor(availspace * 0.90 / 1024 / 1024) ))
+
+       # Fill the pool, verify the vdevs are no longer sparse.
+       file_write -o create -f /$TESTPOOL/file -b 1048576 -c $fill_mb -d R
+       verify_vdevs "-gt" "$VDEV_MAX_MB" $VDEVS
+
+       # Remove the file, issue trim, verify the vdevs are now sparse.
+       log_must rm /$TESTPOOL/file
+       log_must zpool trim $TESTPOOL
+       wait_trim $TESTPOOL $VDEVS
+       verify_vdevs "-le" "$VDEV_MIN_MB" $VDEVS
+
+       log_must zpool destroy $TESTPOOL
+       log_must rm -f $VDEVS
+done
+
+log_pass "Manual trim successfully shrunk vdevs"
diff --git a/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh
new file mode 100755 (executable)
index 0000000..0bbc439
--- /dev/null
@@ -0,0 +1,89 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Tim Chase. All rights reserved.
+# Copyright (c) 2019 Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/trim/trim.kshlib
+. $STF_SUITE/tests/functional/trim/trim.cfg
+
+#
+# DESCRIPTION:
+#      Verify manual trim pool data integrity.
+#
+# STRATEGY:
+#      1. Create a pool on sparse file vdevs to trim.
+#      2. Generate some interesting pool data which can be trimmed.
+#      3. Manually trim the pool.
+#      4. Verify trim IOs of the expected type were issued for the pool.
+#      5. Verify data integrity of the pool after trim.
+#      6. Repeat test for striped, mirrored, and RAIDZ pools.
+
+verify_runnable "global"
+
+log_assert "Run 'zpool trim' and verify pool data integrity"
+
+function cleanup
+{
+       if poolexists $TESTPOOL; then
+               destroy_pool $TESTPOOL
+       fi
+
+       log_must rm -f $TRIM_VDEVS
+
+       log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min
+       log_must set_tunable64 zfs_trim_txg_batch $trim_txg_batch
+}
+log_onexit cleanup
+
+# Minimum trim size is decreased to verify all trim sizes.
+typeset trim_extent_bytes_min=$(get_tunable zfs_trim_extent_bytes_min)
+log_must set_tunable64 zfs_trim_extent_bytes_min 4096
+
+# Reduced zfs_trim_txg_batch to make trimming more frequent.
+typeset trim_txg_batch=$(get_tunable zfs_trim_txg_batch)
+log_must set_tunable64 zfs_trim_txg_batch 8
+
+for type in "" "mirror" "raidz" "raidz2" "raidz3"; do
+       log_must truncate -s 1G $TRIM_VDEVS
+
+       log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS
+
+       # Add and remove data from the pool in a random fashion in order
+       # to generate a variety of interesting ranges to be manually trimmed.
+       for n in {0..10}; do
+               dir="/$TESTPOOL/trim-$((RANDOM % 5))"
+               filesize=$((4096 + ((RANDOM * 691) % 131072) ))
+               log_must rm -rf $dir
+               log_must fill_fs $dir 10 10 $filesize 1 R
+               zpool sync
+       done
+       log_must du -hs /$TESTPOOL
+
+       log_must zpool trim $TESTPOOL
+       wait_trim $TESTPOOL $TRIM_VDEVS
+
+       verify_trim_io $TESTPOOL "ind" 10
+       verify_pool $TESTPOOL
+
+       log_must zpool destroy $TESTPOOL
+       log_must rm -f $TRIM_VDEVS
+done
+
+log_pass "Manual trim successfully validated"