/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Nexenta Systems, Inc.
* Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
static void
print_vdev_metaslab_header(vdev_t *vd)
{
- (void) printf("\tvdev %10llu\n\t%-10s%5llu %-19s %-15s %-10s\n",
- (u_longlong_t)vd->vdev_id,
+ vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
+ const char *bias_str;
+
+ bias_str = (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) ?
+ VDEV_ALLOC_BIAS_LOG :
+ (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
+ (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP :
+ vd->vdev_islog ? "log" : "";
+
+ (void) printf("\tvdev %10llu %s\n"
+ "\t%-10s%5llu %-19s %-15s %-12s\n",
+ (u_longlong_t)vd->vdev_id, bias_str,
"metaslabs", (u_longlong_t)vd->vdev_ms_count,
"offset", "spacemap", "free");
- (void) printf("\t%15s %19s %15s %10s\n",
+ (void) printf("\t%15s %19s %15s %12s\n",
"---------------", "-------------------",
- "---------------", "-------------");
+ "---------------", "------------");
}
static void
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
- if (mg->mg_class != mc)
+ if (mg == NULL || mg->mg_class != mc)
continue;
metaslab_group_histogram_verify(mg);
uint64_t zb_count;
uint64_t zb_gangs;
uint64_t zb_ditto_samevdev;
+ uint64_t zb_ditto_same_ms;
uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
} zdb_blkstats_t;
uint32_t **zcb_vd_obsolete_counts;
} zdb_cb_t;
+/* test if two DVA offsets from same vdev are within the same metaslab */
+static boolean_t
+same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
+{
+ vdev_t *vd = vdev_lookup_top(spa, vdev);
+ uint64_t ms_shift = vd->vdev_ms_shift;
+
+ return ((off1 >> ms_shift) == (off2 >> ms_shift));
+}
+
static void
zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
dmu_object_type_t type)
if (zilog && zil_bp_tree_add(zilog, bp) != 0)
return;
+ spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
+
for (i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
int t = (i & 1) ? type : ZDB_OT_TOTAL;
switch (BP_GET_NDVAS(bp)) {
case 2:
if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
- DVA_GET_VDEV(&bp->blk_dva[1]))
+ DVA_GET_VDEV(&bp->blk_dva[1])) {
zb->zb_ditto_samevdev++;
+
+ if (same_metaslab(zcb->zcb_spa,
+ DVA_GET_VDEV(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[1])))
+ zb->zb_ditto_same_ms++;
+ }
break;
case 3:
equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
DVA_GET_VDEV(&bp->blk_dva[2])) +
(DVA_GET_VDEV(&bp->blk_dva[1]) ==
DVA_GET_VDEV(&bp->blk_dva[2]));
- if (equal != 0)
+ if (equal != 0) {
zb->zb_ditto_samevdev++;
+
+ if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1]) &&
+ same_metaslab(zcb->zcb_spa,
+ DVA_GET_VDEV(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[1])))
+ zb->zb_ditto_same_ms++;
+ else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]) &&
+ same_metaslab(zcb->zcb_spa,
+ DVA_GET_VDEV(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[2])))
+ zb->zb_ditto_same_ms++;
+ else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]) &&
+ same_metaslab(zcb->zcb_spa,
+ DVA_GET_VDEV(&bp->blk_dva[1]),
+ DVA_GET_OFFSET(&bp->blk_dva[1]),
+ DVA_GET_OFFSET(&bp->blk_dva[2])))
+ zb->zb_ditto_same_ms++;
+ }
break;
}
-
}
+ spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
+
if (BP_IS_EMBEDDED(bp)) {
zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
flags |= TRAVERSE_PREFETCH_DATA;
zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
+ zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
+ zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
norm_space = metaslab_class_get_space(spa_normal_class(spa));
- total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
+ total_alloc = norm_alloc +
+ metaslab_class_get_alloc(spa_log_class(spa)) +
+ metaslab_class_get_alloc(spa_special_class(spa)) +
+ metaslab_class_get_alloc(spa_dedup_class(spa));
total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
return (2);
(void) printf("\n");
- (void) printf("\tbp count: %10llu\n",
+ (void) printf("\t%-16s %14llu\n", "bp count:",
(u_longlong_t)tzb->zb_count);
- (void) printf("\tganged count: %10llu\n",
+ (void) printf("\t%-16s %14llu\n", "ganged count:",
(longlong_t)tzb->zb_gangs);
- (void) printf("\tbp logical: %10llu avg: %6llu\n",
+ (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:",
(u_longlong_t)tzb->zb_lsize,
(u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
- (void) printf("\tbp physical: %10llu avg:"
- " %6llu compression: %6.2f\n",
- (u_longlong_t)tzb->zb_psize,
+ (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
+ "bp physical:", (u_longlong_t)tzb->zb_psize,
(u_longlong_t)(tzb->zb_psize / tzb->zb_count),
(double)tzb->zb_lsize / tzb->zb_psize);
- (void) printf("\tbp allocated: %10llu avg:"
- " %6llu compression: %6.2f\n",
- (u_longlong_t)tzb->zb_asize,
+ (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
+ "bp allocated:", (u_longlong_t)tzb->zb_asize,
(u_longlong_t)(tzb->zb_asize / tzb->zb_count),
(double)tzb->zb_lsize / tzb->zb_asize);
- (void) printf("\tbp deduped: %10llu ref>1:"
- " %6llu deduplication: %6.2f\n",
- (u_longlong_t)zcb.zcb_dedup_asize,
+ (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n",
+ "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize,
(u_longlong_t)zcb.zcb_dedup_blocks,
(double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
- (void) printf("\tSPA allocated: %10llu used: %5.2f%%\n",
+ (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:",
(u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
+ if (spa_special_class(spa)->mc_rotor != NULL) {
+ uint64_t alloc = metaslab_class_get_alloc(
+ spa_special_class(spa));
+ uint64_t space = metaslab_class_get_space(
+ spa_special_class(spa));
+
+ (void) printf("\t%-16s %14llu used: %5.2f%%\n",
+ "Special class", (u_longlong_t)alloc,
+ 100.0 * alloc / space);
+ }
+
+ if (spa_dedup_class(spa)->mc_rotor != NULL) {
+ uint64_t alloc = metaslab_class_get_alloc(
+ spa_dedup_class(spa));
+ uint64_t space = metaslab_class_get_space(
+ spa_dedup_class(spa));
+
+ (void) printf("\t%-16s %14llu used: %5.2f%%\n",
+ "Dedup class", (u_longlong_t)alloc,
+ 100.0 * alloc / space);
+ }
+
for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
if (zcb.zcb_embedded_blocks[i] == 0)
continue;
(void) printf("\tDittoed blocks on same vdev: %llu\n",
(longlong_t)tzb->zb_ditto_samevdev);
}
+ if (tzb->zb_ditto_same_ms != 0) {
+ (void) printf("\tDittoed blocks in same metaslab: %llu\n",
+ (longlong_t)tzb->zb_ditto_same_ms);
+ }
for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2012 by Frederik Wessels. All rights reserved.
* Copyright (c) 2012 by Cyril Plisko. All rights reserved.
* Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved.
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
* Copyright (c) 2017 Datto Inc.
* Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
#include <assert.h>
#define NCOMMAND (ARRAY_SIZE(command_table))
+#define VDEV_ALLOC_CLASS_LOGS "logs"
+
static zpool_command_t *current_command;
static char history_str[HIS_MAX_RECORD_LEN];
static boolean_t log_history = B_TRUE;
{
FILE *fp = cb;
- (void) fprintf(fp, "\t%-15s ", zpool_prop_to_name(prop));
+ (void) fprintf(fp, "\t%-19s ", zpool_prop_to_name(prop));
if (zpool_prop_readonly(prop))
(void) fprintf(fp, " NO ");
(void) fprintf(fp,
gettext("\nthe following properties are supported:\n"));
- (void) fprintf(fp, "\n\t%-15s %s %s\n\n",
+ (void) fprintf(fp, "\n\t%-19s %s %s\n\n",
"PROPERTY", "EDIT", "VALUES");
/* Iterate over all properties */
(void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE,
ZFS_TYPE_POOL);
- (void) fprintf(fp, "\t%-15s ", "feature@...");
+ (void) fprintf(fp, "\t%-19s ", "feature@...");
(void) fprintf(fp, "YES disabled | enabled | active\n");
(void) fprintf(fp, gettext("\nThe feature@ properties must be "
exit(requested ? 0 : 2);
}
-void
+/*
+ * print a pool vdev config for dry runs
+ */
+static void
print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent,
- boolean_t print_logs, int name_flags)
+ const char *match, int name_flags)
{
nvlist_t **child;
uint_t c, children;
char *vname;
-
- if (name != NULL)
- (void) printf("\t%*s%s\n", indent, "", name);
+ boolean_t printed = B_FALSE;
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- &child, &children) != 0)
+ &child, &children) != 0) {
+ if (name != NULL)
+ (void) printf("\t%*s%s\n", indent, "", name);
return;
+ }
for (c = 0; c < children; c++) {
uint64_t is_log = B_FALSE;
+ char *class = "";
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
&is_log);
- if ((is_log && !print_logs) || (!is_log && print_logs))
+ if (is_log)
+ class = VDEV_ALLOC_BIAS_LOG;
+ (void) nvlist_lookup_string(child[c],
+ ZPOOL_CONFIG_ALLOCATION_BIAS, &class);
+ if (strcmp(match, class) != 0)
continue;
+ if (!printed && name != NULL) {
+ (void) printf("\t%*s%s\n", indent, "", name);
+ printed = B_TRUE;
+ }
vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags);
- print_vdev_tree(zhp, vname, child[c], indent + 2,
- B_FALSE, name_flags);
+ print_vdev_tree(zhp, vname, child[c], indent + 2, "",
+ name_flags);
free(vname);
}
}
"configuration:\n"), zpool_get_name(zhp));
/* print original main pool and new tree */
- print_vdev_tree(zhp, poolname, poolnvroot, 0, B_FALSE,
+ print_vdev_tree(zhp, poolname, poolnvroot, 0, "",
+ name_flags | VDEV_NAME_TYPE_ID);
+ print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags);
+
+ /* print other classes: 'dedup', 'special', and 'log' */
+ print_vdev_tree(zhp, "dedup", poolnvroot, 0,
+ VDEV_ALLOC_BIAS_DEDUP, name_flags);
+ print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_DEDUP,
name_flags);
- print_vdev_tree(zhp, NULL, nvroot, 0, B_FALSE, name_flags);
- /* Do the same for the logs */
- if (num_logs(poolnvroot) > 0) {
- print_vdev_tree(zhp, "logs", poolnvroot, 0, B_TRUE,
- name_flags);
- print_vdev_tree(zhp, NULL, nvroot, 0, B_TRUE,
- name_flags);
- } else if (num_logs(nvroot) > 0) {
- print_vdev_tree(zhp, "logs", nvroot, 0, B_TRUE,
- name_flags);
- }
+ print_vdev_tree(zhp, "special", poolnvroot, 0,
+ VDEV_ALLOC_BIAS_SPECIAL, name_flags);
+ print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL,
+ name_flags);
+
+ print_vdev_tree(zhp, "logs", poolnvroot, 0, VDEV_ALLOC_BIAS_LOG,
+ name_flags);
+ print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_LOG,
+ name_flags);
/* Do the same for the caches */
if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE,
(void) printf(gettext("would create '%s' with the "
"following layout:\n\n"), poolname);
- print_vdev_tree(NULL, poolname, nvroot, 0, B_FALSE, 0);
- if (num_logs(nvroot) > 0)
- print_vdev_tree(NULL, "logs", nvroot, 0, B_TRUE, 0);
+ print_vdev_tree(NULL, poolname, nvroot, 0, "", 0);
+ print_vdev_tree(NULL, "dedup", nvroot, 0,
+ VDEV_ALLOC_BIAS_DEDUP, 0);
+ print_vdev_tree(NULL, "special", nvroot, 0,
+ VDEV_ALLOC_BIAS_SPECIAL, 0);
+ print_vdev_tree(NULL, "logs", nvroot, 0,
+ VDEV_ALLOC_BIAS_LOG, 0);
ret = 0;
} else {
&ishole);
if (islog || ishole)
continue;
+ /* Only print normal classes here */
+ if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
+ continue;
+
vname = zpool_vdev_name(g_zfs, zhp, child[c],
cb->cb_name_flags | VDEV_NAME_TYPE_ID);
print_status_config(zhp, cb, vname, child[c], depth + 2,
&is_log);
if (is_log)
continue;
+ if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
+ continue;
vname = zpool_vdev_name(g_zfs, NULL, child[c],
cb->cb_name_flags | VDEV_NAME_TYPE_ID);
}
/*
- * Print log vdevs.
- * Logs are recorded as top level vdevs in the main pool child array
- * but with "is_log" set to 1. We use either print_status_config() or
- * print_import_config() to print the top level logs then any log
- * children (eg mirrored slogs) are printed recursively - which
- * works because only the top level vdev is marked "is_log"
+ * Print specialized class vdevs.
+ *
+ * These are recorded as top level vdevs in the main pool child array
+ * but with "is_log" set to 1 or an "alloc_bias" string. We use either
+ * print_status_config() or print_import_config() to print the top level
+ * class vdevs then any of their children (eg mirrored slogs) are printed
+ * recursively - which works because only the top level vdev is marked.
*/
static void
-print_logs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv)
+print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv,
+ const char *class)
{
uint_t c, children;
nvlist_t **child;
+ boolean_t printed = B_FALSE;
+
+ assert(zhp != NULL || !cb->cb_verbose);
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child,
&children) != 0)
return;
- (void) printf(gettext("\tlogs\n"));
-
for (c = 0; c < children; c++) {
uint64_t is_log = B_FALSE;
- char *name;
+ char *bias = NULL;
+ char *type = NULL;
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
&is_log);
- if (!is_log)
+
+ if (is_log) {
+ bias = VDEV_ALLOC_CLASS_LOGS;
+ } else {
+ (void) nvlist_lookup_string(child[c],
+ ZPOOL_CONFIG_ALLOCATION_BIAS, &bias);
+ (void) nvlist_lookup_string(child[c],
+ ZPOOL_CONFIG_TYPE, &type);
+ }
+
+ if (bias == NULL || strcmp(bias, class) != 0)
+ continue;
+ if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
continue;
- name = zpool_vdev_name(g_zfs, zhp, child[c],
+
+ if (!printed) {
+ (void) printf("\t%s\t\n", gettext(class));
+ printed = B_TRUE;
+ }
+
+ char *name = zpool_vdev_name(g_zfs, zhp, child[c],
cb->cb_name_flags | VDEV_NAME_TYPE_ID);
if (cb->cb_print_status)
print_status_config(zhp, cb, name, child[c], 2,
cb.cb_namewidth = 10;
print_import_config(&cb, name, nvroot, 0);
- if (num_logs(nvroot) > 0)
- print_logs(NULL, &cb, nvroot);
+
+ print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP);
+ print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL);
+ print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS);
if (reason == ZPOOL_STATUS_BAD_GUID_SUM) {
(void) printf(gettext("\n\tAdditional devices are known to "
format, column_width, cb->cb_scripted);
}
+static const char *class_name[] = {
+ VDEV_ALLOC_BIAS_DEDUP,
+ VDEV_ALLOC_BIAS_SPECIAL,
+ VDEV_ALLOC_CLASS_LOGS
+};
+
/*
* Print out all the statistics for the given vdev. This can either be the
* toplevel configuration, or called recursively. If 'name' is NULL, then this
*
* Returns the number of stat lines printed.
*/
-unsigned int
+static unsigned int
print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
nvlist_t *newnv, iostat_cbdata_t *cb, int depth)
{
children = MIN(oldchildren, children);
}
+ /*
+ * print normal top-level devices
+ */
for (c = 0; c < children; c++) {
uint64_t ishole = B_FALSE, islog = B_FALSE;
if (ishole || islog)
continue;
+ if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
+ continue;
+
vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
cb->cb_name_flags);
ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
}
/*
- * Log device section
+ * print all other top-level devices
*/
-
- if (num_logs(newnv) > 0) {
- if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted &&
- !cb->cb_vdev_names) {
- print_iostat_dashes(cb, 0, "logs");
- }
- printf("\n");
+ for (uint_t n = 0; n < 3; n++) {
+ boolean_t printed = B_FALSE;
for (c = 0; c < children; c++) {
uint64_t islog = B_FALSE;
+ char *bias = NULL;
+ char *type = NULL;
+
(void) nvlist_lookup_uint64(newchild[c],
ZPOOL_CONFIG_IS_LOG, &islog);
-
if (islog) {
- vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
- cb->cb_name_flags);
- ret += print_vdev_stats(zhp, vname, oldnv ?
- oldchild[c] : NULL, newchild[c],
- cb, depth + 2);
- free(vname);
+ bias = VDEV_ALLOC_CLASS_LOGS;
+ } else {
+ (void) nvlist_lookup_string(newchild[c],
+ ZPOOL_CONFIG_ALLOCATION_BIAS, &bias);
+ (void) nvlist_lookup_string(newchild[c],
+ ZPOOL_CONFIG_TYPE, &type);
}
- }
+ if (bias == NULL || strcmp(bias, class_name[n]) != 0)
+ continue;
+ if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
+ continue;
+ if (!printed) {
+ if ((!(cb->cb_flags & IOS_ANYHISTO_M)) &&
+ !cb->cb_scripted && !cb->cb_vdev_names) {
+ print_iostat_dashes(cb, 0,
+ class_name[n]);
+ }
+ printf("\n");
+ printed = B_TRUE;
+ }
+
+ vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
+ cb->cb_name_flags);
+ ret += print_vdev_stats(zhp, vname, oldnv ?
+ oldchild[c] : NULL, newchild[c], cb, depth + 2);
+ free(vname);
+ }
}
/*
boolean_t cb_literal;
} list_cbdata_t;
+
/*
* Given a list of columns to display, output appropriate headers for each one.
*/
/*
* Given a pool and a list of properties, print out all the properties according
- * to the described layout.
+ * to the described layout. Used by zpool_do_list().
*/
static void
print_pool(zpool_handle_t *zhp, list_cbdata_t *cb)
}
break;
case ZPOOL_PROP_CAPACITY:
+ /* capacity value is in parts-per-10,000 (aka permyriad) */
if (format == ZFS_NICENUM_RAW)
(void) snprintf(propval, sizeof (propval), "%llu",
- (unsigned long long)value);
+ (unsigned long long)value / 100);
else
- (void) snprintf(propval, sizeof (propval), "%llu%%",
- (unsigned long long)value);
+ (void) snprintf(propval, sizeof (propval),
+ value < 1000 ? "%1.2f%%" : value < 10000 ?
+ "%2.1f%%" : "%3.0f%%", value / 100.0);
break;
default:
zfs_nicenum_format(value, propval, sizeof (propval), format);
(void) printf(" %*s", (int)width, propval);
}
+/*
+ * print static default line per vdev
+ * not compatible with '-o' <proplist> option
+ */
void
print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
list_cbdata_t *cb, int depth)
char *vname;
boolean_t scripted = cb->cb_scripted;
uint64_t islog = B_FALSE;
- boolean_t haslog = B_FALSE;
char *dashes = "%-*s - - - - - -\n";
verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel),
format);
cap = (vs->vs_space == 0) ? 0 :
- (vs->vs_alloc * 100 / vs->vs_space);
+ (vs->vs_alloc * 10000 / vs->vs_space);
print_one_column(ZPOOL_PROP_CAPACITY, cap, scripted, toplevel,
format);
(void) printf("\n");
&child, &children) != 0)
return;
+ /* list the normal vdevs first */
for (c = 0; c < children; c++) {
uint64_t ishole = B_FALSE;
continue;
if (nvlist_lookup_uint64(child[c],
- ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) {
- haslog = B_TRUE;
+ ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog)
+ continue;
+
+ if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
continue;
- }
vname = zpool_vdev_name(g_zfs, zhp, child[c],
cb->cb_name_flags);
free(vname);
}
- if (haslog == B_TRUE) {
- /* LINTED E_SEC_PRINTF_VAR_FMT */
- (void) printf(dashes, cb->cb_namewidth, "log");
+ /* list the classes: 'logs', 'dedup', and 'special' */
+ for (uint_t n = 0; n < 3; n++) {
+ boolean_t printed = B_FALSE;
+
for (c = 0; c < children; c++) {
+ char *bias = NULL;
+ char *type = NULL;
+
if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
- &islog) != 0 || !islog)
+ &islog) == 0 && islog) {
+ bias = VDEV_ALLOC_CLASS_LOGS;
+ } else {
+ (void) nvlist_lookup_string(child[c],
+ ZPOOL_CONFIG_ALLOCATION_BIAS, &bias);
+ (void) nvlist_lookup_string(child[c],
+ ZPOOL_CONFIG_TYPE, &type);
+ }
+ if (bias == NULL || strcmp(bias, class_name[n]) != 0)
continue;
+ if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
+ continue;
+
+ if (!printed) {
+ /* LINTED E_SEC_PRINTF_VAR_FMT */
+ (void) printf(dashes, cb->cb_namewidth,
+ class_name[n]);
+ printed = B_TRUE;
+ }
vname = zpool_vdev_name(g_zfs, zhp, child[c],
cb->cb_name_flags);
print_list_stats(zhp, vname, child[c], cb, depth + 2);
}
}
-
/*
* Generic callback function to list a pool.
*/
config = zpool_get_config(zhp, NULL);
+ if (cbp->cb_verbose) {
+ config = zpool_get_config(zhp, NULL);
+
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ }
+
+ if (cbp->cb_verbose)
+ cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0,
+ cbp->cb_name_flags);
+
print_pool(zhp, cbp);
- if (!cbp->cb_verbose)
- return (0);
- verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
- &nvroot) == 0);
- print_list_stats(zhp, NULL, nvroot, cbp, 0);
+ if (cbp->cb_verbose)
+ print_list_stats(zhp, NULL, nvroot, cbp, 0);
return (0);
}
break;
case 'v':
cb.cb_verbose = B_TRUE;
+ cb.cb_namewidth = 8; /* 8 until precalc is avail */
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
if (flags.dryrun) {
(void) printf(gettext("would create '%s' with the "
"following layout:\n\n"), newpool);
- print_vdev_tree(NULL, newpool, config, 0, B_FALSE,
+ print_vdev_tree(NULL, newpool, config, 0, "",
flags.name_flags);
}
}
print_cmd_columns(cbp->vcdl, 0);
printf("\n");
+
print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0,
B_FALSE);
- if (num_logs(nvroot) > 0)
- print_logs(zhp, cbp, nvroot);
+ print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP);
+ print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL);
+ print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS);
+
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
&l2cache, &nl2cache) == 0)
print_l2cache(zhp, cbp, l2cache, nl2cache);
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
- * Copyright (c) 2016 Intel Corporation.
+ * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2017 Intel Corporation.
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
*/
verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
+ if (is_log)
+ verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS,
+ VDEV_ALLOC_BIAS_LOG) == 0);
if (strcmp(type, VDEV_TYPE_DISK) == 0)
verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
(uint64_t)wholedisk) == 0);
*
* Otherwise, make sure that the current spec (if there is one) and the new
* spec have consistent replication levels.
+ *
+ * If there is no current spec (create), make sure new spec has at least
+ * one general purpose vdev.
*/
typedef struct replication_level {
char *zprl_type;
/*
* At this point, we have the replication of the last toplevel
- * vdev in 'rep'. Compare it to 'lastrep' to see if its
+ * vdev in 'rep'. Compare it to 'lastrep' to see if it is
* different.
*/
if (lastrep.zprl_type != NULL) {
return (VDEV_TYPE_LOG);
}
+ if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 ||
+ strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
+ if (mindev != NULL)
+ *mindev = 1;
+ return (type);
+ }
+
if (strcmp(type, "cache") == 0) {
if (mindev != NULL)
*mindev = 1;
nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
const char *type;
- uint64_t is_log;
+ uint64_t is_log, is_special, is_dedup;
boolean_t seen_logs;
top = NULL;
nspares = 0;
nlogs = 0;
nl2cache = 0;
- is_log = B_FALSE;
+ is_log = is_special = is_dedup = B_FALSE;
seen_logs = B_FALSE;
nvroot = NULL;
"specified only once\n"));
goto spec_out;
}
- is_log = B_FALSE;
+ is_log = is_special = is_dedup = B_FALSE;
}
if (strcmp(type, VDEV_TYPE_LOG) == 0) {
}
seen_logs = B_TRUE;
is_log = B_TRUE;
+ is_special = B_FALSE;
+ is_dedup = B_FALSE;
argc--;
argv++;
/*
continue;
}
+ if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
+ is_special = B_TRUE;
+ is_log = B_FALSE;
+ is_dedup = B_FALSE;
+ argc--;
+ argv++;
+ continue;
+ }
+
+ if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
+ is_dedup = B_TRUE;
+ is_log = B_FALSE;
+ is_special = B_FALSE;
+ argc--;
+ argv++;
+ continue;
+ }
+
if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
if (l2cache != NULL) {
(void) fprintf(stderr,
"specified only once\n"));
goto spec_out;
}
- is_log = B_FALSE;
+ is_log = is_special = is_dedup = B_FALSE;
}
- if (is_log) {
+ if (is_log || is_special || is_dedup) {
if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
(void) fprintf(stderr,
gettext("invalid vdev "
- "specification: unsupported 'log' "
- "device: %s\n"), type);
+ "specification: unsupported '%s' "
+ "device: %s\n"), is_log ? "log" :
+ "special", type);
goto spec_out;
}
nlogs++;
nl2cache = children;
continue;
} else {
+ /* create a top-level vdev with children */
verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
0) == 0);
verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
type) == 0);
verify(nvlist_add_uint64(nv,
ZPOOL_CONFIG_IS_LOG, is_log) == 0);
+ if (is_log)
+ verify(nvlist_add_string(nv,
+ ZPOOL_CONFIG_ALLOCATION_BIAS,
+ VDEV_ALLOC_BIAS_LOG) == 0);
+ if (is_special) {
+ verify(nvlist_add_string(nv,
+ ZPOOL_CONFIG_ALLOCATION_BIAS,
+ VDEV_ALLOC_BIAS_SPECIAL) == 0);
+ }
+ if (is_dedup) {
+ verify(nvlist_add_string(nv,
+ ZPOOL_CONFIG_ALLOCATION_BIAS,
+ VDEV_ALLOC_BIAS_DEDUP) == 0);
+ }
if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
verify(nvlist_add_uint64(nv,
ZPOOL_CONFIG_NPARITY,
if (is_log)
nlogs++;
+ if (is_special) {
+ verify(nvlist_add_string(nv,
+ ZPOOL_CONFIG_ALLOCATION_BIAS,
+ VDEV_ALLOC_BIAS_SPECIAL) == 0);
+ }
+ if (is_dedup) {
+ verify(nvlist_add_string(nv,
+ ZPOOL_CONFIG_ALLOCATION_BIAS,
+ VDEV_ALLOC_BIAS_DEDUP) == 0);
+ }
argc--;
argv++;
}
return (newroot);
}
+static int
+num_normal_vdevs(nvlist_t *nvroot)
+{
+ nvlist_t **top;
+ uint_t t, toplevels, normal = 0;
+
+ verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &top, &toplevels) == 0);
+
+ for (t = 0; t < toplevels; t++) {
+ uint64_t log = B_FALSE;
+
+ (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log);
+ if (log)
+ continue;
+ if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS))
+ continue;
+
+ normal++;
+ }
+
+ return (normal);
+}
+
/*
* Get and validate the contents of the given vdev specification. This ensures
* that the nvlist returned is well-formed, that all the devices exist, and that
return (NULL);
}
+ /*
+ * On pool create the new vdev spec must have one normal vdev.
+ */
+ if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) {
+ vdev_error(gettext("at least one general top-level vdev must "
+ "be specified\n"));
+ nvlist_free(newroot);
+ return (NULL);
+ }
+
/*
* Run through the vdev specification and label any whole disks found.
*/
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
*/
/*
static ztest_shared_hdr_t *ztest_shared_hdr;
+enum ztest_class_state {
+ ZTEST_VDEV_CLASS_OFF,
+ ZTEST_VDEV_CLASS_ON,
+ ZTEST_VDEV_CLASS_RND
+};
+
typedef struct ztest_shared_opts {
char zo_pool[ZFS_MAX_DATASET_NAME_LEN];
char zo_dir[ZFS_MAX_DATASET_NAME_LEN];
uint64_t zo_maxloops;
uint64_t zo_metaslab_force_ganging;
int zo_mmp_test;
+ int zo_special_vdevs;
} ztest_shared_opts_t;
static const ztest_shared_opts_t ztest_opts_defaults = {
.zo_time = 300, /* 5 minutes */
.zo_maxloops = 50, /* max loops during spa_freeze() */
.zo_metaslab_force_ganging = 32 << 10,
+ .zo_special_vdevs = ZTEST_VDEV_CLASS_RND,
};
extern uint64_t metaslab_force_ganging;
ztest_func_t ztest_vdev_attach_detach;
ztest_func_t ztest_vdev_LUN_growth;
ztest_func_t ztest_vdev_add_remove;
+ztest_func_t ztest_vdev_class_add;
ztest_func_t ztest_vdev_aux_add_remove;
ztest_func_t ztest_split_pool;
ztest_func_t ztest_reguid;
ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes),
ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely),
ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime),
+ ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime),
ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime),
ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes),
ZTI_INIT(ztest_remap_blocks, 1, &zopt_sometimes),
"\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n"
"\t[-P passtime (default: %llu sec)] time per pass\n"
"\t[-B alt_ztest (default: <none>)] alternate ztest path\n"
+ "\t[-C vdev class state (default: random)] special=on|off|random\n"
"\t[-o variable=value] ... set global variable to an unsigned\n"
"\t 32-bit integer value\n"
"\t[-G dump zfs_dbgmsg buffer before exiting due to an error\n"
exit(requested ? 0 : 1);
}
+
+static void
+ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo)
+{
+ char name[32];
+ char *value;
+ int state = ZTEST_VDEV_CLASS_RND;
+
+ (void) strlcpy(name, input, sizeof (name));
+
+ value = strchr(name, '=');
+ if (value == NULL) {
+ (void) fprintf(stderr, "missing value in property=value "
+ "'-C' argument (%s)\n", input);
+ usage(B_FALSE);
+ }
+ *(value) = '\0';
+ value++;
+
+ if (strcmp(value, "on") == 0) {
+ state = ZTEST_VDEV_CLASS_ON;
+ } else if (strcmp(value, "off") == 0) {
+ state = ZTEST_VDEV_CLASS_OFF;
+ } else if (strcmp(value, "random") == 0) {
+ state = ZTEST_VDEV_CLASS_RND;
+ } else {
+ (void) fprintf(stderr, "invalid property value '%s'\n", value);
+ usage(B_FALSE);
+ }
+
+ if (strcmp(name, "special") == 0) {
+ zo->zo_special_vdevs = state;
+ } else {
+ (void) fprintf(stderr, "invalid property name '%s'\n", name);
+ usage(B_FALSE);
+ }
+ if (zo->zo_verbose >= 3)
+ (void) printf("%s vdev state is '%s'\n", name, value);
+}
+
static void
process_options(int argc, char **argv)
{
bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
while ((opt = getopt(argc, argv,
- "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:o:G")) != EOF) {
+ "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) {
value = 0;
switch (opt) {
case 'v':
case 'B':
(void) strlcpy(altdir, optarg, sizeof (altdir));
break;
+ case 'C':
+ ztest_parse_name_value(optarg, zo);
+ break;
case 'o':
if (set_global_var(optarg) != 0)
usage(B_FALSE);
static nvlist_t *
make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift,
- int log, int r, int m, int t)
+ const char *class, int r, int m, int t)
{
nvlist_t *root, **child;
int c;
+ boolean_t log;
ASSERT(t > 0);
+ log = (class != NULL && strcmp(class, "log") == 0);
+
child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
for (c = 0; c < t; c++) {
r, m);
VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
log) == 0);
+
+ if (class != NULL && class[0] != '\0') {
+ ASSERT(m > 1 || log); /* expecting a mirror */
+ VERIFY(nvlist_add_string(child[c],
+ ZPOOL_CONFIG_ALLOCATION_BIAS, class) == 0);
+ }
}
VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
static int
ztest_random_blocksize(void)
{
+ ASSERT(ztest_spa->spa_max_ashift != 0);
+
/*
* Choose a block size >= the ashift.
* If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
/*
* Attempt to create using a bad file.
*/
- nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
+ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1);
VERIFY3U(ENOENT, ==,
spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL));
nvlist_free(nvroot);
/*
* Attempt to create using a bad mirror.
*/
- nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1);
+ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1);
VERIFY3U(ENOENT, ==,
spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL));
nvlist_free(nvroot);
* what's in the nvroot; we should fail with EEXIST.
*/
(void) pthread_rwlock_rdlock(&ztest_name_lock);
- nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
+ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1);
VERIFY3U(EEXIST, ==,
spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL));
nvlist_free(nvroot);
(void) spa_destroy(name);
nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
- 0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
+ NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
/*
* If we're configuring a RAIDZ device then make sure that the
* If we have slogs then remove them 1/4 of the time.
*/
if (spa_has_slogs(spa) && ztest_random(4) == 0) {
+ metaslab_group_t *mg;
+
/*
- * Grab the guid from the head of the log class rotor.
+ * find the first real slog in log allocation class
*/
- guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
+ mg = spa_log_class(spa)->mc_rotor;
+ while (!mg->mg_vd->vdev_islog)
+ mg = mg->mg_next;
+
+ guid = mg->mg_vd->vdev_guid;
spa_config_exit(spa, SCL_VDEV, FTAG);
spa_config_exit(spa, SCL_VDEV, FTAG);
/*
- * Make 1/4 of the devices be log devices.
+ * Make 1/4 of the devices be log devices
*/
nvroot = make_vdev_root(NULL, NULL, NULL,
- ztest_opts.zo_vdev_size, 0,
- ztest_random(4) == 0, ztest_opts.zo_raidz,
- zs->zs_mirrors, 1);
+ ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ?
+ "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
error = spa_vdev_add(spa, nvroot);
nvlist_free(nvroot);
mutex_exit(&ztest_vdev_lock);
}
+/* ARGSUSED */
+void
+ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = ztest_spa;
+ uint64_t leaves;
+ nvlist_t *nvroot;
+ const char *class = (ztest_random(2) == 0) ?
+ VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP;
+ int error;
+
+ /*
+ * By default add a special vdev 50% of the time
+ */
+ if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) ||
+ (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND &&
+ ztest_random(2) == 0)) {
+ return;
+ }
+
+ mutex_enter(&ztest_vdev_lock);
+
+ /* Only test with mirrors */
+ if (zs->zs_mirrors < 2) {
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+
+ /* requires feature@allocation_classes */
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) {
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+
+ leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
+ class, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
+
+ error = spa_vdev_add(spa, nvroot);
+ nvlist_free(nvroot);
+
+ if (error == ENOSPC)
+ ztest_record_enospc("spa_vdev_add");
+ else if (error != 0)
+ fatal(0, "spa_vdev_add() = %d", error);
+
+ /*
+ * 50% of the time allow small blocks in the special class
+ */
+ if (error == 0 &&
+ spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) {
+ if (ztest_opts.zo_verbose >= 3)
+ (void) printf("Enabling special VDEV small blocks\n");
+ (void) ztest_dsl_prop_set_uint64(zd->zd_name,
+ ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE);
+ }
+
+ mutex_exit(&ztest_vdev_lock);
+
+ if (ztest_opts.zo_verbose >= 3) {
+ metaslab_class_t *mc;
+
+ if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0)
+ mc = spa_special_class(spa);
+ else
+ mc = spa_dedup_class(spa);
+ (void) printf("Added a %s mirrored vdev (of %d)\n",
+ class, (int)mc->mc_groups);
+ }
+}
+
/*
* Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
*/
* Add a new device.
*/
nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
- (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
+ (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1);
error = spa_vdev_add(spa, nvroot);
switch (error) {
* Locate this vdev.
*/
oldvd = rvd->vdev_child[top];
+
+ /* pick a child from the mirror */
if (zs->zs_mirrors >= 1) {
ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz];
}
+
+ /* pick a child out of the raidz group */
if (ztest_opts.zo_raidz > 1) {
ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz);
* Build the nvlist describing newpath.
*/
root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
- ashift, 0, 0, 0, 1);
+ ashift, NULL, 0, 0, 1);
error = spa_vdev_attach(spa, oldguid, root, replacing);
return;
}
ASSERT(psize > 0);
- newsize = psize + psize / 8;
+ newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE);
ASSERT3U(newsize, >, psize);
if (ztest_opts.zo_verbose >= 6) {
nvlist_t *props;
VERIFY0(nvlist_alloc(&props, NV_UNIQUE_NAME, 0));
+
if (ztest_random(2) == 0)
return (props);
zs->zs_splits = 0;
zs->zs_mirrors = ztest_opts.zo_mirrors;
nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
- 0, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
+ NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
props = make_random_props();
/*
tests/zfs-tests/tests/functional/snapshot/Makefile
tests/zfs-tests/tests/functional/snapused/Makefile
tests/zfs-tests/tests/functional/sparse/Makefile
+ tests/zfs-tests/tests/functional/alloc_class/Makefile
tests/zfs-tests/tests/functional/threadsappend/Makefile
tests/zfs-tests/tests/functional/tmpfile/Makefile
tests/zfs-tests/tests/functional/truncate/Makefile
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright 2014 HybridCluster. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
/* Portions Copyright 2010 Robert Milkowski */
((ot) & DMU_OT_METADATA) : \
DMU_OT_IS_METADATA_IMPL(ot))
+#define DMU_OT_IS_DDT(ot) \
+ ((ot) == DMU_OT_DDT_ZAP)
+
+#define DMU_OT_IS_ZIL(ot) \
+ ((ot) == DMU_OT_INTENT_LOG)
+
+/* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */
+#define DMU_OT_IS_FILE(ot) \
+ ((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER)
+
#define DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
((ot) & DMU_OT_ENCRYPTED) : \
DMU_OT_IS_ENCRYPTED_IMPL(ot))
* values.
*
* The DMU_OTN_* types do not have entries in the dmu_ot table,
- * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
+ * use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead
* of indexing into dmu_ot directly (this works for both DMU_OT_* types
* and DMU_OTN_* types).
*/
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
uint64_t os_normalization;
uint64_t os_utf8only;
uint64_t os_casesensitivity;
+ /*
+ * The largest zpl file block allowed in special class.
+ * cached here instead of zfsvfs for easier access.
+ */
+ int os_zpl_special_smallblock;
/*
* Pointer is constant; the blkptr it points to is protected by
* Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
*/
/* Portions Copyright 2010 Robert Milkowski */
ZFS_PROP_KEY_GUID,
ZFS_PROP_KEYSTATUS,
ZFS_PROP_REMAPTXG, /* not exposed to the user */
+ ZFS_PROP_SPECIAL_SMALL_BLOCKS,
ZFS_NUM_PROPS
} zfs_prop_t;
#define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */
+#define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */
/*
* The persistent vdev state is stored as separate values rather than a single
#define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
"com.delphix:pool_checkpoint_sm"
+#define VDEV_TOP_ZAP_ALLOCATION_BIAS \
+ "org.zfsonlinux:allocation_bias"
+
+/* vdev metaslab allocation bias */
+#define VDEV_ALLOC_BIAS_LOG "log"
+#define VDEV_ALLOC_BIAS_SPECIAL "special"
+#define VDEV_ALLOC_BIAS_DEDUP "dedup"
+
/*
* This is needed in userland to report the minimum necessary device size.
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
#ifndef _SYS_METASLAB_H
void metaslab_sync_reassess(metaslab_group_t *);
uint64_t metaslab_block_maxsize(metaslab_t *);
+/*
+ * metaslab alloc flags
+ */
#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
#define METASLAB_GANG_HEADER 0x2
#define METASLAB_GANG_CHILD 0x4
#define METASLAB_ASYNC_ALLOC 0x8
#define METASLAB_DONT_THROTTLE 0x10
-#define METASLAB_FASTWRITE 0x20
+#define METASLAB_MUST_RESERVE 0x20
+#define METASLAB_FASTWRITE 0x40
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *,
zio_t *, int);
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
-void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
- int64_t, int64_t);
uint64_t metaslab_class_get_alloc(metaslab_class_t *);
uint64_t metaslab_class_get_space(metaslab_class_t *);
uint64_t metaslab_class_get_dspace(metaslab_class_t *);
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
*/
#ifndef _SYS_SPA_H
extern boolean_t spa_deflate(spa_t *spa);
extern metaslab_class_t *spa_normal_class(spa_t *spa);
extern metaslab_class_t *spa_log_class(spa_t *spa);
+extern metaslab_class_t *spa_special_class(spa_t *spa);
+extern metaslab_class_t *spa_dedup_class(spa_t *spa);
+extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size,
+ dmu_object_type_t objtype, uint_t level, uint_t special_smallblk);
+
extern void spa_evicting_os_register(spa_t *, objset_t *os);
extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
extern void spa_evicting_os_wait(spa_t *spa);
extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
extern boolean_t spa_multihost(spa_t *spa);
extern unsigned long spa_get_hostid(void);
+extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
extern int spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
* Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
*/
#ifndef _SYS_SPA_IMPL_H
boolean_t spa_is_initializing; /* true while opening pool */
metaslab_class_t *spa_normal_class; /* normal data class */
metaslab_class_t *spa_log_class; /* intent log data class */
+ metaslab_class_t *spa_special_class; /* special allocation class */
+ metaslab_class_t *spa_dedup_class; /* dedup allocation class */
uint64_t spa_first_txg; /* first txg after spa_open() */
uint64_t spa_final_txg; /* txg of export/destroy */
uint64_t spa_freeze_txg; /* freeze pool at this txg */
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
#ifndef _SYS_VDEV_H
extern void vdev_space_update(vdev_t *vd,
int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
+extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space);
+
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
#ifndef _SYS_VDEV_IMPL_H
kmutex_t vq_lock;
};
+typedef enum vdev_alloc_bias {
+ VDEV_BIAS_NONE,
+ VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */
+ VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */
+ VDEV_BIAS_DEDUP /* dedicated to dedup metadata */
+} vdev_alloc_bias_t;
+
+
/*
* On-disk indirect vdev state.
*
boolean_t vdev_ishole; /* is a hole in the namespace */
kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */
uint64_t vdev_top_zap;
+ vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
/* pool checkpoint related */
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
uint8_t zp_salt[ZIO_DATA_SALT_LEN];
uint8_t zp_iv[ZIO_DATA_IV_LEN];
uint8_t zp_mac[ZIO_DATA_MAC_LEN];
+ uint32_t zp_zpl_smallblk;
} zio_prop_t;
typedef struct zio_cksum_report zio_cksum_report_t;
vdev_t *io_vd;
void *io_vsd;
const zio_vsd_ops_t *io_vsd_ops;
+ metaslab_class_t *io_metaslab_class; /* dva throttle class */
uint64_t io_offset;
hrtime_t io_timestamp; /* submitted at */
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
#ifndef _ZFEATURE_COMMON_H
SPA_FEATURE_OBSOLETE_COUNTS,
SPA_FEATURE_POOL_CHECKPOINT,
SPA_FEATURE_SPACEMAP_V2,
+ SPA_FEATURE_ALLOCATION_CLASSES,
SPA_FEATURES
} spa_feature_t;
}
break;
}
+
+ case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
+ if (zpool_hdl != NULL) {
+ char state[64] = "";
+
+ /*
+ * Issue a warning but do not fail so that
+ * tests for setable properties succeed.
+ */
+ if (zpool_prop_get_feature(zpool_hdl,
+ "feature@allocation_classes", state,
+ sizeof (state)) != 0 ||
+ strcmp(state, ZFS_FEATURE_ACTIVE) != 0) {
+ (void) fprintf(stderr, gettext(
+ "%s: property requires a special "
+ "device in the pool\n"), propname);
+ }
+ }
+ if (intval != 0 &&
+ (intval < SPA_MINBLOCKSIZE ||
+ intval > SPA_OLD_MAXBLOCKSIZE || !ISP2(intval))) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "invalid '%s=%d' property: must be zero or "
+ "a power of 2 from 512B to 128K"), propname,
+ intval);
+ (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+ goto error;
+ }
+ break;
+
case ZFS_PROP_MLSLABEL:
{
#ifdef HAVE_MLSLABEL
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
* Copyright (c) 2018 Datto Inc.
* Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
#include <ctype.h>
goto error;
}
break;
+
default:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s'(%d) not defined"), propname, prop);
return (zhp->zpool_state);
}
+/*
+ * Check if vdev list contains a special vdev
+ */
+static boolean_t
+zpool_has_special_vdev(nvlist_t *nvroot)
+{
+ nvlist_t **child;
+ uint_t children;
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child,
+ &children) == 0) {
+ for (uint_t c = 0; c < children; c++) {
+ char *bias;
+
+ if (nvlist_lookup_string(child[c],
+ ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0 &&
+ strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
+ return (B_TRUE);
+ }
+ }
+ }
+ return (B_FALSE);
+}
+
/*
* Create the named pool, using the provided vdev list. It is assumed
* that the consumer has already validated the contents of the nvlist, so we
fsprops, zoned, NULL, NULL, B_TRUE, msg)) == NULL) {
goto create_failed;
}
+
+ if (nvlist_exists(zc_fsprops,
+ zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)) &&
+ !zpool_has_special_vdev(nvroot)) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "%s property requires a special vdev"),
+ zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS));
+ (void) zfs_error(hdl, EZFS_BADPROP, msg);
+ goto create_failed;
+ }
+
if (!zc_props &&
(nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) {
goto create_failed;
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright 2017 Jason King
+ * Copyright (c) 2017, Intel Corporation.
*/
#include <assert.h>
uint_t c, children;
char used[6], avail[6];
char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6];
- char *prefix = "";
v0 = umem_zalloc(sizeof (*v0), UMEM_NOFAIL);
}
if (desc != NULL) {
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
-
- if (is_log)
- prefix = "log ";
+ char *suffix = "", *bias = NULL;
+ char bias_suffix[32];
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
+ (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
+ &bias);
if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) != 0)
vs = v0;
+ if (bias != NULL) {
+ (void) snprintf(bias_suffix, sizeof (bias_suffix),
+ " (%s)", bias);
+ suffix = bias_suffix;
+ } else if (is_log) {
+ suffix = " (log)";
+ }
+
sec = MAX(1, vs->vs_timestamp / NANOSEC);
nicenum(vs->vs_alloc, used, sizeof (used));
(void) printf("%*s%s%*s%*s%*s %5s %5s %5s %5s %5s %5s %5s\n",
indent, "",
- prefix,
- (int)(indent+strlen(prefix)-25-(vs->vs_space ? 0 : 12)),
desc,
+ (int)(indent+strlen(desc)-25-(vs->vs_space ? 0 : 12)),
+ suffix,
vs->vs_space ? 6 : 0, vs->vs_space ? used : "",
vs->vs_space ? 6 : 0, vs->vs_space ? avail : "",
rops, wops, rbytes, wbytes, rerr, werr, cerr);
Default value: \fB0\fR.
.RE
+.sp
+.ne 2
+.na
+\fBzfs_ddt_data_is_special\fR (int)
+.ad
+.RS 12n
+If enabled, ZFS will place DDT data into the special allocation class.
+.sp
+Default value: \fB1\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_user_indirect_is_special\fR (int)
+.ad
+.RS 12n
+If enabled, ZFS will place user data (both file and zvol) indirect blocks
+into the special allocation class.
+.sp
+Default value: \fB1\fR.
+.RE
+
.sp
.ne 2
.na
.RE
+.sp
+.ne 2
+.na
+\fB\fBallocation_classes\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID com.intel:allocation_classes
+READ\-ONLY COMPATIBLE yes
+DEPENDENCIES none
+.TE
+
+This feature enables support for separate allocation classes.
+
+This feature becomes \fBactive\fR when a dedicated allocation class vdev
+(dedup or special) is created with zpool create or zpool add. With device
+removal, it can be returned to the \fBenabled\fR state if all the top-level
+vdevs from an allocation class are removed.
+
+.RE
+
.SH "SEE ALSO"
\fBzpool\fR(8)
.Po see
.Xr zpool-features 5
.Pc .
+.It Sy special_small_blocks Ns = Ns Em size
+This value represents the threshold block size for including small file
+blocks into the special allocation class. Valid values are zero or a
+power of two from 512B up to 128K. The default size is 0 which means no
+small file blocks will be allocated in the special class.
+.Pp
+Before setting this property, a special class vdev must be added to the
+pool. See
+.Xr zpool 8
+for more details on the special allocation class.
.It Sy mountpoint Ns = Ns Pa path Ns | Ns Sy none Ns | Ns Sy legacy
Controls the mount point used for this file system.
See the
For more information, see the
.Sx Intent Log
section.
+.It Sy dedup
+A device dedicated solely for allocating dedup data.
+The redundancy of this device should match the redundancy of the other normal
+devices in the pool. If more than one dedup device is specified, then
+allocations are load-balanced between devices.
+.It Sy special
+A device dedicated solely for allocating various kinds of internal metadata,
+and optionally small file data.
+The redundancy of this device should match the redundancy of the other normal
+devices in the pool. If more than one special device is specified, then
+allocations are load-balanced between devices.
+.Pp
+For more information on special allocations, see the
+.Sx Special Allocation Class
+section.
.It Sy cache
A device used to cache storage pool data.
A cache device cannot be configured as a mirror or raidz group.
checkpoint is allowed to consume the dataset's reservation.
Finally, data that is part of the checkpoint but has been freed in the
current state of the pool won't be scanned during a scrub.
+.Ss Special Allocation Class
+The allocations in the special class are dedicated to specific block types.
+By default this includes all metadata, the indirect blocks of user data, and
+any dedup data. The class can also be provisioned to accept a limited
+percentage of small file data blocks.
+.Pp
+A pool must always have at least one general (non-specified) vdev before
+other devices can be assigned to the special class. If the special class
+becomes full, then allocations intended for it will spill back into the
+normal class.
+.Pp
+Dedup data can be excluded from the special class by setting the
+.Sy zfs_ddt_data_is_special
+zfs module parameter to false (0).
+.Pp
+Inclusion of small file blocks in the special class is opt-in. Each dataset
+can control the size of small file blocks allowed in the special class by
+setting the
+.Sy special_small_blocks
+dataset property. It defaults to zero so you must opt-in by setting it to a
+non-zero value. See
+.Xr zfs 8
+for more info on setting this property.
.Ss Properties
Each pool has several properties associated with it.
Some properties are read-only statistics while others are configurable and
*/
/*
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
#ifndef _KERNEL
ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET,
project_quota_deps);
}
+
+ {
+ zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES,
+ "org.zfsonlinux:allocation_classes", "allocation_classes",
+ "Support for separate allocation classes.",
+ ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+ }
}
#if defined(_KERNEL)
zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
+ zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS,
+ "special_small_blocks", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "zero or 512 to 128K, power of 2", "SPECIAL_SMALL_BLOCKS");
/* hidden properties */
zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
bzero(zp->zp_salt, ZIO_DATA_SALT_LEN);
bzero(zp->zp_iv, ZIO_DATA_IV_LEN);
bzero(zp->zp_mac, ZIO_DATA_MAC_LEN);
+ zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
+ os->os_zpl_special_smallblock : 0;
ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
}
}
}
+static void
+smallblk_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE);
+ ASSERT(ISP2(newval));
+
+ os->os_zpl_special_smallblock = newval;
+}
+
static void
logbias_changed_cb(void *arg, uint64_t newval)
{
zfs_prop_to_name(ZFS_PROP_DNODESIZE),
dnodesize_changed_cb, os);
}
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(
+ ZFS_PROP_SPECIAL_SMALL_BLOCKS),
+ smallblk_changed_cb, os);
+ }
}
if (needlock)
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
#include <sys/zfs_context.h>
return (0);
}
-void
+static void
metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
{
void
metaslab_class_histogram_verify(metaslab_class_t *mc)
{
- vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+ spa_t *spa = mc->mc_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
uint64_t *mc_hist;
int i;
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
- if (msp->ms_sm == NULL)
+ /* skip if not active or not a member */
+ if (msp->ms_sm == NULL || msp->ms_group != mg)
continue;
for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
continue;
+ if (msp->ms_group != mg)
+ continue;
valid_ms++;
fragmentation += msp->ms_fragmentation;
}
- if (valid_ms <= vd->vdev_ms_count / 2)
+ if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
return (ZFS_FRAG_INVALID);
fragmentation /= valid_ms;
* groups to select from. Otherwise, we always consider it eligible
* for allocations.
*/
- if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
+ if ((mc != spa_normal_class(spa) &&
+ mc != spa_special_class(spa) &&
+ mc != spa_dedup_class(spa)) ||
+ mc->mc_groups <= 1)
return (B_TRUE);
/*
msp->ms_max_size = 0;
}
+static void
+metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
+ int64_t defer_delta, int64_t space_delta)
+{
+ vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
+
+ ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
+ ASSERT(vd->vdev_ms_count != 0);
+
+ metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
+ vdev_deflated_space(vd, space_delta));
+}
+
int
metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
metaslab_t **msp)
{
vdev_t *vd = mg->mg_vd;
- objset_t *mos = vd->vdev_spa->spa_meta_objset;
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
metaslab_t *ms;
int error;
/*
* If metaslab_debug_load is set and we're initializing a metaslab
- * that has an allocated space map object then load the its space
- * map so that can verify frees.
+ * that has an allocated space map object then load the space map
+ * so that we can verify frees.
*/
if (metaslab_debug_load && ms->ms_sm != NULL) {
mutex_enter(&ms->ms_lock);
metaslab_fini(metaslab_t *msp)
{
metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
metaslab_group_remove(mg, msp);
mutex_enter(&msp->ms_lock);
VERIFY(msp->ms_group == NULL);
- vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
- 0, -msp->ms_size);
+ metaslab_space_update(vd, mg->mg_class,
+ -space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
+
space_map_close(msp->ms_sm);
metaslab_unload(msp);
+
range_tree_destroy(msp->ms_allocatable);
range_tree_destroy(msp->ms_freeing);
range_tree_destroy(msp->ms_freed);
ASSERT3P(msp->ms_checkpointing, ==, NULL);
msp->ms_checkpointing = range_tree_create(NULL, NULL);
- vdev_space_update(vd, 0, 0, msp->ms_size);
+ metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
}
ASSERT0(range_tree_space(msp->ms_freeing));
ASSERT0(range_tree_space(msp->ms_checkpointing));
defer_delta -= range_tree_space(*defer_tree);
}
- vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
+ metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
+ defer_delta, 0);
/*
* If there's a metaslab_load() in progress, wait for it to complete
spa_config_exit(spa, SCL_ALLOC, FTAG);
}
-static uint64_t
-metaslab_distance(metaslab_t *msp, dva_t *dva)
+/*
+ * When writing a ditto block (i.e. more than one DVA for a given BP) on
+ * the same vdev as an existing DVA of this BP, then try to allocate it
+ * on a different metaslab than existing DVAs (i.e. a unique metaslab).
+ */
+static boolean_t
+metaslab_is_unique(metaslab_t *msp, dva_t *dva)
{
- uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
- uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
- uint64_t start = msp->ms_id;
+ uint64_t dva_ms_id;
+
+ if (DVA_GET_ASIZE(dva) == 0)
+ return (B_TRUE);
if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
- return (1ULL << 63);
+ return (B_TRUE);
- if (offset < start)
- return ((start - offset) << ms_shift);
- if (offset > start)
- return ((offset - start) << ms_shift);
- return (0);
+ dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
+
+ return (msp->ms_id != dva_ms_id);
}
/*
*/
static metaslab_t *
find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
- dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator,
+ dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
{
avl_index_t idx;
if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
break;
- uint64_t target_distance = min_distance
- + (space_map_allocated(msp->ms_sm) != 0 ? 0 :
- min_distance >> 1);
-
for (i = 0; i < d; i++) {
- if (metaslab_distance(msp, &dva[i]) < target_distance)
- break;
+ if (want_unique &&
+ !metaslab_is_unique(msp, &dva[i]))
+ break; /* try another metaslab */
}
if (i == d)
break;
/* ARGSUSED */
static uint64_t
metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
- uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
- int allocator)
+ uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
+ int d, int allocator)
{
metaslab_t *msp = NULL;
uint64_t offset = -1ULL;
was_active = B_TRUE;
} else {
msp = find_valid_metaslab(mg, activation_weight, dva, d,
- min_distance, asize, allocator, zal, search,
+ want_unique, asize, allocator, zal, search,
&was_active);
}
* metaslab.
*/
ASSERT(!metaslab_should_allocate(msp, asize));
+
mutex_exit(&msp->ms_lock);
}
mutex_exit(&msp->ms_lock);
static uint64_t
metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
- uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
- int allocator)
+ uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
+ int d, int allocator)
{
uint64_t offset;
ASSERT(mg->mg_initialized);
- offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
- min_distance, dva, d, allocator);
+ offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
+ dva, d, allocator);
mutex_enter(&mg->mg_lock);
if (offset == -1ULL) {
return (offset);
}
-/*
- * If we have to write a ditto block (i.e. more than one DVA for a given BP)
- * on the same vdev as an existing DVA of this BP, then try to allocate it
- * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
- * existing DVAs.
- */
-int ditto_same_vdev_distance_shift = 3;
-
/*
* Allocate a block for the specified i/o.
*/
/*
* For testing, make some blocks above a certain size be gang blocks.
+ * This will also test spilling from special to normal.
*/
if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
} while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
} else {
+ ASSERT(mc->mc_rotor != NULL);
mg = mc->mc_rotor;
}
ASSERT(mg->mg_class == mc);
- /*
- * If we don't need to try hard, then require that the
- * block be 1/8th of the device away from any other DVAs
- * in this BP. If we are trying hard, allow any offset
- * to be used (distance=0).
- */
- uint64_t distance = 0;
- if (!try_hard) {
- distance = vd->vdev_asize >>
- ditto_same_vdev_distance_shift;
- if (distance <= (1ULL << vd->vdev_ms_shift))
- distance = 0;
- }
-
uint64_t asize = vdev_psize_to_asize(vd, psize);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
+ /*
+ * If we don't need to try hard, then require that the
+ * block be on an different metaslab from any other DVAs
+ * in this BP (unique=true). If we are trying hard, then
+ * allow any metaslab to be used (unique=false).
+ */
uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
- distance, dva, d, allocator);
+ !try_hard, dva, d, allocator);
if (offset != -1ULL) {
/*
if (reserved_slots < max)
available_slots = max - reserved_slots;
- if (slots <= available_slots || GANG_ALLOCATION(flags)) {
+ if (slots <= available_slots || GANG_ALLOCATION(flags) ||
+ flags & METASLAB_MUST_RESERVE) {
/*
* We reserve the slots individually so that we can unreserve
* them individually when an I/O completes.
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
- for (int d = 0; d < ndvas; d++)
- if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
+ for (int d = 0; d < ndvas; d++) {
+ error = metaslab_claim_dva(spa, &dva[d], txg);
+ if (error != 0)
break;
+ }
spa_config_exit(spa, SCL_ALLOC, FTAG);
}
#if defined(_KERNEL)
-/* CSTYLED */
+/* BEGIN CSTYLED */
module_param(metaslab_aliquot, ulong, 0644);
MODULE_PARM_DESC(metaslab_aliquot,
"allocation granularity (a.k.a. stripe size)");
MODULE_PARM_DESC(zfs_metaslab_switch_threshold,
"segment-based metaslab selection maximum buckets before switching");
-/* CSTYLED */
module_param(metaslab_force_ganging, ulong, 0644);
MODULE_PARM_DESC(metaslab_force_ganging,
"blocks larger than this size are forced to be gang blocks");
+/* END CSTYLED */
+
#endif
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
* Copyright (c) 2017 Datto Inc.
* Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
*/
/*
ASSERT(MUTEX_HELD(&spa->spa_props_lock));
if (rvd != NULL) {
- alloc = metaslab_class_get_alloc(spa_normal_class(spa));
- size = metaslab_class_get_space(spa_normal_class(spa));
+ alloc = metaslab_class_get_alloc(mc);
+ alloc += metaslab_class_get_alloc(spa_special_class(spa));
+ alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
+
+ size = metaslab_class_get_space(mc);
+ size += metaslab_class_get_space(spa_special_class(spa));
+ size += metaslab_class_get_space(spa_dedup_class(spa));
+
spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
/* Try to create a covering process */
mutex_enter(&spa->spa_proc_lock);
metaslab_class_destroy(spa->spa_log_class);
spa->spa_log_class = NULL;
+ metaslab_class_destroy(spa->spa_special_class);
+ spa->spa_special_class = NULL;
+
+ metaslab_class_destroy(spa->spa_dedup_class);
+ spa->spa_dedup_class = NULL;
+
/*
* If this was part of an import or the open otherwise failed, we may
* still have errors left in the queues. Empty them just in case.
char *poolname;
nvlist_t *nvl;
- if (nvlist_lookup_string(props, "tname", &poolname) != 0)
+ if (props == NULL ||
+ nvlist_lookup_string(props, "tname", &poolname) != 0)
poolname = (char *)pool;
/*
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
(error = spa_validate_aux(spa, nvroot, txg,
VDEV_ALLOC_ADD)) == 0) {
- for (int c = 0; c < rvd->vdev_children; c++) {
- vdev_metaslab_set_size(rvd->vdev_child[c]);
- vdev_expand(rvd->vdev_child[c], txg);
+ /*
+ * instantiate the metaslab groups (this will dirty the vdevs)
+ * we can no longer error exit past this point
+ */
+ for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ vdev_metaslab_set_size(vd);
+ vdev_expand(vd, txg);
}
}
mutex_enter(&spa_namespace_lock);
old_space = metaslab_class_get_space(spa_normal_class(spa));
+ old_space += metaslab_class_get_space(spa_special_class(spa));
+ old_space += metaslab_class_get_space(spa_dedup_class(spa));
+
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+
new_space = metaslab_class_get_space(spa_normal_class(spa));
+ new_space += metaslab_class_get_space(spa_special_class(spa));
+ new_space += metaslab_class_get_space(spa_dedup_class(spa));
mutex_exit(&spa_namespace_lock);
/*
dsl_pool_t *dp = spa->spa_dsl_pool;
objset_t *mos = spa->spa_meta_objset;
bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
+ metaslab_class_t *normal = spa_normal_class(spa);
+ metaslab_class_t *special = spa_special_class(spa);
+ metaslab_class_t *dedup = spa_dedup_class(spa);
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
dmu_tx_t *tx;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
+ metaslab_class_t *mc;
+
+ if (mg == NULL || !metaslab_group_initialized(mg))
+ continue;
- if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
- !metaslab_group_initialized(mg))
+ mc = mg->mg_class;
+ if (mc != normal && mc != special && mc != dedup)
continue;
/*
}
slots_per_allocator += zfs_vdev_def_queue_depth;
}
- metaslab_class_t *mc = spa_normal_class(spa);
+
for (int i = 0; i < spa->spa_alloc_count; i++) {
- ASSERT0(refcount_count(&mc->mc_alloc_slots[i]));
- mc->mc_alloc_max_slots[i] = slots_per_allocator;
- }
- mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+ ASSERT0(refcount_count(&normal->mc_alloc_slots[i]));
+ ASSERT0(refcount_count(&special->mc_alloc_slots[i]));
+ ASSERT0(refcount_count(&dedup->mc_alloc_slots[i]));
+ normal->mc_alloc_max_slots[i] = slots_per_allocator;
+ special->mc_alloc_max_slots[i] = slots_per_allocator;
+ dedup->mc_alloc_max_slots[i] = slots_per_allocator;
+ }
+ normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+ special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+ dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
*/
#include <sys/zfs_context.h>
spa->spa_trust_config ? "trusted" : "untrusted", buf);
}
+/*
+ * By default dedup and user data indirects land in the special class
+ */
+int zfs_ddt_data_is_special = B_TRUE;
+int zfs_user_indirect_is_special = B_TRUE;
+
+/*
+ * The percentage of special class final space reserved for metadata only.
+ * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
+ * let metadata into the class.
+ */
+int zfs_special_class_metadata_reserve_pct = 25;
+
/*
* ==========================================================================
* SPA config locking
*/
ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
spa_config_exit(spa, SCL_ALL, spa);
return (val);
}
+void
+spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx)
+{
+ /*
+ * We bump the feature refcount for each special vdev added to the pool
+ */
+ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
+ spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
+}
+
/*
* ==========================================================================
* Accessor functions
return (spa->spa_log_class);
}
+metaslab_class_t *
+spa_special_class(spa_t *spa)
+{
+ return (spa->spa_special_class);
+}
+
+metaslab_class_t *
+spa_dedup_class(spa_t *spa)
+{
+ return (spa->spa_dedup_class);
+}
+
+/*
+ * Locate an appropriate allocation class
+ */
+metaslab_class_t *
+spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype,
+ uint_t level, uint_t special_smallblk)
+{
+ if (DMU_OT_IS_ZIL(objtype)) {
+ if (spa->spa_log_class->mc_groups != 0)
+ return (spa_log_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
+
+ if (DMU_OT_IS_DDT(objtype)) {
+ if (spa->spa_dedup_class->mc_groups != 0)
+ return (spa_dedup_class(spa));
+ else if (has_special_class && zfs_ddt_data_is_special)
+ return (spa_special_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ /* Indirect blocks for user data can land in special if allowed */
+ if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
+ if (has_special_class && zfs_user_indirect_is_special)
+ return (spa_special_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ if (DMU_OT_IS_METADATA(objtype) || level > 0) {
+ if (has_special_class)
+ return (spa_special_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ /*
+ * Allow small file blocks in special class in some cases (like
+ * for the dRAID vdev feature). But always leave a reserve of
+ * zfs_special_class_metadata_reserve_pct exclusively for metadata.
+ */
+ if (DMU_OT_IS_FILE(objtype) &&
+ has_special_class && size < special_smallblk) {
+ metaslab_class_t *special = spa_special_class(spa);
+ uint64_t alloc = metaslab_class_get_alloc(special);
+ uint64_t space = metaslab_class_get_space(special);
+ uint64_t limit =
+ (space * (100 - zfs_special_class_metadata_reserve_pct))
+ / 100;
+
+ if (alloc < limit)
+ return (special);
+ }
+
+ return (spa_normal_class(spa));
+}
+
void
spa_evicting_os_register(spa_t *spa, objset_t *os)
{
EXPORT_SYMBOL(spa_deflate);
EXPORT_SYMBOL(spa_normal_class);
EXPORT_SYMBOL(spa_log_class);
+EXPORT_SYMBOL(spa_special_class);
+EXPORT_SYMBOL(spa_preferred_class);
EXPORT_SYMBOL(spa_max_replication);
EXPORT_SYMBOL(spa_prev_software_version);
EXPORT_SYMBOL(spa_get_failmode);
module_param(spa_slop_shift, int, 0644);
MODULE_PARM_DESC(spa_slop_shift, "Reserved free space in pool");
+
+module_param(zfs_ddt_data_is_special, int, 0644);
+MODULE_PARM_DESC(zfs_ddt_data_is_special,
+ "Place DDT data into the special class");
+
+module_param(zfs_user_indirect_is_special, int, 0644);
+MODULE_PARM_DESC(zfs_user_indirect_is_special,
+ "Place user data indirect blocks into the special class");
/* END CSTYLED */
#endif
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Toomas Soome <tsoome@me.com>
* Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
*/
#include <sys/zfs_context.h>
return (ops);
}
+/*
+ * Derive the enumerated alloction bias from string input.
+ * String origin is either the per-vdev zap or zpool(1M).
+ */
+static vdev_alloc_bias_t
+vdev_derive_alloc_bias(const char *bias)
+{
+ vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
+
+ if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
+ alloc_bias = VDEV_BIAS_LOG;
+ else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
+ alloc_bias = VDEV_BIAS_SPECIAL;
+ else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
+ alloc_bias = VDEV_BIAS_DEDUP;
+
+ return (alloc_bias);
+}
+
/*
* Default asize function: return the MAX of psize with the asize of
* all children. This is what's used by anything other than RAID-Z.
vdev_indirect_config_t *vic;
char *tmp = NULL;
int rc;
+ vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
+ boolean_t top_level = (parent && !parent->vdev_parent);
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
}
ASSERT(nparity != -1ULL);
+ /*
+ * If creating a top-level vdev, check for allocation classes input
+ */
+ if (top_level && alloctype == VDEV_ALLOC_ADD) {
+ char *bias;
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
+ &bias) == 0) {
+ alloc_bias = vdev_derive_alloc_bias(bias);
+
+ /* spa_vdev_add() expects feature to be enabled */
+ if (spa->spa_load_state != SPA_LOAD_CREATE &&
+ !spa_feature_is_enabled(spa,
+ SPA_FEATURE_ALLOCATION_CLASSES)) {
+ return (SET_ERROR(ENOTSUP));
+ }
+ }
+ }
+
vd = vdev_alloc_common(spa, id, guid, ops);
vic = &vd->vdev_indirect_config;
vd->vdev_islog = islog;
vd->vdev_nparity = nparity;
+ if (top_level && alloc_bias != VDEV_BIAS_NONE)
+ vd->vdev_alloc_bias = alloc_bias;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
vd->vdev_path = spa_strdup(vd->vdev_path);
/*
* If we're a top-level vdev, try to load the allocation parameters.
*/
- if (parent && !parent->vdev_parent &&
+ if (top_level &&
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
&vd->vdev_ms_array);
ASSERT0(vd->vdev_top_zap);
}
- if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
+ if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
ASSERT(alloctype == VDEV_ALLOC_LOAD ||
alloctype == VDEV_ALLOC_ADD ||
alloctype == VDEV_ALLOC_SPLIT ||
alloctype == VDEV_ALLOC_ROOTPOOL);
- vd->vdev_mg = metaslab_group_create(islog ?
- spa_log_class(spa) : spa_normal_class(spa), vd,
- spa->spa_alloc_count);
+ /* Note: metaslab_group_create() is now deferred */
}
if (vd->vdev_ops->vdev_op_leaf &&
tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
svd->vdev_checkpoint_sm = NULL;
+ tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
+ svd->vdev_alloc_bias = VDEV_BIAS_NONE;
+
tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
vdev_free(mvd);
}
+static void
+vdev_metaslab_group_create(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ /*
+ * metaslab_group_create was delayed until allocation bias was available
+ */
+ if (vd->vdev_mg == NULL) {
+ metaslab_class_t *mc;
+
+ if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
+ vd->vdev_alloc_bias = VDEV_BIAS_LOG;
+
+ ASSERT3U(vd->vdev_islog, ==,
+ (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
+
+ switch (vd->vdev_alloc_bias) {
+ case VDEV_BIAS_LOG:
+ mc = spa_log_class(spa);
+ break;
+ case VDEV_BIAS_SPECIAL:
+ mc = spa_special_class(spa);
+ break;
+ case VDEV_BIAS_DEDUP:
+ mc = spa_dedup_class(spa);
+ break;
+ default:
+ mc = spa_normal_class(spa);
+ }
+
+ vd->vdev_mg = metaslab_group_create(mc, vd,
+ spa->spa_alloc_count);
+
+ /*
+ * The spa ashift values currently only reflect the
+ * general vdev classes. Class destination is late
+ * binding so ashift checking had to wait until now
+ */
+ if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+ mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
+ if (vd->vdev_ashift > spa->spa_max_ashift)
+ spa->spa_max_ashift = vd->vdev_ashift;
+ if (vd->vdev_ashift < spa->spa_min_ashift)
+ spa->spa_min_ashift = vd->vdev_ashift;
+ }
+ }
+}
+
int
vdev_metaslab_init(vdev_t *vd, uint64_t txg)
{
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
metaslab_t **mspp;
int error;
+ boolean_t expanding = (oldc != 0);
ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
- if (oldc != 0) {
+ if (expanding) {
bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
}
}
}
+#ifndef _KERNEL
+ /*
+ * To accomodate zdb_leak_init() fake indirect
+ * metaslabs, we allocate a metaslab group for
+ * indirect vdevs which normally don't have one.
+ */
+ if (vd->vdev_mg == NULL) {
+ ASSERT0(vdev_is_concrete(vd));
+ vdev_metaslab_group_create(vd);
+ }
+#endif
error = metaslab_init(vd->vdev_mg, m, object, txg,
&(vd->vdev_ms[m]));
if (error != 0) {
* the metaslabs since we want to ensure that no new
* allocations are performed on this device.
*/
- if (oldc == 0 && !vd->vdev_removing)
+ if (!expanding && !vd->vdev_removing) {
metaslab_group_activate(vd->vdev_mg);
+ }
if (txg == 0)
spa_config_exit(spa, SCL_ALLOC, FTAG);
/*
* Track the min and max ashift values for normal data devices.
+ *
+ * DJB - TBD these should perhaps be tracked per allocation class
+ * (e.g. spa_min_ashift is used to round up post compression buffers)
*/
if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
- !vd->vdev_islog && vd->vdev_aux == NULL) {
+ vd->vdev_alloc_bias == VDEV_BIAS_NONE &&
+ vd->vdev_aux == NULL) {
if (vd->vdev_ashift > spa->spa_max_ashift)
spa->spa_max_ashift = vd->vdev_ashift;
if (vd->vdev_ashift < spa->spa_min_ashift)
return (error);
}
+static void
+vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
+ const char *string;
+
+ ASSERT(alloc_bias != VDEV_BIAS_NONE);
+
+ string =
+ (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
+ (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
+ (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
+
+ ASSERT(string != NULL);
+ VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
+ 1, strlen(string) + 1, string, tx));
+
+ if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
+ spa_activate_allocation_classes(spa, tx);
+ }
+}
+
void
vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
{
}
if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
+ if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
+ vdev_zap_allocation_data(vd, tx);
}
}
+
for (uint64_t i = 0; i < vd->vdev_children; i++) {
vdev_construct_zaps(vd->vdev_child[i], tx);
}
vdev_set_deflate_ratio(vd);
+ /*
+ * On spa_load path, grab the allocation bias from our zap
+ */
+ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+ spa_t *spa = vd->vdev_spa;
+ char bias_str[64];
+
+ if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
+ bias_str) == 0) {
+ ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
+ vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
+ }
+ }
+
/*
* If this is a top-level vdev, initialize its metaslabs.
*/
if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
+ vdev_metaslab_group_create(vd);
+
if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
+
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
ASSERT0(mg->mg_histogram[i]);
}
}
if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
vdev_is_concrete(vd)) {
- vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
+ vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
+ vd->vdev_mg->mg_fragmentation : 0;
}
}
}
}
+int64_t
+vdev_deflated_space(vdev_t *vd, int64_t space)
+{
+ ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
+ ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
+
+ return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
+}
+
/*
- * Update the in-core space usage stats for this vdev, its metaslab class,
- * and the root vdev.
+ * Update the in-core space usage stats for this vdev and the root vdev.
*/
void
vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
int64_t space_delta)
{
- int64_t dspace_delta = space_delta;
+ int64_t dspace_delta;
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
- metaslab_group_t *mg = vd->vdev_mg;
- metaslab_class_t *mc = mg ? mg->mg_class : NULL;
ASSERT(vd == vd->vdev_top);
* because the root vdev's psize-to-asize is simply the max of its
* childrens', thus not accurate enough for us.
*/
- ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
- ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
- dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
- vd->vdev_deflate_ratio;
+ dspace_delta = vdev_deflated_space(vd, space_delta);
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_alloc += alloc_delta;
vd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&vd->vdev_stat_lock);
- if (mc == spa_normal_class(spa)) {
+ /* every class but log contributes to root space stats */
+ if (vd->vdev_mg != NULL && !vd->vdev_islog) {
mutex_enter(&rvd->vdev_stat_lock);
rvd->vdev_stat.vs_alloc += alloc_delta;
rvd->vdev_stat.vs_space += space_delta;
rvd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&rvd->vdev_stat_lock);
}
-
- if (mc != NULL) {
- ASSERT(rvd == vd->vdev_parent);
- ASSERT(vd->vdev_ms_count != 0);
-
- metaslab_class_space_update(mc,
- alloc_delta, defer_delta, space_delta, dspace_delta);
- }
+ /* Note: metaslab_class_space_update moved to metaslab_space_update */
}
/*
vdev_set_deflate_ratio(vd);
- if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
+ if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
+ vdev_is_concrete(vd)) {
+ vdev_metaslab_group_create(vd);
VERIFY(vdev_metaslab_init(vd, txg) == 0);
vdev_config_dirty(vd);
}
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
/*
fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
vd->vdev_removing);
}
+
+ /* zpool command expects alloc class data */
+ if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
+ const char *bias = NULL;
+
+ switch (vd->vdev_alloc_bias) {
+ case VDEV_BIAS_LOG:
+ bias = VDEV_ALLOC_BIAS_LOG;
+ break;
+ case VDEV_BIAS_SPECIAL:
+ bias = VDEV_ALLOC_BIAS_SPECIAL;
+ break;
+ case VDEV_BIAS_DEDUP:
+ bias = VDEV_ALLOC_BIAS_DEDUP;
+ break;
+ default:
+ ASSERT3U(vd->vdev_alloc_bias, ==,
+ VDEV_BIAS_NONE);
+ }
+ fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
+ bias);
+ }
}
if (vd->vdev_dtl_sm != NULL) {
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
}
ASSERT3U(size, <=, maxalloc);
- int error = metaslab_alloc_dva(spa, mg->mg_class, size,
- &dst, 0, NULL, txg, 0, zal, 0);
+ /*
+ * An allocation class might not have any remaining vdevs or space
+ */
+ metaslab_class_t *mc = mg->mg_class;
+ if (mc != spa_normal_class(spa) && mc->mc_groups <= 1)
+ mc = spa_normal_class(spa);
+ int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
+ zal, 0);
+ if (error == ENOSPC && mc != spa_normal_class(spa)) {
+ error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
+ &dst, 0, NULL, txg, 0, zal, 0);
+ }
if (error != 0)
return (error);
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
return (SET_ERROR(ENOTSUP));
+ /* available space in the pool's normal class */
+ uint64_t available = dsl_dir_space_available(
+ spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
+
+ metaslab_class_t *mc = vd->vdev_mg->mg_class;
+
+ /*
+ * When removing a vdev from an allocation class that has
+ * remaining vdevs, include available space from the class.
+ */
+ if (mc != spa_normal_class(spa) && mc->mc_groups > 1) {
+ uint64_t class_avail = metaslab_class_get_space(mc) -
+ metaslab_class_get_alloc(mc);
+
+ /* add class space, adjusted for overhead */
+ available += (class_avail * 94) / 100;
+ }
+
/*
* There has to be enough free space to remove the
* device and leave double the "slop" space (i.e. we
* must leave at least 3% of the pool free, in addition to
* the normal slop space).
*/
- if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir,
- NULL, 0, B_TRUE) <
- vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
+ if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
return (SET_ERROR(ENOSPC));
}
{
if (zfs_dbgmsg_kstat)
kstat_delete(zfs_dbgmsg_kstat);
-
+ /*
+ * TODO - decide how to make this permanent
+ */
+#ifdef _KERNEL
mutex_enter(&zfs_dbgmsgs_lock);
zfs_dbgmsg_purge(0);
mutex_exit(&zfs_dbgmsgs_lock);
mutex_destroy(&zfs_dbgmsgs_lock);
+#endif
}
void
}
break;
+ case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
+ /*
+ * This property could require the allocation classes
+ * feature to be active for setting, however we allow
+ * it so that tests of settable properties succeed.
+ * The CLI will issue a warning in this case.
+ */
+ break;
+
case ZFS_PROP_SHARESMB:
if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
return (SET_ERROR(ENOTSUP));
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
#include <sys/sysmacros.h>
zio->io_bookmark = *zb;
if (pio != NULL) {
+ if (zio->io_metaslab_class == NULL)
+ zio->io_metaslab_class = pio->io_metaslab_class;
if (zio->io_logical == NULL)
zio->io_logical = pio->io_logical;
if (zio->io_child_type == ZIO_CHILD_GANG)
*/
if (flags & ZIO_FLAG_IO_ALLOCATING &&
(vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
- ASSERTV(metaslab_class_t *mc = spa_normal_class(pio->io_spa));
-
- ASSERT(mc->mc_alloc_throttle_enabled);
+ ASSERT(pio->io_metaslab_class != NULL);
+ ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
ASSERT(type == ZIO_TYPE_WRITE);
ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
BP_GET_PSIZE(bp) == psize &&
pass >= zfs_sync_pass_rewrite) {
- ASSERT(psize != 0);
+ VERIFY3U(psize, !=, 0);
enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
+
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
zio->io_flags |= ZIO_FLAG_IO_REWRITE;
} else {
* reserve then we throttle.
*/
ASSERT3U(zio->io_allocator, ==, allocator);
- if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
+ if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
return (NULL);
}
{
spa_t *spa = zio->io_spa;
zio_t *nio;
+ metaslab_class_t *mc;
+
+ /* locate an appropriate allocation class */
+ mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
+ zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
- !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
+ !mc->mc_alloc_throttle_enabled ||
zio->io_child_type == ZIO_CHILD_GANG ||
zio->io_flags & ZIO_FLAG_NODATA) {
return (zio);
zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
-
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ zio->io_metaslab_class = mc;
avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
-
- nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator);
+ nio = zio_io_to_allocate(spa, zio->io_allocator);
mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
-
return (nio);
}
-void
+static void
zio_allocate_dispatch(spa_t *spa, int allocator)
{
zio_t *zio;
zio_dva_allocate(zio_t *zio)
{
spa_t *spa = zio->io_spa;
- metaslab_class_t *mc = spa_normal_class(spa);
+ metaslab_class_t *mc;
blkptr_t *bp = zio->io_bp;
int error;
int flags = 0;
if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
flags |= METASLAB_ASYNC_ALLOC;
+ /*
+ * if not already chosen, locate an appropriate allocation class
+ */
+ mc = zio->io_metaslab_class;
+ if (mc == NULL) {
+ mc = spa_preferred_class(spa, zio->io_size,
+ zio->io_prop.zp_type, zio->io_prop.zp_level,
+ zio->io_prop.zp_zpl_smallblk);
+ zio->io_metaslab_class = mc;
+ }
+
error = metaslab_alloc(spa, mc, zio->io_size, bp,
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
&zio->io_alloc_list, zio, zio->io_allocator);
+ /*
+ * Fallback to normal class when an alloc class is full
+ */
+ if (error == ENOSPC && mc != spa_normal_class(spa)) {
+ /*
+ * If throttling, transfer reservation over to normal class.
+ * The io_allocator slot can remain the same even though we
+ * are switching classes.
+ */
+ if (mc->mc_alloc_throttle_enabled &&
+ (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
+ metaslab_class_throttle_unreserve(mc,
+ zio->io_prop.zp_copies, zio->io_allocator, zio);
+ zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
+
+ mc = spa_normal_class(spa);
+ VERIFY(metaslab_class_throttle_reserve(mc,
+ zio->io_prop.zp_copies, zio->io_allocator, zio,
+ flags | METASLAB_MUST_RESERVE));
+ } else {
+ mc = spa_normal_class(spa);
+ }
+ zio->io_metaslab_class = mc;
+
+ error = metaslab_alloc(spa, mc, zio->io_size, bp,
+ zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
+ &zio->io_alloc_list, zio, zio->io_allocator);
+ }
+
if (error != 0) {
zfs_dbgmsg("%s: metaslab allocation failure: zio %p, "
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
ASSERT(txg > spa_syncing_txg(spa));
metaslab_trace_init(&io_alloc_list);
+
+ /*
+ * Block pointer fields are useful to metaslabs for stats and debugging.
+ * Fill in the obvious ones before calling into metaslab_alloc().
+ */
+ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+ BP_SET_PSIZE(new_bp, size);
+ BP_SET_LEVEL(new_bp, 0);
+
/*
* When allocating a zil block, we don't have information about
* the final destination of the block except the objset it's part
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(zio->io_metaslab_class != NULL);
+
/*
* We were unable to allocate anything, unreserve and
* issue the next I/O to allocate.
*/
metaslab_class_throttle_unreserve(
- spa_normal_class(zio->io_spa),
- zio->io_prop.zp_copies, zio->io_allocator, zio);
+ zio->io_metaslab_class, zio->io_prop.zp_copies,
+ zio->io_allocator, zio);
zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
}
}
ASSERT(zio->io_logical != NULL);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
+ ASSERT(zio->io_metaslab_class != NULL);
mutex_enter(&pio->io_lock);
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
pio->io_allocator, B_TRUE);
mutex_exit(&pio->io_lock);
- metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
- 1, pio->io_allocator, pio);
+ metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
+ pio->io_allocator, pio);
/*
* Call into the pipeline to see if there is more work that
*/
const uint64_t psize = zio->io_size;
zio_t *pio, *pio_next;
- ASSERTV(metaslab_class_t *mc = spa_normal_class(zio->io_spa));
zio_link_t *zl = NULL;
/*
*/
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
zio->io_child_type == ZIO_CHILD_VDEV) {
- ASSERT(mc->mc_alloc_throttle_enabled);
+ ASSERT(zio->io_metaslab_class != NULL);
+ ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
zio_dva_throttle_done(zio);
}
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(zio->io_bp != NULL);
+
metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
zio->io_allocator);
- VERIFY(refcount_not_held(&mc->mc_alloc_slots[zio->io_allocator],
+ VERIFY(refcount_not_held(
+ &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator],
zio));
}
tests = ['posix_001_pos', 'posix_002_pos', 'posix_003_pos']
tags = ['functional', 'acl', 'posix']
+[tests/functional/alloc_class]
+tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos',
+ 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos',
+ 'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos',
+ 'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos',
+ 'alloc_class_013_pos']
+tags = ['functional', 'alloc_class']
+
[tests/functional/arc]
tests = ['dbufstats_001_pos', 'dbufstats_002_pos']
tags = ['functional', 'arc']
SUBDIRS = \
acl \
+ alloc_class \
arc \
atime \
bootfs \
--- /dev/null
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/alloc_class
+dist_pkgdata_SCRIPTS = \
+ alloc_class.cfg \
+ alloc_class.kshlib \
+ setup.ksh \
+ cleanup.ksh \
+ alloc_class_001_pos.ksh \
+ alloc_class_002_neg.ksh \
+ alloc_class_003_pos.ksh \
+ alloc_class_004_pos.ksh \
+ alloc_class_005_pos.ksh \
+ alloc_class_006_pos.ksh \
+ alloc_class_007_pos.ksh \
+ alloc_class_008_pos.ksh \
+ alloc_class_009_pos.ksh \
+ alloc_class_010_pos.ksh \
+ alloc_class_011_neg.ksh \
+ alloc_class_012_pos.ksh \
+ alloc_class_013_pos.ksh
--- /dev/null
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+export ZPOOL_DISK0="$TEST_BASE_DIR/device-0"
+export ZPOOL_DISK1="$TEST_BASE_DIR/device-1"
+export ZPOOL_DISK2="$TEST_BASE_DIR/device-2"
+export ZPOOL_DISKS="${ZPOOL_DISK0} ${ZPOOL_DISK1} ${ZPOOL_DISK2}"
+
+export CLASS_DISK0="$TEST_BASE_DIR/device-3"
+export CLASS_DISK1="$TEST_BASE_DIR/device-4"
+export CLASS_DISK2="$TEST_BASE_DIR/device-5"
+export CLASS_DISK3="$TEST_BASE_DIR/device-6"
+export CLASS_DISKS="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2} ${CLASS_DISK3}"
+
+export ZPOOL_DEVSIZE=1G
+export CLASS_DEVSIZE=512M
--- /dev/null
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.cfg
+
+function disk_setup
+{
+ truncate -s $ZPOOL_DEVSIZE $ZPOOL_DISKS
+ truncate -s $CLASS_DEVSIZE $CLASS_DISKS
+}
+
+function disk_cleanup
+{
+ rm -f $ZPOOL_DEVSIZE $ZPOOL_DISKS 2> /dev/null
+ rm -f $CLASS_DEVSIZE $CLASS_DISKS 2> /dev/null
+}
+
+function cleanup
+{
+ if datasetexists $TESTPOOL ; then
+ zpool destroy -f $TESTPOOL 2> /dev/null
+ fi
+
+ disk_cleanup
+}
+
+#
+# Try zpool status/iostat for given pool
+#
+# $1 pool
+#
+function display_status
+{
+ typeset pool=$1
+
+ typeset -i ret=0
+ zpool status -xv $pool > /dev/null 2>&1
+ ret=$?
+
+ zpool iostat > /dev/null 2>&1
+ ((ret |= $?))
+
+ typeset mntpnt=$(get_prop mountpoint $pool)
+ dd if=/dev/random of=$mntpnt/testfile.$$ &
+ typeset pid=$!
+
+ zpool iostat -v 1 3 > /dev/null
+ ((ret |= $?))
+
+ kill -9 $pid
+
+ return $ret
+}
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+# Creating a pool with a special device succeeds.
+#
+
+verify_runnable "global"
+
+claim="Creating a pool with a special device succeeds."
+
+log_assert $claim
+log_onexit cleanup
+
+log_must disk_setup
+log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \
+ $CLASS_DISK0 $CLASS_DISK1
+log_must display_status "$TESTPOOL"
+log_must zpool destroy -f "$TESTPOOL"
+
+log_pass $claim
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+# Creating a pool fails when an invalid special device type is specified.
+#
+claim="Creating a pool with an invalid special device type fails."
+
+verify_runnable "global"
+
+log_assert $claim
+log_onexit cleanup
+
+log_must disk_setup
+
+log_mustnot zpool create $TESTPOOL raidz $ZPOOL_DISKS special $CLASS_DISK0
+log_mustnot display_status $TESTPOOL
+log_mustnot zpool destroy -f $TESTPOOL
+
+log_mustnot zpool create $TESTPOOL $ZPOOL_DISKS special mirror \
+ $CLASS_DISK0 $CLASS_DISK1
+log_mustnot display_status $TESTPOOL
+log_mustnot zpool destroy -f $TESTPOOL
+
+log_mustnot zpool create $TESTPOOL raidz $ZPOOL_DISKS special raidz \
+ $CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2
+log_mustnot display_status $TESTPOOL
+log_mustnot zpool destroy -f $TESTPOOL
+
+log_pass $claim
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+# Adding a special device to a normal pool succeeds.
+#
+claim="Adding a special device to a normal pool succeeds."
+
+verify_runnable "global"
+
+log_assert $claim
+log_onexit cleanup
+
+log_must disk_setup
+
+for type in "" "mirror" "raidz"
+do
+ log_must zpool create $TESTPOOL $type $ZPOOL_DISKS
+
+ if [ "$type" = "mirror" ]; then
+ log_must zpool add $TESTPOOL special mirror \
+ $CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK1
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK2
+ elif [ "$type" = "raidz" ]; then
+ log_must zpool add $TESTPOOL special mirror \
+ $CLASS_DISK0 $CLASS_DISK1
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK1
+ else
+ log_must zpool add $TESTPOOL special $CLASS_DISK0
+ log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
+ fi
+
+ log_must zpool destroy -f $TESTPOOL
+done
+
+log_pass $claim
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+# Checking if allocation_classes feature flag status is active after
+# creating a pool with a special device.
+#
+claim="Checking active allocation classes feature flag status successful."
+
+verify_runnable "global"
+
+log_assert $claim
+log_onexit cleanup
+
+log_must disk_setup
+
+typeset ac_value
+typeset stype=""
+typeset sdisks=""
+
+for type in "" "mirror" "raidz"
+do
+ if [ "$type" = "mirror" ]; then
+ stype="mirror"
+ sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}"
+ elif [ "$type" = "raidz" ]; then
+ stype="mirror"
+ sdisks="${CLASS_DISK0} ${CLASS_DISK1}"
+ else
+ stype=""
+ sdisks="${CLASS_DISK0}"
+ fi
+
+ log_must zpool create $TESTPOOL $type $ZPOOL_DISKS \
+ special $stype $sdisks
+
+ ac_value="$(zpool get all -H -o property,value | \
+ egrep allocation_classes | nawk '{print $2}')"
+ if [ "$ac_value" = "active" ]; then
+ log_note "feature@allocation_classes is active"
+ else
+ log_fail "feature@allocation_classes not active, \
+ status = $ac_value"
+ fi
+
+ log_must zpool destroy -f $TESTPOOL
+done
+
+log_pass $claim
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+# Checking allocation_classes feature flag value after pool is created
+# (should be enabled) and also after a special device added to existing
+# pool (should be active).
+#
+
+verify_runnable "global"
+
+log_assert "Values of allocation_classes feature flag correct."
+log_onexit cleanup
+
+log_must disk_setup
+
+typeset ac_value
+
+for type in "" "mirror" "raidz"
+do
+ if [ "$type" = "mirror" ]; then
+ log_must zpool create $TESTPOOL $type $ZPOOL_DISK0 $ZPOOL_DISK1
+ else
+ log_must zpool create $TESTPOOL $type $ZPOOL_DISKS
+ fi
+ ac_value="$(zpool get all -H -o property,value | \
+ egrep allocation_classes | awk '{print $2}')"
+ if [ "$ac_value" = "enabled" ]; then
+ log_note "feature@allocation_classes is enabled"
+ else
+ log_fail "feature@allocation_classes not enabled, \
+ status = $ac_value"
+ fi
+
+ if [ "$type" = "" ]; then
+ log_must zpool add $TESTPOOL special $CLASS_DISK0
+ else
+ log_must zpool add $TESTPOOL special mirror \
+ $CLASS_DISK0 $CLASS_DISK1
+ fi
+ ac_value="$(zpool get all -H -o property,value | \
+ egrep allocation_classes | awk '{print $2}')"
+ if [ "$ac_value" = "active" ]; then
+ log_note "feature@allocation_classes is active"
+ else
+ log_fail "feature@allocation_classes not active, \
+ status = $ac_value"
+ fi
+
+ log_must zpool destroy -f $TESTPOOL
+done
+
+log_pass "Values of allocation_classes feature flag correct."
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+# Using zpool split command to detach disks from mirrored special pool
+# to create a new pool with the detached disks.
+#
+claim="zpool split command succeeds with special devices present."
+
+verify_runnable "global"
+
+log_assert $claim
+log_onexit cleanup
+
+log_must disk_setup
+
+log_must zpool create $TESTPOOL \
+ mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \
+ special mirror $CLASS_DISK0 $CLASS_DISK1
+log_must zpool split $TESTPOOL split_pool
+log_must zpool destroy -f $TESTPOOL
+
+log_pass $claim
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+# Replacing a special device succeeds
+#
+claim="Replacing a special device is successful."
+
+verify_runnable "global"
+
+log_assert $claim
+log_onexit cleanup
+
+log_must disk_setup
+
+log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS \
+ special mirror $CLASS_DISK0 $CLASS_DISK1
+log_must zpool replace $TESTPOOL $CLASS_DISK1 $CLASS_DISK2
+log_must sleep 10
+log_must zpool iostat -H $TESTPOOL $CLASS_DISK2
+log_must zpool destroy -f $TESTPOOL
+
+log_pass $claim
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+# Adding an additional special device to a pool with special succeeds.
+#
+claim="Adding an additional special device succeeds."
+
+verify_runnable "global"
+
+log_assert $claim
+log_onexit cleanup
+
+log_must disk_setup
+
+typeset special_type=""
+typeset create_disks=""
+typeset added_disks=""
+
+for type in "" "raidz"
+do
+ if [ "$type" = "raidz" ]; then
+ special_type="mirror"
+ create_disks="${CLASS_DISK0} ${CLASS_DISK1}"
+ added_disks="${CLASS_DISK2} ${CLASS_DISK3}"
+ else
+ special_type=""
+ create_disks="${CLASS_DISK0}"
+ added_disks="${CLASS_DISK1}"
+ fi
+ log_must zpool create $TESTPOOL $type $ZPOOL_DISKS \
+ special $special_type $create_disks
+ log_must zpool add $TESTPOOL special $special_type $added_disks
+ log_must zpool iostat $TESTPOOL $added_disks
+ log_must zpool destroy -f $TESTPOOL
+done
+
+log_pass $claim
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+# Importing and exporting pool with special device succeeds.
+#
+claim="Import/export of pool with special device mirror succeeds."
+
+verify_runnable "global"
+
+log_assert $claim
+log_onexit cleanup
+
+log_must disk_setup
+
+typeset stype=""
+typeset sdisks=""
+
+for type in "" "mirror" "raidz"
+do
+ if [ "$type" = "mirror" ]; then
+ stype="mirror"
+ sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}"
+ elif [ "$type" = "raidz" ]; then
+ stype="mirror"
+ sdisks="${CLASS_DISK0} ${CLASS_DISK1}"
+ else
+ stype=""
+ special_args="${CLASS_DISK0}"
+ fi
+
+ log_must zpool create $TESTPOOL $type $ZPOOL_DISKS \
+ special $stype $sdisks
+ log_must zpool export $TESTPOOL
+ log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL
+ log_must display_status $TESTPOOL
+ log_must zpool destroy -f $TESTPOOL
+done
+
+log_pass $claim
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+# Setting the special_small_blocks property to a valid value succeeds.
+#
+
+verify_runnable "global"
+
+claim="Setting the special_small_blocks property to a valid value succeeds."
+
+log_assert $claim
+log_onexit cleanup
+
+log_must disk_setup
+
+log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \
+ $CLASS_DISK0 $CLASS_DISK1
+
+for value in 0 512 1024 2048 4096 8192 16384 32768 65536 131072
+do
+ log_must zfs set special_small_blocks=$value $TESTPOOL
+ ACTUAL=$(zfs get -p special_small_blocks $TESTPOOL | \
+ grep special_small_blocks | awk '{print $3}')
+ if [ "$ACTUAL" != "$value" ]
+ then
+ log_fail "v. $ACTUAL set for $TESTPOOL, expected v. $value!"
+ fi
+done
+
+log_must zpool destroy -f "$TESTPOOL"
+log_pass $claim
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+# Setting the special_small_blocks property to invalid values fails.
+#
+
+verify_runnable "global"
+
+claim="Setting the special_small_blocks property to invalid values fails."
+
+log_assert $claim
+log_onexit cleanup
+
+log_must disk_setup
+log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \
+ $CLASS_DISK0 $CLASS_DISK1
+
+for value in 256 1025 262144
+do
+ log_mustnot zfs set special_small_blocks=$value $TESTPOOL
+done
+
+log_must zpool destroy -f "$TESTPOOL"
+log_pass $claim
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+# Removing a special device from a pool succeeds.
+#
+
+verify_runnable "global"
+
+claim= "Removing a special device from a pool succeeds."
+
+log_assert $claim
+log_onexit cleanup
+
+#
+# Create a non-raidz pool so we can remove top-level vdevs
+#
+log_must disk_setup
+log_must zpool create $TESTPOOL $ZPOOL_DISK0 $ZPOOL_DISK1 $ZPOOL_DISK2 \
+ special $CLASS_DISK0 special $CLASS_DISK1
+log_must display_status "$TESTPOOL"
+
+#
+# Generate some metadata and small blocks in the special class before removal
+#
+typeset -l i=1
+typeset -l blocks=25
+
+log_must zfs create -o special_small_blocks=32K -o recordsize=32K \
+ $TESTPOOL/$TESTFS
+for i in 1 2 3 4; do
+ log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/testfile.$i bs=1M \
+ count=$blocks
+ ((blocks = blocks + 25))
+done
+log_must sync_pool $TESTPOOL
+log_must zpool list -v $TESTPOOL
+
+#
+# remove a special allocation vdev and force a remapping
+#
+log_must zpool remove $TESTPOOL $CLASS_DISK0
+log_must zfs remap $TESTPOOL/$TESTFS
+
+sleep 5
+log_must sync_pool $TESTPOOL
+sleep 1
+
+log_must zdb -bbcc $TESTPOOL
+log_must zpool destroy -f "$TESTPOOL"
+
+log_pass $claim
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+#
+# DESCRIPTION:
+# Removing a dedup device from a pool succeeds.
+#
+
+verify_runnable "global"
+
+claim= "Removing a dedup device from a pool succeeds."
+
+log_assert $claim
+log_onexit cleanup
+
+#
+# Create a non-raidz pool so we can remove top-level vdevs
+#
+log_must disk_setup
+log_must zpool create $TESTPOOL $ZPOOL_DISKS dedup $CLASS_DISK0
+log_must display_status "$TESTPOOL"
+
+#
+# Generate some dedup data in the dedup class before removal
+#
+
+log_must zfs create -o dedup=on -V 2G $TESTPOOL/$TESTVOL
+
+log_must echo y | newfs $ZVOL_DEVDIR/$TESTPOOL/$TESTVOL >/dev/null 2>&1
+
+sync_pool
+log_must zpool list -v $TESTPOOL
+
+#
+# remove a dedup allocation vdev
+#
+log_must zpool remove $TESTPOOL $CLASS_DISK0
+
+sleep 5
+log_must sync_pool $TESTPOOL
+sleep 1
+
+log_must zdb -bbcc $TESTPOOL
+
+log_must zpool destroy -f "$TESTPOOL"
+
+log_pass $claim
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018, Delphix
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+verify_runnable "global"
+
+pool_cleanup
+disk_cleanup
+
+log_pass
--- /dev/null
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017, Intel Corporation.
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib
+
+verify_runnable "global"
+
+disk_cleanup
+
+log_pass
"feature@userobj_accounting"
"feature@encryption"
"feature@project_quota"
+ "feature@allocation_classes"
)
fi
# "l" - log
# "ll" - mirrored log
# "c" - cache
+# "sc" - special class
#
function pool_config # <vdev-type>
{
disks[l]="l1"
disks[ll]="l1 l2"
disks[c]="c1"
+ disks[sc]="sc1 sc2"
case $1 in
d|t) # single disk or stripe
vdev='' ;;
vdev='log mirror';;
c) # cache
vdev='cache';;
+ sc) # mirrored special class
+ vdev='special mirror';;
*)
log_fail "setup_pool: unsupported vdev type '$1'"
esac
# "good" and "bad" pool layouts
# first token is always used with "zpool create"
# second to last tokens, if any, are used with "zpool add"
-typeset -a goodconfs=("m" "m l" "m s" "m c" "m m" "m3" "m3 m3" "m m3 l s c")
+typeset -a goodconfs=("m" "m l" "m s" "m c" "m m" "m3" "m3 m3" "m m3 l s c" "m m sc")
typeset -a badconfs=("d" "z1" "z2" "z3" "m d" "m3 d" "m z1" "m z2" "m z3")
typeset FILEDEV_PREFIX="$TEST_BASE_DIR/filedev"
typeset altroot="$TESTDIR/altroot-$TESTPOOL2"