zio_t *mmp_zio_root; /* root of mmp write zios */
uint64_t mmp_kstat_id; /* unique id for next MMP write kstat */
int mmp_skip_error; /* reason for last skipped write */
+ vdev_t *mmp_last_leaf; /* last mmp write sent here */
+ uint64_t mmp_leaf_last_gen; /* last mmp write sent here */
} mmp_thread_t;
taskq_t *spa_prefetch_taskq; /* Taskq for prefetch threads */
uint64_t spa_multihost; /* multihost aware (mmp) */
mmp_thread_t spa_mmp; /* multihost mmp thread */
+ list_t spa_leaf_list; /* list of leaf vdevs */
+ uint64_t spa_leaf_list_gen; /* track leaf_list changes */
/*
* spa_refcount & spa_config_lock must be the last elements
hrtime_t vdev_mmp_pending; /* 0 if write finished */
uint64_t vdev_mmp_kstat_id; /* to find kstat entry */
uint64_t vdev_expansion_time; /* vdev's last expansion time */
+ list_node_t vdev_leaf_node; /* leaf vdev list */
/*
* For DTrace to work in userland (libzpool) context, these fields must
MMP_FAIL_WRITE_PENDING = (1 << 1),
} mmp_vdev_state_flag_t;
-static vdev_t *
-mmp_random_leaf_impl(vdev_t *vd, int *fail_mask)
-{
- int child_idx;
-
- if (vd->vdev_ops->vdev_op_leaf) {
- vdev_t *ret;
-
- if (!vdev_writeable(vd)) {
- *fail_mask |= MMP_FAIL_NOT_WRITABLE;
- ret = NULL;
- } else if (vd->vdev_mmp_pending != 0) {
- *fail_mask |= MMP_FAIL_WRITE_PENDING;
- ret = NULL;
- } else {
- ret = vd;
- }
-
- return (ret);
- }
-
- if (vd->vdev_children == 0)
- return (NULL);
-
- child_idx = spa_get_random(vd->vdev_children);
- for (int offset = vd->vdev_children; offset > 0; offset--) {
- vdev_t *leaf;
- vdev_t *child = vd->vdev_child[(child_idx + offset) %
- vd->vdev_children];
-
- leaf = mmp_random_leaf_impl(child, fail_mask);
- if (leaf)
- return (leaf);
- }
-
- return (NULL);
-}
-
/*
* Find a leaf vdev to write an MMP block to. It must not have an outstanding
* mmp write (if so a new write will also likely block). If there is no usable
- * leaf in the tree rooted at in_vd, a nonzero error value is returned, and
- * *out_vd is unchanged.
- *
- * The error value returned is a bit field.
- *
- * MMP_FAIL_WRITE_PENDING
- * If set, one or more leaf vdevs are writeable, but have an MMP write which has
- * not yet completed.
- *
- * MMP_FAIL_NOT_WRITABLE
- * If set, one or more vdevs are not writeable. The children of those vdevs
- * were not examined.
+ * leaf, a nonzero error value is returned. The error value returned is a bit
+ * field.
*
- * Assuming in_vd points to a tree, a random subtree will be chosen to start.
- * That subtree, and successive ones, will be walked until a usable leaf has
- * been found, or all subtrees have been examined (except that the children of
- * un-writeable vdevs are not examined).
- *
- * If the leaf vdevs in the tree are healthy, the distribution of returned leaf
- * vdevs will be even. If there are unhealthy leaves, the following leaves
- * (child_index % index_children) will be chosen more often.
+ * MMP_FAIL_WRITE_PENDING One or more leaf vdevs are writeable, but have an
+ * outstanding MMP write.
+ * MMP_FAIL_NOT_WRITABLE One or more leaf vdevs are not writeable.
*/
static int
-mmp_random_leaf(vdev_t *in_vd, vdev_t **out_vd)
+mmp_next_leaf(spa_t *spa)
{
- int error_mask = 0;
- vdev_t *vd = mmp_random_leaf_impl(in_vd, &error_mask);
+ vdev_t *leaf;
+ vdev_t *starting_leaf;
+ int fail_mask = 0;
+
+ ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock));
+ ASSERT(spa_config_held(spa, SCL_STATE, RW_READER));
+ ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE);
+ ASSERT(!list_is_empty(&spa->spa_leaf_list));
+
+ if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) {
+ spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list);
+ spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen;
+ }
+
+ leaf = spa->spa_mmp.mmp_last_leaf;
+ if (leaf == NULL)
+ leaf = list_head(&spa->spa_leaf_list);
+ starting_leaf = leaf;
- if (error_mask == 0)
- *out_vd = vd;
+ do {
+ leaf = list_next(&spa->spa_leaf_list, leaf);
+ if (leaf == NULL)
+ leaf = list_head(&spa->spa_leaf_list);
- return (error_mask);
+ if (!vdev_writeable(leaf)) {
+ fail_mask |= MMP_FAIL_NOT_WRITABLE;
+ } else if (leaf->vdev_mmp_pending != 0) {
+ fail_mask |= MMP_FAIL_WRITE_PENDING;
+ } else {
+ spa->spa_mmp.mmp_last_leaf = leaf;
+ return (0);
+ }
+ } while (leaf != starting_leaf);
+
+ ASSERT(fail_mask);
+
+ return (fail_mask);
}
/*
zfs_dbgmsg("SCL_STATE acquisition took %llu ns\n",
(u_longlong_t)lock_acquire_time);
- error = mmp_random_leaf(spa->spa_root_vdev, &vd);
-
mutex_enter(&mmp->mmp_io_lock);
+ error = mmp_next_leaf(spa);
+
/*
* spa_mmp_history has two types of entries:
* Issued MMP write: records time issued, error status, etc.
return;
}
+ vd = spa->spa_mmp.mmp_last_leaf;
mmp->mmp_skip_error = 0;
if (mmp->mmp_zio_root == NULL)
spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
}
+ list_create(&spa->spa_leaf_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_leaf_node));
+
return (spa);
}
sizeof (avl_tree_t));
list_destroy(&spa->spa_config_list);
+ list_destroy(&spa->spa_leaf_list);
nvlist_free(spa->spa_label_features);
nvlist_free(spa->spa_load_info);
*/
for (; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum += cvd->vdev_guid_sum;
+
+ if (cvd->vdev_ops->vdev_op_leaf) {
+ list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
+ cvd->vdev_spa->spa_leaf_list_gen++;
+ }
}
void
pvd->vdev_children = 0;
}
+ if (cvd->vdev_ops->vdev_op_leaf) {
+ spa_t *spa = cvd->vdev_spa;
+ list_remove(&spa->spa_leaf_list, cvd);
+ spa->spa_leaf_list_gen++;
+ }
+
/*
* Walk up all ancestors to update guid sum.
*/
list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
list_link_init(&vd->vdev_initialize_node);
+ list_link_init(&vd->vdev_leaf_node);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
vdev_remove_child(vd->vdev_parent, vd);
ASSERT(vd->vdev_parent == NULL);
+ ASSERT(!list_link_active(&vd->vdev_leaf_node));
/*
* Clean up vdev structure.
tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval',
'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import',
'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history',
- 'mmp_on_zdb']
+ 'mmp_on_zdb', 'mmp_write_distribution']
tags = ['functional', 'mmp']
[tests/functional/mount]
mmp_write_uberblocks.ksh \
mmp_reset_interval.ksh \
mmp_on_zdb.ksh \
+ mmp_write_distribution.ksh \
setup.ksh \
cleanup.ksh
--- /dev/null
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+#
+
+# DESCRIPTION:
+# Verify MMP writes are distributed evenly among leaves
+#
+# STRATEGY:
+# 1. Create an asymmetric mirrored pool
+# 2. Enable multihost and multihost_history
+# 3. Delay for MMP writes to occur
+# 4. Verify the MMP writes are distributed evenly across leaf vdevs
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/mmp/mmp.cfg
+. $STF_SUITE/tests/functional/mmp/mmp.kshlib
+
+verify_runnable "both"
+
+function cleanup
+{
+ log_must zpool destroy $MMP_POOL
+ log_must rm $MMP_DIR/file.{0,1,2,3,4,5,6,7}
+ log_must rm $MMP_HISTORY_TMP
+ log_must rmdir $MMP_DIR
+ log_must mmp_clear_hostid
+}
+
+log_assert "mmp writes are evenly distributed across leaf vdevs"
+log_onexit cleanup
+
+MMP_HISTORY_TMP=$MMP_DIR/history
+MMP_HISTORY=/proc/spl/kstat/zfs/$MMP_POOL/multihost
+
+# Step 1
+log_must mkdir -p $MMP_DIR
+log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5,6,7}
+log_must zpool create -f $MMP_POOL mirror $MMP_DIR/file.{0,1} mirror $MMP_DIR/file.{2,3,4,5,6,7}
+
+# Step 2
+log_must mmp_set_hostid $HOSTID1
+log_must zpool set multihost=on $MMP_POOL
+set_tunable64 zfs_multihost_history 0
+set_tunable64 zfs_multihost_history 40
+
+# Step 3
+# default settings, every leaf written once/second
+sleep 4
+
+# Step 4
+typeset -i min_writes=999
+typeset -i max_writes=0
+typeset -i write_count
+# copy to get as close to a consistent view as possible
+cat $MMP_HISTORY > $MMP_HISTORY_TMP
+for x in $(seq 0 7); do
+ write_count=$(grep -c file.${x} $MMP_HISTORY_TMP)
+ if [ $write_count -lt $min_writes ]; then
+ min_writes=$write_count
+ fi
+ if [ $write_count -gt $max_writes ]; then
+ max_writes=$write_count
+ fi
+done
+log_note "mmp min_writes $min_writes max_writes $max_writes"
+
+if [ $min_writes -lt 1 ]; then
+ log_fail "mmp writes were not counted correctly"
+fi
+
+if [ $((max_writes - min_writes)) -gt 1 ]; then
+ log_fail "mmp writes were not evenly distributed across leaf vdevs"
+fi
+
+log_pass "mmp writes were evenly distributed across leaf vdevs"