spa->spa_state = POOL_STATE_ACTIVE;
spa->spa_mode = mode;
- spa->spa_normal_class = metaslab_class_create();
- spa->spa_log_class = metaslab_class_create();
+ spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
+ spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
- for (int t = 0; t < ZIO_TYPES; t++) {
+ for (t = 0; t < ZIO_TYPES; t++) {
+ const zio_taskq_info_t *ztip = &zio_taskqs[t];
- for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
+ for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
- spa->spa_zio_taskq[t][q] = taskq_create("spa_zio",
- zio_taskq_threads[t][q], maxclsyspri, 50,
- INT_MAX, TASKQ_PREPOPULATE);
+ enum zti_modes mode = ztip->zti_nthreads[q].zti_mode;
+ uint_t value = ztip->zti_nthreads[q].zti_value;
+ char name[32];
+
+ (void) snprintf(name, sizeof (name),
+ "%s_%s", ztip->zti_name, zio_taskq_types[q]);
+
+ if (mode == zti_mode_tune) {
+ mode = zio_taskq_tune_mode;
+ value = zio_taskq_tune_value;
+ if (mode == zti_mode_tune)
+ mode = zti_mode_online_percent;
+ }
+
+ switch (mode) {
+ case zti_mode_fixed:
+ ASSERT3U(value, >=, 1);
+ value = MAX(value, 1);
+
+ spa->spa_zio_taskq[t][q] = taskq_create(name,
+ value, maxclsyspri, 50, INT_MAX,
+ TASKQ_PREPOPULATE);
+ break;
+
+ case zti_mode_online_percent:
+ spa->spa_zio_taskq[t][q] = taskq_create(name,
+ value, maxclsyspri, 50, INT_MAX,
+ TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
+ break;
+
+ case zti_mode_tune:
+ default:
+ panic("unrecognized mode for "
+ "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) "
+ "in spa_activate()",
+ t, q, mode, value);
+ break;
+ }
}
}
uint_t id, int atype)
{
nvlist_t **child;
- uint_t c, children;
+ uint_t children;
int error;
++ int c;
if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
return (error);
}
}
- for (int c = 0; c < children; c++) {
+ /*
+ * Load the slog device state from the config object since it's possible
+ * that the label does not contain the most up-to-date information.
+ */
+ void
+ spa_load_log_state(spa_t *spa)
+ {
+ nvlist_t *nv, *nvroot, **child;
+ uint64_t is_log;
+ uint_t children;
+ vdev_t *rvd = spa->spa_root_vdev;
++ int c;
+
+ VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0);
+ VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0);
+
++ for (c = 0; c < children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+ &is_log) == 0 && is_log)
+ vdev_load_log_state(tvd, child[c]);
+ }
+ nvlist_free(nv);
+ }
+
/*
* Check for missing log devices
*/
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
uint64_t version;
++ int c;
/*
* If this pool already exists, return failure.
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
(error = spa_validate_aux(spa, nvroot, txg,
VDEV_ALLOC_ADD)) == 0) {
- for (c = 0; c < rvd->vdev_children; c++)
- vdev_init(rvd->vdev_child[c], txg);
- vdev_config_dirty(rvd);
- for (int c = 0; c < rvd->vdev_children; c++) {
++ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_metaslab_set_size(rvd->vdev_child[c]);
+ vdev_expand(rvd->vdev_child[c], txg);
+ }
}
spa_config_exit(spa, SCL_ALL, FTAG);
return (0);
}
+ #ifdef _KERNEL
/*
- * Import the given pool into the system. We set up the necessary spa_t and
- * then call spa_load() to do the dirty work.
+ * Get the root pool information from the root disk, then import the root pool
+ * during the system boot up time.
*/
- static int
- spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
- boolean_t isroot, boolean_t allowfaulted)
+ extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
+
+ static nvlist_t *
+ spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
+ {
+ nvlist_t *config;
+ nvlist_t *nvtop, *nvroot;
+ uint64_t pgid;
+
+ if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
+ return (NULL);
+
+ /*
+ * Add this top-level vdev to the child array.
+ */
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &pgid) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
+
+ /*
+ * Put this pool's top-level vdevs into a root vdev.
+ */
+ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT) == 0);
+ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
+ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
+ VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &nvtop, 1) == 0);
+
+ /*
+ * Replace the existing vdev_tree with the new root vdev in
+ * this pool's configuration (remove the old, add the new).
+ */
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+ nvlist_free(nvroot);
+ return (config);
+ }
+
+ /*
+ * Walk the vdev tree and see if we can find a device with "better"
+ * configuration. A configuration is "better" if the label on that
+ * device has a more recent txg.
+ */
+ static void
+ spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
+ {
- for (int c = 0; c < vd->vdev_children; c++)
++ int c;
++
++ for (c = 0; c < vd->vdev_children; c++)
+ spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ nvlist_t *label;
+ uint64_t label_txg;
+
+ if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
+ &label) != 0)
+ return;
+
+ VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
+ &label_txg) == 0);
+
+ /*
+ * Do we have a better boot device?
+ */
+ if (label_txg > *txg) {
+ *txg = label_txg;
+ *avd = vd;
+ }
+ nvlist_free(label);
+ }
+ }
+
+ /*
+ * Import a root pool.
+ *
+ * For x86. devpath_list will consist of devid and/or physpath name of
+ * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
+ * The GRUB "findroot" command will return the vdev we should boot.
+ *
+ * For Sparc, devpath_list consists the physpath name of the booting device
+ * no matter the rootpool is a single device pool or a mirrored pool.
+ * e.g.
+ * "/pci@1f,0/ide@d/disk@0,0:a"
+ */
+ int
+ spa_import_rootpool(char *devpath, char *devid)
+ {
+ spa_t *spa;
+ vdev_t *rvd, *bvd, *avd = NULL;
+ nvlist_t *config, *nvtop;
+ uint64_t guid, txg;
+ char *pname;
+ int error;
+
+ /*
+ * Read the label from the boot device and generate a configuration.
+ */
+ if ((config = spa_generate_rootconf(devpath, devid, &guid)) == NULL) {
+ cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
+ devpath);
+ return (EIO);
+ }
+
+ VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &pname) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
+
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(pname)) != NULL) {
+ /*
+ * Remove the existing root pool from the namespace so that we
+ * can replace it with the correct config we just read in.
+ */
+ spa_remove(spa);
+ }
+
+ spa = spa_add(pname, NULL);
+ spa->spa_is_root = B_TRUE;
+
+ /*
+ * Build up a vdev tree based on the boot device's label config.
+ */
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
+ VDEV_ALLOC_ROOTPOOL);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (error) {
+ mutex_exit(&spa_namespace_lock);
+ nvlist_free(config);
+ cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
+ pname);
+ return (error);
+ }
+
+ /*
+ * Get the boot vdev.
+ */
+ if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
+ cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
+ (u_longlong_t)guid);
+ error = ENOENT;
+ goto out;
+ }
+
+ /*
+ * Determine if there is a better boot device.
+ */
+ avd = bvd;
+ spa_alt_rootvdev(rvd, &avd, &txg);
+ if (avd != bvd) {
+ cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
+ "try booting from '%s'", avd->vdev_path);
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * If the boot device is part of a spare vdev then ensure that
+ * we're booting off the active spare.
+ */
+ if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ !bvd->vdev_isspare) {
+ cmn_err(CE_NOTE, "The boot device is currently spared. Please "
+ "try booting from '%s'",
+ bvd->vdev_parent->vdev_child[1]->vdev_path);
+ error = EINVAL;
+ goto out;
+ }
+
+ VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+ error = 0;
+ out:
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ vdev_free(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ mutex_exit(&spa_namespace_lock);
+
+ nvlist_free(config);
+ return (error);
+ }
+
+ #endif
+
+ /*
+ * Take a pool and insert it into the namespace as if it had been loaded at
+ * boot.
+ */
+ int
+ spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
{
spa_t *spa;
char *altroot = NULL;
spa_async_probe(spa, vd->vdev_child[c]);
}
- for (int c = 0; c < vd->vdev_children; c++) {
+ static void
+ spa_async_autoexpand(spa_t *spa, vdev_t *vd)
+ {
+ sysevent_id_t eid;
+ nvlist_t *attr;
+ char *physpath;
++ int c;
+
+ if (!spa->spa_autoexpand)
+ return;
+
++ for (c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ spa_async_autoexpand(spa, cvd);
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
+ return;
+
+ physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+ (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
+
+ VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
+
+ (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
+ ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
+
+ nvlist_free(attr);
+ kmem_free(physpath, MAXPATHLEN);
+ }
+
static void
spa_async_thread(spa_t *spa)
{
dmu_tx_t *tx;
int dirty_vdevs;
int error;
++ int c;
/*
* Lock out configuration changes.
int svdcount = 0;
int children = rvd->vdev_children;
int c0 = spa_get_random(children);
- int c;
- for (int c = 0; c < children; c++) {
+ for (c = 0; c < children; c++) {
vd = rvd->vdev_child[(c0 + c) % children];
if (vd->vdev_ms_array == 0 || vd->vdev_islog)
continue;
{
uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
uint64_t csize;
- uint64_t c;
++ int c;
- for (int c = 0; c < vd->vdev_children; c++) {
+ for (c = 0; c < vd->vdev_children; c++) {
csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
asize = MAX(asize, csize);
}
}
/*
- * Get the replaceable or attachable device size.
- * If the parent is a mirror or raidz, the replaceable size is the minimum
- * psize of all its children. For the rest, just return our own psize.
- *
- * e.g.
- * psize rsize
- * root - -
- * mirror/raidz - -
- * disk1 20g 20g
- * disk2 40g 20g
- * disk3 80g 80g
+ * Get the minimum allocatable size. We define the allocatable size as
+ * the vdev's asize rounded to the nearest metaslab. This allows us to
+ * replace or attach devices which don't have the same physical size but
+ * can still satisfy the same number of allocations.
*/
uint64_t
- vdev_get_rsize(vdev_t *vd)
+ vdev_get_min_asize(vdev_t *vd)
{
- vdev_t *pvd, *cvd;
- uint64_t c, rsize;
+ vdev_t *pvd = vd->vdev_parent;
- pvd = vd->vdev_parent;
+ /*
+ * The our parent is NULL (inactive spare or cache) or is the root,
+ * just return our own asize.
+ */
+ if (pvd == NULL)
+ return (vd->vdev_asize);
/*
- * If our parent is NULL or the root, just return our own psize.
+ * The top-level vdev just returns the allocatable size rounded
+ * to the nearest metaslab.
*/
- if (pvd == NULL || pvd->vdev_parent == NULL)
- return (vd->vdev_psize);
+ if (vd == vd->vdev_top)
+ return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
- rsize = 0;
+ /*
+ * The allocatable space for a raidz vdev is N * sizeof(smallest child),
+ * so each child must provide at least 1/Nth of its asize.
+ */
+ if (pvd->vdev_ops == &vdev_raidz_ops)
+ return (pvd->vdev_min_asize / pvd->vdev_children);
- for (c = 0; c < pvd->vdev_children; c++) {
- cvd = pvd->vdev_child[c];
- rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
- }
+ return (pvd->vdev_min_asize);
+ }
- return (rsize);
+ void
+ vdev_set_min_asize(vdev_t *vd)
+ {
+ vd->vdev_min_asize = vdev_get_min_asize(vd);
++ int c;
+
- for (int c = 0; c < vd->vdev_children; c++)
++ for (c = 0; c < vd->vdev_children; c++)
+ vdev_set_min_asize(vd->vdev_child[c]);
}
vdev_t *
vdev_t *
vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
{
- int c;
vdev_t *mvd;
++ int c;
if (vd->vdev_guid == guid)
return (vd);
{
vdev_t **newchild, *cvd;
int oldc = pvd->vdev_children;
- int newc, c;
+ int newc;
++ int c;
ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
return (NULL);
}
- for (int l = 1; l < VDEV_LABELS; l++) {
+ for (l = 1; l < VDEV_LABELS; l++) {
zio_nowait(zio_read_phys(pio, vd,
vdev_label_offset(vd->vdev_psize, l,
- offsetof(vdev_label_t, vl_pad)),
- VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE),
+ offsetof(vdev_label_t, vl_pad2)),
+ VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
}
uint64_t osize = 0;
uint64_t asize, psize;
uint64_t ashift = 0;
++ int c;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
vd->vdev_state = VDEV_STATE_HEALTHY;
}
- for (c = 0; c < vd->vdev_children; c++)
- for (int c = 0; c < vd->vdev_children; c++) {
++ for (c = 0; c < vd->vdev_children; c++) {
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_NONE);
nvlist_t *label;
uint64_t guid, top_guid;
uint64_t state;
++ int c;
- for (int c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < vd->vdev_children; c++)
if (vdev_validate(vd->vdev_child[c]) != 0)
return (EBADF);
void
vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
{
- int c;
vdev_stat_t *vs = &vd->vdev_stat;
++ int c;
- for (int c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < vd->vdev_children; c++)
vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
mutex_enter(&vd->vdev_stat_lock);
vdev_t *rvd = spa->spa_root_vdev;
int degraded = 0, faulted = 0;
int corrupted = 0;
- int c;
vdev_t *child;
++ int c;
if (vd->vdev_children > 0) {
- for (int c = 0; c < vd->vdev_children; c++) {
+ for (c = 0; c < vd->vdev_children; c++) {
child = vd->vdev_child[c];
if (!vdev_readable(child) ||
}
return (B_TRUE);
}
- for (int c = 0; c < children; c++)
+
+ void
+ vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
+ {
+ uint_t children;
+ nvlist_t **child;
+ uint64_t val;
+ spa_t *spa = vd->vdev_spa;
++ int c;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
++ for (c = 0; c < children; c++)
+ vdev_load_log_state(vd->vdev_child[c], child[c]);
+ }
+
+ if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
+ ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
+
+ /*
+ * It would be nice to call vdev_offline()
+ * directly but the pool isn't fully loaded and
+ * the txg threads have not been started yet.
+ */
+ spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER);
+ vd->vdev_offline = val;
+ vdev_reopen(vd->vdev_top);
+ spa_config_exit(spa, SCL_STATE_ALL, FTAG);
+ }
+ }
+
+ /*
+ * Expand a vdev if possible.
+ */
+ void
+ vdev_expand(vdev_t *vd, uint64_t txg)
+ {
+ ASSERT(vd->vdev_top == vd);
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
+ VERIFY(vdev_metaslab_init(vd, txg) == 0);
+ vdev_config_dirty(vd);
+ }
+ }
nvlist_t *config = NULL;
vdev_phys_t *vp;
zio_t *zio;
- int l, flags =
- ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE;
++ int l;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
vp = zio_buf_alloc(sizeof (vdev_phys_t));
- for (int l = 0; l < VDEV_LABELS; l++) {
+ retry:
+ for (l = 0; l < VDEV_LABELS; l++) {
zio = zio_root(spa, NULL, NULL, flags);
/*
* Write everything in parallel.
*/
+ retry:
zio = zio_root(spa, NULL, NULL, flags);
- for (int l = 0; l < VDEV_LABELS; l++) {
+ for (l = 0; l < VDEV_LABELS; l++) {
vdev_label_write(zio, vd, l, vp,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), NULL, NULL, flags);
- vdev_label_write(zio, vd, l, vb,
- offsetof(vdev_label_t, vl_boot_header),
- sizeof (vdev_boot_header_t), NULL, NULL, flags);
+ /*
+ * Skip the 1st padding area.
+ * Zero out the 2nd padding area where it might have
+ * left over data from previous filesystem format.
+ */
+ vdev_label_write(zio, vd, l, pad2,
+ offsetof(vdev_label_t, vl_pad2),
+ VDEV_PAD_SIZE, NULL, NULL, flags);
- for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+ for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
vdev_label_write(zio, vd, l, ub,
VDEV_UBERBLOCK_OFFSET(vd, n),
VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags);
{
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
- int flags =
- ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
+ int c, l, n;
if (vd == rvd) {
ASSERT(zio == NULL);