X-Git-Url: https://granicus.if.org/sourcecode?a=blobdiff_plain;f=module%2Fzfs%2Fvdev_label.c;h=29d7d651bf0d651c50b52b1e382f6c54a55f2e00;hb=d2734cce68cf740e015312314415f9034c67851c;hp=7f588ed6b0b5611905cea0ce53d8ccfca64204b7;hpb=6e9710f7c33a7440904b39fb4e6b68b3325cfeca;p=zfs diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 7f588ed6b..29d7d651b 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* @@ -33,15 +33,15 @@ * 1. Uniquely identify this device as part of a ZFS pool and confirm its * identity within the pool. * - * 2. Verify that all the devices given in a configuration are present + * 2. Verify that all the devices given in a configuration are present * within the pool. * - * 3. Determine the uberblock for the pool. + * 3. Determine the uberblock for the pool. * - * 4. In case of an import operation, determine the configuration of the + * 4. In case of an import operation, determine the configuration of the * toplevel vdev of which it is a part. * - * 5. If an import operation cannot find all the devices in the pool, + * 5. If an import operation cannot find all the devices in the pool, * provide enough information to the administrator to determine which * devices are missing. * @@ -77,9 +77,9 @@ * In order to identify which labels are valid, the labels are written in the * following manner: * - * 1. For each vdev, update 'L1' to the new label - * 2. Update the uberblock - * 3. For each vdev, update 'L2' to the new label + * 1. For each vdev, update 'L1' to the new label + * 2. Update the uberblock + * 3. For each vdev, update 'L2' to the new label * * Given arbitrary failure, we can determine the correct label to use based on * the transaction group. If we fail after updating L1 but before updating the @@ -117,19 +117,19 @@ * * The nvlist describing the pool and vdev contains the following elements: * - * version ZFS on-disk version - * name Pool name - * state Pool state - * txg Transaction group in which this label was written - * pool_guid Unique identifier for this pool - * vdev_tree An nvlist describing vdev tree. + * version ZFS on-disk version + * name Pool name + * state Pool state + * txg Transaction group in which this label was written + * pool_guid Unique identifier for this pool + * vdev_tree An nvlist describing vdev tree. * features_for_read * An nvlist of the features necessary for reading the MOS. * * Each leaf device label also contains the following: * - * top_guid Unique ID for top-level vdev in which this is contained - * guid Unique ID for the leaf vdev + * top_guid Unique ID for top-level vdev in which this is contained + * guid Unique ID for the leaf vdev * * The 'vs' configuration follows the format described in 'spa_config.c'. */ @@ -143,8 +143,10 @@ #include #include #include +#include #include #include +#include #include /* @@ -178,11 +180,12 @@ vdev_label_number(uint64_t psize, uint64_t offset) } static void -vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private, int flags) +vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, int flags) { - ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == - SCL_STATE_ALL); + ASSERT( + spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || + spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_read_phys(zio, vd, @@ -191,14 +194,13 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, ZIO_PRIORITY_SYNC_READ, flags, B_TRUE)); } -static void -vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private, int flags) +void +vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, int flags) { - ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || - (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) == - (SCL_CONFIG | SCL_STATE) && - dsl_pool_sync_context(spa_get_dsl(zio->io_spa)))); + ASSERT( + spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || + spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_write_phys(zio, vd, @@ -207,6 +209,180 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE)); } +/* + * Generate the nvlist representing this vdev's stats + */ +void +vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) +{ + nvlist_t *nvx; + vdev_stat_t *vs; + vdev_stat_ex_t *vsx; + + vs = kmem_alloc(sizeof (*vs), KM_SLEEP); + vsx = kmem_alloc(sizeof (*vsx), KM_SLEEP); + + vdev_get_stats_ex(vd, vs, vsx); + fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t *)vs, sizeof (*vs) / sizeof (uint64_t)); + + kmem_free(vs, sizeof (*vs)); + + /* + * Add extended stats into a special extended stats nvlist. This keeps + * all the extended stats nicely grouped together. The extended stats + * nvlist is then added to the main nvlist. + */ + nvx = fnvlist_alloc(); + + /* ZIOs in flight to disk */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]); + + /* ZIOs pending */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]); + + /* Histograms */ + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + vsx->vsx_total_histo[ZIO_TYPE_READ], + ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + vsx->vsx_total_histo[ZIO_TYPE_WRITE], + ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + vsx->vsx_disk_histo[ZIO_TYPE_READ], + ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + vsx->vsx_disk_histo[ZIO_TYPE_WRITE], + ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB])); + + /* Request sizes */ + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB])); + + /* Add extended stats nvlist to main nvlist */ + fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); + + fnvlist_free(nvx); + kmem_free(vsx, sizeof (*vsx)); +} + +static void +root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) +{ + spa_t *spa = vd->vdev_spa; + + if (vd != spa->spa_root_vdev) + return; + + /* provide either current or previous scan information */ + pool_scan_stat_t ps; + if (spa_scan_get_stats(spa, &ps) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, + sizeof (pool_scan_stat_t) / sizeof (uint64_t)); + } + + pool_removal_stat_t prs; + if (spa_removal_get_stats(spa, &prs) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs, + sizeof (prs) / sizeof (uint64_t)); + } + + pool_checkpoint_stat_t pcs; + if (spa_checkpoint_get_stats(spa, &pcs) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs, + sizeof (pcs) / sizeof (uint64_t)); + } +} + /* * Generate the nvlist representing this vdev's config. */ @@ -215,6 +391,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags) { nvlist_t *nv = NULL; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; nv = fnvlist_alloc(); @@ -233,6 +410,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, vd->vdev_physpath); + if (vd->vdev_enc_sysfs_path != NULL) + fnvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + vd->vdev_enc_sysfs_path); + if (vd->vdev_fru != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); @@ -262,7 +443,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk); - if (vd->vdev_not_present) + if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); if (vd->vdev_isspare) @@ -278,9 +459,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); - if (vd->vdev_removing) + if (vd->vdev_removing) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, vd->vdev_removing); + } } if (vd->vdev_dtl_sm != NULL) { @@ -288,22 +470,100 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, space_map_object(vd->vdev_dtl_sm)); } + if (vic->vic_mapping_object != 0) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, + vic->vic_mapping_object); + } + + if (vic->vic_births_object != 0) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, + vic->vic_births_object); + } + + if (vic->vic_prev_indirect_vdev != UINT64_MAX) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, + vic->vic_prev_indirect_vdev); + } + if (vd->vdev_crtxg) fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); + if (flags & VDEV_CONFIG_MOS) { + if (vd->vdev_leaf_zap != 0) { + ASSERT(vd->vdev_ops->vdev_op_leaf); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, + vd->vdev_leaf_zap); + } + + if (vd->vdev_top_zap != 0) { + ASSERT(vd == vd->vdev_top); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, + vd->vdev_top_zap); + } + } + if (getstats) { - vdev_stat_t vs; - pool_scan_stat_t ps; - - vdev_get_stats(vd, &vs); - fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)); - - /* provide either current or previous scan information */ - if (spa_scan_get_stats(spa, &ps) == 0) { - fnvlist_add_uint64_array(nv, - ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, - sizeof (pool_scan_stat_t) / sizeof (uint64_t)); + vdev_config_generate_stats(vd, nv); + + root_vdev_actions_getprogress(vd, nv); + + /* + * Note: this can be called from open context + * (spa_get_stats()), so we need the rwlock to prevent + * the mapping from being changed by condensing. + */ + rw_enter(&vd->vdev_indirect_rwlock, RW_READER); + if (vd->vdev_indirect_mapping != NULL) { + ASSERT(vd->vdev_indirect_births != NULL); + vdev_indirect_mapping_t *vim = + vd->vdev_indirect_mapping; + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, + vdev_indirect_mapping_size(vim)); + } + rw_exit(&vd->vdev_indirect_rwlock); + if (vd->vdev_mg != NULL && + vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) { + /* + * Compute approximately how much memory would be used + * for the indirect mapping if this device were to + * be removed. + * + * Note: If the frag metric is invalid, then not + * enough metaslabs have been converted to have + * histograms. + */ + uint64_t seg_count = 0; + uint64_t to_alloc = vd->vdev_stat.vs_alloc; + + /* + * There are the same number of allocated segments + * as free segments, so we will have at least one + * entry per free segment. However, small free + * segments (smaller than vdev_removal_max_span) + * will be combined with adjacent allocated segments + * as a single mapping. + */ + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + if (1ULL << (i + 1) < vdev_removal_max_span) { + to_alloc += + vd->vdev_mg->mg_histogram[i] << + (i + 1); + } else { + seg_count += + vd->vdev_mg->mg_histogram[i]; + } + } + + /* + * The maximum length of a mapping is + * zfs_remove_max_segment, so we need at least one entry + * per zfs_remove_max_segment of allocated data. + */ + seg_count += to_alloc / zfs_remove_max_segment; + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, + seg_count * + sizeof (vdev_indirect_mapping_entry_phys_t)); } } @@ -361,6 +621,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_ishole) fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE); + /* Set the reason why we're FAULTED/DEGRADED. */ switch (vd->vdev_stat.vs_aux) { case VDEV_AUX_ERR_EXCEEDED: aux = "err_exceeded"; @@ -371,8 +632,15 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, break; } - if (aux != NULL) + if (aux != NULL && !vd->vdev_tmpoffline) { fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux); + } else { + /* + * We're healthy - clear any previous AUX_STATE values. + */ + if (nvlist_exists(nv, ZPOOL_CONFIG_AUX_STATE)) + nvlist_remove_all(nv, ZPOOL_CONFIG_AUX_STATE); + } if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, @@ -401,8 +669,9 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config) for (c = 0, idx = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - if (tvd->vdev_ishole) + if (tvd->vdev_ishole) { array[idx++] = c; + } } if (idx) { @@ -430,35 +699,35 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; vdev_phys_t *vp; + abd_t *vp_abd; zio_t *zio; uint64_t best_txg = 0; + uint64_t label_txg = 0; int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; - int l; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (!vdev_readable(vd)) return (NULL); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + vp = abd_to_buf(vp_abd); retry: - for (l = 0; l < VDEV_LABELS; l++) { + for (int l = 0; l < VDEV_LABELS; l++) { nvlist_t *label = NULL; zio = zio_root(spa, NULL, NULL, flags); - vdev_label_read(zio, vd, l, vp, + vdev_label_read(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); if (zio_wait(zio) == 0 && nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), &label, 0) == 0) { - uint64_t label_txg = 0; - /* * Auxiliary vdevs won't have txg values in their * labels and newly added vdevs may not have been @@ -489,7 +758,16 @@ retry: goto retry; } - zio_buf_free(vp, sizeof (vdev_phys_t)); + /* + * We found a valid label but it didn't pass txg restrictions. + */ + if (config == NULL && label_txg != 0) { + vdev_dbgmsg(vd, "label discarded as txg is too large " + "(%llu > %llu)", (u_longlong_t)label_txg, + (u_longlong_t)txg); + } + + abd_free(vp_abd); return (config); } @@ -625,20 +903,20 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) spa_t *spa = vd->vdev_spa; nvlist_t *label; vdev_phys_t *vp; - char *pad2; + abd_t *vp_abd; + abd_t *pad2; uberblock_t *ub; + abd_t *ub_abd; zio_t *zio; char *buf; size_t buflen; int error; uint64_t spare_guid = 0, l2cache_guid = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; - int c, l; - vdev_t *pvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) if ((error = vdev_label_init(vd->vdev_child[c], crtxg, reason)) != 0) return (error); @@ -674,7 +952,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) vd->vdev_guid += guid_delta; - for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += guid_delta; /* @@ -694,7 +972,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) vd->vdev_guid += guid_delta; - for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += guid_delta; /* @@ -710,8 +988,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize its label. */ - vp = zio_buf_alloc(sizeof (vdev_phys_t)); - bzero(vp, sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + abd_zero(vp_abd, sizeof (vdev_phys_t)); + vp = abd_to_buf(vp_abd); /* * Generate a label describing the pool and our top-level vdev. @@ -771,22 +1050,23 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); if (error != 0) { nvlist_free(label); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd); /* EFAULT means nvlist_pack ran out of room */ - return (error == EFAULT ? ENAMETOOLONG : EINVAL); + return (SET_ERROR(error == EFAULT ? ENAMETOOLONG : EINVAL)); } /* * Initialize uberblock template. */ - ub = zio_buf_alloc(VDEV_UBERBLOCK_RING); - bzero(ub, VDEV_UBERBLOCK_RING); - *ub = spa->spa_uberblock; + ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE); + abd_zero(ub_abd, VDEV_UBERBLOCK_RING); + abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t)); + ub = abd_to_buf(ub_abd); ub->ub_txg = 0; /* Initialize the 2nd padding area. */ - pad2 = zio_buf_alloc(VDEV_PAD_SIZE); - bzero(pad2, VDEV_PAD_SIZE); + pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(pad2, VDEV_PAD_SIZE); /* * Write everything in parallel. @@ -794,9 +1074,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) retry: zio = zio_root(spa, NULL, NULL, flags); - for (l = 0; l < VDEV_LABELS; l++) { + for (int l = 0; l < VDEV_LABELS; l++) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -809,7 +1089,7 @@ retry: offsetof(vdev_label_t, vl_pad2), VDEV_PAD_SIZE, NULL, NULL, flags); - vdev_label_write(zio, vd, l, ub, + vdev_label_write(zio, vd, l, ub_abd, offsetof(vdev_label_t, vl_uberblock), VDEV_UBERBLOCK_RING, NULL, NULL, flags); } @@ -822,9 +1102,9 @@ retry: } nvlist_free(label); - zio_buf_free(pad2, VDEV_PAD_SIZE); - zio_buf_free(ub, VDEV_UBERBLOCK_RING); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(pad2); + abd_free(ub_abd); + abd_free(vp_abd); /* * If this vdev hasn't been previously identified as a spare, then we @@ -862,19 +1142,13 @@ retry: * among uberblocks with equal txg, choose the one with the latest timestamp. */ static int -vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) +vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) { - if (ub1->ub_txg < ub2->ub_txg) - return (-1); - if (ub1->ub_txg > ub2->ub_txg) - return (1); - - if (ub1->ub_timestamp < ub2->ub_timestamp) - return (-1); - if (ub1->ub_timestamp > ub2->ub_timestamp) - return (1); + int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); + if (likely(cmp)) + return (cmp); - return (0); + return (AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp)); } struct ubl_cbdata { @@ -888,7 +1162,7 @@ vdev_uberblock_load_done(zio_t *zio) vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; - uberblock_t *ub = zio->io_data; + uberblock_t *ub = abd_to_buf(zio->io_abd); struct ubl_cbdata *cbp = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); @@ -909,24 +1183,22 @@ vdev_uberblock_load_done(zio_t *zio) mutex_exit(&rio->io_lock); } - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); } static void vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, struct ubl_cbdata *cbp) { - int c, l, n; - - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { - for (l = 0; l < VDEV_LABELS; l++) { - for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + for (int l = 0; l < VDEV_LABELS; l++) { + for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, - zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), - VDEV_UBERBLOCK_OFFSET(vd, n), + abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), + B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_load_done, zio, flags); } @@ -969,11 +1241,77 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) * Search all labels on this vdev to find the configuration that * matches the txg for our uberblock. */ - if (cb.ubl_vd != NULL) + if (cb.ubl_vd != NULL) { + vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. " + "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg); + *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); + if (*config == NULL && spa->spa_extreme_rewind) { + vdev_dbgmsg(cb.ubl_vd, "failed to read label config. " + "Trying again without txg restrictions."); + *config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX); + } + if (*config == NULL) { + vdev_dbgmsg(cb.ubl_vd, "failed to read label config"); + } + } spa_config_exit(spa, SCL_ALL, FTAG); } +/* + * For use when a leaf vdev is expanded. + * The location of labels 2 and 3 changed, and at the new location the + * uberblock rings are either empty or contain garbage. The sync will write + * new configs there because the vdev is dirty, but expansion also needs the + * uberblock rings copied. Read them from label 0 which did not move. + * + * Since the point is to populate labels {2,3} with valid uberblocks, + * we zero uberblocks we fail to read or which are not valid. + */ + +static void +vdev_copy_uberblocks(vdev_t *vd) +{ + abd_t *ub_abd; + zio_t *write_zio; + int locks = (SCL_L2ARC | SCL_ZIO); + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE; + + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_READER) == + SCL_STATE); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER); + + ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); + + write_zio = zio_root(vd->vdev_spa, NULL, NULL, flags); + for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + const int src_label = 0; + zio_t *zio; + + zio = zio_root(vd->vdev_spa, NULL, NULL, flags); + vdev_label_read(zio, vd, src_label, ub_abd, + VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), + NULL, NULL, flags); + + if (zio_wait(zio) || uberblock_verify(abd_to_buf(ub_abd))) + abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); + + for (int l = 2; l < VDEV_LABELS; l++) + vdev_label_write(write_zio, vd, l, ub_abd, + VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, + flags | ZIO_FLAG_DONT_PROPAGATE); + } + (void) zio_wait(write_zio); + + spa_config_exit(vd->vdev_spa, locks, FTAG); + + abd_free(ub_abd); +} + /* * On success, increment root zio's count of good writes. * We only get credit for writes to known-visible vdevs; see spa_vdev_add(). @@ -984,20 +1322,20 @@ vdev_uberblock_sync_done(zio_t *zio) uint64_t *good_writes = zio->io_private; if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) - atomic_add_64(good_writes, 1); + atomic_inc_64(good_writes); } /* * Write the uberblock to all labels of all leaves of the specified vdev. */ static void -vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) +vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, + uberblock_t *ub, vdev_t *vd, int flags) { - uberblock_t *ubbuf; - int c, l, n; - - for (c = 0; c < vd->vdev_children; c++) - vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags); + for (uint64_t c = 0; c < vd->vdev_children; c++) { + vdev_uberblock_sync(zio, good_writes, + ub, vd->vdev_child[c], flags); + } if (!vd->vdev_ops->vdev_op_leaf) return; @@ -1005,19 +1343,28 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) if (!vdev_writeable(vd)) return; - n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); + /* If the vdev was expanded, need to copy uberblock rings. */ + if (vd->vdev_state == VDEV_STATE_HEALTHY && + vd->vdev_copy_uberblocks == B_TRUE) { + vdev_copy_uberblocks(vd); + vd->vdev_copy_uberblocks = B_FALSE; + } + + int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0; + int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m); - ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); - bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); - *ubbuf = *ub; + /* Copy the uberblock_t into the ABD */ + abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); + abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); + abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); - for (l = 0; l < VDEV_LABELS; l++) - vdev_label_write(zio, vd, l, ubbuf, + for (int l = 0; l < VDEV_LABELS; l++) + vdev_label_write(zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), - vdev_uberblock_sync_done, zio->io_private, + vdev_uberblock_sync_done, good_writes, flags | ZIO_FLAG_DONT_PROPAGATE); - zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); + abd_free(ub_abd); } /* Sync the uberblocks to all vdevs in svd[] */ @@ -1027,12 +1374,11 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) spa_t *spa = svd[0]->vdev_spa; zio_t *zio; uint64_t good_writes = 0; - int v; - zio = zio_root(spa, NULL, &good_writes, flags); + zio = zio_root(spa, NULL, NULL, flags); - for (v = 0; v < svdcount; v++) - vdev_uberblock_sync(zio, ub, svd[v], flags); + for (int v = 0; v < svdcount; v++) + vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags); (void) zio_wait(zio); @@ -1043,8 +1389,11 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) */ zio = zio_root(spa, NULL, NULL, flags); - for (v = 0; v < svdcount; v++) - zio_flush(zio, svd[v]); + for (int v = 0; v < svdcount; v++) { + if (vdev_writeable(svd[v])) { + zio_flush(zio, svd[v]); + } + } (void) zio_wait(zio); @@ -1060,7 +1409,7 @@ vdev_label_sync_done(zio_t *zio) uint64_t *good_writes = zio->io_private; if (zio->io_error == 0) - atomic_add_64(good_writes, 1); + atomic_inc_64(good_writes); } /* @@ -1090,16 +1439,19 @@ vdev_label_sync_ignore_done(zio_t *zio) * Write all even or odd labels to all leaves of the specified vdev. */ static void -vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) +vdev_label_sync(zio_t *zio, uint64_t *good_writes, + vdev_t *vd, int l, uint64_t txg, int flags) { nvlist_t *label; vdev_phys_t *vp; + abd_t *vp_abd; char *buf; size_t buflen; - int c; - for (c = 0; c < vd->vdev_children; c++) - vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_label_sync(zio, good_writes, + vd->vdev_child[c], l, txg, flags); + } if (!vd->vdev_ops->vdev_op_leaf) return; @@ -1112,23 +1464,24 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) */ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); - bzero(vp, sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + abd_zero(vp_abd, sizeof (vdev_phys_t)); + vp = abd_to_buf(vp_abd); buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) { for (; l < VDEV_LABELS; l += 2) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), - vdev_label_sync_done, zio->io_private, + vdev_label_sync_done, good_writes, flags | ZIO_FLAG_DONT_PROPAGATE); } } - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd); nvlist_free(label); } @@ -1147,16 +1500,15 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { uint64_t *good_writes; - zio_t *vio; ASSERT(!vd->vdev_ishole); good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); - vio = zio_null(zio, spa, NULL, + zio_t *vio = zio_null(zio, spa, NULL, (vd->vdev_islog || vd->vdev_aux != NULL) ? vdev_label_sync_ignore_done : vdev_label_sync_top_done, good_writes, flags); - vdev_label_sync(vio, vd, l, txg, flags); + vdev_label_sync(vio, good_writes, vd, l, txg, flags); zio_nowait(vio); } @@ -1187,15 +1539,15 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) * at any time, you can just call it again, and it will resume its work. */ int -vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard) +vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) { spa_t *spa = svd[0]->vdev_spa; uberblock_t *ub = &spa->spa_uberblock; - vdev_t *vd; - zio_t *zio; - int error; + int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + ASSERT(svdcount != 0); +retry: /* * Normally, we don't want to try too hard to write every label and * uberblock. If there is a flaky disk, we don't want the rest of the @@ -1203,8 +1555,11 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard) * single label out, we should retry with ZIO_FLAG_TRYHARD before * bailing out and declaring the pool faulted. */ - if (tryhard) + if (error != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) + return (error); flags |= ZIO_FLAG_TRYHARD; + } ASSERT(ub->ub_txg <= txg); @@ -1214,10 +1569,13 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard) * and the vdev configuration hasn't changed, * then there's nothing to do. */ - if (ub->ub_txg < txg && - uberblock_update(ub, spa->spa_root_vdev, txg) == B_FALSE && - list_is_empty(&spa->spa_config_dirty_list)) - return (0); + if (ub->ub_txg < txg) { + boolean_t changed = uberblock_update(ub, spa->spa_root_vdev, + txg, spa->spa_mmp.mmp_delay); + + if (!changed && list_is_empty(&spa->spa_config_dirty_list)) + return (0); + } if (txg > spa_freeze_txg(spa)) return (0); @@ -1230,9 +1588,10 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard) * written in this txg will be committed to stable storage * before any uberblock that references them. */ - zio = zio_root(spa, NULL, NULL, flags); + zio_t *zio = zio_root(spa, NULL, NULL, flags); - for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd; + for (vdev_t *vd = + txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL; vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) zio_flush(zio, vd); @@ -1247,8 +1606,14 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard) * the new labels to disk to ensure that all even-label updates * are committed to stable storage before the uberblock update. */ - if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) - return (error); + if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) { + zfs_dbgmsg("vdev_label_sync_list() returned error %d " + "for pool '%s' when syncing out the even labels " + "of dirty vdevs", error, spa_name(spa)); + } + goto retry; + } /* * Sync the uberblocks to all vdevs in svd[]. @@ -1265,8 +1630,16 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard) * been successfully committed) will be valid with respect * to the new uberblocks. */ - if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) - return (error); + if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) { + zfs_dbgmsg("vdev_uberblock_sync_list() returned error " + "%d for pool '%s'", error, spa_name(spa)); + } + goto retry; + } + + if (spa_multihost(spa)) + mmp_update_uberblock(spa, ub); /* * Sync out odd labels for every dirty vdev. If the system dies @@ -1278,5 +1651,14 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard) * to disk to ensure that all odd-label updates are committed to * stable storage before the next transaction group begins. */ - return (vdev_label_sync_list(spa, 1, txg, flags)); + if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) { + zfs_dbgmsg("vdev_label_sync_list() returned error %d " + "for pool '%s' when syncing out the odd labels of " + "dirty vdevs", error, spa_name(spa)); + } + goto retry; + } + + return (0); }