OpenZFS 9682 - page fault in dsl_async_clone_destroy() while opening pool

[zfs] / module / zfs / metaslab.c
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c

index 12fe5890a0837362776857103b0c0c11afaafad8..a117dc4460b8fd4d3cc901384515487250ef1cdc 100644 (file)
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -20,8 +20,9 @@
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
   */
  
  #include <sys/zfs_context.h>
@@ -246,11 +247,11 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
         mc->mc_ops = ops;
         mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
         mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
-           sizeof (refcount_t), KM_SLEEP);
+           sizeof (zfs_refcount_t), KM_SLEEP);
         mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
             sizeof (uint64_t), KM_SLEEP);
         for (int i = 0; i < spa->spa_alloc_count; i++)
-               refcount_create_tracked(&mc->mc_alloc_slots[i]);
+               zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]);
  
         return (mc);
  }
@@ -265,9 +266,9 @@ metaslab_class_destroy(metaslab_class_t *mc)
         ASSERT(mc->mc_dspace == 0);
  
         for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
-               refcount_destroy(&mc->mc_alloc_slots[i]);
+               zfs_refcount_destroy(&mc->mc_alloc_slots[i]);
         kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
-           sizeof (refcount_t));
+           sizeof (zfs_refcount_t));
         kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
             sizeof (uint64_t));
         mutex_destroy(&mc->mc_lock);
@@ -300,7 +301,7 @@ metaslab_class_validate(metaslab_class_t *mc)
         return (0);
  }
  
-void
+static void
  metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
      int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
  {
@@ -337,7 +338,8 @@ metaslab_class_get_dspace(metaslab_class_t *mc)
  void
  metaslab_class_histogram_verify(metaslab_class_t *mc)
  {
-       vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+       spa_t *spa = mc->mc_spa;
+       vdev_t *rvd = spa->spa_root_vdev;
         uint64_t *mc_hist;
         int i;
  
@@ -646,12 +648,12 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
         mg->mg_no_free_space = B_TRUE;
         mg->mg_allocators = allocators;
  
-       mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t),
-           KM_SLEEP);
+       mg->mg_alloc_queue_depth = kmem_zalloc(allocators *
+           sizeof (zfs_refcount_t), KM_SLEEP);
         mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
             sizeof (uint64_t), KM_SLEEP);
         for (int i = 0; i < allocators; i++) {
-               refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
+               zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
                 mg->mg_cur_max_alloc_queue_depth[i] = 0;
         }
  
@@ -681,11 +683,11 @@ metaslab_group_destroy(metaslab_group_t *mg)
         mutex_destroy(&mg->mg_lock);
  
         for (int i = 0; i < mg->mg_allocators; i++) {
-               refcount_destroy(&mg->mg_alloc_queue_depth[i]);
+               zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
                 mg->mg_cur_max_alloc_queue_depth[i] = 0;
         }
         kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
-           sizeof (refcount_t));
+           sizeof (zfs_refcount_t));
         kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
             sizeof (uint64_t));
  
@@ -834,7 +836,8 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
         for (int m = 0; m < vd->vdev_ms_count; m++) {
                 metaslab_t *msp = vd->vdev_ms[m];
  
-               if (msp->ms_sm == NULL)
+               /* skip if not active or not a member */
+               if (msp->ms_sm == NULL || msp->ms_group != mg)
                         continue;
  
                 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
@@ -967,12 +970,14 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
  
                 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
                         continue;
+               if (msp->ms_group != mg)
+                       continue;
  
                 valid_ms++;
                 fragmentation += msp->ms_fragmentation;
         }
  
-       if (valid_ms <= vd->vdev_ms_count / 2)
+       if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
                 return (ZFS_FRAG_INVALID);
  
         fragmentation /= valid_ms;
@@ -992,7 +997,7 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
   */
  static boolean_t
  metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
-    uint64_t psize, int allocator)
+    uint64_t psize, int allocator, int d)
  {
         spa_t *spa = mg->mg_vd->vdev_spa;
         metaslab_class_t *mc = mg->mg_class;
@@ -1003,7 +1008,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
          * groups to select from. Otherwise, we always consider it eligible
          * for allocations.
          */
-       if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
+       if ((mc != spa_normal_class(spa) &&
+           mc != spa_special_class(spa) &&
+           mc != spa_dedup_class(spa)) ||
+           mc->mc_groups <= 1)
                 return (B_TRUE);
  
         /*
@@ -1033,7 +1041,15 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
                 if (mg->mg_no_free_space)
                         return (B_FALSE);
  
-               qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]);
+               /*
+                * Relax allocation throttling for ditto blocks.  Due to
+                * random imbalances in allocation it tends to push copies
+                * to one vdev, that looks a bit better at the moment.
+                */
+               qmax = qmax * (4 + d) / 4;
+
+               qdepth = zfs_refcount_count(
+                   &mg->mg_alloc_queue_depth[allocator]);
  
                 /*
                  * If this metaslab group is below its qmax or it's
@@ -1053,8 +1069,8 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
                  */
                 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
                         qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
-
-                       qdepth = refcount_count(
+                       qmax = qmax * (4 + d) / 4;
+                       qdepth = zfs_refcount_count(
                             &mgp->mg_alloc_queue_depth[allocator]);
  
                         /*
@@ -1459,12 +1475,26 @@ metaslab_unload(metaslab_t *msp)
         msp->ms_max_size = 0;
  }
  
+static void
+metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
+    int64_t defer_delta, int64_t space_delta)
+{
+       vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
+
+       ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
+       ASSERT(vd->vdev_ms_count != 0);
+
+       metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
+           vdev_deflated_space(vd, space_delta));
+}
+
  int
  metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
      metaslab_t **msp)
  {
         vdev_t *vd = mg->mg_vd;
-       objset_t *mos = vd->vdev_spa->spa_meta_objset;
+       spa_t *spa = vd->vdev_spa;
+       objset_t *mos = spa->spa_meta_objset;
         metaslab_t *ms;
         int error;
  
@@ -1521,8 +1551,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
  
         /*
          * If metaslab_debug_load is set and we're initializing a metaslab
-        * that has an allocated space map object then load the its space
-        * map so that can verify frees.
+        * that has an allocated space map object then load the space map
+        * so that we can verify frees.
          */
         if (metaslab_debug_load && ms->ms_sm != NULL) {
                 mutex_enter(&ms->ms_lock);
@@ -1544,16 +1574,19 @@ void
  metaslab_fini(metaslab_t *msp)
  {
         metaslab_group_t *mg = msp->ms_group;
+       vdev_t *vd = mg->mg_vd;
  
         metaslab_group_remove(mg, msp);
  
         mutex_enter(&msp->ms_lock);
         VERIFY(msp->ms_group == NULL);
-       vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
-           0, -msp->ms_size);
+       metaslab_space_update(vd, mg->mg_class,
+           -space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
+
         space_map_close(msp->ms_sm);
  
         metaslab_unload(msp);
+
         range_tree_destroy(msp->ms_allocatable);
         range_tree_destroy(msp->ms_freeing);
         range_tree_destroy(msp->ms_freed);
@@ -2576,7 +2609,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
                 ASSERT3P(msp->ms_checkpointing, ==, NULL);
                 msp->ms_checkpointing = range_tree_create(NULL, NULL);
  
-               vdev_space_update(vd, 0, 0, msp->ms_size);
+               metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
         }
         ASSERT0(range_tree_space(msp->ms_freeing));
         ASSERT0(range_tree_space(msp->ms_checkpointing));
@@ -2598,7 +2631,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
                 defer_delta -= range_tree_space(*defer_tree);
         }
  
-       vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
+       metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
+           defer_delta, 0);
  
         /*
          * If there's a metaslab_load() in progress, wait for it to complete
@@ -2697,21 +2731,25 @@ metaslab_sync_reassess(metaslab_group_t *mg)
         spa_config_exit(spa, SCL_ALLOC, FTAG);
  }
  
-static uint64_t
-metaslab_distance(metaslab_t *msp, dva_t *dva)
+/*
+ * When writing a ditto block (i.e. more than one DVA for a given BP) on
+ * the same vdev as an existing DVA of this BP, then try to allocate it
+ * on a different metaslab than existing DVAs (i.e. a unique metaslab).
+ */
+static boolean_t
+metaslab_is_unique(metaslab_t *msp, dva_t *dva)
  {
-       uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
-       uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
-       uint64_t start = msp->ms_id;
+       uint64_t dva_ms_id;
+
+       if (DVA_GET_ASIZE(dva) == 0)
+               return (B_TRUE);
  
         if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
-               return (1ULL << 63);
+               return (B_TRUE);
  
-       if (offset < start)
-               return ((start - offset) << ms_shift);
-       if (offset > start)
-               return ((offset - start) << ms_shift);
-       return (0);
+       dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
+
+       return (msp->ms_id != dva_ms_id);
  }
  
  /*
@@ -2868,7 +2906,7 @@ metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
         if (!mg->mg_class->mc_alloc_throttle_enabled)
                 return;
  
-       (void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
+       (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
  }
  
  static void
@@ -2899,7 +2937,7 @@ metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
         if (!mg->mg_class->mc_alloc_throttle_enabled)
                 return;
  
-       (void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
+       (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
         if (io_complete)
                 metaslab_group_increment_qdepth(mg, allocator);
  }
@@ -2915,8 +2953,8 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
         for (int d = 0; d < ndvas; d++) {
                 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
                 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
-               VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator],
-                   tag));
+               VERIFY(zfs_refcount_not_held(
+                   &mg->mg_alloc_queue_depth[allocator], tag));
         }
  #endif
  }
@@ -2971,7 +3009,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
   */
  static metaslab_t *
  find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
-    dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator,
+    dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
      zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
  {
         avl_index_t idx;
@@ -3005,13 +3043,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
                 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
                         break;
  
-               uint64_t target_distance = min_distance
-                   + (space_map_allocated(msp->ms_sm) != 0 ? 0 :
-                   min_distance >> 1);
-
                 for (i = 0; i < d; i++) {
-                       if (metaslab_distance(msp, &dva[i]) < target_distance)
-                               break;
+                       if (want_unique &&
+                           !metaslab_is_unique(msp, &dva[i]))
+                               break;  /* try another metaslab */
                 }
                 if (i == d)
                         break;
@@ -3029,8 +3064,8 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
  /* ARGSUSED */
  static uint64_t
  metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
-    int allocator)
+    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
+    int d, int allocator)
  {
         metaslab_t *msp = NULL;
         uint64_t offset = -1ULL;
@@ -3084,7 +3119,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
                         was_active = B_TRUE;
                 } else {
                         msp = find_valid_metaslab(mg, activation_weight, dva, d,
-                           min_distance, asize, allocator, zal, search,
+                           want_unique, asize, allocator, zal, search,
                             &was_active);
                 }
  
@@ -3214,6 +3249,7 @@ next:
                  * metaslab.
                  */
                 ASSERT(!metaslab_should_allocate(msp, asize));
+
                 mutex_exit(&msp->ms_lock);
         }
         mutex_exit(&msp->ms_lock);
@@ -3223,14 +3259,14 @@ next:
  
  static uint64_t
  metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
-    int allocator)
+    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
+    int d, int allocator)
  {
         uint64_t offset;
         ASSERT(mg->mg_initialized);
  
-       offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
-           min_distance, dva, d, allocator);
+       offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
+           dva, d, allocator);
  
         mutex_enter(&mg->mg_lock);
         if (offset == -1ULL) {
@@ -3257,14 +3293,6 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
         return (offset);
  }
  
-/*
- * If we have to write a ditto block (i.e. more than one DVA for a given BP)
- * on the same vdev as an existing DVA of this BP, then try to allocate it
- * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
- * existing DVAs.
- */
-int ditto_same_vdev_distance_shift = 3;
-
  /*
   * Allocate a block for the specified i/o.
   */
@@ -3281,6 +3309,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
  
         /*
          * For testing, make some blocks above a certain size be gang blocks.
+        * This will also test spilling from special to normal.
          */
         if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
                 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
@@ -3341,6 +3370,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
                 } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
  
         } else {
+               ASSERT(mc->mc_rotor != NULL);
                 mg = mc->mc_rotor;
         }
  
@@ -3379,7 +3409,7 @@ top:
                  */
                 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
                         allocatable = metaslab_group_allocatable(mg, rotor,
-                           psize, allocator);
+                           psize, allocator, d);
                 }
  
                 if (!allocatable) {
@@ -3405,25 +3435,17 @@ top:
  
                 ASSERT(mg->mg_class == mc);
  
-               /*
-                * If we don't need to try hard, then require that the
-                * block be 1/8th of the device away from any other DVAs
-                * in this BP.  If we are trying hard, allow any offset
-                * to be used (distance=0).
-                */
-               uint64_t distance = 0;
-               if (!try_hard) {
-                       distance = vd->vdev_asize >>
-                           ditto_same_vdev_distance_shift;
-                       if (distance <= (1ULL << vd->vdev_ms_shift))
-                               distance = 0;
-               }
-
                 uint64_t asize = vdev_psize_to_asize(vd, psize);
                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
  
+               /*
+                * If we don't need to try hard, then require that the
+                * block be on an different metaslab from any other DVAs
+                * in this BP (unique=true).  If we are trying hard, then
+                * allow any metaslab to be used (unique=false).
+                */
                 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
-                   distance, dva, d, allocator);
+                   !try_hard, dva, d, allocator);
  
                 if (offset != -1ULL) {
                         /*
@@ -3819,18 +3841,19 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
         mutex_enter(&mc->mc_lock);
  
         uint64_t reserved_slots =
-           refcount_count(&mc->mc_alloc_slots[allocator]);
+           zfs_refcount_count(&mc->mc_alloc_slots[allocator]);
         if (reserved_slots < max)
                 available_slots = max - reserved_slots;
  
-       if (slots <= available_slots || GANG_ALLOCATION(flags)) {
+       if (slots <= available_slots || GANG_ALLOCATION(flags) ||
+           flags & METASLAB_MUST_RESERVE) {
                 /*
                  * We reserve the slots individually so that we can unreserve
                  * them individually when an I/O completes.
                  */
                 for (int d = 0; d < slots; d++) {
                         reserved_slots =
-                           refcount_add(&mc->mc_alloc_slots[allocator],
+                           zfs_refcount_add(&mc->mc_alloc_slots[allocator],
                             zio);
                 }
                 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
@@ -3848,7 +3871,7 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
         ASSERT(mc->mc_alloc_throttle_enabled);
         mutex_enter(&mc->mc_lock);
         for (int d = 0; d < slots; d++) {
-               (void) refcount_remove(&mc->mc_alloc_slots[allocator],
+               (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator],
                     zio);
         }
         mutex_exit(&mc->mc_lock);
@@ -4100,9 +4123,11 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
  
         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
  
-       for (int d = 0; d < ndvas; d++)
-               if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
+       for (int d = 0; d < ndvas; d++) {
+               error = metaslab_claim_dva(spa, &dva[d], txg);
+               if (error != 0)
                         break;
+       }
  
         spa_config_exit(spa, SCL_ALLOC, FTAG);
  
@@ -4228,7 +4253,7 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp)
  }
  
  #if defined(_KERNEL)
-/* CSTYLED */
+/* BEGIN CSTYLED */
  module_param(metaslab_aliquot, ulong, 0644);
  MODULE_PARM_DESC(metaslab_aliquot,
         "allocation granularity (a.k.a. stripe size)");
@@ -4277,8 +4302,9 @@ module_param(zfs_metaslab_switch_threshold, int, 0644);
  MODULE_PARM_DESC(zfs_metaslab_switch_threshold,
         "segment-based metaslab selection maximum buckets before switching");
  
-/* CSTYLED */
  module_param(metaslab_force_ganging, ulong, 0644);
  MODULE_PARM_DESC(metaslab_force_ganging,
         "blocks larger than this size are forced to be gang blocks");
+/* END CSTYLED */
+
  #endif