granicus.if.org Git - zfs/blob - module/zfs/metaslab.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  */
  26
  27 #include <sys/zfs_context.h>
  28 #include <sys/dmu.h>
  29 #include <sys/dmu_tx.h>
  30 #include <sys/space_map.h>
  31 #include <sys/metaslab_impl.h>
  32 #include <sys/vdev_impl.h>
  33 #include <sys/zio.h>
  34 #include <sys/spa_impl.h>
  35 #include <sys/zfeature.h>
  36 #include <sys/vdev_indirect_mapping.h>
  37
  38 #define WITH_DF_BLOCK_ALLOCATOR
  39
  40 #define GANG_ALLOCATION(flags) \
  41         ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
  42
  43 /*
  44  * Metaslab granularity, in bytes. This is roughly similar to what would be
  45  * referred to as the "stripe size" in traditional RAID arrays. In normal
  46  * operation, we will try to write this amount of data to a top-level vdev
  47  * before moving on to the next one.
  48  */
  49 unsigned long metaslab_aliquot = 512 << 10;
  50
  51 /*
  52  * For testing, make some blocks above a certain size be gang blocks.
  53  */
  54 unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
  55
  56 /*
  57  * The in-core space map representation is more compact than its on-disk form.
  58  * The zfs_condense_pct determines how much more compact the in-core
  59  * space map representation must be before we compact it on-disk.
  60  * Values should be greater than or equal to 100.
  61  */
  62 int zfs_condense_pct = 200;
  63
  64 /*
  65  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  66  * space used on disk. In particular, a space map uses data in increments of
  67  * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
  68  * same number of blocks after condensing. Since the goal of condensing is to
  69  * reduce the number of IOPs required to read the space map, we only want to
  70  * condense when we can be sure we will reduce the number of blocks used by the
  71  * space map. Unfortunately, we cannot precisely compute whether or not this is
  72  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  73  * we apply the following heuristic: do not condense a spacemap unless the
  74  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  75  * blocks.
  76  */
  77 int zfs_metaslab_condense_block_threshold = 4;
  78
  79 /*
  80  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  81  * be eligible for allocation. The value is defined as a percentage of
  82  * free space. Metaslab groups that have more free space than
  83  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  84  * a metaslab group's free space is less than or equal to the
  85  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  86  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  87  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  88  * groups are allowed to accept allocations. Gang blocks are always
  89  * eligible to allocate on any metaslab group. The default value of 0 means
  90  * no metaslab group will be excluded based on this criterion.
  91  */
  92 int zfs_mg_noalloc_threshold = 0;
  93
  94 /*
  95  * Metaslab groups are considered eligible for allocations if their
  96  * fragmenation metric (measured as a percentage) is less than or equal to
  97  * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
  98  * then it will be skipped unless all metaslab groups within the metaslab
  99  * class have also crossed this threshold.
 100  */
 101 int zfs_mg_fragmentation_threshold = 85;
 102
 103 /*
 104  * Allow metaslabs to keep their active state as long as their fragmentation
 105  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
 106  * active metaslab that exceeds this threshold will no longer keep its active
 107  * status allowing better metaslabs to be selected.
 108  */
 109 int zfs_metaslab_fragmentation_threshold = 70;
 110
 111 /*
 112  * When set will load all metaslabs when pool is first opened.
 113  */
 114 int metaslab_debug_load = 0;
 115
 116 /*
 117  * When set will prevent metaslabs from being unloaded.
 118  */
 119 int metaslab_debug_unload = 0;
 120
 121 /*
 122  * Minimum size which forces the dynamic allocator to change
 123  * it's allocation strategy.  Once the space map cannot satisfy
 124  * an allocation of this size then it switches to using more
 125  * aggressive strategy (i.e search by size rather than offset).
 126  */
 127 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 128
 129 /*
 130  * The minimum free space, in percent, which must be available
 131  * in a space map to continue allocations in a first-fit fashion.
 132  * Once the space map's free space drops below this level we dynamically
 133  * switch to using best-fit allocations.
 134  */
 135 int metaslab_df_free_pct = 4;
 136
 137 /*
 138  * Percentage of all cpus that can be used by the metaslab taskq.
 139  */
 140 int metaslab_load_pct = 50;
 141
 142 /*
 143  * Determines how many txgs a metaslab may remain loaded without having any
 144  * allocations from it. As long as a metaslab continues to be used we will
 145  * keep it loaded.
 146  */
 147 int metaslab_unload_delay = TXG_SIZE * 2;
 148
 149 /*
 150  * Max number of metaslabs per group to preload.
 151  */
 152 int metaslab_preload_limit = SPA_DVAS_PER_BP;
 153
 154 /*
 155  * Enable/disable preloading of metaslab.
 156  */
 157 int metaslab_preload_enabled = B_TRUE;
 158
 159 /*
 160  * Enable/disable fragmentation weighting on metaslabs.
 161  */
 162 int metaslab_fragmentation_factor_enabled = B_TRUE;
 163
 164 /*
 165  * Enable/disable lba weighting (i.e. outer tracks are given preference).
 166  */
 167 int metaslab_lba_weighting_enabled = B_TRUE;
 168
 169 /*
 170  * Enable/disable metaslab group biasing.
 171  */
 172 int metaslab_bias_enabled = B_TRUE;
 173
 174
 175 /*
 176  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
 177  */
 178 boolean_t zfs_remap_blkptr_enable = B_TRUE;
 179
 180 /*
 181  * Enable/disable segment-based metaslab selection.
 182  */
 183 int zfs_metaslab_segment_weight_enabled = B_TRUE;
 184
 185 /*
 186  * When using segment-based metaslab selection, we will continue
 187  * allocating from the active metaslab until we have exhausted
 188  * zfs_metaslab_switch_threshold of its buckets.
 189  */
 190 int zfs_metaslab_switch_threshold = 2;
 191
 192 /*
 193  * Internal switch to enable/disable the metaslab allocation tracing
 194  * facility.
 195  */
 196 #ifdef _METASLAB_TRACING
 197 boolean_t metaslab_trace_enabled = B_TRUE;
 198 #endif
 199
 200 /*
 201  * Maximum entries that the metaslab allocation tracing facility will keep
 202  * in a given list when running in non-debug mode. We limit the number
 203  * of entries in non-debug mode to prevent us from using up too much memory.
 204  * The limit should be sufficiently large that we don't expect any allocation
 205  * to every exceed this value. In debug mode, the system will panic if this
 206  * limit is ever reached allowing for further investigation.
 207  */
 208 #ifdef _METASLAB_TRACING
 209 uint64_t metaslab_trace_max_entries = 5000;
 210 #endif
 211
 212 static uint64_t metaslab_weight(metaslab_t *);
 213 static void metaslab_set_fragmentation(metaslab_t *);
 214 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
 215 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 216
 217 #ifdef _METASLAB_TRACING
 218 kmem_cache_t *metaslab_alloc_trace_cache;
 219 #endif
 220
 221 /*
 222  * ==========================================================================
 223  * Metaslab classes
 224  * ==========================================================================
 225  */
 226 metaslab_class_t *
 227 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 228 {
 229         metaslab_class_t *mc;
 230
 231         mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 232
 233         mc->mc_spa = spa;
 234         mc->mc_rotor = NULL;
 235         mc->mc_ops = ops;
 236         mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 237         refcount_create_tracked(&mc->mc_alloc_slots);
 238
 239         return (mc);
 240 }
 241
 242 void
 243 metaslab_class_destroy(metaslab_class_t *mc)
 244 {
 245         ASSERT(mc->mc_rotor == NULL);
 246         ASSERT(mc->mc_alloc == 0);
 247         ASSERT(mc->mc_deferred == 0);
 248         ASSERT(mc->mc_space == 0);
 249         ASSERT(mc->mc_dspace == 0);
 250
 251         refcount_destroy(&mc->mc_alloc_slots);
 252         mutex_destroy(&mc->mc_lock);
 253         kmem_free(mc, sizeof (metaslab_class_t));
 254 }
 255
 256 int
 257 metaslab_class_validate(metaslab_class_t *mc)
 258 {
 259         metaslab_group_t *mg;
 260         vdev_t *vd;
 261
 262         /*
 263          * Must hold one of the spa_config locks.
 264          */
 265         ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 266             spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 267
 268         if ((mg = mc->mc_rotor) == NULL)
 269                 return (0);
 270
 271         do {
 272                 vd = mg->mg_vd;
 273                 ASSERT(vd->vdev_mg != NULL);
 274                 ASSERT3P(vd->vdev_top, ==, vd);
 275                 ASSERT3P(mg->mg_class, ==, mc);
 276                 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 277         } while ((mg = mg->mg_next) != mc->mc_rotor);
 278
 279         return (0);
 280 }
 281
 282 void
 283 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 284     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 285 {
 286         atomic_add_64(&mc->mc_alloc, alloc_delta);
 287         atomic_add_64(&mc->mc_deferred, defer_delta);
 288         atomic_add_64(&mc->mc_space, space_delta);
 289         atomic_add_64(&mc->mc_dspace, dspace_delta);
 290 }
 291
 292 uint64_t
 293 metaslab_class_get_alloc(metaslab_class_t *mc)
 294 {
 295         return (mc->mc_alloc);
 296 }
 297
 298 uint64_t
 299 metaslab_class_get_deferred(metaslab_class_t *mc)
 300 {
 301         return (mc->mc_deferred);
 302 }
 303
 304 uint64_t
 305 metaslab_class_get_space(metaslab_class_t *mc)
 306 {
 307         return (mc->mc_space);
 308 }
 309
 310 uint64_t
 311 metaslab_class_get_dspace(metaslab_class_t *mc)
 312 {
 313         return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 314 }
 315
 316 void
 317 metaslab_class_histogram_verify(metaslab_class_t *mc)
 318 {
 319         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 320         uint64_t *mc_hist;
 321         int i;
 322
 323         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 324                 return;
 325
 326         mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 327             KM_SLEEP);
 328
 329         for (int c = 0; c < rvd->vdev_children; c++) {
 330                 vdev_t *tvd = rvd->vdev_child[c];
 331                 metaslab_group_t *mg = tvd->vdev_mg;
 332
 333                 /*
 334                  * Skip any holes, uninitialized top-levels, or
 335                  * vdevs that are not in this metalab class.
 336                  */
 337                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 338                     mg->mg_class != mc) {
 339                         continue;
 340                 }
 341
 342                 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 343                         mc_hist[i] += mg->mg_histogram[i];
 344         }
 345
 346         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 347                 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 348
 349         kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 350 }
 351
 352 /*
 353  * Calculate the metaslab class's fragmentation metric. The metric
 354  * is weighted based on the space contribution of each metaslab group.
 355  * The return value will be a number between 0 and 100 (inclusive), or
 356  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
 357  * zfs_frag_table for more information about the metric.
 358  */
 359 uint64_t
 360 metaslab_class_fragmentation(metaslab_class_t *mc)
 361 {
 362         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 363         uint64_t fragmentation = 0;
 364
 365         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 366
 367         for (int c = 0; c < rvd->vdev_children; c++) {
 368                 vdev_t *tvd = rvd->vdev_child[c];
 369                 metaslab_group_t *mg = tvd->vdev_mg;
 370
 371                 /*
 372                  * Skip any holes, uninitialized top-levels,
 373                  * or vdevs that are not in this metalab class.
 374                  */
 375                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 376                     mg->mg_class != mc) {
 377                         continue;
 378                 }
 379
 380                 /*
 381                  * If a metaslab group does not contain a fragmentation
 382                  * metric then just bail out.
 383                  */
 384                 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 385                         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 386                         return (ZFS_FRAG_INVALID);
 387                 }
 388
 389                 /*
 390                  * Determine how much this metaslab_group is contributing
 391                  * to the overall pool fragmentation metric.
 392                  */
 393                 fragmentation += mg->mg_fragmentation *
 394                     metaslab_group_get_space(mg);
 395         }
 396         fragmentation /= metaslab_class_get_space(mc);
 397
 398         ASSERT3U(fragmentation, <=, 100);
 399         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 400         return (fragmentation);
 401 }
 402
 403 /*
 404  * Calculate the amount of expandable space that is available in
 405  * this metaslab class. If a device is expanded then its expandable
 406  * space will be the amount of allocatable space that is currently not
 407  * part of this metaslab class.
 408  */
 409 uint64_t
 410 metaslab_class_expandable_space(metaslab_class_t *mc)
 411 {
 412         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 413         uint64_t space = 0;
 414
 415         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 416         for (int c = 0; c < rvd->vdev_children; c++) {
 417                 vdev_t *tvd = rvd->vdev_child[c];
 418                 metaslab_group_t *mg = tvd->vdev_mg;
 419
 420                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 421                     mg->mg_class != mc) {
 422                         continue;
 423                 }
 424
 425                 /*
 426                  * Calculate if we have enough space to add additional
 427                  * metaslabs. We report the expandable space in terms
 428                  * of the metaslab size since that's the unit of expansion.
 429                  */
 430                 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
 431                     1ULL << tvd->vdev_ms_shift);
 432         }
 433         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 434         return (space);
 435 }
 436
 437 static int
 438 metaslab_compare(const void *x1, const void *x2)
 439 {
 440         const metaslab_t *m1 = (const metaslab_t *)x1;
 441         const metaslab_t *m2 = (const metaslab_t *)x2;
 442
 443         int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
 444         if (likely(cmp))
 445                 return (cmp);
 446
 447         IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
 448
 449         return (AVL_CMP(m1->ms_start, m2->ms_start));
 450 }
 451
 452 /*
 453  * Verify that the space accounting on disk matches the in-core range_trees.
 454  */
 455 void
 456 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 457 {
 458         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 459         uint64_t allocated = 0;
 460         uint64_t sm_free_space, msp_free_space;
 461
 462         ASSERT(MUTEX_HELD(&msp->ms_lock));
 463
 464         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 465                 return;
 466
 467         /*
 468          * We can only verify the metaslab space when we're called
 469          * from syncing context with a loaded metaslab that has an allocated
 470          * space map. Calling this in non-syncing context does not
 471          * provide a consistent view of the metaslab since we're performing
 472          * allocations in the future.
 473          */
 474         if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 475             !msp->ms_loaded)
 476                 return;
 477
 478         sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
 479             space_map_alloc_delta(msp->ms_sm);
 480
 481         /*
 482          * Account for future allocations since we would have already
 483          * deducted that space from the ms_freetree.
 484          */
 485         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 486                 allocated +=
 487                     range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
 488         }
 489
 490         msp_free_space = range_tree_space(msp->ms_tree) + allocated +
 491             msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
 492
 493         VERIFY3U(sm_free_space, ==, msp_free_space);
 494 }
 495
 496 /*
 497  * ==========================================================================
 498  * Metaslab groups
 499  * ==========================================================================
 500  */
 501 /*
 502  * Update the allocatable flag and the metaslab group's capacity.
 503  * The allocatable flag is set to true if the capacity is below
 504  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
 505  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
 506  * transitions from allocatable to non-allocatable or vice versa then the
 507  * metaslab group's class is updated to reflect the transition.
 508  */
 509 static void
 510 metaslab_group_alloc_update(metaslab_group_t *mg)
 511 {
 512         vdev_t *vd = mg->mg_vd;
 513         metaslab_class_t *mc = mg->mg_class;
 514         vdev_stat_t *vs = &vd->vdev_stat;
 515         boolean_t was_allocatable;
 516         boolean_t was_initialized;
 517
 518         ASSERT(vd == vd->vdev_top);
 519         ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
 520             SCL_ALLOC);
 521
 522         mutex_enter(&mg->mg_lock);
 523         was_allocatable = mg->mg_allocatable;
 524         was_initialized = mg->mg_initialized;
 525
 526         mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 527             (vs->vs_space + 1);
 528
 529         mutex_enter(&mc->mc_lock);
 530
 531         /*
 532          * If the metaslab group was just added then it won't
 533          * have any space until we finish syncing out this txg.
 534          * At that point we will consider it initialized and available
 535          * for allocations.  We also don't consider non-activated
 536          * metaslab groups (e.g. vdevs that are in the middle of being removed)
 537          * to be initialized, because they can't be used for allocation.
 538          */
 539         mg->mg_initialized = metaslab_group_initialized(mg);
 540         if (!was_initialized && mg->mg_initialized) {
 541                 mc->mc_groups++;
 542         } else if (was_initialized && !mg->mg_initialized) {
 543                 ASSERT3U(mc->mc_groups, >, 0);
 544                 mc->mc_groups--;
 545         }
 546         if (mg->mg_initialized)
 547                 mg->mg_no_free_space = B_FALSE;
 548
 549         /*
 550          * A metaslab group is considered allocatable if it has plenty
 551          * of free space or is not heavily fragmented. We only take
 552          * fragmentation into account if the metaslab group has a valid
 553          * fragmentation metric (i.e. a value between 0 and 100).
 554          */
 555         mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 556             mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 557             (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 558             mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 559
 560         /*
 561          * The mc_alloc_groups maintains a count of the number of
 562          * groups in this metaslab class that are still above the
 563          * zfs_mg_noalloc_threshold. This is used by the allocating
 564          * threads to determine if they should avoid allocations to
 565          * a given group. The allocator will avoid allocations to a group
 566          * if that group has reached or is below the zfs_mg_noalloc_threshold
 567          * and there are still other groups that are above the threshold.
 568          * When a group transitions from allocatable to non-allocatable or
 569          * vice versa we update the metaslab class to reflect that change.
 570          * When the mc_alloc_groups value drops to 0 that means that all
 571          * groups have reached the zfs_mg_noalloc_threshold making all groups
 572          * eligible for allocations. This effectively means that all devices
 573          * are balanced again.
 574          */
 575         if (was_allocatable && !mg->mg_allocatable)
 576                 mc->mc_alloc_groups--;
 577         else if (!was_allocatable && mg->mg_allocatable)
 578                 mc->mc_alloc_groups++;
 579         mutex_exit(&mc->mc_lock);
 580
 581         mutex_exit(&mg->mg_lock);
 582 }
 583
 584 metaslab_group_t *
 585 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 586 {
 587         metaslab_group_t *mg;
 588
 589         mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 590         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 591         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 592             sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 593         mg->mg_vd = vd;
 594         mg->mg_class = mc;
 595         mg->mg_activation_count = 0;
 596         mg->mg_initialized = B_FALSE;
 597         mg->mg_no_free_space = B_TRUE;
 598         refcount_create_tracked(&mg->mg_alloc_queue_depth);
 599
 600         mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 601             maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
 602
 603         return (mg);
 604 }
 605
 606 void
 607 metaslab_group_destroy(metaslab_group_t *mg)
 608 {
 609         ASSERT(mg->mg_prev == NULL);
 610         ASSERT(mg->mg_next == NULL);
 611         /*
 612          * We may have gone below zero with the activation count
 613          * either because we never activated in the first place or
 614          * because we're done, and possibly removing the vdev.
 615          */
 616         ASSERT(mg->mg_activation_count <= 0);
 617
 618         taskq_destroy(mg->mg_taskq);
 619         avl_destroy(&mg->mg_metaslab_tree);
 620         mutex_destroy(&mg->mg_lock);
 621         refcount_destroy(&mg->mg_alloc_queue_depth);
 622         kmem_free(mg, sizeof (metaslab_group_t));
 623 }
 624
 625 void
 626 metaslab_group_activate(metaslab_group_t *mg)
 627 {
 628         metaslab_class_t *mc = mg->mg_class;
 629         metaslab_group_t *mgprev, *mgnext;
 630
 631         ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
 632
 633         ASSERT(mc->mc_rotor != mg);
 634         ASSERT(mg->mg_prev == NULL);
 635         ASSERT(mg->mg_next == NULL);
 636         ASSERT(mg->mg_activation_count <= 0);
 637
 638         if (++mg->mg_activation_count <= 0)
 639                 return;
 640
 641         mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 642         metaslab_group_alloc_update(mg);
 643
 644         if ((mgprev = mc->mc_rotor) == NULL) {
 645                 mg->mg_prev = mg;
 646                 mg->mg_next = mg;
 647         } else {
 648                 mgnext = mgprev->mg_next;
 649                 mg->mg_prev = mgprev;
 650                 mg->mg_next = mgnext;
 651                 mgprev->mg_next = mg;
 652                 mgnext->mg_prev = mg;
 653         }
 654         mc->mc_rotor = mg;
 655 }
 656
 657 /*
 658  * Passivate a metaslab group and remove it from the allocation rotor.
 659  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
 660  * a metaslab group. This function will momentarily drop spa_config_locks
 661  * that are lower than the SCL_ALLOC lock (see comment below).
 662  */
 663 void
 664 metaslab_group_passivate(metaslab_group_t *mg)
 665 {
 666         metaslab_class_t *mc = mg->mg_class;
 667         spa_t *spa = mc->mc_spa;
 668         metaslab_group_t *mgprev, *mgnext;
 669         int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
 670
 671         ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
 672             (SCL_ALLOC | SCL_ZIO));
 673
 674         if (--mg->mg_activation_count != 0) {
 675                 ASSERT(mc->mc_rotor != mg);
 676                 ASSERT(mg->mg_prev == NULL);
 677                 ASSERT(mg->mg_next == NULL);
 678                 ASSERT(mg->mg_activation_count < 0);
 679                 return;
 680         }
 681
 682         /*
 683          * The spa_config_lock is an array of rwlocks, ordered as
 684          * follows (from highest to lowest):
 685          *      SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
 686          *      SCL_ZIO > SCL_FREE > SCL_VDEV
 687          * (For more information about the spa_config_lock see spa_misc.c)
 688          * The higher the lock, the broader its coverage. When we passivate
 689          * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
 690          * config locks. However, the metaslab group's taskq might be trying
 691          * to preload metaslabs so we must drop the SCL_ZIO lock and any
 692          * lower locks to allow the I/O to complete. At a minimum,
 693          * we continue to hold the SCL_ALLOC lock, which prevents any future
 694          * allocations from taking place and any changes to the vdev tree.
 695          */
 696         spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
 697         taskq_wait_outstanding(mg->mg_taskq, 0);
 698         spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 699         metaslab_group_alloc_update(mg);
 700
 701         mgprev = mg->mg_prev;
 702         mgnext = mg->mg_next;
 703
 704         if (mg == mgnext) {
 705                 mc->mc_rotor = NULL;
 706         } else {
 707                 mc->mc_rotor = mgnext;
 708                 mgprev->mg_next = mgnext;
 709                 mgnext->mg_prev = mgprev;
 710         }
 711
 712         mg->mg_prev = NULL;
 713         mg->mg_next = NULL;
 714 }
 715
 716 boolean_t
 717 metaslab_group_initialized(metaslab_group_t *mg)
 718 {
 719         vdev_t *vd = mg->mg_vd;
 720         vdev_stat_t *vs = &vd->vdev_stat;
 721
 722         return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 723 }
 724
 725 uint64_t
 726 metaslab_group_get_space(metaslab_group_t *mg)
 727 {
 728         return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
 729 }
 730
 731 void
 732 metaslab_group_histogram_verify(metaslab_group_t *mg)
 733 {
 734         uint64_t *mg_hist;
 735         vdev_t *vd = mg->mg_vd;
 736         uint64_t ashift = vd->vdev_ashift;
 737         int i;
 738
 739         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 740                 return;
 741
 742         mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 743             KM_SLEEP);
 744
 745         ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 746             SPACE_MAP_HISTOGRAM_SIZE + ashift);
 747
 748         for (int m = 0; m < vd->vdev_ms_count; m++) {
 749                 metaslab_t *msp = vd->vdev_ms[m];
 750
 751                 if (msp->ms_sm == NULL)
 752                         continue;
 753
 754                 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 755                         mg_hist[i + ashift] +=
 756                             msp->ms_sm->sm_phys->smp_histogram[i];
 757         }
 758
 759         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 760                 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 761
 762         kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 763 }
 764
 765 static void
 766 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 767 {
 768         metaslab_class_t *mc = mg->mg_class;
 769         uint64_t ashift = mg->mg_vd->vdev_ashift;
 770
 771         ASSERT(MUTEX_HELD(&msp->ms_lock));
 772         if (msp->ms_sm == NULL)
 773                 return;
 774
 775         mutex_enter(&mg->mg_lock);
 776         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 777                 mg->mg_histogram[i + ashift] +=
 778                     msp->ms_sm->sm_phys->smp_histogram[i];
 779                 mc->mc_histogram[i + ashift] +=
 780                     msp->ms_sm->sm_phys->smp_histogram[i];
 781         }
 782         mutex_exit(&mg->mg_lock);
 783 }
 784
 785 void
 786 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 787 {
 788         metaslab_class_t *mc = mg->mg_class;
 789         uint64_t ashift = mg->mg_vd->vdev_ashift;
 790
 791         ASSERT(MUTEX_HELD(&msp->ms_lock));
 792         if (msp->ms_sm == NULL)
 793                 return;
 794
 795         mutex_enter(&mg->mg_lock);
 796         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 797                 ASSERT3U(mg->mg_histogram[i + ashift], >=,
 798                     msp->ms_sm->sm_phys->smp_histogram[i]);
 799                 ASSERT3U(mc->mc_histogram[i + ashift], >=,
 800                     msp->ms_sm->sm_phys->smp_histogram[i]);
 801
 802                 mg->mg_histogram[i + ashift] -=
 803                     msp->ms_sm->sm_phys->smp_histogram[i];
 804                 mc->mc_histogram[i + ashift] -=
 805                     msp->ms_sm->sm_phys->smp_histogram[i];
 806         }
 807         mutex_exit(&mg->mg_lock);
 808 }
 809
 810 static void
 811 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 812 {
 813         ASSERT(msp->ms_group == NULL);
 814         mutex_enter(&mg->mg_lock);
 815         msp->ms_group = mg;
 816         msp->ms_weight = 0;
 817         avl_add(&mg->mg_metaslab_tree, msp);
 818         mutex_exit(&mg->mg_lock);
 819
 820         mutex_enter(&msp->ms_lock);
 821         metaslab_group_histogram_add(mg, msp);
 822         mutex_exit(&msp->ms_lock);
 823 }
 824
 825 static void
 826 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 827 {
 828         mutex_enter(&msp->ms_lock);
 829         metaslab_group_histogram_remove(mg, msp);
 830         mutex_exit(&msp->ms_lock);
 831
 832         mutex_enter(&mg->mg_lock);
 833         ASSERT(msp->ms_group == mg);
 834         avl_remove(&mg->mg_metaslab_tree, msp);
 835         msp->ms_group = NULL;
 836         mutex_exit(&mg->mg_lock);
 837 }
 838
 839 static void
 840 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 841 {
 842         /*
 843          * Although in principle the weight can be any value, in
 844          * practice we do not use values in the range [1, 511].
 845          */
 846         ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 847         ASSERT(MUTEX_HELD(&msp->ms_lock));
 848
 849         mutex_enter(&mg->mg_lock);
 850         ASSERT(msp->ms_group == mg);
 851         avl_remove(&mg->mg_metaslab_tree, msp);
 852         msp->ms_weight = weight;
 853         avl_add(&mg->mg_metaslab_tree, msp);
 854         mutex_exit(&mg->mg_lock);
 855 }
 856
 857 /*
 858  * Calculate the fragmentation for a given metaslab group. We can use
 859  * a simple average here since all metaslabs within the group must have
 860  * the same size. The return value will be a value between 0 and 100
 861  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
 862  * group have a fragmentation metric.
 863  */
 864 uint64_t
 865 metaslab_group_fragmentation(metaslab_group_t *mg)
 866 {
 867         vdev_t *vd = mg->mg_vd;
 868         uint64_t fragmentation = 0;
 869         uint64_t valid_ms = 0;
 870
 871         for (int m = 0; m < vd->vdev_ms_count; m++) {
 872                 metaslab_t *msp = vd->vdev_ms[m];
 873
 874                 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
 875                         continue;
 876
 877                 valid_ms++;
 878                 fragmentation += msp->ms_fragmentation;
 879         }
 880
 881         if (valid_ms <= vd->vdev_ms_count / 2)
 882                 return (ZFS_FRAG_INVALID);
 883
 884         fragmentation /= valid_ms;
 885         ASSERT3U(fragmentation, <=, 100);
 886         return (fragmentation);
 887 }
 888
 889 /*
 890  * Determine if a given metaslab group should skip allocations. A metaslab
 891  * group should avoid allocations if its free capacity is less than the
 892  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
 893  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
 894  * that can still handle allocations. If the allocation throttle is enabled
 895  * then we skip allocations to devices that have reached their maximum
 896  * allocation queue depth unless the selected metaslab group is the only
 897  * eligible group remaining.
 898  */
 899 static boolean_t
 900 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 901     uint64_t psize)
 902 {
 903         spa_t *spa = mg->mg_vd->vdev_spa;
 904         metaslab_class_t *mc = mg->mg_class;
 905
 906         /*
 907          * We can only consider skipping this metaslab group if it's
 908          * in the normal metaslab class and there are other metaslab
 909          * groups to select from. Otherwise, we always consider it eligible
 910          * for allocations.
 911          */
 912         if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
 913                 return (B_TRUE);
 914
 915         /*
 916          * If the metaslab group's mg_allocatable flag is set (see comments
 917          * in metaslab_group_alloc_update() for more information) and
 918          * the allocation throttle is disabled then allow allocations to this
 919          * device. However, if the allocation throttle is enabled then
 920          * check if we have reached our allocation limit (mg_alloc_queue_depth)
 921          * to determine if we should allow allocations to this metaslab group.
 922          * If all metaslab groups are no longer considered allocatable
 923          * (mc_alloc_groups == 0) or we're trying to allocate the smallest
 924          * gang block size then we allow allocations on this metaslab group
 925          * regardless of the mg_allocatable or throttle settings.
 926          */
 927         if (mg->mg_allocatable) {
 928                 metaslab_group_t *mgp;
 929                 int64_t qdepth;
 930                 uint64_t qmax = mg->mg_max_alloc_queue_depth;
 931
 932                 if (!mc->mc_alloc_throttle_enabled)
 933                         return (B_TRUE);
 934
 935                 /*
 936                  * If this metaslab group does not have any free space, then
 937                  * there is no point in looking further.
 938                  */
 939                 if (mg->mg_no_free_space)
 940                         return (B_FALSE);
 941
 942                 qdepth = refcount_count(&mg->mg_alloc_queue_depth);
 943
 944                 /*
 945                  * If this metaslab group is below its qmax or it's
 946                  * the only allocatable metasable group, then attempt
 947                  * to allocate from it.
 948                  */
 949                 if (qdepth < qmax || mc->mc_alloc_groups == 1)
 950                         return (B_TRUE);
 951                 ASSERT3U(mc->mc_alloc_groups, >, 1);
 952
 953                 /*
 954                  * Since this metaslab group is at or over its qmax, we
 955                  * need to determine if there are metaslab groups after this
 956                  * one that might be able to handle this allocation. This is
 957                  * racy since we can't hold the locks for all metaslab
 958                  * groups at the same time when we make this check.
 959                  */
 960                 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
 961                         qmax = mgp->mg_max_alloc_queue_depth;
 962
 963                         qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
 964
 965                         /*
 966                          * If there is another metaslab group that
 967                          * might be able to handle the allocation, then
 968                          * we return false so that we skip this group.
 969                          */
 970                         if (qdepth < qmax && !mgp->mg_no_free_space)
 971                                 return (B_FALSE);
 972                 }
 973
 974                 /*
 975                  * We didn't find another group to handle the allocation
 976                  * so we can't skip this metaslab group even though
 977                  * we are at or over our qmax.
 978                  */
 979                 return (B_TRUE);
 980
 981         } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
 982                 return (B_TRUE);
 983         }
 984         return (B_FALSE);
 985 }
 986
 987 /*
 988  * ==========================================================================
 989  * Range tree callbacks
 990  * ==========================================================================
 991  */
 992
 993 /*
 994  * Comparison function for the private size-ordered tree. Tree is sorted
 995  * by size, larger sizes at the end of the tree.
 996  */
 997 static int
 998 metaslab_rangesize_compare(const void *x1, const void *x2)
 999 {
1000         const range_seg_t *r1 = x1;
1001         const range_seg_t *r2 = x2;
1002         uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1003         uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1004
1005         int cmp = AVL_CMP(rs_size1, rs_size2);
1006         if (likely(cmp))
1007                 return (cmp);
1008
1009         return (AVL_CMP(r1->rs_start, r2->rs_start));
1010 }
1011
1012 /*
1013  * ==========================================================================
1014  * Common allocator routines
1015  * ==========================================================================
1016  */
1017
1018 /*
1019  * Return the maximum contiguous segment within the metaslab.
1020  */
1021 uint64_t
1022 metaslab_block_maxsize(metaslab_t *msp)
1023 {
1024         avl_tree_t *t = &msp->ms_size_tree;
1025         range_seg_t *rs;
1026
1027         if (t == NULL || (rs = avl_last(t)) == NULL)
1028                 return (0ULL);
1029
1030         return (rs->rs_end - rs->rs_start);
1031 }
1032
1033 static range_seg_t *
1034 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1035 {
1036         range_seg_t *rs, rsearch;
1037         avl_index_t where;
1038
1039         rsearch.rs_start = start;
1040         rsearch.rs_end = start + size;
1041
1042         rs = avl_find(t, &rsearch, &where);
1043         if (rs == NULL) {
1044                 rs = avl_nearest(t, where, AVL_AFTER);
1045         }
1046
1047         return (rs);
1048 }
1049
1050 #if defined(WITH_FF_BLOCK_ALLOCATOR) || \
1051     defined(WITH_DF_BLOCK_ALLOCATOR) || \
1052     defined(WITH_CF_BLOCK_ALLOCATOR)
1053 /*
1054  * This is a helper function that can be used by the allocator to find
1055  * a suitable block to allocate. This will search the specified AVL
1056  * tree looking for a block that matches the specified criteria.
1057  */
1058 static uint64_t
1059 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1060     uint64_t align)
1061 {
1062         range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1063
1064         while (rs != NULL) {
1065                 uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1066
1067                 if (offset + size <= rs->rs_end) {
1068                         *cursor = offset + size;
1069                         return (offset);
1070                 }
1071                 rs = AVL_NEXT(t, rs);
1072         }
1073
1074         /*
1075          * If we know we've searched the whole map (*cursor == 0), give up.
1076          * Otherwise, reset the cursor to the beginning and try again.
1077          */
1078         if (*cursor == 0)
1079                 return (-1ULL);
1080
1081         *cursor = 0;
1082         return (metaslab_block_picker(t, cursor, size, align));
1083 }
1084 #endif /* WITH_FF/DF/CF_BLOCK_ALLOCATOR */
1085
1086 #if defined(WITH_FF_BLOCK_ALLOCATOR)
1087 /*
1088  * ==========================================================================
1089  * The first-fit block allocator
1090  * ==========================================================================
1091  */
1092 static uint64_t
1093 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1094 {
1095         /*
1096          * Find the largest power of 2 block size that evenly divides the
1097          * requested size. This is used to try to allocate blocks with similar
1098          * alignment from the same area of the metaslab (i.e. same cursor
1099          * bucket) but it does not guarantee that other allocations sizes
1100          * may exist in the same region.
1101          */
1102         uint64_t align = size & -size;
1103         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1104         avl_tree_t *t = &msp->ms_tree->rt_root;
1105
1106         return (metaslab_block_picker(t, cursor, size, align));
1107 }
1108
1109 static metaslab_ops_t metaslab_ff_ops = {
1110         metaslab_ff_alloc
1111 };
1112
1113 metaslab_ops_t *zfs_metaslab_ops = &metaslab_ff_ops;
1114 #endif /* WITH_FF_BLOCK_ALLOCATOR */
1115
1116 #if defined(WITH_DF_BLOCK_ALLOCATOR)
1117 /*
1118  * ==========================================================================
1119  * Dynamic block allocator -
1120  * Uses the first fit allocation scheme until space get low and then
1121  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1122  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1123  * ==========================================================================
1124  */
1125 static uint64_t
1126 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1127 {
1128         /*
1129          * Find the largest power of 2 block size that evenly divides the
1130          * requested size. This is used to try to allocate blocks with similar
1131          * alignment from the same area of the metaslab (i.e. same cursor
1132          * bucket) but it does not guarantee that other allocations sizes
1133          * may exist in the same region.
1134          */
1135         uint64_t align = size & -size;
1136         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1137         range_tree_t *rt = msp->ms_tree;
1138         avl_tree_t *t = &rt->rt_root;
1139         uint64_t max_size = metaslab_block_maxsize(msp);
1140         int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1141
1142         ASSERT(MUTEX_HELD(&msp->ms_lock));
1143         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1144
1145         if (max_size < size)
1146                 return (-1ULL);
1147
1148         /*
1149          * If we're running low on space switch to using the size
1150          * sorted AVL tree (best-fit).
1151          */
1152         if (max_size < metaslab_df_alloc_threshold ||
1153             free_pct < metaslab_df_free_pct) {
1154                 t = &msp->ms_size_tree;
1155                 *cursor = 0;
1156         }
1157
1158         return (metaslab_block_picker(t, cursor, size, 1ULL));
1159 }
1160
1161 static metaslab_ops_t metaslab_df_ops = {
1162         metaslab_df_alloc
1163 };
1164
1165 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1166 #endif /* WITH_DF_BLOCK_ALLOCATOR */
1167
1168 #if defined(WITH_CF_BLOCK_ALLOCATOR)
1169 /*
1170  * ==========================================================================
1171  * Cursor fit block allocator -
1172  * Select the largest region in the metaslab, set the cursor to the beginning
1173  * of the range and the cursor_end to the end of the range. As allocations
1174  * are made advance the cursor. Continue allocating from the cursor until
1175  * the range is exhausted and then find a new range.
1176  * ==========================================================================
1177  */
1178 static uint64_t
1179 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1180 {
1181         range_tree_t *rt = msp->ms_tree;
1182         avl_tree_t *t = &msp->ms_size_tree;
1183         uint64_t *cursor = &msp->ms_lbas[0];
1184         uint64_t *cursor_end = &msp->ms_lbas[1];
1185         uint64_t offset = 0;
1186
1187         ASSERT(MUTEX_HELD(&msp->ms_lock));
1188         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1189
1190         ASSERT3U(*cursor_end, >=, *cursor);
1191
1192         if ((*cursor + size) > *cursor_end) {
1193                 range_seg_t *rs;
1194
1195                 rs = avl_last(&msp->ms_size_tree);
1196                 if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1197                         return (-1ULL);
1198
1199                 *cursor = rs->rs_start;
1200                 *cursor_end = rs->rs_end;
1201         }
1202
1203         offset = *cursor;
1204         *cursor += size;
1205
1206         return (offset);
1207 }
1208
1209 static metaslab_ops_t metaslab_cf_ops = {
1210         metaslab_cf_alloc
1211 };
1212
1213 metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops;
1214 #endif /* WITH_CF_BLOCK_ALLOCATOR */
1215
1216 #if defined(WITH_NDF_BLOCK_ALLOCATOR)
1217 /*
1218  * ==========================================================================
1219  * New dynamic fit allocator -
1220  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1221  * contiguous blocks. If no region is found then just use the largest segment
1222  * that remains.
1223  * ==========================================================================
1224  */
1225
1226 /*
1227  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1228  * to request from the allocator.
1229  */
1230 uint64_t metaslab_ndf_clump_shift = 4;
1231
1232 static uint64_t
1233 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1234 {
1235         avl_tree_t *t = &msp->ms_tree->rt_root;
1236         avl_index_t where;
1237         range_seg_t *rs, rsearch;
1238         uint64_t hbit = highbit64(size);
1239         uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1240         uint64_t max_size = metaslab_block_maxsize(msp);
1241
1242         ASSERT(MUTEX_HELD(&msp->ms_lock));
1243         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1244
1245         if (max_size < size)
1246                 return (-1ULL);
1247
1248         rsearch.rs_start = *cursor;
1249         rsearch.rs_end = *cursor + size;
1250
1251         rs = avl_find(t, &rsearch, &where);
1252         if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1253                 t = &msp->ms_size_tree;
1254
1255                 rsearch.rs_start = 0;
1256                 rsearch.rs_end = MIN(max_size,
1257                     1ULL << (hbit + metaslab_ndf_clump_shift));
1258                 rs = avl_find(t, &rsearch, &where);
1259                 if (rs == NULL)
1260                         rs = avl_nearest(t, where, AVL_AFTER);
1261                 ASSERT(rs != NULL);
1262         }
1263
1264         if ((rs->rs_end - rs->rs_start) >= size) {
1265                 *cursor = rs->rs_start + size;
1266                 return (rs->rs_start);
1267         }
1268         return (-1ULL);
1269 }
1270
1271 static metaslab_ops_t metaslab_ndf_ops = {
1272         metaslab_ndf_alloc
1273 };
1274
1275 metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
1276 #endif /* WITH_NDF_BLOCK_ALLOCATOR */
1277
1278
1279 /*
1280  * ==========================================================================
1281  * Metaslabs
1282  * ==========================================================================
1283  */
1284
1285 /*
1286  * Wait for any in-progress metaslab loads to complete.
1287  */
1288 void
1289 metaslab_load_wait(metaslab_t *msp)
1290 {
1291         ASSERT(MUTEX_HELD(&msp->ms_lock));
1292
1293         while (msp->ms_loading) {
1294                 ASSERT(!msp->ms_loaded);
1295                 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1296         }
1297 }
1298
1299 int
1300 metaslab_load(metaslab_t *msp)
1301 {
1302         int error = 0;
1303         boolean_t success = B_FALSE;
1304
1305         ASSERT(MUTEX_HELD(&msp->ms_lock));
1306         ASSERT(!msp->ms_loaded);
1307         ASSERT(!msp->ms_loading);
1308
1309         msp->ms_loading = B_TRUE;
1310         /*
1311          * Nobody else can manipulate a loading metaslab, so it's now safe
1312          * to drop the lock.  This way we don't have to hold the lock while
1313          * reading the spacemap from disk.
1314          */
1315         mutex_exit(&msp->ms_lock);
1316
1317         /*
1318          * If the space map has not been allocated yet, then treat
1319          * all the space in the metaslab as free and add it to the
1320          * ms_tree.
1321          */
1322         if (msp->ms_sm != NULL)
1323                 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1324         else
1325                 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1326
1327         success = (error == 0);
1328
1329         mutex_enter(&msp->ms_lock);
1330         msp->ms_loading = B_FALSE;
1331
1332         if (success) {
1333                 ASSERT3P(msp->ms_group, !=, NULL);
1334                 msp->ms_loaded = B_TRUE;
1335
1336                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1337                         range_tree_walk(msp->ms_defertree[t],
1338                             range_tree_remove, msp->ms_tree);
1339                 }
1340                 msp->ms_max_size = metaslab_block_maxsize(msp);
1341         }
1342         cv_broadcast(&msp->ms_load_cv);
1343         return (error);
1344 }
1345
1346 void
1347 metaslab_unload(metaslab_t *msp)
1348 {
1349         ASSERT(MUTEX_HELD(&msp->ms_lock));
1350         range_tree_vacate(msp->ms_tree, NULL, NULL);
1351         msp->ms_loaded = B_FALSE;
1352         msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1353         msp->ms_max_size = 0;
1354 }
1355
1356 int
1357 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1358     metaslab_t **msp)
1359 {
1360         vdev_t *vd = mg->mg_vd;
1361         objset_t *mos = vd->vdev_spa->spa_meta_objset;
1362         metaslab_t *ms;
1363         int error;
1364
1365         ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1366         mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1367         mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
1368         cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1369         ms->ms_id = id;
1370         ms->ms_start = id << vd->vdev_ms_shift;
1371         ms->ms_size = 1ULL << vd->vdev_ms_shift;
1372
1373         /*
1374          * We only open space map objects that already exist. All others
1375          * will be opened when we finally allocate an object for it.
1376          */
1377         if (object != 0) {
1378                 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1379                     ms->ms_size, vd->vdev_ashift);
1380
1381                 if (error != 0) {
1382                         kmem_free(ms, sizeof (metaslab_t));
1383                         return (error);
1384                 }
1385
1386                 ASSERT(ms->ms_sm != NULL);
1387         }
1388
1389         /*
1390          * We create the main range tree here, but we don't create the
1391          * other range trees until metaslab_sync_done().  This serves
1392          * two purposes: it allows metaslab_sync_done() to detect the
1393          * addition of new space; and for debugging, it ensures that we'd
1394          * data fault on any attempt to use this metaslab before it's ready.
1395          */
1396         ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree,
1397             metaslab_rangesize_compare, 0);
1398         metaslab_group_add(mg, ms);
1399
1400         metaslab_set_fragmentation(ms);
1401
1402         /*
1403          * If we're opening an existing pool (txg == 0) or creating
1404          * a new one (txg == TXG_INITIAL), all space is available now.
1405          * If we're adding space to an existing pool, the new space
1406          * does not become available until after this txg has synced.
1407          * The metaslab's weight will also be initialized when we sync
1408          * out this txg. This ensures that we don't attempt to allocate
1409          * from it before we have initialized it completely.
1410          */
1411         if (txg <= TXG_INITIAL)
1412                 metaslab_sync_done(ms, 0);
1413
1414         /*
1415          * If metaslab_debug_load is set and we're initializing a metaslab
1416          * that has an allocated space map object then load the its space
1417          * map so that can verify frees.
1418          */
1419         if (metaslab_debug_load && ms->ms_sm != NULL) {
1420                 mutex_enter(&ms->ms_lock);
1421                 VERIFY0(metaslab_load(ms));
1422                 mutex_exit(&ms->ms_lock);
1423         }
1424
1425         if (txg != 0) {
1426                 vdev_dirty(vd, 0, NULL, txg);
1427                 vdev_dirty(vd, VDD_METASLAB, ms, txg);
1428         }
1429
1430         *msp = ms;
1431
1432         return (0);
1433 }
1434
1435 void
1436 metaslab_fini(metaslab_t *msp)
1437 {
1438         metaslab_group_t *mg = msp->ms_group;
1439
1440         metaslab_group_remove(mg, msp);
1441
1442         mutex_enter(&msp->ms_lock);
1443         VERIFY(msp->ms_group == NULL);
1444         vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1445             0, -msp->ms_size);
1446         space_map_close(msp->ms_sm);
1447
1448         metaslab_unload(msp);
1449         range_tree_destroy(msp->ms_tree);
1450         range_tree_destroy(msp->ms_freeingtree);
1451         range_tree_destroy(msp->ms_freedtree);
1452
1453         for (int t = 0; t < TXG_SIZE; t++) {
1454                 range_tree_destroy(msp->ms_alloctree[t]);
1455         }
1456
1457         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1458                 range_tree_destroy(msp->ms_defertree[t]);
1459         }
1460
1461         ASSERT0(msp->ms_deferspace);
1462
1463         mutex_exit(&msp->ms_lock);
1464         cv_destroy(&msp->ms_load_cv);
1465         mutex_destroy(&msp->ms_lock);
1466         mutex_destroy(&msp->ms_sync_lock);
1467
1468         kmem_free(msp, sizeof (metaslab_t));
1469 }
1470
1471 #define FRAGMENTATION_TABLE_SIZE        17
1472
1473 /*
1474  * This table defines a segment size based fragmentation metric that will
1475  * allow each metaslab to derive its own fragmentation value. This is done
1476  * by calculating the space in each bucket of the spacemap histogram and
1477  * multiplying that by the fragmetation metric in this table. Doing
1478  * this for all buckets and dividing it by the total amount of free
1479  * space in this metaslab (i.e. the total free space in all buckets) gives
1480  * us the fragmentation metric. This means that a high fragmentation metric
1481  * equates to most of the free space being comprised of small segments.
1482  * Conversely, if the metric is low, then most of the free space is in
1483  * large segments. A 10% change in fragmentation equates to approximately
1484  * double the number of segments.
1485  *
1486  * This table defines 0% fragmented space using 16MB segments. Testing has
1487  * shown that segments that are greater than or equal to 16MB do not suffer
1488  * from drastic performance problems. Using this value, we derive the rest
1489  * of the table. Since the fragmentation value is never stored on disk, it
1490  * is possible to change these calculations in the future.
1491  */
1492 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1493         100,    /* 512B */
1494         100,    /* 1K   */
1495         98,     /* 2K   */
1496         95,     /* 4K   */
1497         90,     /* 8K   */
1498         80,     /* 16K  */
1499         70,     /* 32K  */
1500         60,     /* 64K  */
1501         50,     /* 128K */
1502         40,     /* 256K */
1503         30,     /* 512K */
1504         20,     /* 1M   */
1505         15,     /* 2M   */
1506         10,     /* 4M   */
1507         5,      /* 8M   */
1508         0       /* 16M  */
1509 };
1510
1511 /*
1512  * Calclate the metaslab's fragmentation metric. A return value
1513  * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1514  * not support this metric. Otherwise, the return value should be in the
1515  * range [0, 100].
1516  */
1517 static void
1518 metaslab_set_fragmentation(metaslab_t *msp)
1519 {
1520         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1521         uint64_t fragmentation = 0;
1522         uint64_t total = 0;
1523         boolean_t feature_enabled = spa_feature_is_enabled(spa,
1524             SPA_FEATURE_SPACEMAP_HISTOGRAM);
1525
1526         if (!feature_enabled) {
1527                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1528                 return;
1529         }
1530
1531         /*
1532          * A null space map means that the entire metaslab is free
1533          * and thus is not fragmented.
1534          */
1535         if (msp->ms_sm == NULL) {
1536                 msp->ms_fragmentation = 0;
1537                 return;
1538         }
1539
1540         /*
1541          * If this metaslab's space map has not been upgraded, flag it
1542          * so that we upgrade next time we encounter it.
1543          */
1544         if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1545                 uint64_t txg = spa_syncing_txg(spa);
1546                 vdev_t *vd = msp->ms_group->mg_vd;
1547
1548                 /*
1549                  * If we've reached the final dirty txg, then we must
1550                  * be shutting down the pool. We don't want to dirty
1551                  * any data past this point so skip setting the condense
1552                  * flag. We can retry this action the next time the pool
1553                  * is imported.
1554                  */
1555                 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
1556                         msp->ms_condense_wanted = B_TRUE;
1557                         vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1558                         zfs_dbgmsg("txg %llu, requesting force condense: "
1559                             "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
1560                             vd->vdev_id);
1561                 }
1562                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1563                 return;
1564         }
1565
1566         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1567                 uint64_t space = 0;
1568                 uint8_t shift = msp->ms_sm->sm_shift;
1569
1570                 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1571                     FRAGMENTATION_TABLE_SIZE - 1);
1572
1573                 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1574                         continue;
1575
1576                 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1577                 total += space;
1578
1579                 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1580                 fragmentation += space * zfs_frag_table[idx];
1581         }
1582
1583         if (total > 0)
1584                 fragmentation /= total;
1585         ASSERT3U(fragmentation, <=, 100);
1586
1587         msp->ms_fragmentation = fragmentation;
1588 }
1589
1590 /*
1591  * Compute a weight -- a selection preference value -- for the given metaslab.
1592  * This is based on the amount of free space, the level of fragmentation,
1593  * the LBA range, and whether the metaslab is loaded.
1594  */
1595 static uint64_t
1596 metaslab_space_weight(metaslab_t *msp)
1597 {
1598         metaslab_group_t *mg = msp->ms_group;
1599         vdev_t *vd = mg->mg_vd;
1600         uint64_t weight, space;
1601
1602         ASSERT(MUTEX_HELD(&msp->ms_lock));
1603         ASSERT(!vd->vdev_removing);
1604
1605         /*
1606          * The baseline weight is the metaslab's free space.
1607          */
1608         space = msp->ms_size - space_map_allocated(msp->ms_sm);
1609
1610         if (metaslab_fragmentation_factor_enabled &&
1611             msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1612                 /*
1613                  * Use the fragmentation information to inversely scale
1614                  * down the baseline weight. We need to ensure that we
1615                  * don't exclude this metaslab completely when it's 100%
1616                  * fragmented. To avoid this we reduce the fragmented value
1617                  * by 1.
1618                  */
1619                 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1620
1621                 /*
1622                  * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1623                  * this metaslab again. The fragmentation metric may have
1624                  * decreased the space to something smaller than
1625                  * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1626                  * so that we can consume any remaining space.
1627                  */
1628                 if (space > 0 && space < SPA_MINBLOCKSIZE)
1629                         space = SPA_MINBLOCKSIZE;
1630         }
1631         weight = space;
1632
1633         /*
1634          * Modern disks have uniform bit density and constant angular velocity.
1635          * Therefore, the outer recording zones are faster (higher bandwidth)
1636          * than the inner zones by the ratio of outer to inner track diameter,
1637          * which is typically around 2:1.  We account for this by assigning
1638          * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1639          * In effect, this means that we'll select the metaslab with the most
1640          * free bandwidth rather than simply the one with the most free space.
1641          */
1642         if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
1643                 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1644                 ASSERT(weight >= space && weight <= 2 * space);
1645         }
1646
1647         /*
1648          * If this metaslab is one we're actively using, adjust its
1649          * weight to make it preferable to any inactive metaslab so
1650          * we'll polish it off. If the fragmentation on this metaslab
1651          * has exceed our threshold, then don't mark it active.
1652          */
1653         if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1654             msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1655                 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1656         }
1657
1658         WEIGHT_SET_SPACEBASED(weight);
1659         return (weight);
1660 }
1661
1662 /*
1663  * Return the weight of the specified metaslab, according to the segment-based
1664  * weighting algorithm. The metaslab must be loaded. This function can
1665  * be called within a sync pass since it relies only on the metaslab's
1666  * range tree which is always accurate when the metaslab is loaded.
1667  */
1668 static uint64_t
1669 metaslab_weight_from_range_tree(metaslab_t *msp)
1670 {
1671         uint64_t weight = 0;
1672         uint32_t segments = 0;
1673
1674         ASSERT(msp->ms_loaded);
1675
1676         for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
1677             i--) {
1678                 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
1679                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1680
1681                 segments <<= 1;
1682                 segments += msp->ms_tree->rt_histogram[i];
1683
1684                 /*
1685                  * The range tree provides more precision than the space map
1686                  * and must be downgraded so that all values fit within the
1687                  * space map's histogram. This allows us to compare loaded
1688                  * vs. unloaded metaslabs to determine which metaslab is
1689                  * considered "best".
1690                  */
1691                 if (i > max_idx)
1692                         continue;
1693
1694                 if (segments != 0) {
1695                         WEIGHT_SET_COUNT(weight, segments);
1696                         WEIGHT_SET_INDEX(weight, i);
1697                         WEIGHT_SET_ACTIVE(weight, 0);
1698                         break;
1699                 }
1700         }
1701         return (weight);
1702 }
1703
1704 /*
1705  * Calculate the weight based on the on-disk histogram. This should only
1706  * be called after a sync pass has completely finished since the on-disk
1707  * information is updated in metaslab_sync().
1708  */
1709 static uint64_t
1710 metaslab_weight_from_spacemap(metaslab_t *msp)
1711 {
1712         uint64_t weight = 0;
1713
1714         for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1715                 if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1716                         WEIGHT_SET_COUNT(weight,
1717                             msp->ms_sm->sm_phys->smp_histogram[i]);
1718                         WEIGHT_SET_INDEX(weight, i +
1719                             msp->ms_sm->sm_shift);
1720                         WEIGHT_SET_ACTIVE(weight, 0);
1721                         break;
1722                 }
1723         }
1724         return (weight);
1725 }
1726
1727 /*
1728  * Compute a segment-based weight for the specified metaslab. The weight
1729  * is determined by highest bucket in the histogram. The information
1730  * for the highest bucket is encoded into the weight value.
1731  */
1732 static uint64_t
1733 metaslab_segment_weight(metaslab_t *msp)
1734 {
1735         metaslab_group_t *mg = msp->ms_group;
1736         uint64_t weight = 0;
1737         uint8_t shift = mg->mg_vd->vdev_ashift;
1738
1739         ASSERT(MUTEX_HELD(&msp->ms_lock));
1740
1741         /*
1742          * The metaslab is completely free.
1743          */
1744         if (space_map_allocated(msp->ms_sm) == 0) {
1745                 int idx = highbit64(msp->ms_size) - 1;
1746                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1747
1748                 if (idx < max_idx) {
1749                         WEIGHT_SET_COUNT(weight, 1ULL);
1750                         WEIGHT_SET_INDEX(weight, idx);
1751                 } else {
1752                         WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1753                         WEIGHT_SET_INDEX(weight, max_idx);
1754                 }
1755                 WEIGHT_SET_ACTIVE(weight, 0);
1756                 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1757
1758                 return (weight);
1759         }
1760
1761         ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1762
1763         /*
1764          * If the metaslab is fully allocated then just make the weight 0.
1765          */
1766         if (space_map_allocated(msp->ms_sm) == msp->ms_size)
1767                 return (0);
1768         /*
1769          * If the metaslab is already loaded, then use the range tree to
1770          * determine the weight. Otherwise, we rely on the space map information
1771          * to generate the weight.
1772          */
1773         if (msp->ms_loaded) {
1774                 weight = metaslab_weight_from_range_tree(msp);
1775         } else {
1776                 weight = metaslab_weight_from_spacemap(msp);
1777         }
1778
1779         /*
1780          * If the metaslab was active the last time we calculated its weight
1781          * then keep it active. We want to consume the entire region that
1782          * is associated with this weight.
1783          */
1784         if (msp->ms_activation_weight != 0 && weight != 0)
1785                 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
1786         return (weight);
1787 }
1788
1789 /*
1790  * Determine if we should attempt to allocate from this metaslab. If the
1791  * metaslab has a maximum size then we can quickly determine if the desired
1792  * allocation size can be satisfied. Otherwise, if we're using segment-based
1793  * weighting then we can determine the maximum allocation that this metaslab
1794  * can accommodate based on the index encoded in the weight. If we're using
1795  * space-based weights then rely on the entire weight (excluding the weight
1796  * type bit).
1797  */
1798 boolean_t
1799 metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
1800 {
1801         boolean_t should_allocate;
1802
1803         if (msp->ms_max_size != 0)
1804                 return (msp->ms_max_size >= asize);
1805
1806         if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
1807                 /*
1808                  * The metaslab segment weight indicates segments in the
1809                  * range [2^i, 2^(i+1)), where i is the index in the weight.
1810                  * Since the asize might be in the middle of the range, we
1811                  * should attempt the allocation if asize < 2^(i+1).
1812                  */
1813                 should_allocate = (asize <
1814                     1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
1815         } else {
1816                 should_allocate = (asize <=
1817                     (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
1818         }
1819         return (should_allocate);
1820 }
1821 static uint64_t
1822 metaslab_weight(metaslab_t *msp)
1823 {
1824         vdev_t *vd = msp->ms_group->mg_vd;
1825         spa_t *spa = vd->vdev_spa;
1826         uint64_t weight;
1827
1828         ASSERT(MUTEX_HELD(&msp->ms_lock));
1829
1830         /*
1831          * If this vdev is in the process of being removed, there is nothing
1832          * for us to do here.
1833          */
1834         if (vd->vdev_removing)
1835                 return (0);
1836
1837         metaslab_set_fragmentation(msp);
1838
1839         /*
1840          * Update the maximum size if the metaslab is loaded. This will
1841          * ensure that we get an accurate maximum size if newly freed space
1842          * has been added back into the free tree.
1843          */
1844         if (msp->ms_loaded)
1845                 msp->ms_max_size = metaslab_block_maxsize(msp);
1846
1847         /*
1848          * Segment-based weighting requires space map histogram support.
1849          */
1850         if (zfs_metaslab_segment_weight_enabled &&
1851             spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
1852             (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
1853             sizeof (space_map_phys_t))) {
1854                 weight = metaslab_segment_weight(msp);
1855         } else {
1856                 weight = metaslab_space_weight(msp);
1857         }
1858         return (weight);
1859 }
1860
1861 static int
1862 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1863 {
1864         ASSERT(MUTEX_HELD(&msp->ms_lock));
1865
1866         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1867                 metaslab_load_wait(msp);
1868                 if (!msp->ms_loaded) {
1869                         int error = metaslab_load(msp);
1870                         if (error) {
1871                                 metaslab_group_sort(msp->ms_group, msp, 0);
1872                                 return (error);
1873                         }
1874                 }
1875
1876                 msp->ms_activation_weight = msp->ms_weight;
1877                 metaslab_group_sort(msp->ms_group, msp,
1878                     msp->ms_weight | activation_weight);
1879         }
1880         ASSERT(msp->ms_loaded);
1881         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1882
1883         return (0);
1884 }
1885
1886 static void
1887 metaslab_passivate(metaslab_t *msp, uint64_t weight)
1888 {
1889         ASSERTV(uint64_t size = weight & ~METASLAB_WEIGHT_TYPE);
1890
1891         /*
1892          * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1893          * this metaslab again.  In that case, it had better be empty,
1894          * or we would be leaving space on the table.
1895          */
1896         ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
1897             size >= SPA_MINBLOCKSIZE ||
1898             range_tree_space(msp->ms_tree) == 0);
1899         ASSERT0(weight & METASLAB_ACTIVE_MASK);
1900
1901         msp->ms_activation_weight = 0;
1902         metaslab_group_sort(msp->ms_group, msp, weight);
1903         ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1904 }
1905
1906 /*
1907  * Segment-based metaslabs are activated once and remain active until
1908  * we either fail an allocation attempt (similar to space-based metaslabs)
1909  * or have exhausted the free space in zfs_metaslab_switch_threshold
1910  * buckets since the metaslab was activated. This function checks to see
1911  * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
1912  * metaslab and passivates it proactively. This will allow us to select a
1913  * metaslab with a larger contiguous region, if any, remaining within this
1914  * metaslab group. If we're in sync pass > 1, then we continue using this
1915  * metaslab so that we don't dirty more block and cause more sync passes.
1916  */
1917 void
1918 metaslab_segment_may_passivate(metaslab_t *msp)
1919 {
1920         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1921
1922         if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
1923                 return;
1924
1925         /*
1926          * Since we are in the middle of a sync pass, the most accurate
1927          * information that is accessible to us is the in-core range tree
1928          * histogram; calculate the new weight based on that information.
1929          */
1930         uint64_t weight = metaslab_weight_from_range_tree(msp);
1931         int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
1932         int current_idx = WEIGHT_GET_INDEX(weight);
1933
1934         if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
1935                 metaslab_passivate(msp, weight);
1936 }
1937
1938 static void
1939 metaslab_preload(void *arg)
1940 {
1941         metaslab_t *msp = arg;
1942         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1943         fstrans_cookie_t cookie = spl_fstrans_mark();
1944
1945         ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
1946
1947         mutex_enter(&msp->ms_lock);
1948         metaslab_load_wait(msp);
1949         if (!msp->ms_loaded)
1950                 (void) metaslab_load(msp);
1951         msp->ms_selected_txg = spa_syncing_txg(spa);
1952         mutex_exit(&msp->ms_lock);
1953         spl_fstrans_unmark(cookie);
1954 }
1955
1956 static void
1957 metaslab_group_preload(metaslab_group_t *mg)
1958 {
1959         spa_t *spa = mg->mg_vd->vdev_spa;
1960         metaslab_t *msp;
1961         avl_tree_t *t = &mg->mg_metaslab_tree;
1962         int m = 0;
1963
1964         if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1965                 taskq_wait_outstanding(mg->mg_taskq, 0);
1966                 return;
1967         }
1968
1969         mutex_enter(&mg->mg_lock);
1970
1971         /*
1972          * Load the next potential metaslabs
1973          */
1974         for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
1975                 ASSERT3P(msp->ms_group, ==, mg);
1976
1977                 /*
1978                  * We preload only the maximum number of metaslabs specified
1979                  * by metaslab_preload_limit. If a metaslab is being forced
1980                  * to condense then we preload it too. This will ensure
1981                  * that force condensing happens in the next txg.
1982                  */
1983                 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
1984                         continue;
1985                 }
1986
1987                 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1988                     msp, TQ_SLEEP) != TASKQID_INVALID);
1989         }
1990         mutex_exit(&mg->mg_lock);
1991 }
1992
1993 /*
1994  * Determine if the space map's on-disk footprint is past our tolerance
1995  * for inefficiency. We would like to use the following criteria to make
1996  * our decision:
1997  *
1998  * 1. The size of the space map object should not dramatically increase as a
1999  * result of writing out the free space range tree.
2000  *
2001  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
2002  * times the size than the free space range tree representation
2003  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
2004  *
2005  * 3. The on-disk size of the space map should actually decrease.
2006  *
2007  * Checking the first condition is tricky since we don't want to walk
2008  * the entire AVL tree calculating the estimated on-disk size. Instead we
2009  * use the size-ordered range tree in the metaslab and calculate the
2010  * size required to write out the largest segment in our free tree. If the
2011  * size required to represent that segment on disk is larger than the space
2012  * map object then we avoid condensing this map.
2013  *
2014  * To determine the second criterion we use a best-case estimate and assume
2015  * each segment can be represented on-disk as a single 64-bit entry. We refer
2016  * to this best-case estimate as the space map's minimal form.
2017  *
2018  * Unfortunately, we cannot compute the on-disk size of the space map in this
2019  * context because we cannot accurately compute the effects of compression, etc.
2020  * Instead, we apply the heuristic described in the block comment for
2021  * zfs_metaslab_condense_block_threshold - we only condense if the space used
2022  * is greater than a threshold number of blocks.
2023  */
2024 static boolean_t
2025 metaslab_should_condense(metaslab_t *msp)
2026 {
2027         space_map_t *sm = msp->ms_sm;
2028         range_seg_t *rs;
2029         uint64_t size, entries, segsz, object_size, optimal_size, record_size;
2030         dmu_object_info_t doi;
2031         uint64_t vdev_blocksize = 1ULL << msp->ms_group->mg_vd->vdev_ashift;
2032
2033         ASSERT(MUTEX_HELD(&msp->ms_lock));
2034         ASSERT(msp->ms_loaded);
2035
2036         /*
2037          * Use the ms_size_tree range tree, which is ordered by size, to
2038          * obtain the largest segment in the free tree. We always condense
2039          * metaslabs that are empty and metaslabs for which a condense
2040          * request has been made.
2041          */
2042         rs = avl_last(&msp->ms_size_tree);
2043         if (rs == NULL || msp->ms_condense_wanted)
2044                 return (B_TRUE);
2045
2046         /*
2047          * Calculate the number of 64-bit entries this segment would
2048          * require when written to disk. If this single segment would be
2049          * larger on-disk than the entire current on-disk structure, then
2050          * clearly condensing will increase the on-disk structure size.
2051          */
2052         size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
2053         entries = size / (MIN(size, SM_RUN_MAX));
2054         segsz = entries * sizeof (uint64_t);
2055
2056         optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
2057         object_size = space_map_length(msp->ms_sm);
2058
2059         dmu_object_info_from_db(sm->sm_dbuf, &doi);
2060         record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
2061
2062         return (segsz <= object_size &&
2063             object_size >= (optimal_size * zfs_condense_pct / 100) &&
2064             object_size > zfs_metaslab_condense_block_threshold * record_size);
2065 }
2066
2067 /*
2068  * Condense the on-disk space map representation to its minimized form.
2069  * The minimized form consists of a small number of allocations followed by
2070  * the entries of the free range tree.
2071  */
2072 static void
2073 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
2074 {
2075         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2076         range_tree_t *condense_tree;
2077         space_map_t *sm = msp->ms_sm;
2078
2079         ASSERT(MUTEX_HELD(&msp->ms_lock));
2080         ASSERT3U(spa_sync_pass(spa), ==, 1);
2081         ASSERT(msp->ms_loaded);
2082
2083
2084         zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
2085             "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
2086             msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
2087             msp->ms_group->mg_vd->vdev_spa->spa_name,
2088             space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
2089             msp->ms_condense_wanted ? "TRUE" : "FALSE");
2090
2091         msp->ms_condense_wanted = B_FALSE;
2092
2093         /*
2094          * Create an range tree that is 100% allocated. We remove segments
2095          * that have been freed in this txg, any deferred frees that exist,
2096          * and any allocation in the future. Removing segments should be
2097          * a relatively inexpensive operation since we expect these trees to
2098          * have a small number of nodes.
2099          */
2100         condense_tree = range_tree_create(NULL, NULL);
2101         range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
2102
2103         /*
2104          * Remove what's been freed in this txg from the condense_tree.
2105          * Since we're in sync_pass 1, we know that all the frees from
2106          * this txg are in the freeingtree.
2107          */
2108         range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
2109
2110         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2111                 range_tree_walk(msp->ms_defertree[t],
2112                     range_tree_remove, condense_tree);
2113         }
2114
2115         for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2116                 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
2117                     range_tree_remove, condense_tree);
2118         }
2119
2120         /*
2121          * We're about to drop the metaslab's lock thus allowing
2122          * other consumers to change it's content. Set the
2123          * metaslab's ms_condensing flag to ensure that
2124          * allocations on this metaslab do not occur while we're
2125          * in the middle of committing it to disk. This is only critical
2126          * for the ms_tree as all other range trees use per txg
2127          * views of their content.
2128          */
2129         msp->ms_condensing = B_TRUE;
2130
2131         mutex_exit(&msp->ms_lock);
2132         space_map_truncate(sm, tx);
2133
2134         /*
2135          * While we would ideally like to create a space map representation
2136          * that consists only of allocation records, doing so can be
2137          * prohibitively expensive because the in-core free tree can be
2138          * large, and therefore computationally expensive to subtract
2139          * from the condense_tree. Instead we sync out two trees, a cheap
2140          * allocation only tree followed by the in-core free tree. While not
2141          * optimal, this is typically close to optimal, and much cheaper to
2142          * compute.
2143          */
2144         space_map_write(sm, condense_tree, SM_ALLOC, tx);
2145         range_tree_vacate(condense_tree, NULL, NULL);
2146         range_tree_destroy(condense_tree);
2147
2148         space_map_write(sm, msp->ms_tree, SM_FREE, tx);
2149         mutex_enter(&msp->ms_lock);
2150         msp->ms_condensing = B_FALSE;
2151 }
2152
2153 /*
2154  * Write a metaslab to disk in the context of the specified transaction group.
2155  */
2156 void
2157 metaslab_sync(metaslab_t *msp, uint64_t txg)
2158 {
2159         metaslab_group_t *mg = msp->ms_group;
2160         vdev_t *vd = mg->mg_vd;
2161         spa_t *spa = vd->vdev_spa;
2162         objset_t *mos = spa_meta_objset(spa);
2163         range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
2164         dmu_tx_t *tx;
2165         uint64_t object = space_map_object(msp->ms_sm);
2166
2167         ASSERT(!vd->vdev_ishole);
2168
2169         /*
2170          * This metaslab has just been added so there's no work to do now.
2171          */
2172         if (msp->ms_freeingtree == NULL) {
2173                 ASSERT3P(alloctree, ==, NULL);
2174                 return;
2175         }
2176
2177         ASSERT3P(alloctree, !=, NULL);
2178         ASSERT3P(msp->ms_freeingtree, !=, NULL);
2179         ASSERT3P(msp->ms_freedtree, !=, NULL);
2180
2181         /*
2182          * Normally, we don't want to process a metaslab if there
2183          * are no allocations or frees to perform. However, if the metaslab
2184          * is being forced to condense and it's loaded, we need to let it
2185          * through.
2186          */
2187         if (range_tree_space(alloctree) == 0 &&
2188             range_tree_space(msp->ms_freeingtree) == 0 &&
2189             !(msp->ms_loaded && msp->ms_condense_wanted))
2190                 return;
2191
2192
2193         VERIFY(txg <= spa_final_dirty_txg(spa));
2194
2195         /*
2196          * The only state that can actually be changing concurrently with
2197          * metaslab_sync() is the metaslab's ms_tree.  No other thread can
2198          * be modifying this txg's alloctree, freeingtree, freedtree, or
2199          * space_map_phys_t.  We drop ms_lock whenever we could call
2200          * into the DMU, because the DMU can call down to us
2201          * (e.g. via zio_free()) at any time.
2202          *
2203          * The spa_vdev_remove_thread() can be reading metaslab state
2204          * concurrently, and it is locked out by the ms_sync_lock.  Note
2205          * that the ms_lock is insufficient for this, because it is dropped
2206          * by space_map_write().
2207          */
2208
2209         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2210
2211         if (msp->ms_sm == NULL) {
2212                 uint64_t new_object;
2213
2214                 new_object = space_map_alloc(mos, tx);
2215                 VERIFY3U(new_object, !=, 0);
2216
2217                 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2218                     msp->ms_start, msp->ms_size, vd->vdev_ashift));
2219                 ASSERT(msp->ms_sm != NULL);
2220         }
2221
2222         mutex_enter(&msp->ms_sync_lock);
2223         mutex_enter(&msp->ms_lock);
2224
2225         /*
2226          * Note: metaslab_condense() clears the space map's histogram.
2227          * Therefore we must verify and remove this histogram before
2228          * condensing.
2229          */
2230         metaslab_group_histogram_verify(mg);
2231         metaslab_class_histogram_verify(mg->mg_class);
2232         metaslab_group_histogram_remove(mg, msp);
2233
2234         if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
2235             metaslab_should_condense(msp)) {
2236                 metaslab_condense(msp, txg, tx);
2237         } else {
2238                 mutex_exit(&msp->ms_lock);
2239                 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
2240                 space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
2241                 mutex_enter(&msp->ms_lock);
2242         }
2243
2244         if (msp->ms_loaded) {
2245                 /*
2246                  * When the space map is loaded, we have an accurate
2247                  * histogram in the range tree. This gives us an opportunity
2248                  * to bring the space map's histogram up-to-date so we clear
2249                  * it first before updating it.
2250                  */
2251                 space_map_histogram_clear(msp->ms_sm);
2252                 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
2253
2254                 /*
2255                  * Since we've cleared the histogram we need to add back
2256                  * any free space that has already been processed, plus
2257                  * any deferred space. This allows the on-disk histogram
2258                  * to accurately reflect all free space even if some space
2259                  * is not yet available for allocation (i.e. deferred).
2260                  */
2261                 space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
2262
2263                 /*
2264                  * Add back any deferred free space that has not been
2265                  * added back into the in-core free tree yet. This will
2266                  * ensure that we don't end up with a space map histogram
2267                  * that is completely empty unless the metaslab is fully
2268                  * allocated.
2269                  */
2270                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2271                         space_map_histogram_add(msp->ms_sm,
2272                             msp->ms_defertree[t], tx);
2273                 }
2274         }
2275
2276         /*
2277          * Always add the free space from this sync pass to the space
2278          * map histogram. We want to make sure that the on-disk histogram
2279          * accounts for all free space. If the space map is not loaded,
2280          * then we will lose some accuracy but will correct it the next
2281          * time we load the space map.
2282          */
2283         space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
2284
2285         metaslab_group_histogram_add(mg, msp);
2286         metaslab_group_histogram_verify(mg);
2287         metaslab_class_histogram_verify(mg->mg_class);
2288
2289         /*
2290          * For sync pass 1, we avoid traversing this txg's free range tree
2291          * and instead will just swap the pointers for freeingtree and
2292          * freedtree. We can safely do this since the freed_tree is
2293          * guaranteed to be empty on the initial pass.
2294          */
2295         if (spa_sync_pass(spa) == 1) {
2296                 range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
2297         } else {
2298                 range_tree_vacate(msp->ms_freeingtree,
2299                     range_tree_add, msp->ms_freedtree);
2300         }
2301         range_tree_vacate(alloctree, NULL, NULL);
2302
2303         ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2304         ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
2305         ASSERT0(range_tree_space(msp->ms_freeingtree));
2306
2307         mutex_exit(&msp->ms_lock);
2308
2309         if (object != space_map_object(msp->ms_sm)) {
2310                 object = space_map_object(msp->ms_sm);
2311                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2312                     msp->ms_id, sizeof (uint64_t), &object, tx);
2313         }
2314         mutex_exit(&msp->ms_sync_lock);
2315         dmu_tx_commit(tx);
2316 }
2317
2318 /*
2319  * Called after a transaction group has completely synced to mark
2320  * all of the metaslab's free space as usable.
2321  */
2322 void
2323 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2324 {
2325         metaslab_group_t *mg = msp->ms_group;
2326         vdev_t *vd = mg->mg_vd;
2327         spa_t *spa = vd->vdev_spa;
2328         range_tree_t **defer_tree;
2329         int64_t alloc_delta, defer_delta;
2330         boolean_t defer_allowed = B_TRUE;
2331
2332         ASSERT(!vd->vdev_ishole);
2333
2334         mutex_enter(&msp->ms_lock);
2335
2336         /*
2337          * If this metaslab is just becoming available, initialize its
2338          * range trees and add its capacity to the vdev.
2339          */
2340         if (msp->ms_freedtree == NULL) {
2341                 for (int t = 0; t < TXG_SIZE; t++) {
2342                         ASSERT(msp->ms_alloctree[t] == NULL);
2343
2344                         msp->ms_alloctree[t] = range_tree_create(NULL, NULL);
2345                 }
2346
2347                 ASSERT3P(msp->ms_freeingtree, ==, NULL);
2348                 msp->ms_freeingtree = range_tree_create(NULL, NULL);
2349
2350                 ASSERT3P(msp->ms_freedtree, ==, NULL);
2351                 msp->ms_freedtree = range_tree_create(NULL, NULL);
2352
2353                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2354                         ASSERT(msp->ms_defertree[t] == NULL);
2355
2356                         msp->ms_defertree[t] = range_tree_create(NULL, NULL);
2357                 }
2358
2359                 vdev_space_update(vd, 0, 0, msp->ms_size);
2360         }
2361
2362         defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
2363
2364         uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2365             metaslab_class_get_alloc(spa_normal_class(spa));
2366         if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
2367                 defer_allowed = B_FALSE;
2368         }
2369
2370         defer_delta = 0;
2371         alloc_delta = space_map_alloc_delta(msp->ms_sm);
2372         if (defer_allowed) {
2373                 defer_delta = range_tree_space(msp->ms_freedtree) -
2374                     range_tree_space(*defer_tree);
2375         } else {
2376                 defer_delta -= range_tree_space(*defer_tree);
2377         }
2378
2379         vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2380
2381         /*
2382          * If there's a metaslab_load() in progress, wait for it to complete
2383          * so that we have a consistent view of the in-core space map.
2384          */
2385         metaslab_load_wait(msp);
2386
2387         /*
2388          * Move the frees from the defer_tree back to the free
2389          * range tree (if it's loaded). Swap the freed_tree and the
2390          * defer_tree -- this is safe to do because we've just emptied out
2391          * the defer_tree.
2392          */
2393         range_tree_vacate(*defer_tree,
2394             msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2395         if (defer_allowed) {
2396                 range_tree_swap(&msp->ms_freedtree, defer_tree);
2397         } else {
2398                 range_tree_vacate(msp->ms_freedtree,
2399                     msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2400         }
2401
2402         space_map_update(msp->ms_sm);
2403
2404         msp->ms_deferspace += defer_delta;
2405         ASSERT3S(msp->ms_deferspace, >=, 0);
2406         ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2407         if (msp->ms_deferspace != 0) {
2408                 /*
2409                  * Keep syncing this metaslab until all deferred frees
2410                  * are back in circulation.
2411                  */
2412                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2413         }
2414
2415         /*
2416          * Calculate the new weights before unloading any metaslabs.
2417          * This will give us the most accurate weighting.
2418          */
2419         metaslab_group_sort(mg, msp, metaslab_weight(msp));
2420
2421         /*
2422          * If the metaslab is loaded and we've not tried to load or allocate
2423          * from it in 'metaslab_unload_delay' txgs, then unload it.
2424          */
2425         if (msp->ms_loaded &&
2426             msp->ms_selected_txg + metaslab_unload_delay < txg) {
2427
2428                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2429                         VERIFY0(range_tree_space(
2430                             msp->ms_alloctree[(txg + t) & TXG_MASK]));
2431                 }
2432
2433                 if (!metaslab_debug_unload)
2434                         metaslab_unload(msp);
2435         }
2436
2437         ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2438         ASSERT0(range_tree_space(msp->ms_freeingtree));
2439         ASSERT0(range_tree_space(msp->ms_freedtree));
2440
2441         mutex_exit(&msp->ms_lock);
2442 }
2443
2444 void
2445 metaslab_sync_reassess(metaslab_group_t *mg)
2446 {
2447         spa_t *spa = mg->mg_class->mc_spa;
2448
2449         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2450         metaslab_group_alloc_update(mg);
2451         mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2452
2453         /*
2454          * Preload the next potential metaslabs but only on active
2455          * metaslab groups. We can get into a state where the metaslab
2456          * is no longer active since we dirty metaslabs as we remove a
2457          * a device, thus potentially making the metaslab group eligible
2458          * for preloading.
2459          */
2460         if (mg->mg_activation_count > 0) {
2461                 metaslab_group_preload(mg);
2462         }
2463         spa_config_exit(spa, SCL_ALLOC, FTAG);
2464 }
2465
2466 static uint64_t
2467 metaslab_distance(metaslab_t *msp, dva_t *dva)
2468 {
2469         uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2470         uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2471         uint64_t start = msp->ms_id;
2472
2473         if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2474                 return (1ULL << 63);
2475
2476         if (offset < start)
2477                 return ((start - offset) << ms_shift);
2478         if (offset > start)
2479                 return ((offset - start) << ms_shift);
2480         return (0);
2481 }
2482
2483 /*
2484  * ==========================================================================
2485  * Metaslab allocation tracing facility
2486  * ==========================================================================
2487  */
2488 #ifdef _METASLAB_TRACING
2489 kstat_t *metaslab_trace_ksp;
2490 kstat_named_t metaslab_trace_over_limit;
2491
2492 void
2493 metaslab_alloc_trace_init(void)
2494 {
2495         ASSERT(metaslab_alloc_trace_cache == NULL);
2496         metaslab_alloc_trace_cache = kmem_cache_create(
2497             "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
2498             0, NULL, NULL, NULL, NULL, NULL, 0);
2499         metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
2500             "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
2501         if (metaslab_trace_ksp != NULL) {
2502                 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
2503                 kstat_named_init(&metaslab_trace_over_limit,
2504                     "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
2505                 kstat_install(metaslab_trace_ksp);
2506         }
2507 }
2508
2509 void
2510 metaslab_alloc_trace_fini(void)
2511 {
2512         if (metaslab_trace_ksp != NULL) {
2513                 kstat_delete(metaslab_trace_ksp);
2514                 metaslab_trace_ksp = NULL;
2515         }
2516         kmem_cache_destroy(metaslab_alloc_trace_cache);
2517         metaslab_alloc_trace_cache = NULL;
2518 }
2519
2520 /*
2521  * Add an allocation trace element to the allocation tracing list.
2522  */
2523 static void
2524 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
2525     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
2526 {
2527         metaslab_alloc_trace_t *mat;
2528
2529         if (!metaslab_trace_enabled)
2530                 return;
2531
2532         /*
2533          * When the tracing list reaches its maximum we remove
2534          * the second element in the list before adding a new one.
2535          * By removing the second element we preserve the original
2536          * entry as a clue to what allocations steps have already been
2537          * performed.
2538          */
2539         if (zal->zal_size == metaslab_trace_max_entries) {
2540                 metaslab_alloc_trace_t *mat_next;
2541 #ifdef DEBUG
2542                 panic("too many entries in allocation list");
2543 #endif
2544                 atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
2545                 zal->zal_size--;
2546                 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
2547                 list_remove(&zal->zal_list, mat_next);
2548                 kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
2549         }
2550
2551         mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
2552         list_link_init(&mat->mat_list_node);
2553         mat->mat_mg = mg;
2554         mat->mat_msp = msp;
2555         mat->mat_size = psize;
2556         mat->mat_dva_id = dva_id;
2557         mat->mat_offset = offset;
2558         mat->mat_weight = 0;
2559
2560         if (msp != NULL)
2561                 mat->mat_weight = msp->ms_weight;
2562
2563         /*
2564          * The list is part of the zio so locking is not required. Only
2565          * a single thread will perform allocations for a given zio.
2566          */
2567         list_insert_tail(&zal->zal_list, mat);
2568         zal->zal_size++;
2569
2570         ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
2571 }
2572
2573 void
2574 metaslab_trace_init(zio_alloc_list_t *zal)
2575 {
2576         list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
2577             offsetof(metaslab_alloc_trace_t, mat_list_node));
2578         zal->zal_size = 0;
2579 }
2580
2581 void
2582 metaslab_trace_fini(zio_alloc_list_t *zal)
2583 {
2584         metaslab_alloc_trace_t *mat;
2585
2586         while ((mat = list_remove_head(&zal->zal_list)) != NULL)
2587                 kmem_cache_free(metaslab_alloc_trace_cache, mat);
2588         list_destroy(&zal->zal_list);
2589         zal->zal_size = 0;
2590 }
2591 #else
2592
2593 #define metaslab_trace_add(zal, mg, msp, psize, id, off)
2594
2595 void
2596 metaslab_alloc_trace_init(void)
2597 {
2598 }
2599
2600 void
2601 metaslab_alloc_trace_fini(void)
2602 {
2603 }
2604
2605 void
2606 metaslab_trace_init(zio_alloc_list_t *zal)
2607 {
2608 }
2609
2610 void
2611 metaslab_trace_fini(zio_alloc_list_t *zal)
2612 {
2613 }
2614
2615 #endif /* _METASLAB_TRACING */
2616
2617 /*
2618  * ==========================================================================
2619  * Metaslab block operations
2620  * ==========================================================================
2621  */
2622
2623 static void
2624 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2625 {
2626         if (!(flags & METASLAB_ASYNC_ALLOC) ||
2627             flags & METASLAB_DONT_THROTTLE)
2628                 return;
2629
2630         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2631         if (!mg->mg_class->mc_alloc_throttle_enabled)
2632                 return;
2633
2634         (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2635 }
2636
2637 void
2638 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2639 {
2640         if (!(flags & METASLAB_ASYNC_ALLOC) ||
2641             flags & METASLAB_DONT_THROTTLE)
2642                 return;
2643
2644         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2645         if (!mg->mg_class->mc_alloc_throttle_enabled)
2646                 return;
2647
2648         (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2649 }
2650
2651 void
2652 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2653 {
2654 #ifdef ZFS_DEBUG
2655         const dva_t *dva = bp->blk_dva;
2656         int ndvas = BP_GET_NDVAS(bp);
2657
2658         for (int d = 0; d < ndvas; d++) {
2659                 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2660                 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2661                 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2662         }
2663 #endif
2664 }
2665
2666 static uint64_t
2667 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
2668 {
2669         uint64_t start;
2670         range_tree_t *rt = msp->ms_tree;
2671         metaslab_class_t *mc = msp->ms_group->mg_class;
2672
2673         VERIFY(!msp->ms_condensing);
2674
2675         start = mc->mc_ops->msop_alloc(msp, size);
2676         if (start != -1ULL) {
2677                 metaslab_group_t *mg = msp->ms_group;
2678                 vdev_t *vd = mg->mg_vd;
2679
2680                 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
2681                 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2682                 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
2683                 range_tree_remove(rt, start, size);
2684
2685                 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2686                         vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2687
2688                 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
2689
2690                 /* Track the last successful allocation */
2691                 msp->ms_alloc_txg = txg;
2692                 metaslab_verify_space(msp, txg);
2693         }
2694
2695         /*
2696          * Now that we've attempted the allocation we need to update the
2697          * metaslab's maximum block size since it may have changed.
2698          */
2699         msp->ms_max_size = metaslab_block_maxsize(msp);
2700         return (start);
2701 }
2702
2703 static uint64_t
2704 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
2705     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2706 {
2707         metaslab_t *msp = NULL;
2708         uint64_t offset = -1ULL;
2709         uint64_t activation_weight;
2710         uint64_t target_distance;
2711         int i;
2712
2713         activation_weight = METASLAB_WEIGHT_PRIMARY;
2714         for (i = 0; i < d; i++) {
2715                 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2716                         activation_weight = METASLAB_WEIGHT_SECONDARY;
2717                         break;
2718                 }
2719         }
2720
2721         metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
2722         search->ms_weight = UINT64_MAX;
2723         search->ms_start = 0;
2724         for (;;) {
2725                 boolean_t was_active;
2726                 avl_tree_t *t = &mg->mg_metaslab_tree;
2727                 avl_index_t idx;
2728
2729                 mutex_enter(&mg->mg_lock);
2730
2731                 /*
2732                  * Find the metaslab with the highest weight that is less
2733                  * than what we've already tried.  In the common case, this
2734                  * means that we will examine each metaslab at most once.
2735                  * Note that concurrent callers could reorder metaslabs
2736                  * by activation/passivation once we have dropped the mg_lock.
2737                  * If a metaslab is activated by another thread, and we fail
2738                  * to allocate from the metaslab we have selected, we may
2739                  * not try the newly-activated metaslab, and instead activate
2740                  * another metaslab.  This is not optimal, but generally
2741                  * does not cause any problems (a possible exception being
2742                  * if every metaslab is completely full except for the
2743                  * the newly-activated metaslab which we fail to examine).
2744                  */
2745                 msp = avl_find(t, search, &idx);
2746                 if (msp == NULL)
2747                         msp = avl_nearest(t, idx, AVL_AFTER);
2748                 for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
2749
2750                         if (!metaslab_should_allocate(msp, asize)) {
2751                                 metaslab_trace_add(zal, mg, msp, asize, d,
2752                                     TRACE_TOO_SMALL);
2753                                 continue;
2754                         }
2755
2756                         /*
2757                          * If the selected metaslab is condensing, skip it.
2758                          */
2759                         if (msp->ms_condensing)
2760                                 continue;
2761
2762                         was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2763                         if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2764                                 break;
2765
2766                         target_distance = min_distance +
2767                             (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2768                             min_distance >> 1);
2769
2770                         for (i = 0; i < d; i++) {
2771                                 if (metaslab_distance(msp, &dva[i]) <
2772                                     target_distance)
2773                                         break;
2774                         }
2775                         if (i == d)
2776                                 break;
2777                 }
2778                 mutex_exit(&mg->mg_lock);
2779                 if (msp == NULL) {
2780                         kmem_free(search, sizeof (*search));
2781                         return (-1ULL);
2782                 }
2783                 search->ms_weight = msp->ms_weight;
2784                 search->ms_start = msp->ms_start + 1;
2785
2786                 mutex_enter(&msp->ms_lock);
2787
2788                 /*
2789                  * Ensure that the metaslab we have selected is still
2790                  * capable of handling our request. It's possible that
2791                  * another thread may have changed the weight while we
2792                  * were blocked on the metaslab lock. We check the
2793                  * active status first to see if we need to reselect
2794                  * a new metaslab.
2795                  */
2796                 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
2797                         mutex_exit(&msp->ms_lock);
2798                         continue;
2799                 }
2800
2801                 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2802                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
2803                         metaslab_passivate(msp,
2804                             msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2805                         mutex_exit(&msp->ms_lock);
2806                         continue;
2807                 }
2808
2809                 if (metaslab_activate(msp, activation_weight) != 0) {
2810                         mutex_exit(&msp->ms_lock);
2811                         continue;
2812                 }
2813                 msp->ms_selected_txg = txg;
2814
2815                 /*
2816                  * Now that we have the lock, recheck to see if we should
2817                  * continue to use this metaslab for this allocation. The
2818                  * the metaslab is now loaded so metaslab_should_allocate() can
2819                  * accurately determine if the allocation attempt should
2820                  * proceed.
2821                  */
2822                 if (!metaslab_should_allocate(msp, asize)) {
2823                         /* Passivate this metaslab and select a new one. */
2824                         metaslab_trace_add(zal, mg, msp, asize, d,
2825                             TRACE_TOO_SMALL);
2826                         goto next;
2827                 }
2828
2829
2830                 /*
2831                  * If this metaslab is currently condensing then pick again as
2832                  * we can't manipulate this metaslab until it's committed
2833                  * to disk.
2834                  */
2835                 if (msp->ms_condensing) {
2836                         metaslab_trace_add(zal, mg, msp, asize, d,
2837                             TRACE_CONDENSING);
2838                         mutex_exit(&msp->ms_lock);
2839                         continue;
2840                 }
2841
2842                 offset = metaslab_block_alloc(msp, asize, txg);
2843                 metaslab_trace_add(zal, mg, msp, asize, d, offset);
2844
2845                 if (offset != -1ULL) {
2846                         /* Proactively passivate the metaslab, if needed */
2847                         metaslab_segment_may_passivate(msp);
2848                         break;
2849                 }
2850 next:
2851                 ASSERT(msp->ms_loaded);
2852
2853                 /*
2854                  * We were unable to allocate from this metaslab so determine
2855                  * a new weight for this metaslab. Now that we have loaded
2856                  * the metaslab we can provide a better hint to the metaslab
2857                  * selector.
2858                  *
2859                  * For space-based metaslabs, we use the maximum block size.
2860                  * This information is only available when the metaslab
2861                  * is loaded and is more accurate than the generic free
2862                  * space weight that was calculated by metaslab_weight().
2863                  * This information allows us to quickly compare the maximum
2864                  * available allocation in the metaslab to the allocation
2865                  * size being requested.
2866                  *
2867                  * For segment-based metaslabs, determine the new weight
2868                  * based on the highest bucket in the range tree. We
2869                  * explicitly use the loaded segment weight (i.e. the range
2870                  * tree histogram) since it contains the space that is
2871                  * currently available for allocation and is accurate
2872                  * even within a sync pass.
2873                  */
2874                 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2875                         uint64_t weight = metaslab_block_maxsize(msp);
2876                         WEIGHT_SET_SPACEBASED(weight);
2877                         metaslab_passivate(msp, weight);
2878                 } else {
2879                         metaslab_passivate(msp,
2880                             metaslab_weight_from_range_tree(msp));
2881                 }
2882
2883                 /*
2884                  * We have just failed an allocation attempt, check
2885                  * that metaslab_should_allocate() agrees. Otherwise,
2886                  * we may end up in an infinite loop retrying the same
2887                  * metaslab.
2888                  */
2889                 ASSERT(!metaslab_should_allocate(msp, asize));
2890                 mutex_exit(&msp->ms_lock);
2891         }
2892         mutex_exit(&msp->ms_lock);
2893         kmem_free(search, sizeof (*search));
2894         return (offset);
2895 }
2896
2897 static uint64_t
2898 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
2899     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2900 {
2901         uint64_t offset;
2902         ASSERT(mg->mg_initialized);
2903
2904         offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
2905             min_distance, dva, d);
2906
2907         mutex_enter(&mg->mg_lock);
2908         if (offset == -1ULL) {
2909                 mg->mg_failed_allocations++;
2910                 metaslab_trace_add(zal, mg, NULL, asize, d,
2911                     TRACE_GROUP_FAILURE);
2912                 if (asize == SPA_GANGBLOCKSIZE) {
2913                         /*
2914                          * This metaslab group was unable to allocate
2915                          * the minimum gang block size so it must be out of
2916                          * space. We must notify the allocation throttle
2917                          * to start skipping allocation attempts to this
2918                          * metaslab group until more space becomes available.
2919                          * Note: this failure cannot be caused by the
2920                          * allocation throttle since the allocation throttle
2921                          * is only responsible for skipping devices and
2922                          * not failing block allocations.
2923                          */
2924                         mg->mg_no_free_space = B_TRUE;
2925                 }
2926         }
2927         mg->mg_allocations++;
2928         mutex_exit(&mg->mg_lock);
2929         return (offset);
2930 }
2931
2932 /*
2933  * If we have to write a ditto block (i.e. more than one DVA for a given BP)
2934  * on the same vdev as an existing DVA of this BP, then try to allocate it
2935  * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
2936  * existing DVAs.
2937  */
2938 int ditto_same_vdev_distance_shift = 3;
2939
2940 /*
2941  * Allocate a block for the specified i/o.
2942  */
2943 int
2944 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
2945     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
2946     zio_alloc_list_t *zal)
2947 {
2948         metaslab_group_t *mg, *fast_mg, *rotor;
2949         vdev_t *vd;
2950         boolean_t try_hard = B_FALSE;
2951
2952         ASSERT(!DVA_IS_VALID(&dva[d]));
2953
2954         /*
2955          * For testing, make some blocks above a certain size be gang blocks.
2956          */
2957         if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
2958                 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
2959                 return (SET_ERROR(ENOSPC));
2960         }
2961
2962         /*
2963          * Start at the rotor and loop through all mgs until we find something.
2964          * Note that there's no locking on mc_rotor or mc_aliquot because
2965          * nothing actually breaks if we miss a few updates -- we just won't
2966          * allocate quite as evenly.  It all balances out over time.
2967          *
2968          * If we are doing ditto or log blocks, try to spread them across
2969          * consecutive vdevs.  If we're forced to reuse a vdev before we've
2970          * allocated all of our ditto blocks, then try and spread them out on
2971          * that vdev as much as possible.  If it turns out to not be possible,
2972          * gradually lower our standards until anything becomes acceptable.
2973          * Also, allocating on consecutive vdevs (as opposed to random vdevs)
2974          * gives us hope of containing our fault domains to something we're
2975          * able to reason about.  Otherwise, any two top-level vdev failures
2976          * will guarantee the loss of data.  With consecutive allocation,
2977          * only two adjacent top-level vdev failures will result in data loss.
2978          *
2979          * If we are doing gang blocks (hintdva is non-NULL), try to keep
2980          * ourselves on the same vdev as our gang block header.  That
2981          * way, we can hope for locality in vdev_cache, plus it makes our
2982          * fault domains something tractable.
2983          */
2984         if (hintdva) {
2985                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
2986
2987                 /*
2988                  * It's possible the vdev we're using as the hint no
2989                  * longer exists or its mg has been closed (e.g. by
2990                  * device removal).  Consult the rotor when
2991                  * all else fails.
2992                  */
2993                 if (vd != NULL && vd->vdev_mg != NULL) {
2994                         mg = vd->vdev_mg;
2995
2996                         if (flags & METASLAB_HINTBP_AVOID &&
2997                             mg->mg_next != NULL)
2998                                 mg = mg->mg_next;
2999                 } else {
3000                         mg = mc->mc_rotor;
3001                 }
3002         } else if (d != 0) {
3003                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
3004                 mg = vd->vdev_mg->mg_next;
3005         } else if (flags & METASLAB_FASTWRITE) {
3006                 mg = fast_mg = mc->mc_rotor;
3007
3008                 do {
3009                         if (fast_mg->mg_vd->vdev_pending_fastwrite <
3010                             mg->mg_vd->vdev_pending_fastwrite)
3011                                 mg = fast_mg;
3012                 } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
3013
3014         } else {
3015                 mg = mc->mc_rotor;
3016         }
3017
3018         /*
3019          * If the hint put us into the wrong metaslab class, or into a
3020          * metaslab group that has been passivated, just follow the rotor.
3021          */
3022         if (mg->mg_class != mc || mg->mg_activation_count <= 0)
3023                 mg = mc->mc_rotor;
3024
3025         rotor = mg;
3026 top:
3027         do {
3028                 boolean_t allocatable;
3029
3030                 ASSERT(mg->mg_activation_count == 1);
3031                 vd = mg->mg_vd;
3032
3033                 /*
3034                  * Don't allocate from faulted devices.
3035                  */
3036                 if (try_hard) {
3037                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
3038                         allocatable = vdev_allocatable(vd);
3039                         spa_config_exit(spa, SCL_ZIO, FTAG);
3040                 } else {
3041                         allocatable = vdev_allocatable(vd);
3042                 }
3043
3044                 /*
3045                  * Determine if the selected metaslab group is eligible
3046                  * for allocations. If we're ganging then don't allow
3047                  * this metaslab group to skip allocations since that would
3048                  * inadvertently return ENOSPC and suspend the pool
3049                  * even though space is still available.
3050                  */
3051                 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3052                         allocatable = metaslab_group_allocatable(mg, rotor,
3053                             psize);
3054                 }
3055
3056                 if (!allocatable) {
3057                         metaslab_trace_add(zal, mg, NULL, psize, d,
3058                             TRACE_NOT_ALLOCATABLE);
3059                         goto next;
3060                 }
3061
3062                 ASSERT(mg->mg_initialized);
3063
3064                 /*
3065                  * Avoid writing single-copy data to a failing,
3066                  * non-redundant vdev, unless we've already tried all
3067                  * other vdevs.
3068                  */
3069                 if ((vd->vdev_stat.vs_write_errors > 0 ||
3070                     vd->vdev_state < VDEV_STATE_HEALTHY) &&
3071                     d == 0 && !try_hard && vd->vdev_children == 0) {
3072                         metaslab_trace_add(zal, mg, NULL, psize, d,
3073                             TRACE_VDEV_ERROR);
3074                         goto next;
3075                 }
3076
3077                 ASSERT(mg->mg_class == mc);
3078
3079                 /*
3080                  * If we don't need to try hard, then require that the
3081                  * block be 1/8th of the device away from any other DVAs
3082                  * in this BP.  If we are trying hard, allow any offset
3083                  * to be used (distance=0).
3084                  */
3085                 uint64_t distance = 0;
3086                 if (!try_hard) {
3087                         distance = vd->vdev_asize >>
3088                             ditto_same_vdev_distance_shift;
3089                         if (distance <= (1ULL << vd->vdev_ms_shift))
3090                                 distance = 0;
3091                 }
3092
3093                 uint64_t asize = vdev_psize_to_asize(vd, psize);
3094                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3095
3096                 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3097                     distance, dva, d);
3098
3099                 if (offset != -1ULL) {
3100                         /*
3101                          * If we've just selected this metaslab group,
3102                          * figure out whether the corresponding vdev is
3103                          * over- or under-used relative to the pool,
3104                          * and set an allocation bias to even it out.
3105                          *
3106                          * Bias is also used to compensate for unequally
3107                          * sized vdevs so that space is allocated fairly.
3108                          */
3109                         if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3110                                 vdev_stat_t *vs = &vd->vdev_stat;
3111                                 int64_t vs_free = vs->vs_space - vs->vs_alloc;
3112                                 int64_t mc_free = mc->mc_space - mc->mc_alloc;
3113                                 int64_t ratio;
3114
3115                                 /*
3116                                  * Calculate how much more or less we should
3117                                  * try to allocate from this device during
3118                                  * this iteration around the rotor.
3119                                  *
3120                                  * This basically introduces a zero-centered
3121                                  * bias towards the devices with the most
3122                                  * free space, while compensating for vdev
3123                                  * size differences.
3124                                  *
3125                                  * Examples:
3126                                  *  vdev V1 = 16M/128M
3127                                  *  vdev V2 = 16M/128M
3128                                  *  ratio(V1) = 100% ratio(V2) = 100%
3129                                  *
3130                                  *  vdev V1 = 16M/128M
3131                                  *  vdev V2 = 64M/128M
3132                                  *  ratio(V1) = 127% ratio(V2) =  72%
3133                                  *
3134                                  *  vdev V1 = 16M/128M
3135                                  *  vdev V2 = 64M/512M
3136                                  *  ratio(V1) =  40% ratio(V2) = 160%
3137                                  */
3138                                 ratio = (vs_free * mc->mc_alloc_groups * 100) /
3139                                     (mc_free + 1);
3140                                 mg->mg_bias = ((ratio - 100) *
3141                                     (int64_t)mg->mg_aliquot) / 100;
3142                         } else if (!metaslab_bias_enabled) {
3143                                 mg->mg_bias = 0;
3144                         }
3145
3146                         if ((flags & METASLAB_FASTWRITE) ||
3147                             atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3148                             mg->mg_aliquot + mg->mg_bias) {
3149                                 mc->mc_rotor = mg->mg_next;
3150                                 mc->mc_aliquot = 0;
3151                         }
3152
3153                         DVA_SET_VDEV(&dva[d], vd->vdev_id);
3154                         DVA_SET_OFFSET(&dva[d], offset);
3155                         DVA_SET_GANG(&dva[d],
3156                             ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
3157                         DVA_SET_ASIZE(&dva[d], asize);
3158
3159                         if (flags & METASLAB_FASTWRITE) {
3160                                 atomic_add_64(&vd->vdev_pending_fastwrite,
3161                                     psize);
3162                         }
3163
3164                         return (0);
3165                 }
3166 next:
3167                 mc->mc_rotor = mg->mg_next;
3168                 mc->mc_aliquot = 0;
3169         } while ((mg = mg->mg_next) != rotor);
3170
3171         /*
3172          * If we haven't tried hard, do so now.
3173          */
3174         if (!try_hard) {
3175                 try_hard = B_TRUE;
3176                 goto top;
3177         }
3178
3179         bzero(&dva[d], sizeof (dva_t));
3180
3181         metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
3182         return (SET_ERROR(ENOSPC));
3183 }
3184
3185 void
3186 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
3187     uint64_t txg)
3188 {
3189         metaslab_t *msp;
3190         ASSERTV(spa_t *spa = vd->vdev_spa);
3191
3192         ASSERT3U(txg, ==, spa->spa_syncing_txg);
3193         ASSERT(vdev_is_concrete(vd));
3194         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3195         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
3196
3197         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3198
3199         VERIFY(!msp->ms_condensing);
3200         VERIFY3U(offset, >=, msp->ms_start);
3201         VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
3202         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3203         VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
3204
3205         metaslab_check_free_impl(vd, offset, asize);
3206         mutex_enter(&msp->ms_lock);
3207         if (range_tree_space(msp->ms_freeingtree) == 0) {
3208                 vdev_dirty(vd, VDD_METASLAB, msp, txg);
3209         }
3210         range_tree_add(msp->ms_freeingtree, offset, asize);
3211         mutex_exit(&msp->ms_lock);
3212 }
3213
3214 /* ARGSUSED */
3215 void
3216 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3217     uint64_t size, void *arg)
3218 {
3219         uint64_t *txgp = arg;
3220
3221         if (vd->vdev_ops->vdev_op_remap != NULL)
3222                 vdev_indirect_mark_obsolete(vd, offset, size, *txgp);
3223         else
3224                 metaslab_free_impl(vd, offset, size, *txgp);
3225 }
3226
3227 static void
3228 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
3229     uint64_t txg)
3230 {
3231         spa_t *spa = vd->vdev_spa;
3232
3233         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3234
3235         if (txg > spa_freeze_txg(spa))
3236                 return;
3237
3238         if (spa->spa_vdev_removal != NULL &&
3239             spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
3240             vdev_is_concrete(vd)) {
3241                 /*
3242                  * Note: we check if the vdev is concrete because when
3243                  * we complete the removal, we first change the vdev to be
3244                  * an indirect vdev (in open context), and then (in syncing
3245                  * context) clear spa_vdev_removal.
3246                  */
3247                 free_from_removing_vdev(vd, offset, size, txg);
3248         } else if (vd->vdev_ops->vdev_op_remap != NULL) {
3249                 vdev_indirect_mark_obsolete(vd, offset, size, txg);
3250                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
3251                     metaslab_free_impl_cb, &txg);
3252         } else {
3253                 metaslab_free_concrete(vd, offset, size, txg);
3254         }
3255 }
3256
3257 typedef struct remap_blkptr_cb_arg {
3258         blkptr_t *rbca_bp;
3259         spa_remap_cb_t rbca_cb;
3260         vdev_t *rbca_remap_vd;
3261         uint64_t rbca_remap_offset;
3262         void *rbca_cb_arg;
3263 } remap_blkptr_cb_arg_t;
3264
3265 void
3266 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3267     uint64_t size, void *arg)
3268 {
3269         remap_blkptr_cb_arg_t *rbca = arg;
3270         blkptr_t *bp = rbca->rbca_bp;
3271
3272         /* We can not remap split blocks. */
3273         if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
3274                 return;
3275         ASSERT0(inner_offset);
3276
3277         if (rbca->rbca_cb != NULL) {
3278                 /*
3279                  * At this point we know that we are not handling split
3280                  * blocks and we invoke the callback on the previous
3281                  * vdev which must be indirect.
3282                  */
3283                 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
3284
3285                 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
3286                     rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
3287
3288                 /* set up remap_blkptr_cb_arg for the next call */
3289                 rbca->rbca_remap_vd = vd;
3290                 rbca->rbca_remap_offset = offset;
3291         }
3292
3293         /*
3294          * The phys birth time is that of dva[0].  This ensures that we know
3295          * when each dva was written, so that resilver can determine which
3296          * blocks need to be scrubbed (i.e. those written during the time
3297          * the vdev was offline).  It also ensures that the key used in
3298          * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
3299          * we didn't change the phys_birth, a lookup in the ARC for a
3300          * remapped BP could find the data that was previously stored at
3301          * this vdev + offset.
3302          */
3303         vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
3304             DVA_GET_VDEV(&bp->blk_dva[0]));
3305         vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
3306         bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
3307             DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
3308
3309         DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
3310         DVA_SET_OFFSET(&bp->blk_dva[0], offset);
3311 }
3312
3313 /*
3314  * If the block pointer contains any indirect DVAs, modify them to refer to
3315  * concrete DVAs.  Note that this will sometimes not be possible, leaving
3316  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
3317  * segments in the mapping (i.e. it is a "split block").
3318  *
3319  * If the BP was remapped, calls the callback on the original dva (note the
3320  * callback can be called multiple times if the original indirect DVA refers
3321  * to another indirect DVA, etc).
3322  *
3323  * Returns TRUE if the BP was remapped.
3324  */
3325 boolean_t
3326 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
3327 {
3328         remap_blkptr_cb_arg_t rbca;
3329
3330         if (!zfs_remap_blkptr_enable)
3331                 return (B_FALSE);
3332
3333         if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
3334                 return (B_FALSE);
3335
3336         /*
3337          * Dedup BP's can not be remapped, because ddt_phys_select() depends
3338          * on DVA[0] being the same in the BP as in the DDT (dedup table).
3339          */
3340         if (BP_GET_DEDUP(bp))
3341                 return (B_FALSE);
3342
3343         /*
3344          * Gang blocks can not be remapped, because
3345          * zio_checksum_gang_verifier() depends on the DVA[0] that's in
3346          * the BP used to read the gang block header (GBH) being the same
3347          * as the DVA[0] that we allocated for the GBH.
3348          */
3349         if (BP_IS_GANG(bp))
3350                 return (B_FALSE);
3351
3352         /*
3353          * Embedded BP's have no DVA to remap.
3354          */
3355         if (BP_GET_NDVAS(bp) < 1)
3356                 return (B_FALSE);
3357
3358         /*
3359          * Note: we only remap dva[0].  If we remapped other dvas, we
3360          * would no longer know what their phys birth txg is.
3361          */
3362         dva_t *dva = &bp->blk_dva[0];
3363
3364         uint64_t offset = DVA_GET_OFFSET(dva);
3365         uint64_t size = DVA_GET_ASIZE(dva);
3366         vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
3367
3368         if (vd->vdev_ops->vdev_op_remap == NULL)
3369                 return (B_FALSE);
3370
3371         rbca.rbca_bp = bp;
3372         rbca.rbca_cb = callback;
3373         rbca.rbca_remap_vd = vd;
3374         rbca.rbca_remap_offset = offset;
3375         rbca.rbca_cb_arg = arg;
3376
3377         /*
3378          * remap_blkptr_cb() will be called in order for each level of
3379          * indirection, until a concrete vdev is reached or a split block is
3380          * encountered. old_vd and old_offset are updated within the callback
3381          * as we go from the one indirect vdev to the next one (either concrete
3382          * or indirect again) in that order.
3383          */
3384         vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
3385
3386         /* Check if the DVA wasn't remapped because it is a split block */
3387         if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
3388                 return (B_FALSE);
3389
3390         return (B_TRUE);
3391 }
3392
3393 /*
3394  * Undo the allocation of a DVA which happened in the given transaction group.
3395  */
3396 void
3397 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3398 {
3399         metaslab_t *msp;
3400         vdev_t *vd;
3401         uint64_t vdev = DVA_GET_VDEV(dva);
3402         uint64_t offset = DVA_GET_OFFSET(dva);
3403         uint64_t size = DVA_GET_ASIZE(dva);
3404
3405         ASSERT(DVA_IS_VALID(dva));
3406         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3407
3408         if (txg > spa_freeze_txg(spa))
3409                 return;
3410
3411         if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
3412             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
3413                 zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
3414                     (u_longlong_t)vdev, (u_longlong_t)offset,
3415                     (u_longlong_t)size);
3416                 return;
3417         }
3418
3419         ASSERT(!vd->vdev_removing);
3420         ASSERT(vdev_is_concrete(vd));
3421         ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
3422         ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
3423
3424         if (DVA_GET_GANG(dva))
3425                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3426
3427         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3428
3429         mutex_enter(&msp->ms_lock);
3430         range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
3431             offset, size);
3432
3433         VERIFY(!msp->ms_condensing);
3434         VERIFY3U(offset, >=, msp->ms_start);
3435         VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
3436         VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
3437             msp->ms_size);
3438         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3439         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3440         range_tree_add(msp->ms_tree, offset, size);
3441         mutex_exit(&msp->ms_lock);
3442 }
3443
3444 /*
3445  * Free the block represented by DVA in the context of the specified
3446  * transaction group.
3447  */
3448 void
3449 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3450 {
3451         uint64_t vdev = DVA_GET_VDEV(dva);
3452         uint64_t offset = DVA_GET_OFFSET(dva);
3453         uint64_t size = DVA_GET_ASIZE(dva);
3454         vdev_t *vd = vdev_lookup_top(spa, vdev);
3455
3456         ASSERT(DVA_IS_VALID(dva));
3457         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3458
3459         if (DVA_GET_GANG(dva)) {
3460                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3461         }
3462
3463         metaslab_free_impl(vd, offset, size, txg);
3464 }
3465
3466 /*
3467  * Reserve some allocation slots. The reservation system must be called
3468  * before we call into the allocator. If there aren't any available slots
3469  * then the I/O will be throttled until an I/O completes and its slots are
3470  * freed up. The function returns true if it was successful in placing
3471  * the reservation.
3472  */
3473 boolean_t
3474 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
3475     int flags)
3476 {
3477         uint64_t available_slots = 0;
3478         boolean_t slot_reserved = B_FALSE;
3479
3480         ASSERT(mc->mc_alloc_throttle_enabled);
3481         mutex_enter(&mc->mc_lock);
3482
3483         uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
3484         if (reserved_slots < mc->mc_alloc_max_slots)
3485                 available_slots = mc->mc_alloc_max_slots - reserved_slots;
3486
3487         if (slots <= available_slots || GANG_ALLOCATION(flags)) {
3488                 /*
3489                  * We reserve the slots individually so that we can unreserve
3490                  * them individually when an I/O completes.
3491                  */
3492                 for (int d = 0; d < slots; d++) {
3493                         reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
3494                 }
3495                 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
3496                 slot_reserved = B_TRUE;
3497         }
3498
3499         mutex_exit(&mc->mc_lock);
3500         return (slot_reserved);
3501 }
3502
3503 void
3504 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
3505 {
3506         ASSERT(mc->mc_alloc_throttle_enabled);
3507         mutex_enter(&mc->mc_lock);
3508         for (int d = 0; d < slots; d++) {
3509                 (void) refcount_remove(&mc->mc_alloc_slots, zio);
3510         }
3511         mutex_exit(&mc->mc_lock);
3512 }
3513
3514 static int
3515 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
3516     uint64_t txg)
3517 {
3518         metaslab_t *msp;
3519         spa_t *spa = vd->vdev_spa;
3520         int error = 0;
3521
3522         if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
3523                 return (ENXIO);
3524
3525         ASSERT3P(vd->vdev_ms, !=, NULL);
3526         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3527
3528         mutex_enter(&msp->ms_lock);
3529
3530         if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
3531                 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
3532
3533         if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
3534                 error = SET_ERROR(ENOENT);
3535
3536         if (error || txg == 0) {        /* txg == 0 indicates dry run */
3537                 mutex_exit(&msp->ms_lock);
3538                 return (error);
3539         }
3540
3541         VERIFY(!msp->ms_condensing);
3542         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3543         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3544         VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
3545         range_tree_remove(msp->ms_tree, offset, size);
3546
3547         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
3548                 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
3549                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
3550                 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
3551         }
3552
3553         mutex_exit(&msp->ms_lock);
3554
3555         return (0);
3556 }
3557
3558 typedef struct metaslab_claim_cb_arg_t {
3559         uint64_t        mcca_txg;
3560         int             mcca_error;
3561 } metaslab_claim_cb_arg_t;
3562
3563 /* ARGSUSED */
3564 static void
3565 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3566     uint64_t size, void *arg)
3567 {
3568         metaslab_claim_cb_arg_t *mcca_arg = arg;
3569
3570         if (mcca_arg->mcca_error == 0) {
3571                 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
3572                     size, mcca_arg->mcca_txg);
3573         }
3574 }
3575
3576 int
3577 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
3578 {
3579         if (vd->vdev_ops->vdev_op_remap != NULL) {
3580                 metaslab_claim_cb_arg_t arg;
3581
3582                 /*
3583                  * Only zdb(1M) can claim on indirect vdevs.  This is used
3584                  * to detect leaks of mapped space (that are not accounted
3585                  * for in the obsolete counts, spacemap, or bpobj).
3586                  */
3587                 ASSERT(!spa_writeable(vd->vdev_spa));
3588                 arg.mcca_error = 0;
3589                 arg.mcca_txg = txg;
3590
3591                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
3592                     metaslab_claim_impl_cb, &arg);
3593
3594                 if (arg.mcca_error == 0) {
3595                         arg.mcca_error = metaslab_claim_concrete(vd,
3596                             offset, size, txg);
3597                 }
3598                 return (arg.mcca_error);
3599         } else {
3600                 return (metaslab_claim_concrete(vd, offset, size, txg));
3601         }
3602 }
3603
3604 /*
3605  * Intent log support: upon opening the pool after a crash, notify the SPA
3606  * of blocks that the intent log has allocated for immediate write, but
3607  * which are still considered free by the SPA because the last transaction
3608  * group didn't commit yet.
3609  */
3610 static int
3611 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3612 {
3613         uint64_t vdev = DVA_GET_VDEV(dva);
3614         uint64_t offset = DVA_GET_OFFSET(dva);
3615         uint64_t size = DVA_GET_ASIZE(dva);
3616         vdev_t *vd;
3617
3618         if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
3619                 return (SET_ERROR(ENXIO));
3620         }
3621
3622         ASSERT(DVA_IS_VALID(dva));
3623
3624         if (DVA_GET_GANG(dva))
3625                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3626
3627         return (metaslab_claim_impl(vd, offset, size, txg));
3628 }
3629
3630 int
3631 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
3632     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
3633     zio_alloc_list_t *zal, zio_t *zio)
3634 {
3635         dva_t *dva = bp->blk_dva;
3636         dva_t *hintdva = hintbp->blk_dva;
3637         int error = 0;
3638
3639         ASSERT(bp->blk_birth == 0);
3640         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
3641
3642         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3643
3644         if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
3645                 spa_config_exit(spa, SCL_ALLOC, FTAG);
3646                 return (SET_ERROR(ENOSPC));
3647         }
3648
3649         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
3650         ASSERT(BP_GET_NDVAS(bp) == 0);
3651         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
3652         ASSERT3P(zal, !=, NULL);
3653
3654         for (int d = 0; d < ndvas; d++) {
3655                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
3656                     txg, flags, zal);
3657                 if (error != 0) {
3658                         for (d--; d >= 0; d--) {
3659                                 metaslab_unalloc_dva(spa, &dva[d], txg);
3660                                 metaslab_group_alloc_decrement(spa,
3661                                     DVA_GET_VDEV(&dva[d]), zio, flags);
3662                                 bzero(&dva[d], sizeof (dva_t));
3663                         }
3664                         spa_config_exit(spa, SCL_ALLOC, FTAG);
3665                         return (error);
3666                 } else {
3667                         /*
3668                          * Update the metaslab group's queue depth
3669                          * based on the newly allocated dva.
3670                          */
3671                         metaslab_group_alloc_increment(spa,
3672                             DVA_GET_VDEV(&dva[d]), zio, flags);
3673                 }
3674
3675         }
3676         ASSERT(error == 0);
3677         ASSERT(BP_GET_NDVAS(bp) == ndvas);
3678
3679         spa_config_exit(spa, SCL_ALLOC, FTAG);
3680
3681         BP_SET_BIRTH(bp, txg, 0);
3682
3683         return (0);
3684 }
3685
3686 void
3687 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
3688 {
3689         const dva_t *dva = bp->blk_dva;
3690         int ndvas = BP_GET_NDVAS(bp);
3691
3692         ASSERT(!BP_IS_HOLE(bp));
3693         ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
3694
3695         spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
3696
3697         for (int d = 0; d < ndvas; d++) {
3698                 if (now) {
3699                         metaslab_unalloc_dva(spa, &dva[d], txg);
3700                 } else {
3701                         metaslab_free_dva(spa, &dva[d], txg);
3702                 }
3703         }
3704
3705         spa_config_exit(spa, SCL_FREE, FTAG);
3706 }
3707
3708 int
3709 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
3710 {
3711         const dva_t *dva = bp->blk_dva;
3712         int ndvas = BP_GET_NDVAS(bp);
3713         int error = 0;
3714
3715         ASSERT(!BP_IS_HOLE(bp));
3716
3717         if (txg != 0) {
3718                 /*
3719                  * First do a dry run to make sure all DVAs are claimable,
3720                  * so we don't have to unwind from partial failures below.
3721                  */
3722                 if ((error = metaslab_claim(spa, bp, 0)) != 0)
3723                         return (error);
3724         }
3725
3726         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3727
3728         for (int d = 0; d < ndvas; d++)
3729                 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
3730                         break;
3731
3732         spa_config_exit(spa, SCL_ALLOC, FTAG);
3733
3734         ASSERT(error == 0 || txg == 0);
3735
3736         return (error);
3737 }
3738
3739 void
3740 metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
3741 {
3742         const dva_t *dva = bp->blk_dva;
3743         int ndvas = BP_GET_NDVAS(bp);
3744         uint64_t psize = BP_GET_PSIZE(bp);
3745         int d;
3746         vdev_t *vd;
3747
3748         ASSERT(!BP_IS_HOLE(bp));
3749         ASSERT(!BP_IS_EMBEDDED(bp));
3750         ASSERT(psize > 0);
3751
3752         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3753
3754         for (d = 0; d < ndvas; d++) {
3755                 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
3756                         continue;
3757                 atomic_add_64(&vd->vdev_pending_fastwrite, psize);
3758         }
3759
3760         spa_config_exit(spa, SCL_VDEV, FTAG);
3761 }
3762
3763 void
3764 metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
3765 {
3766         const dva_t *dva = bp->blk_dva;
3767         int ndvas = BP_GET_NDVAS(bp);
3768         uint64_t psize = BP_GET_PSIZE(bp);
3769         int d;
3770         vdev_t *vd;
3771
3772         ASSERT(!BP_IS_HOLE(bp));
3773         ASSERT(!BP_IS_EMBEDDED(bp));
3774         ASSERT(psize > 0);
3775
3776         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3777
3778         for (d = 0; d < ndvas; d++) {
3779                 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
3780                         continue;
3781                 ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
3782                 atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
3783         }
3784
3785         spa_config_exit(spa, SCL_VDEV, FTAG);
3786 }
3787
3788 /* ARGSUSED */
3789 static void
3790 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
3791     uint64_t size, void *arg)
3792 {
3793         if (vd->vdev_ops == &vdev_indirect_ops)
3794                 return;
3795
3796         metaslab_check_free_impl(vd, offset, size);
3797 }
3798
3799 static void
3800 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
3801 {
3802         metaslab_t *msp;
3803         ASSERTV(spa_t *spa = vd->vdev_spa);
3804
3805         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3806                 return;
3807
3808         if (vd->vdev_ops->vdev_op_remap != NULL) {
3809                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
3810                     metaslab_check_free_impl_cb, NULL);
3811                 return;
3812         }
3813
3814         ASSERT(vdev_is_concrete(vd));
3815         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
3816         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
3817
3818         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3819
3820         mutex_enter(&msp->ms_lock);
3821         if (msp->ms_loaded)
3822                 range_tree_verify(msp->ms_tree, offset, size);
3823
3824         range_tree_verify(msp->ms_freeingtree, offset, size);
3825         range_tree_verify(msp->ms_freedtree, offset, size);
3826         for (int j = 0; j < TXG_DEFER_SIZE; j++)
3827                 range_tree_verify(msp->ms_defertree[j], offset, size);
3828         mutex_exit(&msp->ms_lock);
3829 }
3830
3831 void
3832 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
3833 {
3834         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3835                 return;
3836
3837         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3838         for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
3839                 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
3840                 vdev_t *vd = vdev_lookup_top(spa, vdev);
3841                 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
3842                 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
3843
3844                 if (DVA_GET_GANG(&bp->blk_dva[i]))
3845                         size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3846
3847                 ASSERT3P(vd, !=, NULL);
3848
3849                 metaslab_check_free_impl(vd, offset, size);
3850         }
3851         spa_config_exit(spa, SCL_VDEV, FTAG);
3852 }
3853
3854 #if defined(_KERNEL) && defined(HAVE_SPL)
3855 /* CSTYLED */
3856 module_param(metaslab_aliquot, ulong, 0644);
3857 MODULE_PARM_DESC(metaslab_aliquot,
3858         "allocation granularity (a.k.a. stripe size)");
3859
3860 module_param(metaslab_debug_load, int, 0644);
3861 MODULE_PARM_DESC(metaslab_debug_load,
3862         "load all metaslabs when pool is first opened");
3863
3864 module_param(metaslab_debug_unload, int, 0644);
3865 MODULE_PARM_DESC(metaslab_debug_unload,
3866         "prevent metaslabs from being unloaded");
3867
3868 module_param(metaslab_preload_enabled, int, 0644);
3869 MODULE_PARM_DESC(metaslab_preload_enabled,
3870         "preload potential metaslabs during reassessment");
3871
3872 module_param(zfs_mg_noalloc_threshold, int, 0644);
3873 MODULE_PARM_DESC(zfs_mg_noalloc_threshold,
3874         "percentage of free space for metaslab group to allow allocation");
3875
3876 module_param(zfs_mg_fragmentation_threshold, int, 0644);
3877 MODULE_PARM_DESC(zfs_mg_fragmentation_threshold,
3878         "fragmentation for metaslab group to allow allocation");
3879
3880 module_param(zfs_metaslab_fragmentation_threshold, int, 0644);
3881 MODULE_PARM_DESC(zfs_metaslab_fragmentation_threshold,
3882         "fragmentation for metaslab to allow allocation");
3883
3884 module_param(metaslab_fragmentation_factor_enabled, int, 0644);
3885 MODULE_PARM_DESC(metaslab_fragmentation_factor_enabled,
3886         "use the fragmentation metric to prefer less fragmented metaslabs");
3887
3888 module_param(metaslab_lba_weighting_enabled, int, 0644);
3889 MODULE_PARM_DESC(metaslab_lba_weighting_enabled,
3890         "prefer metaslabs with lower LBAs");
3891
3892 module_param(metaslab_bias_enabled, int, 0644);
3893 MODULE_PARM_DESC(metaslab_bias_enabled,
3894         "enable metaslab group biasing");
3895
3896 module_param(zfs_metaslab_segment_weight_enabled, int, 0644);
3897 MODULE_PARM_DESC(zfs_metaslab_segment_weight_enabled,
3898         "enable segment-based metaslab selection");
3899
3900 module_param(zfs_metaslab_switch_threshold, int, 0644);
3901 MODULE_PARM_DESC(zfs_metaslab_switch_threshold,
3902         "segment-based metaslab selection maximum buckets before switching");
3903
3904 /* CSTYLED */
3905 module_param(metaslab_force_ganging, ulong, 0644);
3906 MODULE_PARM_DESC(metaslab_force_ganging,
3907         "blocks larger than this size are forced to be gang blocks");
3908 #endif /* _KERNEL && HAVE_SPL */