granicus.if.org Git - zfs/blob - module/zfs/metaslab.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  * Copyright (c) 2017, Intel Corporation.
  26  */
  27
  28 #include <sys/zfs_context.h>
  29 #include <sys/dmu.h>
  30 #include <sys/dmu_tx.h>
  31 #include <sys/space_map.h>
  32 #include <sys/metaslab_impl.h>
  33 #include <sys/vdev_impl.h>
  34 #include <sys/zio.h>
  35 #include <sys/spa_impl.h>
  36 #include <sys/zfeature.h>
  37 #include <sys/vdev_indirect_mapping.h>
  38 #include <sys/zap.h>
  39
  40 #define WITH_DF_BLOCK_ALLOCATOR
  41
  42 #define GANG_ALLOCATION(flags) \
  43         ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
  44
  45 /*
  46  * Metaslab granularity, in bytes. This is roughly similar to what would be
  47  * referred to as the "stripe size" in traditional RAID arrays. In normal
  48  * operation, we will try to write this amount of data to a top-level vdev
  49  * before moving on to the next one.
  50  */
  51 unsigned long metaslab_aliquot = 512 << 10;
  52
  53 /*
  54  * For testing, make some blocks above a certain size be gang blocks.
  55  */
  56 unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
  57
  58 /*
  59  * In pools where the log space map feature is not enabled we touch
  60  * multiple metaslabs (and their respective space maps) with each
  61  * transaction group. Thus, we benefit from having a small space map
  62  * block size since it allows us to issue more I/O operations scattered
  63  * around the disk. So a sane default for the space map block size
  64  * is 8~16K.
  65  */
  66 int zfs_metaslab_sm_blksz_no_log = (1 << 14);
  67
  68 /*
  69  * When the log space map feature is enabled, we accumulate a lot of
  70  * changes per metaslab that are flushed once in a while so we benefit
  71  * from a bigger block size like 128K for the metaslab space maps.
  72  */
  73 int zfs_metaslab_sm_blksz_with_log = (1 << 17);
  74
  75 /*
  76  * The in-core space map representation is more compact than its on-disk form.
  77  * The zfs_condense_pct determines how much more compact the in-core
  78  * space map representation must be before we compact it on-disk.
  79  * Values should be greater than or equal to 100.
  80  */
  81 int zfs_condense_pct = 200;
  82
  83 /*
  84  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  85  * space used on disk. In particular, a space map uses data in increments of
  86  * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
  87  * same number of blocks after condensing. Since the goal of condensing is to
  88  * reduce the number of IOPs required to read the space map, we only want to
  89  * condense when we can be sure we will reduce the number of blocks used by the
  90  * space map. Unfortunately, we cannot precisely compute whether or not this is
  91  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  92  * we apply the following heuristic: do not condense a spacemap unless the
  93  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  94  * blocks.
  95  */
  96 int zfs_metaslab_condense_block_threshold = 4;
  97
  98 /*
  99  * The zfs_mg_noalloc_threshold defines which metaslab groups should
 100  * be eligible for allocation. The value is defined as a percentage of
 101  * free space. Metaslab groups that have more free space than
 102  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
 103  * a metaslab group's free space is less than or equal to the
 104  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
 105  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
 106  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
 107  * groups are allowed to accept allocations. Gang blocks are always
 108  * eligible to allocate on any metaslab group. The default value of 0 means
 109  * no metaslab group will be excluded based on this criterion.
 110  */
 111 int zfs_mg_noalloc_threshold = 0;
 112
 113 /*
 114  * Metaslab groups are considered eligible for allocations if their
 115  * fragmenation metric (measured as a percentage) is less than or
 116  * equal to zfs_mg_fragmentation_threshold. If a metaslab group
 117  * exceeds this threshold then it will be skipped unless all metaslab
 118  * groups within the metaslab class have also crossed this threshold.
 119  *
 120  * This tunable was introduced to avoid edge cases where we continue
 121  * allocating from very fragmented disks in our pool while other, less
 122  * fragmented disks, exists. On the other hand, if all disks in the
 123  * pool are uniformly approaching the threshold, the threshold can
 124  * be a speed bump in performance, where we keep switching the disks
 125  * that we allocate from (e.g. we allocate some segments from disk A
 126  * making it bypassing the threshold while freeing segments from disk
 127  * B getting its fragmentation below the threshold).
 128  *
 129  * Empirically, we've seen that our vdev selection for allocations is
 130  * good enough that fragmentation increases uniformly across all vdevs
 131  * the majority of the time. Thus we set the threshold percentage high
 132  * enough to avoid hitting the speed bump on pools that are being pushed
 133  * to the edge.
 134  */
 135 int zfs_mg_fragmentation_threshold = 95;
 136
 137 /*
 138  * Allow metaslabs to keep their active state as long as their fragmentation
 139  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
 140  * active metaslab that exceeds this threshold will no longer keep its active
 141  * status allowing better metaslabs to be selected.
 142  */
 143 int zfs_metaslab_fragmentation_threshold = 70;
 144
 145 /*
 146  * When set will load all metaslabs when pool is first opened.
 147  */
 148 int metaslab_debug_load = 0;
 149
 150 /*
 151  * When set will prevent metaslabs from being unloaded.
 152  */
 153 int metaslab_debug_unload = 0;
 154
 155 /*
 156  * Minimum size which forces the dynamic allocator to change
 157  * it's allocation strategy.  Once the space map cannot satisfy
 158  * an allocation of this size then it switches to using more
 159  * aggressive strategy (i.e search by size rather than offset).
 160  */
 161 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 162
 163 /*
 164  * The minimum free space, in percent, which must be available
 165  * in a space map to continue allocations in a first-fit fashion.
 166  * Once the space map's free space drops below this level we dynamically
 167  * switch to using best-fit allocations.
 168  */
 169 int metaslab_df_free_pct = 4;
 170
 171 /*
 172  * Maximum distance to search forward from the last offset. Without this
 173  * limit, fragmented pools can see >100,000 iterations and
 174  * metaslab_block_picker() becomes the performance limiting factor on
 175  * high-performance storage.
 176  *
 177  * With the default setting of 16MB, we typically see less than 500
 178  * iterations, even with very fragmented, ashift=9 pools. The maximum number
 179  * of iterations possible is:
 180  *     metaslab_df_max_search / (2 * (1<<ashift))
 181  * With the default setting of 16MB this is 16*1024 (with ashift=9) or
 182  * 2048 (with ashift=12).
 183  */
 184 int metaslab_df_max_search = 16 * 1024 * 1024;
 185
 186 /*
 187  * If we are not searching forward (due to metaslab_df_max_search,
 188  * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
 189  * controls what segment is used.  If it is set, we will use the largest free
 190  * segment.  If it is not set, we will use a segment of exactly the requested
 191  * size (or larger).
 192  */
 193 int metaslab_df_use_largest_segment = B_FALSE;
 194
 195 /*
 196  * Percentage of all cpus that can be used by the metaslab taskq.
 197  */
 198 int metaslab_load_pct = 50;
 199
 200 /*
 201  * Determines how many txgs a metaslab may remain loaded without having any
 202  * allocations from it. As long as a metaslab continues to be used we will
 203  * keep it loaded.
 204  */
 205 int metaslab_unload_delay = TXG_SIZE * 2;
 206
 207 /*
 208  * Max number of metaslabs per group to preload.
 209  */
 210 int metaslab_preload_limit = SPA_DVAS_PER_BP;
 211
 212 /*
 213  * Enable/disable preloading of metaslab.
 214  */
 215 int metaslab_preload_enabled = B_TRUE;
 216
 217 /*
 218  * Enable/disable fragmentation weighting on metaslabs.
 219  */
 220 int metaslab_fragmentation_factor_enabled = B_TRUE;
 221
 222 /*
 223  * Enable/disable lba weighting (i.e. outer tracks are given preference).
 224  */
 225 int metaslab_lba_weighting_enabled = B_TRUE;
 226
 227 /*
 228  * Enable/disable metaslab group biasing.
 229  */
 230 int metaslab_bias_enabled = B_TRUE;
 231
 232 /*
 233  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
 234  */
 235 boolean_t zfs_remap_blkptr_enable = B_TRUE;
 236
 237 /*
 238  * Enable/disable segment-based metaslab selection.
 239  */
 240 int zfs_metaslab_segment_weight_enabled = B_TRUE;
 241
 242 /*
 243  * When using segment-based metaslab selection, we will continue
 244  * allocating from the active metaslab until we have exhausted
 245  * zfs_metaslab_switch_threshold of its buckets.
 246  */
 247 int zfs_metaslab_switch_threshold = 2;
 248
 249 /*
 250  * Internal switch to enable/disable the metaslab allocation tracing
 251  * facility.
 252  */
 253 #ifdef _METASLAB_TRACING
 254 boolean_t metaslab_trace_enabled = B_TRUE;
 255 #endif
 256
 257 /*
 258  * Maximum entries that the metaslab allocation tracing facility will keep
 259  * in a given list when running in non-debug mode. We limit the number
 260  * of entries in non-debug mode to prevent us from using up too much memory.
 261  * The limit should be sufficiently large that we don't expect any allocation
 262  * to every exceed this value. In debug mode, the system will panic if this
 263  * limit is ever reached allowing for further investigation.
 264  */
 265 #ifdef _METASLAB_TRACING
 266 uint64_t metaslab_trace_max_entries = 5000;
 267 #endif
 268
 269 /*
 270  * Maximum number of metaslabs per group that can be disabled
 271  * simultaneously.
 272  */
 273 int max_disabled_ms = 3;
 274
 275 /*
 276  * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
 277  * To avoid 64-bit overflow, don't set above UINT32_MAX.
 278  */
 279 unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
 280
 281 /*
 282  * Maximum percentage of memory to use on storing loaded metaslabs. If loading
 283  * a metaslab would take it over this percentage, the oldest selected metaslab
 284  * is automatically unloaded.
 285  */
 286 int zfs_metaslab_mem_limit = 75;
 287
 288 static uint64_t metaslab_weight(metaslab_t *);
 289 static void metaslab_set_fragmentation(metaslab_t *);
 290 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 291 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 292
 293 static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
 294 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
 295 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
 296 static unsigned int metaslab_idx_func(multilist_t *, void *);
 297 static void metaslab_evict(metaslab_t *, uint64_t);
 298 #ifdef _METASLAB_TRACING
 299 kmem_cache_t *metaslab_alloc_trace_cache;
 300 #endif
 301
 302 /*
 303  * ==========================================================================
 304  * Metaslab classes
 305  * ==========================================================================
 306  */
 307 metaslab_class_t *
 308 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 309 {
 310         metaslab_class_t *mc;
 311
 312         mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 313
 314         mc->mc_spa = spa;
 315         mc->mc_rotor = NULL;
 316         mc->mc_ops = ops;
 317         mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 318         mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t),
 319             offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
 320         mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
 321             sizeof (zfs_refcount_t), KM_SLEEP);
 322         mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
 323             sizeof (uint64_t), KM_SLEEP);
 324         for (int i = 0; i < spa->spa_alloc_count; i++)
 325                 zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]);
 326
 327         return (mc);
 328 }
 329
 330 void
 331 metaslab_class_destroy(metaslab_class_t *mc)
 332 {
 333         ASSERT(mc->mc_rotor == NULL);
 334         ASSERT(mc->mc_alloc == 0);
 335         ASSERT(mc->mc_deferred == 0);
 336         ASSERT(mc->mc_space == 0);
 337         ASSERT(mc->mc_dspace == 0);
 338
 339         for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
 340                 zfs_refcount_destroy(&mc->mc_alloc_slots[i]);
 341         kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
 342             sizeof (zfs_refcount_t));
 343         kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
 344             sizeof (uint64_t));
 345         mutex_destroy(&mc->mc_lock);
 346         multilist_destroy(mc->mc_metaslab_txg_list);
 347         kmem_free(mc, sizeof (metaslab_class_t));
 348 }
 349
 350 int
 351 metaslab_class_validate(metaslab_class_t *mc)
 352 {
 353         metaslab_group_t *mg;
 354         vdev_t *vd;
 355
 356         /*
 357          * Must hold one of the spa_config locks.
 358          */
 359         ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 360             spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 361
 362         if ((mg = mc->mc_rotor) == NULL)
 363                 return (0);
 364
 365         do {
 366                 vd = mg->mg_vd;
 367                 ASSERT(vd->vdev_mg != NULL);
 368                 ASSERT3P(vd->vdev_top, ==, vd);
 369                 ASSERT3P(mg->mg_class, ==, mc);
 370                 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 371         } while ((mg = mg->mg_next) != mc->mc_rotor);
 372
 373         return (0);
 374 }
 375
 376 static void
 377 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 378     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 379 {
 380         atomic_add_64(&mc->mc_alloc, alloc_delta);
 381         atomic_add_64(&mc->mc_deferred, defer_delta);
 382         atomic_add_64(&mc->mc_space, space_delta);
 383         atomic_add_64(&mc->mc_dspace, dspace_delta);
 384 }
 385
 386 uint64_t
 387 metaslab_class_get_alloc(metaslab_class_t *mc)
 388 {
 389         return (mc->mc_alloc);
 390 }
 391
 392 uint64_t
 393 metaslab_class_get_deferred(metaslab_class_t *mc)
 394 {
 395         return (mc->mc_deferred);
 396 }
 397
 398 uint64_t
 399 metaslab_class_get_space(metaslab_class_t *mc)
 400 {
 401         return (mc->mc_space);
 402 }
 403
 404 uint64_t
 405 metaslab_class_get_dspace(metaslab_class_t *mc)
 406 {
 407         return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 408 }
 409
 410 void
 411 metaslab_class_histogram_verify(metaslab_class_t *mc)
 412 {
 413         spa_t *spa = mc->mc_spa;
 414         vdev_t *rvd = spa->spa_root_vdev;
 415         uint64_t *mc_hist;
 416         int i;
 417
 418         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 419                 return;
 420
 421         mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 422             KM_SLEEP);
 423
 424         for (int c = 0; c < rvd->vdev_children; c++) {
 425                 vdev_t *tvd = rvd->vdev_child[c];
 426                 metaslab_group_t *mg = tvd->vdev_mg;
 427
 428                 /*
 429                  * Skip any holes, uninitialized top-levels, or
 430                  * vdevs that are not in this metalab class.
 431                  */
 432                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 433                     mg->mg_class != mc) {
 434                         continue;
 435                 }
 436
 437                 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 438                         mc_hist[i] += mg->mg_histogram[i];
 439         }
 440
 441         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 442                 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 443
 444         kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 445 }
 446
 447 /*
 448  * Calculate the metaslab class's fragmentation metric. The metric
 449  * is weighted based on the space contribution of each metaslab group.
 450  * The return value will be a number between 0 and 100 (inclusive), or
 451  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
 452  * zfs_frag_table for more information about the metric.
 453  */
 454 uint64_t
 455 metaslab_class_fragmentation(metaslab_class_t *mc)
 456 {
 457         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 458         uint64_t fragmentation = 0;
 459
 460         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 461
 462         for (int c = 0; c < rvd->vdev_children; c++) {
 463                 vdev_t *tvd = rvd->vdev_child[c];
 464                 metaslab_group_t *mg = tvd->vdev_mg;
 465
 466                 /*
 467                  * Skip any holes, uninitialized top-levels,
 468                  * or vdevs that are not in this metalab class.
 469                  */
 470                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 471                     mg->mg_class != mc) {
 472                         continue;
 473                 }
 474
 475                 /*
 476                  * If a metaslab group does not contain a fragmentation
 477                  * metric then just bail out.
 478                  */
 479                 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 480                         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 481                         return (ZFS_FRAG_INVALID);
 482                 }
 483
 484                 /*
 485                  * Determine how much this metaslab_group is contributing
 486                  * to the overall pool fragmentation metric.
 487                  */
 488                 fragmentation += mg->mg_fragmentation *
 489                     metaslab_group_get_space(mg);
 490         }
 491         fragmentation /= metaslab_class_get_space(mc);
 492
 493         ASSERT3U(fragmentation, <=, 100);
 494         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 495         return (fragmentation);
 496 }
 497
 498 /*
 499  * Calculate the amount of expandable space that is available in
 500  * this metaslab class. If a device is expanded then its expandable
 501  * space will be the amount of allocatable space that is currently not
 502  * part of this metaslab class.
 503  */
 504 uint64_t
 505 metaslab_class_expandable_space(metaslab_class_t *mc)
 506 {
 507         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 508         uint64_t space = 0;
 509
 510         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 511         for (int c = 0; c < rvd->vdev_children; c++) {
 512                 vdev_t *tvd = rvd->vdev_child[c];
 513                 metaslab_group_t *mg = tvd->vdev_mg;
 514
 515                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 516                     mg->mg_class != mc) {
 517                         continue;
 518                 }
 519
 520                 /*
 521                  * Calculate if we have enough space to add additional
 522                  * metaslabs. We report the expandable space in terms
 523                  * of the metaslab size since that's the unit of expansion.
 524                  */
 525                 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
 526                     1ULL << tvd->vdev_ms_shift);
 527         }
 528         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 529         return (space);
 530 }
 531
 532 void
 533 metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 534 {
 535         multilist_t *ml = mc->mc_metaslab_txg_list;
 536         for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
 537                 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
 538                 metaslab_t *msp = multilist_sublist_head(mls);
 539                 multilist_sublist_unlock(mls);
 540                 while (msp != NULL) {
 541                         mutex_enter(&msp->ms_lock);
 542                         /*
 543                          * Once we've hit a metaslab selected too recently to
 544                          * evict, we're done evicting for now.
 545                          */
 546                         if (msp->ms_selected_txg + metaslab_unload_delay >=
 547                             txg) {
 548                                 mutex_exit(&msp->ms_lock);
 549                                 break;
 550                         }
 551
 552                         /*
 553                          * If the metaslab has been removed from the list
 554                          * (which could happen if we were at the memory limit
 555                          * and it was evicted during this loop), then we can't
 556                          * proceed and we should restart the sublist.
 557                          */
 558                         if (!multilist_link_active(&msp->ms_class_txg_node)) {
 559                                 mutex_exit(&msp->ms_lock);
 560                                 i--;
 561                                 break;
 562                         }
 563                         mls = multilist_sublist_lock(ml, i);
 564                         metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 565                         multilist_sublist_unlock(mls);
 566                         metaslab_evict(msp, txg);
 567                         mutex_exit(&msp->ms_lock);
 568                         msp = next_msp;
 569                 }
 570         }
 571 }
 572
 573 static int
 574 metaslab_compare(const void *x1, const void *x2)
 575 {
 576         const metaslab_t *m1 = (const metaslab_t *)x1;
 577         const metaslab_t *m2 = (const metaslab_t *)x2;
 578
 579         int sort1 = 0;
 580         int sort2 = 0;
 581         if (m1->ms_allocator != -1 && m1->ms_primary)
 582                 sort1 = 1;
 583         else if (m1->ms_allocator != -1 && !m1->ms_primary)
 584                 sort1 = 2;
 585         if (m2->ms_allocator != -1 && m2->ms_primary)
 586                 sort2 = 1;
 587         else if (m2->ms_allocator != -1 && !m2->ms_primary)
 588                 sort2 = 2;
 589
 590         /*
 591          * Sort inactive metaslabs first, then primaries, then secondaries. When
 592          * selecting a metaslab to allocate from, an allocator first tries its
 593          * primary, then secondary active metaslab. If it doesn't have active
 594          * metaslabs, or can't allocate from them, it searches for an inactive
 595          * metaslab to activate. If it can't find a suitable one, it will steal
 596          * a primary or secondary metaslab from another allocator.
 597          */
 598         if (sort1 < sort2)
 599                 return (-1);
 600         if (sort1 > sort2)
 601                 return (1);
 602
 603         int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
 604         if (likely(cmp))
 605                 return (cmp);
 606
 607         IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
 608
 609         return (AVL_CMP(m1->ms_start, m2->ms_start));
 610 }
 611
 612 /*
 613  * ==========================================================================
 614  * Metaslab groups
 615  * ==========================================================================
 616  */
 617 /*
 618  * Update the allocatable flag and the metaslab group's capacity.
 619  * The allocatable flag is set to true if the capacity is below
 620  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
 621  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
 622  * transitions from allocatable to non-allocatable or vice versa then the
 623  * metaslab group's class is updated to reflect the transition.
 624  */
 625 static void
 626 metaslab_group_alloc_update(metaslab_group_t *mg)
 627 {
 628         vdev_t *vd = mg->mg_vd;
 629         metaslab_class_t *mc = mg->mg_class;
 630         vdev_stat_t *vs = &vd->vdev_stat;
 631         boolean_t was_allocatable;
 632         boolean_t was_initialized;
 633
 634         ASSERT(vd == vd->vdev_top);
 635         ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
 636             SCL_ALLOC);
 637
 638         mutex_enter(&mg->mg_lock);
 639         was_allocatable = mg->mg_allocatable;
 640         was_initialized = mg->mg_initialized;
 641
 642         mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 643             (vs->vs_space + 1);
 644
 645         mutex_enter(&mc->mc_lock);
 646
 647         /*
 648          * If the metaslab group was just added then it won't
 649          * have any space until we finish syncing out this txg.
 650          * At that point we will consider it initialized and available
 651          * for allocations.  We also don't consider non-activated
 652          * metaslab groups (e.g. vdevs that are in the middle of being removed)
 653          * to be initialized, because they can't be used for allocation.
 654          */
 655         mg->mg_initialized = metaslab_group_initialized(mg);
 656         if (!was_initialized && mg->mg_initialized) {
 657                 mc->mc_groups++;
 658         } else if (was_initialized && !mg->mg_initialized) {
 659                 ASSERT3U(mc->mc_groups, >, 0);
 660                 mc->mc_groups--;
 661         }
 662         if (mg->mg_initialized)
 663                 mg->mg_no_free_space = B_FALSE;
 664
 665         /*
 666          * A metaslab group is considered allocatable if it has plenty
 667          * of free space or is not heavily fragmented. We only take
 668          * fragmentation into account if the metaslab group has a valid
 669          * fragmentation metric (i.e. a value between 0 and 100).
 670          */
 671         mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 672             mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 673             (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 674             mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 675
 676         /*
 677          * The mc_alloc_groups maintains a count of the number of
 678          * groups in this metaslab class that are still above the
 679          * zfs_mg_noalloc_threshold. This is used by the allocating
 680          * threads to determine if they should avoid allocations to
 681          * a given group. The allocator will avoid allocations to a group
 682          * if that group has reached or is below the zfs_mg_noalloc_threshold
 683          * and there are still other groups that are above the threshold.
 684          * When a group transitions from allocatable to non-allocatable or
 685          * vice versa we update the metaslab class to reflect that change.
 686          * When the mc_alloc_groups value drops to 0 that means that all
 687          * groups have reached the zfs_mg_noalloc_threshold making all groups
 688          * eligible for allocations. This effectively means that all devices
 689          * are balanced again.
 690          */
 691         if (was_allocatable && !mg->mg_allocatable)
 692                 mc->mc_alloc_groups--;
 693         else if (!was_allocatable && mg->mg_allocatable)
 694                 mc->mc_alloc_groups++;
 695         mutex_exit(&mc->mc_lock);
 696
 697         mutex_exit(&mg->mg_lock);
 698 }
 699
 700 int
 701 metaslab_sort_by_flushed(const void *va, const void *vb)
 702 {
 703         const metaslab_t *a = va;
 704         const metaslab_t *b = vb;
 705
 706         int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
 707         if (likely(cmp))
 708                 return (cmp);
 709
 710         uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
 711         uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
 712         cmp = AVL_CMP(a_vdev_id, b_vdev_id);
 713         if (cmp)
 714                 return (cmp);
 715
 716         return (AVL_CMP(a->ms_id, b->ms_id));
 717 }
 718
 719 metaslab_group_t *
 720 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 721 {
 722         metaslab_group_t *mg;
 723
 724         mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 725         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 726         mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
 727         cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
 728         mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
 729             KM_SLEEP);
 730         mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
 731             KM_SLEEP);
 732         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 733             sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
 734         mg->mg_vd = vd;
 735         mg->mg_class = mc;
 736         mg->mg_activation_count = 0;
 737         mg->mg_initialized = B_FALSE;
 738         mg->mg_no_free_space = B_TRUE;
 739         mg->mg_allocators = allocators;
 740
 741         mg->mg_alloc_queue_depth = kmem_zalloc(allocators *
 742             sizeof (zfs_refcount_t), KM_SLEEP);
 743         mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
 744             sizeof (uint64_t), KM_SLEEP);
 745         for (int i = 0; i < allocators; i++) {
 746                 zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
 747                 mg->mg_cur_max_alloc_queue_depth[i] = 0;
 748         }
 749
 750         mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 751             maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
 752
 753         return (mg);
 754 }
 755
 756 void
 757 metaslab_group_destroy(metaslab_group_t *mg)
 758 {
 759         ASSERT(mg->mg_prev == NULL);
 760         ASSERT(mg->mg_next == NULL);
 761         /*
 762          * We may have gone below zero with the activation count
 763          * either because we never activated in the first place or
 764          * because we're done, and possibly removing the vdev.
 765          */
 766         ASSERT(mg->mg_activation_count <= 0);
 767
 768         taskq_destroy(mg->mg_taskq);
 769         avl_destroy(&mg->mg_metaslab_tree);
 770         kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
 771         kmem_free(mg->mg_secondaries, mg->mg_allocators *
 772             sizeof (metaslab_t *));
 773         mutex_destroy(&mg->mg_lock);
 774         mutex_destroy(&mg->mg_ms_disabled_lock);
 775         cv_destroy(&mg->mg_ms_disabled_cv);
 776
 777         for (int i = 0; i < mg->mg_allocators; i++) {
 778                 zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
 779                 mg->mg_cur_max_alloc_queue_depth[i] = 0;
 780         }
 781         kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
 782             sizeof (zfs_refcount_t));
 783         kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
 784             sizeof (uint64_t));
 785
 786         kmem_free(mg, sizeof (metaslab_group_t));
 787 }
 788
 789 void
 790 metaslab_group_activate(metaslab_group_t *mg)
 791 {
 792         metaslab_class_t *mc = mg->mg_class;
 793         metaslab_group_t *mgprev, *mgnext;
 794
 795         ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
 796
 797         ASSERT(mc->mc_rotor != mg);
 798         ASSERT(mg->mg_prev == NULL);
 799         ASSERT(mg->mg_next == NULL);
 800         ASSERT(mg->mg_activation_count <= 0);
 801
 802         if (++mg->mg_activation_count <= 0)
 803                 return;
 804
 805         mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 806         metaslab_group_alloc_update(mg);
 807
 808         if ((mgprev = mc->mc_rotor) == NULL) {
 809                 mg->mg_prev = mg;
 810                 mg->mg_next = mg;
 811         } else {
 812                 mgnext = mgprev->mg_next;
 813                 mg->mg_prev = mgprev;
 814                 mg->mg_next = mgnext;
 815                 mgprev->mg_next = mg;
 816                 mgnext->mg_prev = mg;
 817         }
 818         mc->mc_rotor = mg;
 819 }
 820
 821 /*
 822  * Passivate a metaslab group and remove it from the allocation rotor.
 823  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
 824  * a metaslab group. This function will momentarily drop spa_config_locks
 825  * that are lower than the SCL_ALLOC lock (see comment below).
 826  */
 827 void
 828 metaslab_group_passivate(metaslab_group_t *mg)
 829 {
 830         metaslab_class_t *mc = mg->mg_class;
 831         spa_t *spa = mc->mc_spa;
 832         metaslab_group_t *mgprev, *mgnext;
 833         int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
 834
 835         ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
 836             (SCL_ALLOC | SCL_ZIO));
 837
 838         if (--mg->mg_activation_count != 0) {
 839                 ASSERT(mc->mc_rotor != mg);
 840                 ASSERT(mg->mg_prev == NULL);
 841                 ASSERT(mg->mg_next == NULL);
 842                 ASSERT(mg->mg_activation_count < 0);
 843                 return;
 844         }
 845
 846         /*
 847          * The spa_config_lock is an array of rwlocks, ordered as
 848          * follows (from highest to lowest):
 849          *      SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
 850          *      SCL_ZIO > SCL_FREE > SCL_VDEV
 851          * (For more information about the spa_config_lock see spa_misc.c)
 852          * The higher the lock, the broader its coverage. When we passivate
 853          * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
 854          * config locks. However, the metaslab group's taskq might be trying
 855          * to preload metaslabs so we must drop the SCL_ZIO lock and any
 856          * lower locks to allow the I/O to complete. At a minimum,
 857          * we continue to hold the SCL_ALLOC lock, which prevents any future
 858          * allocations from taking place and any changes to the vdev tree.
 859          */
 860         spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
 861         taskq_wait_outstanding(mg->mg_taskq, 0);
 862         spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 863         metaslab_group_alloc_update(mg);
 864         for (int i = 0; i < mg->mg_allocators; i++) {
 865                 metaslab_t *msp = mg->mg_primaries[i];
 866                 if (msp != NULL) {
 867                         mutex_enter(&msp->ms_lock);
 868                         metaslab_passivate(msp,
 869                             metaslab_weight_from_range_tree(msp));
 870                         mutex_exit(&msp->ms_lock);
 871                 }
 872                 msp = mg->mg_secondaries[i];
 873                 if (msp != NULL) {
 874                         mutex_enter(&msp->ms_lock);
 875                         metaslab_passivate(msp,
 876                             metaslab_weight_from_range_tree(msp));
 877                         mutex_exit(&msp->ms_lock);
 878                 }
 879         }
 880
 881         mgprev = mg->mg_prev;
 882         mgnext = mg->mg_next;
 883
 884         if (mg == mgnext) {
 885                 mc->mc_rotor = NULL;
 886         } else {
 887                 mc->mc_rotor = mgnext;
 888                 mgprev->mg_next = mgnext;
 889                 mgnext->mg_prev = mgprev;
 890         }
 891
 892         mg->mg_prev = NULL;
 893         mg->mg_next = NULL;
 894 }
 895
 896 boolean_t
 897 metaslab_group_initialized(metaslab_group_t *mg)
 898 {
 899         vdev_t *vd = mg->mg_vd;
 900         vdev_stat_t *vs = &vd->vdev_stat;
 901
 902         return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 903 }
 904
 905 uint64_t
 906 metaslab_group_get_space(metaslab_group_t *mg)
 907 {
 908         return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
 909 }
 910
 911 void
 912 metaslab_group_histogram_verify(metaslab_group_t *mg)
 913 {
 914         uint64_t *mg_hist;
 915         vdev_t *vd = mg->mg_vd;
 916         uint64_t ashift = vd->vdev_ashift;
 917         int i;
 918
 919         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 920                 return;
 921
 922         mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 923             KM_SLEEP);
 924
 925         ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 926             SPACE_MAP_HISTOGRAM_SIZE + ashift);
 927
 928         for (int m = 0; m < vd->vdev_ms_count; m++) {
 929                 metaslab_t *msp = vd->vdev_ms[m];
 930
 931                 /* skip if not active or not a member */
 932                 if (msp->ms_sm == NULL || msp->ms_group != mg)
 933                         continue;
 934
 935                 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 936                         mg_hist[i + ashift] +=
 937                             msp->ms_sm->sm_phys->smp_histogram[i];
 938         }
 939
 940         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 941                 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 942
 943         kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 944 }
 945
 946 static void
 947 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 948 {
 949         metaslab_class_t *mc = mg->mg_class;
 950         uint64_t ashift = mg->mg_vd->vdev_ashift;
 951
 952         ASSERT(MUTEX_HELD(&msp->ms_lock));
 953         if (msp->ms_sm == NULL)
 954                 return;
 955
 956         mutex_enter(&mg->mg_lock);
 957         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 958                 mg->mg_histogram[i + ashift] +=
 959                     msp->ms_sm->sm_phys->smp_histogram[i];
 960                 mc->mc_histogram[i + ashift] +=
 961                     msp->ms_sm->sm_phys->smp_histogram[i];
 962         }
 963         mutex_exit(&mg->mg_lock);
 964 }
 965
 966 void
 967 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 968 {
 969         metaslab_class_t *mc = mg->mg_class;
 970         uint64_t ashift = mg->mg_vd->vdev_ashift;
 971
 972         ASSERT(MUTEX_HELD(&msp->ms_lock));
 973         if (msp->ms_sm == NULL)
 974                 return;
 975
 976         mutex_enter(&mg->mg_lock);
 977         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 978                 ASSERT3U(mg->mg_histogram[i + ashift], >=,
 979                     msp->ms_sm->sm_phys->smp_histogram[i]);
 980                 ASSERT3U(mc->mc_histogram[i + ashift], >=,
 981                     msp->ms_sm->sm_phys->smp_histogram[i]);
 982
 983                 mg->mg_histogram[i + ashift] -=
 984                     msp->ms_sm->sm_phys->smp_histogram[i];
 985                 mc->mc_histogram[i + ashift] -=
 986                     msp->ms_sm->sm_phys->smp_histogram[i];
 987         }
 988         mutex_exit(&mg->mg_lock);
 989 }
 990
 991 static void
 992 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 993 {
 994         ASSERT(msp->ms_group == NULL);
 995         mutex_enter(&mg->mg_lock);
 996         msp->ms_group = mg;
 997         msp->ms_weight = 0;
 998         avl_add(&mg->mg_metaslab_tree, msp);
 999         mutex_exit(&mg->mg_lock);
1000
1001         mutex_enter(&msp->ms_lock);
1002         metaslab_group_histogram_add(mg, msp);
1003         mutex_exit(&msp->ms_lock);
1004 }
1005
1006 static void
1007 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
1008 {
1009         mutex_enter(&msp->ms_lock);
1010         metaslab_group_histogram_remove(mg, msp);
1011         mutex_exit(&msp->ms_lock);
1012
1013         mutex_enter(&mg->mg_lock);
1014         ASSERT(msp->ms_group == mg);
1015         avl_remove(&mg->mg_metaslab_tree, msp);
1016
1017         metaslab_class_t *mc = msp->ms_group->mg_class;
1018         multilist_sublist_t *mls =
1019             multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
1020         if (multilist_link_active(&msp->ms_class_txg_node))
1021                 multilist_sublist_remove(mls, msp);
1022         multilist_sublist_unlock(mls);
1023
1024         msp->ms_group = NULL;
1025         mutex_exit(&mg->mg_lock);
1026 }
1027
1028 static void
1029 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1030 {
1031         ASSERT(MUTEX_HELD(&msp->ms_lock));
1032         ASSERT(MUTEX_HELD(&mg->mg_lock));
1033         ASSERT(msp->ms_group == mg);
1034
1035         avl_remove(&mg->mg_metaslab_tree, msp);
1036         msp->ms_weight = weight;
1037         avl_add(&mg->mg_metaslab_tree, msp);
1038
1039 }
1040
1041 static void
1042 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1043 {
1044         /*
1045          * Although in principle the weight can be any value, in
1046          * practice we do not use values in the range [1, 511].
1047          */
1048         ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
1049         ASSERT(MUTEX_HELD(&msp->ms_lock));
1050
1051         mutex_enter(&mg->mg_lock);
1052         metaslab_group_sort_impl(mg, msp, weight);
1053         mutex_exit(&mg->mg_lock);
1054 }
1055
1056 /*
1057  * Calculate the fragmentation for a given metaslab group. We can use
1058  * a simple average here since all metaslabs within the group must have
1059  * the same size. The return value will be a value between 0 and 100
1060  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
1061  * group have a fragmentation metric.
1062  */
1063 uint64_t
1064 metaslab_group_fragmentation(metaslab_group_t *mg)
1065 {
1066         vdev_t *vd = mg->mg_vd;
1067         uint64_t fragmentation = 0;
1068         uint64_t valid_ms = 0;
1069
1070         for (int m = 0; m < vd->vdev_ms_count; m++) {
1071                 metaslab_t *msp = vd->vdev_ms[m];
1072
1073                 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
1074                         continue;
1075                 if (msp->ms_group != mg)
1076                         continue;
1077
1078                 valid_ms++;
1079                 fragmentation += msp->ms_fragmentation;
1080         }
1081
1082         if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
1083                 return (ZFS_FRAG_INVALID);
1084
1085         fragmentation /= valid_ms;
1086         ASSERT3U(fragmentation, <=, 100);
1087         return (fragmentation);
1088 }
1089
1090 /*
1091  * Determine if a given metaslab group should skip allocations. A metaslab
1092  * group should avoid allocations if its free capacity is less than the
1093  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
1094  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
1095  * that can still handle allocations. If the allocation throttle is enabled
1096  * then we skip allocations to devices that have reached their maximum
1097  * allocation queue depth unless the selected metaslab group is the only
1098  * eligible group remaining.
1099  */
1100 static boolean_t
1101 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
1102     uint64_t psize, int allocator, int d)
1103 {
1104         spa_t *spa = mg->mg_vd->vdev_spa;
1105         metaslab_class_t *mc = mg->mg_class;
1106
1107         /*
1108          * We can only consider skipping this metaslab group if it's
1109          * in the normal metaslab class and there are other metaslab
1110          * groups to select from. Otherwise, we always consider it eligible
1111          * for allocations.
1112          */
1113         if ((mc != spa_normal_class(spa) &&
1114             mc != spa_special_class(spa) &&
1115             mc != spa_dedup_class(spa)) ||
1116             mc->mc_groups <= 1)
1117                 return (B_TRUE);
1118
1119         /*
1120          * If the metaslab group's mg_allocatable flag is set (see comments
1121          * in metaslab_group_alloc_update() for more information) and
1122          * the allocation throttle is disabled then allow allocations to this
1123          * device. However, if the allocation throttle is enabled then
1124          * check if we have reached our allocation limit (mg_alloc_queue_depth)
1125          * to determine if we should allow allocations to this metaslab group.
1126          * If all metaslab groups are no longer considered allocatable
1127          * (mc_alloc_groups == 0) or we're trying to allocate the smallest
1128          * gang block size then we allow allocations on this metaslab group
1129          * regardless of the mg_allocatable or throttle settings.
1130          */
1131         if (mg->mg_allocatable) {
1132                 metaslab_group_t *mgp;
1133                 int64_t qdepth;
1134                 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
1135
1136                 if (!mc->mc_alloc_throttle_enabled)
1137                         return (B_TRUE);
1138
1139                 /*
1140                  * If this metaslab group does not have any free space, then
1141                  * there is no point in looking further.
1142                  */
1143                 if (mg->mg_no_free_space)
1144                         return (B_FALSE);
1145
1146                 /*
1147                  * Relax allocation throttling for ditto blocks.  Due to
1148                  * random imbalances in allocation it tends to push copies
1149                  * to one vdev, that looks a bit better at the moment.
1150                  */
1151                 qmax = qmax * (4 + d) / 4;
1152
1153                 qdepth = zfs_refcount_count(
1154                     &mg->mg_alloc_queue_depth[allocator]);
1155
1156                 /*
1157                  * If this metaslab group is below its qmax or it's
1158                  * the only allocatable metasable group, then attempt
1159                  * to allocate from it.
1160                  */
1161                 if (qdepth < qmax || mc->mc_alloc_groups == 1)
1162                         return (B_TRUE);
1163                 ASSERT3U(mc->mc_alloc_groups, >, 1);
1164
1165                 /*
1166                  * Since this metaslab group is at or over its qmax, we
1167                  * need to determine if there are metaslab groups after this
1168                  * one that might be able to handle this allocation. This is
1169                  * racy since we can't hold the locks for all metaslab
1170                  * groups at the same time when we make this check.
1171                  */
1172                 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
1173                         qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
1174                         qmax = qmax * (4 + d) / 4;
1175                         qdepth = zfs_refcount_count(
1176                             &mgp->mg_alloc_queue_depth[allocator]);
1177
1178                         /*
1179                          * If there is another metaslab group that
1180                          * might be able to handle the allocation, then
1181                          * we return false so that we skip this group.
1182                          */
1183                         if (qdepth < qmax && !mgp->mg_no_free_space)
1184                                 return (B_FALSE);
1185                 }
1186
1187                 /*
1188                  * We didn't find another group to handle the allocation
1189                  * so we can't skip this metaslab group even though
1190                  * we are at or over our qmax.
1191                  */
1192                 return (B_TRUE);
1193
1194         } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
1195                 return (B_TRUE);
1196         }
1197         return (B_FALSE);
1198 }
1199
1200 /*
1201  * ==========================================================================
1202  * Range tree callbacks
1203  * ==========================================================================
1204  */
1205
1206 /*
1207  * Comparison function for the private size-ordered tree. Tree is sorted
1208  * by size, larger sizes at the end of the tree.
1209  */
1210 static int
1211 metaslab_rangesize_compare(const void *x1, const void *x2)
1212 {
1213         const range_seg_t *r1 = x1;
1214         const range_seg_t *r2 = x2;
1215         uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1216         uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1217
1218         int cmp = AVL_CMP(rs_size1, rs_size2);
1219         if (likely(cmp))
1220                 return (cmp);
1221
1222         return (AVL_CMP(r1->rs_start, r2->rs_start));
1223 }
1224
1225 /*
1226  * ==========================================================================
1227  * Common allocator routines
1228  * ==========================================================================
1229  */
1230
1231 /*
1232  * Return the maximum contiguous segment within the metaslab.
1233  */
1234 uint64_t
1235 metaslab_largest_allocatable(metaslab_t *msp)
1236 {
1237         avl_tree_t *t = &msp->ms_allocatable_by_size;
1238         range_seg_t *rs;
1239
1240         if (t == NULL)
1241                 return (0);
1242         rs = avl_last(t);
1243         if (rs == NULL)
1244                 return (0);
1245
1246         return (rs->rs_end - rs->rs_start);
1247 }
1248
1249 /*
1250  * Return the maximum contiguous segment within the unflushed frees of this
1251  * metaslab.
1252  */
1253 uint64_t
1254 metaslab_largest_unflushed_free(metaslab_t *msp)
1255 {
1256         ASSERT(MUTEX_HELD(&msp->ms_lock));
1257
1258         if (msp->ms_unflushed_frees == NULL)
1259                 return (0);
1260
1261         range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size);
1262         if (rs == NULL)
1263                 return (0);
1264
1265         /*
1266          * When a range is freed from the metaslab, that range is added to
1267          * both the unflushed frees and the deferred frees. While the block
1268          * will eventually be usable, if the metaslab were loaded the range
1269          * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
1270          * txgs had passed.  As a result, when attempting to estimate an upper
1271          * bound for the largest currently-usable free segment in the
1272          * metaslab, we need to not consider any ranges currently in the defer
1273          * trees. This algorithm approximates the largest available chunk in
1274          * the largest range in the unflushed_frees tree by taking the first
1275          * chunk.  While this may be a poor estimate, it should only remain so
1276          * briefly and should eventually self-correct as frees are no longer
1277          * deferred. Similar logic applies to the ms_freed tree. See
1278          * metaslab_load() for more details.
1279          *
1280          * There are two primary sources of innacuracy in this estimate. Both
1281          * are tolerated for performance reasons. The first source is that we
1282          * only check the largest segment for overlaps. Smaller segments may
1283          * have more favorable overlaps with the other trees, resulting in
1284          * larger usable chunks.  Second, we only look at the first chunk in
1285          * the largest segment; there may be other usable chunks in the
1286          * largest segment, but we ignore them.
1287          */
1288         uint64_t rstart = rs->rs_start;
1289         uint64_t rsize = rs->rs_end - rstart;
1290         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1291                 uint64_t start = 0;
1292                 uint64_t size = 0;
1293                 boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
1294                     rsize, &start, &size);
1295                 if (found) {
1296                         if (rstart == start)
1297                                 return (0);
1298                         rsize = start - rstart;
1299                 }
1300         }
1301
1302         uint64_t start = 0;
1303         uint64_t size = 0;
1304         boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
1305             rsize, &start, &size);
1306         if (found)
1307                 rsize = start - rstart;
1308
1309         return (rsize);
1310 }
1311
1312 static range_seg_t *
1313 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1314 {
1315         range_seg_t *rs, rsearch;
1316         avl_index_t where;
1317
1318         rsearch.rs_start = start;
1319         rsearch.rs_end = start + size;
1320
1321         rs = avl_find(t, &rsearch, &where);
1322         if (rs == NULL) {
1323                 rs = avl_nearest(t, where, AVL_AFTER);
1324         }
1325
1326         return (rs);
1327 }
1328
1329 #if defined(WITH_DF_BLOCK_ALLOCATOR) || \
1330     defined(WITH_CF_BLOCK_ALLOCATOR)
1331 /*
1332  * This is a helper function that can be used by the allocator to find
1333  * a suitable block to allocate. This will search the specified AVL
1334  * tree looking for a block that matches the specified criteria.
1335  */
1336 static uint64_t
1337 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1338     uint64_t max_search)
1339 {
1340         range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1341         uint64_t first_found;
1342
1343         if (rs != NULL)
1344                 first_found = rs->rs_start;
1345
1346         while (rs != NULL && rs->rs_start - first_found <= max_search) {
1347                 uint64_t offset = rs->rs_start;
1348                 if (offset + size <= rs->rs_end) {
1349                         *cursor = offset + size;
1350                         return (offset);
1351                 }
1352                 rs = AVL_NEXT(t, rs);
1353         }
1354
1355         *cursor = 0;
1356         return (-1ULL);
1357 }
1358 #endif /* WITH_DF/CF_BLOCK_ALLOCATOR */
1359
1360 #if defined(WITH_DF_BLOCK_ALLOCATOR)
1361 /*
1362  * ==========================================================================
1363  * Dynamic Fit (df) block allocator
1364  *
1365  * Search for a free chunk of at least this size, starting from the last
1366  * offset (for this alignment of block) looking for up to
1367  * metaslab_df_max_search bytes (16MB).  If a large enough free chunk is not
1368  * found within 16MB, then return a free chunk of exactly the requested size (or
1369  * larger).
1370  *
1371  * If it seems like searching from the last offset will be unproductive, skip
1372  * that and just return a free chunk of exactly the requested size (or larger).
1373  * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct.  This
1374  * mechanism is probably not very useful and may be removed in the future.
1375  *
1376  * The behavior when not searching can be changed to return the largest free
1377  * chunk, instead of a free chunk of exactly the requested size, by setting
1378  * metaslab_df_use_largest_segment.
1379  * ==========================================================================
1380  */
1381 static uint64_t
1382 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1383 {
1384         /*
1385          * Find the largest power of 2 block size that evenly divides the
1386          * requested size. This is used to try to allocate blocks with similar
1387          * alignment from the same area of the metaslab (i.e. same cursor
1388          * bucket) but it does not guarantee that other allocations sizes
1389          * may exist in the same region.
1390          */
1391         uint64_t align = size & -size;
1392         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1393         range_tree_t *rt = msp->ms_allocatable;
1394         int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1395         uint64_t offset;
1396
1397         ASSERT(MUTEX_HELD(&msp->ms_lock));
1398         ASSERT3U(avl_numnodes(&rt->rt_root), ==,
1399             avl_numnodes(&msp->ms_allocatable_by_size));
1400
1401         /*
1402          * If we're running low on space, find a segment based on size,
1403          * rather than iterating based on offset.
1404          */
1405         if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
1406             free_pct < metaslab_df_free_pct) {
1407                 offset = -1;
1408         } else {
1409                 offset = metaslab_block_picker(&rt->rt_root,
1410                     cursor, size, metaslab_df_max_search);
1411         }
1412
1413         if (offset == -1) {
1414                 range_seg_t *rs;
1415                 if (metaslab_df_use_largest_segment) {
1416                         /* use largest free segment */
1417                         rs = avl_last(&msp->ms_allocatable_by_size);
1418                 } else {
1419                         /* use segment of this size, or next largest */
1420                         rs = metaslab_block_find(&msp->ms_allocatable_by_size,
1421                             0, size);
1422                 }
1423                 if (rs != NULL && rs->rs_start + size <= rs->rs_end) {
1424                         offset = rs->rs_start;
1425                         *cursor = offset + size;
1426                 }
1427         }
1428
1429         return (offset);
1430 }
1431
1432 static metaslab_ops_t metaslab_df_ops = {
1433         metaslab_df_alloc
1434 };
1435
1436 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1437 #endif /* WITH_DF_BLOCK_ALLOCATOR */
1438
1439 #if defined(WITH_CF_BLOCK_ALLOCATOR)
1440 /*
1441  * ==========================================================================
1442  * Cursor fit block allocator -
1443  * Select the largest region in the metaslab, set the cursor to the beginning
1444  * of the range and the cursor_end to the end of the range. As allocations
1445  * are made advance the cursor. Continue allocating from the cursor until
1446  * the range is exhausted and then find a new range.
1447  * ==========================================================================
1448  */
1449 static uint64_t
1450 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1451 {
1452         range_tree_t *rt = msp->ms_allocatable;
1453         avl_tree_t *t = &msp->ms_allocatable_by_size;
1454         uint64_t *cursor = &msp->ms_lbas[0];
1455         uint64_t *cursor_end = &msp->ms_lbas[1];
1456         uint64_t offset = 0;
1457
1458         ASSERT(MUTEX_HELD(&msp->ms_lock));
1459         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1460
1461         ASSERT3U(*cursor_end, >=, *cursor);
1462
1463         if ((*cursor + size) > *cursor_end) {
1464                 range_seg_t *rs;
1465
1466                 rs = avl_last(&msp->ms_allocatable_by_size);
1467                 if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1468                         return (-1ULL);
1469
1470                 *cursor = rs->rs_start;
1471                 *cursor_end = rs->rs_end;
1472         }
1473
1474         offset = *cursor;
1475         *cursor += size;
1476
1477         return (offset);
1478 }
1479
1480 static metaslab_ops_t metaslab_cf_ops = {
1481         metaslab_cf_alloc
1482 };
1483
1484 metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops;
1485 #endif /* WITH_CF_BLOCK_ALLOCATOR */
1486
1487 #if defined(WITH_NDF_BLOCK_ALLOCATOR)
1488 /*
1489  * ==========================================================================
1490  * New dynamic fit allocator -
1491  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1492  * contiguous blocks. If no region is found then just use the largest segment
1493  * that remains.
1494  * ==========================================================================
1495  */
1496
1497 /*
1498  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1499  * to request from the allocator.
1500  */
1501 uint64_t metaslab_ndf_clump_shift = 4;
1502
1503 static uint64_t
1504 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1505 {
1506         avl_tree_t *t = &msp->ms_allocatable->rt_root;
1507         avl_index_t where;
1508         range_seg_t *rs, rsearch;
1509         uint64_t hbit = highbit64(size);
1510         uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1511         uint64_t max_size = metaslab_largest_allocatable(msp);
1512
1513         ASSERT(MUTEX_HELD(&msp->ms_lock));
1514         ASSERT3U(avl_numnodes(t), ==,
1515             avl_numnodes(&msp->ms_allocatable_by_size));
1516
1517         if (max_size < size)
1518                 return (-1ULL);
1519
1520         rsearch.rs_start = *cursor;
1521         rsearch.rs_end = *cursor + size;
1522
1523         rs = avl_find(t, &rsearch, &where);
1524         if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1525                 t = &msp->ms_allocatable_by_size;
1526
1527                 rsearch.rs_start = 0;
1528                 rsearch.rs_end = MIN(max_size,
1529                     1ULL << (hbit + metaslab_ndf_clump_shift));
1530                 rs = avl_find(t, &rsearch, &where);
1531                 if (rs == NULL)
1532                         rs = avl_nearest(t, where, AVL_AFTER);
1533                 ASSERT(rs != NULL);
1534         }
1535
1536         if ((rs->rs_end - rs->rs_start) >= size) {
1537                 *cursor = rs->rs_start + size;
1538                 return (rs->rs_start);
1539         }
1540         return (-1ULL);
1541 }
1542
1543 static metaslab_ops_t metaslab_ndf_ops = {
1544         metaslab_ndf_alloc
1545 };
1546
1547 metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
1548 #endif /* WITH_NDF_BLOCK_ALLOCATOR */
1549
1550
1551 /*
1552  * ==========================================================================
1553  * Metaslabs
1554  * ==========================================================================
1555  */
1556
1557 /*
1558  * Wait for any in-progress metaslab loads to complete.
1559  */
1560 void
1561 metaslab_load_wait(metaslab_t *msp)
1562 {
1563         ASSERT(MUTEX_HELD(&msp->ms_lock));
1564
1565         while (msp->ms_loading) {
1566                 ASSERT(!msp->ms_loaded);
1567                 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1568         }
1569 }
1570
1571 /*
1572  * Wait for any in-progress flushing to complete.
1573  */
1574 void
1575 metaslab_flush_wait(metaslab_t *msp)
1576 {
1577         ASSERT(MUTEX_HELD(&msp->ms_lock));
1578
1579         while (msp->ms_flushing)
1580                 cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
1581 }
1582
1583 static unsigned int
1584 metaslab_idx_func(multilist_t *ml, void *arg)
1585 {
1586         metaslab_t *msp = arg;
1587         return (msp->ms_id % multilist_get_num_sublists(ml));
1588 }
1589
1590 uint64_t
1591 metaslab_allocated_space(metaslab_t *msp)
1592 {
1593         return (msp->ms_allocated_space);
1594 }
1595
1596 /*
1597  * Verify that the space accounting on disk matches the in-core range_trees.
1598  */
1599 static void
1600 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
1601 {
1602         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1603         uint64_t allocating = 0;
1604         uint64_t sm_free_space, msp_free_space;
1605
1606         ASSERT(MUTEX_HELD(&msp->ms_lock));
1607         ASSERT(!msp->ms_condensing);
1608
1609         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
1610                 return;
1611
1612         /*
1613          * We can only verify the metaslab space when we're called
1614          * from syncing context with a loaded metaslab that has an
1615          * allocated space map. Calling this in non-syncing context
1616          * does not provide a consistent view of the metaslab since
1617          * we're performing allocations in the future.
1618          */
1619         if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
1620             !msp->ms_loaded)
1621                 return;
1622
1623         /*
1624          * Even though the smp_alloc field can get negative,
1625          * when it comes to a metaslab's space map, that should
1626          * never be the case.
1627          */
1628         ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
1629
1630         ASSERT3U(space_map_allocated(msp->ms_sm), >=,
1631             range_tree_space(msp->ms_unflushed_frees));
1632
1633         ASSERT3U(metaslab_allocated_space(msp), ==,
1634             space_map_allocated(msp->ms_sm) +
1635             range_tree_space(msp->ms_unflushed_allocs) -
1636             range_tree_space(msp->ms_unflushed_frees));
1637
1638         sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
1639
1640         /*
1641          * Account for future allocations since we would have
1642          * already deducted that space from the ms_allocatable.
1643          */
1644         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
1645                 allocating +=
1646                     range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
1647         }
1648         ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
1649             msp->ms_allocating_total);
1650
1651         ASSERT3U(msp->ms_deferspace, ==,
1652             range_tree_space(msp->ms_defer[0]) +
1653             range_tree_space(msp->ms_defer[1]));
1654
1655         msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
1656             msp->ms_deferspace + range_tree_space(msp->ms_freed);
1657
1658         VERIFY3U(sm_free_space, ==, msp_free_space);
1659 }
1660
1661 static void
1662 metaslab_aux_histograms_clear(metaslab_t *msp)
1663 {
1664         /*
1665          * Auxiliary histograms are only cleared when resetting them,
1666          * which can only happen while the metaslab is loaded.
1667          */
1668         ASSERT(msp->ms_loaded);
1669
1670         bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1671         for (int t = 0; t < TXG_DEFER_SIZE; t++)
1672                 bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
1673 }
1674
1675 static void
1676 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
1677     range_tree_t *rt)
1678 {
1679         /*
1680          * This is modeled after space_map_histogram_add(), so refer to that
1681          * function for implementation details. We want this to work like
1682          * the space map histogram, and not the range tree histogram, as we
1683          * are essentially constructing a delta that will be later subtracted
1684          * from the space map histogram.
1685          */
1686         int idx = 0;
1687         for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
1688                 ASSERT3U(i, >=, idx + shift);
1689                 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
1690
1691                 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
1692                         ASSERT3U(idx + shift, ==, i);
1693                         idx++;
1694                         ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
1695                 }
1696         }
1697 }
1698
1699 /*
1700  * Called at every sync pass that the metaslab gets synced.
1701  *
1702  * The reason is that we want our auxiliary histograms to be updated
1703  * wherever the metaslab's space map histogram is updated. This way
1704  * we stay consistent on which parts of the metaslab space map's
1705  * histogram are currently not available for allocations (e.g because
1706  * they are in the defer, freed, and freeing trees).
1707  */
1708 static void
1709 metaslab_aux_histograms_update(metaslab_t *msp)
1710 {
1711         space_map_t *sm = msp->ms_sm;
1712         ASSERT(sm != NULL);
1713
1714         /*
1715          * This is similar to the metaslab's space map histogram updates
1716          * that take place in metaslab_sync(). The only difference is that
1717          * we only care about segments that haven't made it into the
1718          * ms_allocatable tree yet.
1719          */
1720         if (msp->ms_loaded) {
1721                 metaslab_aux_histograms_clear(msp);
1722
1723                 metaslab_aux_histogram_add(msp->ms_synchist,
1724                     sm->sm_shift, msp->ms_freed);
1725
1726                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1727                         metaslab_aux_histogram_add(msp->ms_deferhist[t],
1728                             sm->sm_shift, msp->ms_defer[t]);
1729                 }
1730         }
1731
1732         metaslab_aux_histogram_add(msp->ms_synchist,
1733             sm->sm_shift, msp->ms_freeing);
1734 }
1735
1736 /*
1737  * Called every time we are done syncing (writing to) the metaslab,
1738  * i.e. at the end of each sync pass.
1739  * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
1740  */
1741 static void
1742 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
1743 {
1744         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1745         space_map_t *sm = msp->ms_sm;
1746
1747         if (sm == NULL) {
1748                 /*
1749                  * We came here from metaslab_init() when creating/opening a
1750                  * pool, looking at a metaslab that hasn't had any allocations
1751                  * yet.
1752                  */
1753                 return;
1754         }
1755
1756         /*
1757          * This is similar to the actions that we take for the ms_freed
1758          * and ms_defer trees in metaslab_sync_done().
1759          */
1760         uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
1761         if (defer_allowed) {
1762                 bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
1763                     sizeof (msp->ms_synchist));
1764         } else {
1765                 bzero(msp->ms_deferhist[hist_index],
1766                     sizeof (msp->ms_deferhist[hist_index]));
1767         }
1768         bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1769 }
1770
1771 /*
1772  * Ensure that the metaslab's weight and fragmentation are consistent
1773  * with the contents of the histogram (either the range tree's histogram
1774  * or the space map's depending whether the metaslab is loaded).
1775  */
1776 static void
1777 metaslab_verify_weight_and_frag(metaslab_t *msp)
1778 {
1779         ASSERT(MUTEX_HELD(&msp->ms_lock));
1780
1781         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
1782                 return;
1783
1784         /*
1785          * We can end up here from vdev_remove_complete(), in which case we
1786          * cannot do these assertions because we hold spa config locks and
1787          * thus we are not allowed to read from the DMU.
1788          *
1789          * We check if the metaslab group has been removed and if that's
1790          * the case we return immediately as that would mean that we are
1791          * here from the aforementioned code path.
1792          */
1793         if (msp->ms_group == NULL)
1794                 return;
1795
1796         /*
1797          * Devices being removed always return a weight of 0 and leave
1798          * fragmentation and ms_max_size as is - there is nothing for
1799          * us to verify here.
1800          */
1801         vdev_t *vd = msp->ms_group->mg_vd;
1802         if (vd->vdev_removing)
1803                 return;
1804
1805         /*
1806          * If the metaslab is dirty it probably means that we've done
1807          * some allocations or frees that have changed our histograms
1808          * and thus the weight.
1809          */
1810         for (int t = 0; t < TXG_SIZE; t++) {
1811                 if (txg_list_member(&vd->vdev_ms_list, msp, t))
1812                         return;
1813         }
1814
1815         /*
1816          * This verification checks that our in-memory state is consistent
1817          * with what's on disk. If the pool is read-only then there aren't
1818          * any changes and we just have the initially-loaded state.
1819          */
1820         if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
1821                 return;
1822
1823         /* some extra verification for in-core tree if you can */
1824         if (msp->ms_loaded) {
1825                 range_tree_stat_verify(msp->ms_allocatable);
1826                 VERIFY(space_map_histogram_verify(msp->ms_sm,
1827                     msp->ms_allocatable));
1828         }
1829
1830         uint64_t weight = msp->ms_weight;
1831         uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1832         boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
1833         uint64_t frag = msp->ms_fragmentation;
1834         uint64_t max_segsize = msp->ms_max_size;
1835
1836         msp->ms_weight = 0;
1837         msp->ms_fragmentation = 0;
1838
1839         /*
1840          * This function is used for verification purposes. Regardless of
1841          * whether metaslab_weight() thinks this metaslab should be active or
1842          * not, we want to ensure that the actual weight (and therefore the
1843          * value of ms_weight) would be the same if it was to be recalculated
1844          * at this point.
1845          */
1846         msp->ms_weight = metaslab_weight(msp) | was_active;
1847
1848         VERIFY3U(max_segsize, ==, msp->ms_max_size);
1849
1850         /*
1851          * If the weight type changed then there is no point in doing
1852          * verification. Revert fields to their original values.
1853          */
1854         if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
1855             (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
1856                 msp->ms_fragmentation = frag;
1857                 msp->ms_weight = weight;
1858                 return;
1859         }
1860
1861         VERIFY3U(msp->ms_fragmentation, ==, frag);
1862         VERIFY3U(msp->ms_weight, ==, weight);
1863 }
1864
1865 /*
1866  * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
1867  * this class that was used longest ago, and attempt to unload it.  We don't
1868  * want to spend too much time in this loop to prevent performance
1869  * degredation, and we expect that most of the time this operation will
1870  * succeed. Between that and the normal unloading processing during txg sync,
1871  * we expect this to keep the metaslab memory usage under control.
1872  */
1873 static void
1874 metaslab_potentially_evict(metaslab_class_t *mc)
1875 {
1876 #ifdef _KERNEL
1877         uint64_t allmem = arc_all_memory();
1878         extern kmem_cache_t *range_seg_cache;
1879         uint64_t inuse = range_seg_cache->skc_obj_total;
1880         uint64_t size = range_seg_cache->skc_obj_size;
1881         int tries = 0;
1882         for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
1883             tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2;
1884             tries++) {
1885                 unsigned int idx = multilist_get_random_index(
1886                     mc->mc_metaslab_txg_list);
1887                 multilist_sublist_t *mls =
1888                     multilist_sublist_lock(mc->mc_metaslab_txg_list, idx);
1889                 metaslab_t *msp = multilist_sublist_head(mls);
1890                 multilist_sublist_unlock(mls);
1891                 while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
1892                     inuse * size) {
1893                         VERIFY3P(mls, ==, multilist_sublist_lock(
1894                             mc->mc_metaslab_txg_list, idx));
1895                         ASSERT3U(idx, ==,
1896                             metaslab_idx_func(mc->mc_metaslab_txg_list, msp));
1897
1898                         if (!multilist_link_active(&msp->ms_class_txg_node)) {
1899                                 multilist_sublist_unlock(mls);
1900                                 break;
1901                         }
1902                         metaslab_t *next_msp = multilist_sublist_next(mls, msp);
1903                         multilist_sublist_unlock(mls);
1904                         /*
1905                          * If the metaslab is currently loading there are two
1906                          * cases. If it's the metaslab we're evicting, we
1907                          * can't continue on or we'll panic when we attempt to
1908                          * recursively lock the mutex. If it's another
1909                          * metaslab that's loading, it can be safely skipped,
1910                          * since we know it's very new and therefore not a
1911                          * good eviction candidate. We check later once the
1912                          * lock is held that the metaslab is fully loaded
1913                          * before actually unloading it.
1914                          */
1915                         if (msp->ms_loading) {
1916                                 msp = next_msp;
1917                                 inuse = range_seg_cache->skc_obj_total;
1918                                 continue;
1919                         }
1920                         /*
1921                          * We can't unload metaslabs with no spacemap because
1922                          * they're not ready to be unloaded yet. We can't
1923                          * unload metaslabs with outstanding allocations
1924                          * because doing so could cause the metaslab's weight
1925                          * to decrease while it's unloaded, which violates an
1926                          * invariant that we use to prevent unnecessary
1927                          * loading. We also don't unload metaslabs that are
1928                          * currently active because they are high-weight
1929                          * metaslabs that are likely to be used in the near
1930                          * future.
1931                          */
1932                         mutex_enter(&msp->ms_lock);
1933                         if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
1934                             msp->ms_allocating_total == 0) {
1935                                 metaslab_unload(msp);
1936                         }
1937                         mutex_exit(&msp->ms_lock);
1938                         msp = next_msp;
1939                         inuse = range_seg_cache->skc_obj_total;
1940                 }
1941         }
1942 #endif
1943 }
1944
1945 static int
1946 metaslab_load_impl(metaslab_t *msp)
1947 {
1948         int error = 0;
1949
1950         ASSERT(MUTEX_HELD(&msp->ms_lock));
1951         ASSERT(msp->ms_loading);
1952         ASSERT(!msp->ms_condensing);
1953
1954         /*
1955          * We temporarily drop the lock to unblock other operations while we
1956          * are reading the space map. Therefore, metaslab_sync() and
1957          * metaslab_sync_done() can run at the same time as we do.
1958          *
1959          * If we are using the log space maps, metaslab_sync() can't write to
1960          * the metaslab's space map while we are loading as we only write to
1961          * it when we are flushing the metaslab, and that can't happen while
1962          * we are loading it.
1963          *
1964          * If we are not using log space maps though, metaslab_sync() can
1965          * append to the space map while we are loading. Therefore we load
1966          * only entries that existed when we started the load. Additionally,
1967          * metaslab_sync_done() has to wait for the load to complete because
1968          * there are potential races like metaslab_load() loading parts of the
1969          * space map that are currently being appended by metaslab_sync(). If
1970          * we didn't, the ms_allocatable would have entries that
1971          * metaslab_sync_done() would try to re-add later.
1972          *
1973          * That's why before dropping the lock we remember the synced length
1974          * of the metaslab and read up to that point of the space map,
1975          * ignoring entries appended by metaslab_sync() that happen after we
1976          * drop the lock.
1977          */
1978         uint64_t length = msp->ms_synced_length;
1979         mutex_exit(&msp->ms_lock);
1980
1981         hrtime_t load_start = gethrtime();
1982         if (msp->ms_sm != NULL) {
1983                 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
1984                     SM_FREE, length);
1985         } else {
1986                 /*
1987                  * The space map has not been allocated yet, so treat
1988                  * all the space in the metaslab as free and add it to the
1989                  * ms_allocatable tree.
1990                  */
1991                 range_tree_add(msp->ms_allocatable,
1992                     msp->ms_start, msp->ms_size);
1993
1994                 if (msp->ms_freed != NULL) {
1995                         /*
1996                          * If the ms_sm doesn't exist, this means that this
1997                          * metaslab hasn't gone through metaslab_sync() and
1998                          * thus has never been dirtied. So we shouldn't
1999                          * expect any unflushed allocs or frees from previous
2000                          * TXGs.
2001                          *
2002                          * Note: ms_freed and all the other trees except for
2003                          * the ms_allocatable, can be NULL at this point only
2004                          * if this is a new metaslab of a vdev that just got
2005                          * expanded.
2006                          */
2007                         ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
2008                         ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
2009                 }
2010         }
2011
2012         /*
2013          * We need to grab the ms_sync_lock to prevent metaslab_sync() from
2014          * changing the ms_sm (or log_sm) and the metaslab's range trees
2015          * while we are about to use them and populate the ms_allocatable.
2016          * The ms_lock is insufficient for this because metaslab_sync() doesn't
2017          * hold the ms_lock while writing the ms_checkpointing tree to disk.
2018          */
2019         mutex_enter(&msp->ms_sync_lock);
2020         mutex_enter(&msp->ms_lock);
2021
2022         ASSERT(!msp->ms_condensing);
2023         ASSERT(!msp->ms_flushing);
2024
2025         if (error != 0) {
2026                 mutex_exit(&msp->ms_sync_lock);
2027                 return (error);
2028         }
2029
2030         ASSERT3P(msp->ms_group, !=, NULL);
2031         msp->ms_loaded = B_TRUE;
2032
2033         /*
2034          * Apply all the unflushed changes to ms_allocatable right
2035          * away so any manipulations we do below have a clear view
2036          * of what is allocated and what is free.
2037          */
2038         range_tree_walk(msp->ms_unflushed_allocs,
2039             range_tree_remove, msp->ms_allocatable);
2040         range_tree_walk(msp->ms_unflushed_frees,
2041             range_tree_add, msp->ms_allocatable);
2042
2043         msp->ms_loaded = B_TRUE;
2044
2045         ASSERT3P(msp->ms_group, !=, NULL);
2046         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2047         if (spa_syncing_log_sm(spa) != NULL) {
2048                 ASSERT(spa_feature_is_enabled(spa,
2049                     SPA_FEATURE_LOG_SPACEMAP));
2050
2051                 /*
2052                  * If we use a log space map we add all the segments
2053                  * that are in ms_unflushed_frees so they are available
2054                  * for allocation.
2055                  *
2056                  * ms_allocatable needs to contain all free segments
2057                  * that are ready for allocations (thus not segments
2058                  * from ms_freeing, ms_freed, and the ms_defer trees).
2059                  * But if we grab the lock in this code path at a sync
2060                  * pass later that 1, then it also contains the
2061                  * segments of ms_freed (they were added to it earlier
2062                  * in this path through ms_unflushed_frees). So we
2063                  * need to remove all the segments that exist in
2064                  * ms_freed from ms_allocatable as they will be added
2065                  * later in metaslab_sync_done().
2066                  *
2067                  * When there's no log space map, the ms_allocatable
2068                  * correctly doesn't contain any segments that exist
2069                  * in ms_freed [see ms_synced_length].
2070                  */
2071                 range_tree_walk(msp->ms_freed,
2072                     range_tree_remove, msp->ms_allocatable);
2073         }
2074
2075         /*
2076          * If we are not using the log space map, ms_allocatable
2077          * contains the segments that exist in the ms_defer trees
2078          * [see ms_synced_length]. Thus we need to remove them
2079          * from ms_allocatable as they will be added again in
2080          * metaslab_sync_done().
2081          *
2082          * If we are using the log space map, ms_allocatable still
2083          * contains the segments that exist in the ms_defer trees.
2084          * Not because it read them through the ms_sm though. But
2085          * because these segments are part of ms_unflushed_frees
2086          * whose segments we add to ms_allocatable earlier in this
2087          * code path.
2088          */
2089         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2090                 range_tree_walk(msp->ms_defer[t],
2091                     range_tree_remove, msp->ms_allocatable);
2092         }
2093
2094         /*
2095          * Call metaslab_recalculate_weight_and_sort() now that the
2096          * metaslab is loaded so we get the metaslab's real weight.
2097          *
2098          * Unless this metaslab was created with older software and
2099          * has not yet been converted to use segment-based weight, we
2100          * expect the new weight to be better or equal to the weight
2101          * that the metaslab had while it was not loaded. This is
2102          * because the old weight does not take into account the
2103          * consolidation of adjacent segments between TXGs. [see
2104          * comment for ms_synchist and ms_deferhist[] for more info]
2105          */
2106         uint64_t weight = msp->ms_weight;
2107         uint64_t max_size = msp->ms_max_size;
2108         metaslab_recalculate_weight_and_sort(msp);
2109         if (!WEIGHT_IS_SPACEBASED(weight))
2110                 ASSERT3U(weight, <=, msp->ms_weight);
2111         msp->ms_max_size = metaslab_largest_allocatable(msp);
2112         ASSERT3U(max_size, <=, msp->ms_max_size);
2113         hrtime_t load_end = gethrtime();
2114                 msp->ms_load_time = load_end;
2115         if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
2116                 zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, "
2117                     "ms_id %llu, smp_length %llu, "
2118                     "unflushed_allocs %llu, unflushed_frees %llu, "
2119                     "freed %llu, defer %llu + %llu, "
2120                     "loading_time %lld ms, ms_max_size %llu, "
2121                     "max size error %llu",
2122                     spa_syncing_txg(spa), spa_name(spa),
2123                     msp->ms_group->mg_vd->vdev_id, msp->ms_id,
2124                     space_map_length(msp->ms_sm),
2125                     range_tree_space(msp->ms_unflushed_allocs),
2126                     range_tree_space(msp->ms_unflushed_frees),
2127                     range_tree_space(msp->ms_freed),
2128                     range_tree_space(msp->ms_defer[0]),
2129                     range_tree_space(msp->ms_defer[1]),
2130                     (longlong_t)((load_end - load_start) / 1000000),
2131                     msp->ms_max_size, msp->ms_max_size - max_size);
2132         }
2133
2134         metaslab_verify_space(msp, spa_syncing_txg(spa));
2135         mutex_exit(&msp->ms_sync_lock);
2136         return (0);
2137 }
2138
2139 int
2140 metaslab_load(metaslab_t *msp)
2141 {
2142         ASSERT(MUTEX_HELD(&msp->ms_lock));
2143
2144         /*
2145          * There may be another thread loading the same metaslab, if that's
2146          * the case just wait until the other thread is done and return.
2147          */
2148         metaslab_load_wait(msp);
2149         if (msp->ms_loaded)
2150                 return (0);
2151         VERIFY(!msp->ms_loading);
2152         ASSERT(!msp->ms_condensing);
2153
2154         /*
2155          * We set the loading flag BEFORE potentially dropping the lock to
2156          * wait for an ongoing flush (see ms_flushing below). This way other
2157          * threads know that there is already a thread that is loading this
2158          * metaslab.
2159          */
2160         msp->ms_loading = B_TRUE;
2161
2162         /*
2163          * Wait for any in-progress flushing to finish as we drop the ms_lock
2164          * both here (during space_map_load()) and in metaslab_flush() (when
2165          * we flush our changes to the ms_sm).
2166          */
2167         if (msp->ms_flushing)
2168                 metaslab_flush_wait(msp);
2169
2170         /*
2171          * In the possibility that we were waiting for the metaslab to be
2172          * flushed (where we temporarily dropped the ms_lock), ensure that
2173          * no one else loaded the metaslab somehow.
2174          */
2175         ASSERT(!msp->ms_loaded);
2176
2177         /*
2178          * If we're loading a metaslab in the normal class, consider evicting
2179          * another one to keep our memory usage under the limit defined by the
2180          * zfs_metaslab_mem_limit tunable.
2181          */
2182         if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
2183             msp->ms_group->mg_class) {
2184                 metaslab_potentially_evict(msp->ms_group->mg_class);
2185         }
2186
2187         int error = metaslab_load_impl(msp);
2188
2189         ASSERT(MUTEX_HELD(&msp->ms_lock));
2190         msp->ms_loading = B_FALSE;
2191         cv_broadcast(&msp->ms_load_cv);
2192
2193         return (error);
2194 }
2195
2196 void
2197 metaslab_unload(metaslab_t *msp)
2198 {
2199         ASSERT(MUTEX_HELD(&msp->ms_lock));
2200
2201         /*
2202          * This can happen if a metaslab is selected for eviction (in
2203          * metaslab_potentially_evict) and then unloaded during spa_sync (via
2204          * metaslab_class_evict_old).
2205          */
2206         if (!msp->ms_loaded)
2207                 return;
2208
2209         range_tree_vacate(msp->ms_allocatable, NULL, NULL);
2210         msp->ms_loaded = B_FALSE;
2211         msp->ms_unload_time = gethrtime();
2212
2213         msp->ms_activation_weight = 0;
2214         msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
2215
2216         if (msp->ms_group != NULL) {
2217                 metaslab_class_t *mc = msp->ms_group->mg_class;
2218                 multilist_sublist_t *mls =
2219                     multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
2220                 if (multilist_link_active(&msp->ms_class_txg_node))
2221                         multilist_sublist_remove(mls, msp);
2222                 multilist_sublist_unlock(mls);
2223         }
2224
2225         /*
2226          * We explicitly recalculate the metaslab's weight based on its space
2227          * map (as it is now not loaded). We want unload metaslabs to always
2228          * have their weights calculated from the space map histograms, while
2229          * loaded ones have it calculated from their in-core range tree
2230          * [see metaslab_load()]. This way, the weight reflects the information
2231          * available in-core, whether it is loaded or not.
2232          *
2233          * If ms_group == NULL means that we came here from metaslab_fini(),
2234          * at which point it doesn't make sense for us to do the recalculation
2235          * and the sorting.
2236          */
2237         if (msp->ms_group != NULL)
2238                 metaslab_recalculate_weight_and_sort(msp);
2239 }
2240
2241 void
2242 metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
2243 {
2244         ASSERT(MUTEX_HELD(&msp->ms_lock));
2245         metaslab_class_t *mc = msp->ms_group->mg_class;
2246         multilist_sublist_t *mls =
2247             multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
2248         if (multilist_link_active(&msp->ms_class_txg_node))
2249                 multilist_sublist_remove(mls, msp);
2250         msp->ms_selected_txg = txg;
2251         multilist_sublist_insert_tail(mls, msp);
2252         multilist_sublist_unlock(mls);
2253 }
2254
2255 void
2256 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
2257     int64_t defer_delta, int64_t space_delta)
2258 {
2259         vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
2260
2261         ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
2262         ASSERT(vd->vdev_ms_count != 0);
2263
2264         metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
2265             vdev_deflated_space(vd, space_delta));
2266 }
2267
2268 int
2269 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
2270     uint64_t txg, metaslab_t **msp)
2271 {
2272         vdev_t *vd = mg->mg_vd;
2273         spa_t *spa = vd->vdev_spa;
2274         objset_t *mos = spa->spa_meta_objset;
2275         metaslab_t *ms;
2276         int error;
2277
2278         ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
2279         mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
2280         mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
2281         cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
2282         cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
2283         multilist_link_init(&ms->ms_class_txg_node);
2284
2285         ms->ms_id = id;
2286         ms->ms_start = id << vd->vdev_ms_shift;
2287         ms->ms_size = 1ULL << vd->vdev_ms_shift;
2288         ms->ms_allocator = -1;
2289         ms->ms_new = B_TRUE;
2290
2291         /*
2292          * We only open space map objects that already exist. All others
2293          * will be opened when we finally allocate an object for it.
2294          *
2295          * Note:
2296          * When called from vdev_expand(), we can't call into the DMU as
2297          * we are holding the spa_config_lock as a writer and we would
2298          * deadlock [see relevant comment in vdev_metaslab_init()]. in
2299          * that case, the object parameter is zero though, so we won't
2300          * call into the DMU.
2301          */
2302         if (object != 0) {
2303                 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
2304                     ms->ms_size, vd->vdev_ashift);
2305
2306                 if (error != 0) {
2307                         kmem_free(ms, sizeof (metaslab_t));
2308                         return (error);
2309                 }
2310
2311                 ASSERT(ms->ms_sm != NULL);
2312                 ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
2313         }
2314
2315         /*
2316          * We create the ms_allocatable here, but we don't create the
2317          * other range trees until metaslab_sync_done().  This serves
2318          * two purposes: it allows metaslab_sync_done() to detect the
2319          * addition of new space; and for debugging, it ensures that
2320          * we'd data fault on any attempt to use this metaslab before
2321          * it's ready.
2322          */
2323         ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
2324             &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
2325
2326         ms->ms_trim = range_tree_create(NULL, NULL);
2327
2328         metaslab_group_add(mg, ms);
2329         metaslab_set_fragmentation(ms);
2330
2331         /*
2332          * If we're opening an existing pool (txg == 0) or creating
2333          * a new one (txg == TXG_INITIAL), all space is available now.
2334          * If we're adding space to an existing pool, the new space
2335          * does not become available until after this txg has synced.
2336          * The metaslab's weight will also be initialized when we sync
2337          * out this txg. This ensures that we don't attempt to allocate
2338          * from it before we have initialized it completely.
2339          */
2340         if (txg <= TXG_INITIAL) {
2341                 metaslab_sync_done(ms, 0);
2342                 metaslab_space_update(vd, mg->mg_class,
2343                     metaslab_allocated_space(ms), 0, 0);
2344         }
2345
2346         if (txg != 0) {
2347                 vdev_dirty(vd, 0, NULL, txg);
2348                 vdev_dirty(vd, VDD_METASLAB, ms, txg);
2349         }
2350
2351         *msp = ms;
2352
2353         return (0);
2354 }
2355
2356 static void
2357 metaslab_fini_flush_data(metaslab_t *msp)
2358 {
2359         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2360
2361         if (metaslab_unflushed_txg(msp) == 0) {
2362                 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
2363                     ==, NULL);
2364                 return;
2365         }
2366         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
2367
2368         mutex_enter(&spa->spa_flushed_ms_lock);
2369         avl_remove(&spa->spa_metaslabs_by_flushed, msp);
2370         mutex_exit(&spa->spa_flushed_ms_lock);
2371
2372         spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
2373         spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
2374 }
2375
2376 uint64_t
2377 metaslab_unflushed_changes_memused(metaslab_t *ms)
2378 {
2379         return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
2380             range_tree_numsegs(ms->ms_unflushed_frees)) *
2381             sizeof (range_seg_t));
2382 }
2383
2384 void
2385 metaslab_fini(metaslab_t *msp)
2386 {
2387         metaslab_group_t *mg = msp->ms_group;
2388         vdev_t *vd = mg->mg_vd;
2389         spa_t *spa = vd->vdev_spa;
2390
2391         metaslab_fini_flush_data(msp);
2392
2393         metaslab_group_remove(mg, msp);
2394
2395         mutex_enter(&msp->ms_lock);
2396         VERIFY(msp->ms_group == NULL);
2397         metaslab_space_update(vd, mg->mg_class,
2398             -metaslab_allocated_space(msp), 0, -msp->ms_size);
2399
2400         space_map_close(msp->ms_sm);
2401         msp->ms_sm = NULL;
2402
2403         metaslab_unload(msp);
2404         range_tree_destroy(msp->ms_allocatable);
2405         range_tree_destroy(msp->ms_freeing);
2406         range_tree_destroy(msp->ms_freed);
2407
2408         ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
2409             metaslab_unflushed_changes_memused(msp));
2410         spa->spa_unflushed_stats.sus_memused -=
2411             metaslab_unflushed_changes_memused(msp);
2412         range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
2413         range_tree_destroy(msp->ms_unflushed_allocs);
2414         range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
2415         range_tree_destroy(msp->ms_unflushed_frees);
2416
2417         for (int t = 0; t < TXG_SIZE; t++) {
2418                 range_tree_destroy(msp->ms_allocating[t]);
2419         }
2420
2421         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2422                 range_tree_destroy(msp->ms_defer[t]);
2423         }
2424         ASSERT0(msp->ms_deferspace);
2425
2426         range_tree_destroy(msp->ms_checkpointing);
2427
2428         for (int t = 0; t < TXG_SIZE; t++)
2429                 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
2430
2431         range_tree_vacate(msp->ms_trim, NULL, NULL);
2432         range_tree_destroy(msp->ms_trim);
2433
2434         mutex_exit(&msp->ms_lock);
2435         cv_destroy(&msp->ms_load_cv);
2436         cv_destroy(&msp->ms_flush_cv);
2437         mutex_destroy(&msp->ms_lock);
2438         mutex_destroy(&msp->ms_sync_lock);
2439         ASSERT3U(msp->ms_allocator, ==, -1);
2440
2441         kmem_free(msp, sizeof (metaslab_t));
2442 }
2443
2444 #define FRAGMENTATION_TABLE_SIZE        17
2445
2446 /*
2447  * This table defines a segment size based fragmentation metric that will
2448  * allow each metaslab to derive its own fragmentation value. This is done
2449  * by calculating the space in each bucket of the spacemap histogram and
2450  * multiplying that by the fragmentation metric in this table. Doing
2451  * this for all buckets and dividing it by the total amount of free
2452  * space in this metaslab (i.e. the total free space in all buckets) gives
2453  * us the fragmentation metric. This means that a high fragmentation metric
2454  * equates to most of the free space being comprised of small segments.
2455  * Conversely, if the metric is low, then most of the free space is in
2456  * large segments. A 10% change in fragmentation equates to approximately
2457  * double the number of segments.
2458  *
2459  * This table defines 0% fragmented space using 16MB segments. Testing has
2460  * shown that segments that are greater than or equal to 16MB do not suffer
2461  * from drastic performance problems. Using this value, we derive the rest
2462  * of the table. Since the fragmentation value is never stored on disk, it
2463  * is possible to change these calculations in the future.
2464  */
2465 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
2466         100,    /* 512B */
2467         100,    /* 1K   */
2468         98,     /* 2K   */
2469         95,     /* 4K   */
2470         90,     /* 8K   */
2471         80,     /* 16K  */
2472         70,     /* 32K  */
2473         60,     /* 64K  */
2474         50,     /* 128K */
2475         40,     /* 256K */
2476         30,     /* 512K */
2477         20,     /* 1M   */
2478         15,     /* 2M   */
2479         10,     /* 4M   */
2480         5,      /* 8M   */
2481         0       /* 16M  */
2482 };
2483
2484 /*
2485  * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
2486  * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
2487  * been upgraded and does not support this metric. Otherwise, the return
2488  * value should be in the range [0, 100].
2489  */
2490 static void
2491 metaslab_set_fragmentation(metaslab_t *msp)
2492 {
2493         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2494         uint64_t fragmentation = 0;
2495         uint64_t total = 0;
2496         boolean_t feature_enabled = spa_feature_is_enabled(spa,
2497             SPA_FEATURE_SPACEMAP_HISTOGRAM);
2498
2499         if (!feature_enabled) {
2500                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
2501                 return;
2502         }
2503
2504         /*
2505          * A null space map means that the entire metaslab is free
2506          * and thus is not fragmented.
2507          */
2508         if (msp->ms_sm == NULL) {
2509                 msp->ms_fragmentation = 0;
2510                 return;
2511         }
2512
2513         /*
2514          * If this metaslab's space map has not been upgraded, flag it
2515          * so that we upgrade next time we encounter it.
2516          */
2517         if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
2518                 uint64_t txg = spa_syncing_txg(spa);
2519                 vdev_t *vd = msp->ms_group->mg_vd;
2520
2521                 /*
2522                  * If we've reached the final dirty txg, then we must
2523                  * be shutting down the pool. We don't want to dirty
2524                  * any data past this point so skip setting the condense
2525                  * flag. We can retry this action the next time the pool
2526                  * is imported.
2527                  */
2528                 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
2529                         msp->ms_condense_wanted = B_TRUE;
2530                         vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2531                         zfs_dbgmsg("txg %llu, requesting force condense: "
2532                             "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
2533                             vd->vdev_id);
2534                 }
2535                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
2536                 return;
2537         }
2538
2539         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
2540                 uint64_t space = 0;
2541                 uint8_t shift = msp->ms_sm->sm_shift;
2542
2543                 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
2544                     FRAGMENTATION_TABLE_SIZE - 1);
2545
2546                 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
2547                         continue;
2548
2549                 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
2550                 total += space;
2551
2552                 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
2553                 fragmentation += space * zfs_frag_table[idx];
2554         }
2555
2556         if (total > 0)
2557                 fragmentation /= total;
2558         ASSERT3U(fragmentation, <=, 100);
2559
2560         msp->ms_fragmentation = fragmentation;
2561 }
2562
2563 /*
2564  * Compute a weight -- a selection preference value -- for the given metaslab.
2565  * This is based on the amount of free space, the level of fragmentation,
2566  * the LBA range, and whether the metaslab is loaded.
2567  */
2568 static uint64_t
2569 metaslab_space_weight(metaslab_t *msp)
2570 {
2571         metaslab_group_t *mg = msp->ms_group;
2572         vdev_t *vd = mg->mg_vd;
2573         uint64_t weight, space;
2574
2575         ASSERT(MUTEX_HELD(&msp->ms_lock));
2576         ASSERT(!vd->vdev_removing);
2577
2578         /*
2579          * The baseline weight is the metaslab's free space.
2580          */
2581         space = msp->ms_size - metaslab_allocated_space(msp);
2582
2583         if (metaslab_fragmentation_factor_enabled &&
2584             msp->ms_fragmentation != ZFS_FRAG_INVALID) {
2585                 /*
2586                  * Use the fragmentation information to inversely scale
2587                  * down the baseline weight. We need to ensure that we
2588                  * don't exclude this metaslab completely when it's 100%
2589                  * fragmented. To avoid this we reduce the fragmented value
2590                  * by 1.
2591                  */
2592                 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
2593
2594                 /*
2595                  * If space < SPA_MINBLOCKSIZE, then we will not allocate from
2596                  * this metaslab again. The fragmentation metric may have
2597                  * decreased the space to something smaller than
2598                  * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
2599                  * so that we can consume any remaining space.
2600                  */
2601                 if (space > 0 && space < SPA_MINBLOCKSIZE)
2602                         space = SPA_MINBLOCKSIZE;
2603         }
2604         weight = space;
2605
2606         /*
2607          * Modern disks have uniform bit density and constant angular velocity.
2608          * Therefore, the outer recording zones are faster (higher bandwidth)
2609          * than the inner zones by the ratio of outer to inner track diameter,
2610          * which is typically around 2:1.  We account for this by assigning
2611          * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
2612          * In effect, this means that we'll select the metaslab with the most
2613          * free bandwidth rather than simply the one with the most free space.
2614          */
2615         if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
2616                 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
2617                 ASSERT(weight >= space && weight <= 2 * space);
2618         }
2619
2620         /*
2621          * If this metaslab is one we're actively using, adjust its
2622          * weight to make it preferable to any inactive metaslab so
2623          * we'll polish it off. If the fragmentation on this metaslab
2624          * has exceed our threshold, then don't mark it active.
2625          */
2626         if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
2627             msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
2628                 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
2629         }
2630
2631         WEIGHT_SET_SPACEBASED(weight);
2632         return (weight);
2633 }
2634
2635 /*
2636  * Return the weight of the specified metaslab, according to the segment-based
2637  * weighting algorithm. The metaslab must be loaded. This function can
2638  * be called within a sync pass since it relies only on the metaslab's
2639  * range tree which is always accurate when the metaslab is loaded.
2640  */
2641 static uint64_t
2642 metaslab_weight_from_range_tree(metaslab_t *msp)
2643 {
2644         uint64_t weight = 0;
2645         uint32_t segments = 0;
2646
2647         ASSERT(msp->ms_loaded);
2648
2649         for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
2650             i--) {
2651                 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
2652                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
2653
2654                 segments <<= 1;
2655                 segments += msp->ms_allocatable->rt_histogram[i];
2656
2657                 /*
2658                  * The range tree provides more precision than the space map
2659                  * and must be downgraded so that all values fit within the
2660                  * space map's histogram. This allows us to compare loaded
2661                  * vs. unloaded metaslabs to determine which metaslab is
2662                  * considered "best".
2663                  */
2664                 if (i > max_idx)
2665                         continue;
2666
2667                 if (segments != 0) {
2668                         WEIGHT_SET_COUNT(weight, segments);
2669                         WEIGHT_SET_INDEX(weight, i);
2670                         WEIGHT_SET_ACTIVE(weight, 0);
2671                         break;
2672                 }
2673         }
2674         return (weight);
2675 }
2676
2677 /*
2678  * Calculate the weight based on the on-disk histogram. Should be applied
2679  * only to unloaded metaslabs  (i.e no incoming allocations) in-order to
2680  * give results consistent with the on-disk state
2681  */
2682 static uint64_t
2683 metaslab_weight_from_spacemap(metaslab_t *msp)
2684 {
2685         space_map_t *sm = msp->ms_sm;
2686         ASSERT(!msp->ms_loaded);
2687         ASSERT(sm != NULL);
2688         ASSERT3U(space_map_object(sm), !=, 0);
2689         ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
2690
2691         /*
2692          * Create a joint histogram from all the segments that have made
2693          * it to the metaslab's space map histogram, that are not yet
2694          * available for allocation because they are still in the freeing
2695          * pipeline (e.g. freeing, freed, and defer trees). Then subtract
2696          * these segments from the space map's histogram to get a more
2697          * accurate weight.
2698          */
2699         uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
2700         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
2701                 deferspace_histogram[i] += msp->ms_synchist[i];
2702         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2703                 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
2704                         deferspace_histogram[i] += msp->ms_deferhist[t][i];
2705                 }
2706         }
2707
2708         uint64_t weight = 0;
2709         for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
2710                 ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
2711                     deferspace_histogram[i]);
2712                 uint64_t count =
2713                     sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
2714                 if (count != 0) {
2715                         WEIGHT_SET_COUNT(weight, count);
2716                         WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
2717                         WEIGHT_SET_ACTIVE(weight, 0);
2718                         break;
2719                 }
2720         }
2721         return (weight);
2722 }
2723
2724 /*
2725  * Compute a segment-based weight for the specified metaslab. The weight
2726  * is determined by highest bucket in the histogram. The information
2727  * for the highest bucket is encoded into the weight value.
2728  */
2729 static uint64_t
2730 metaslab_segment_weight(metaslab_t *msp)
2731 {
2732         metaslab_group_t *mg = msp->ms_group;
2733         uint64_t weight = 0;
2734         uint8_t shift = mg->mg_vd->vdev_ashift;
2735
2736         ASSERT(MUTEX_HELD(&msp->ms_lock));
2737
2738         /*
2739          * The metaslab is completely free.
2740          */
2741         if (metaslab_allocated_space(msp) == 0) {
2742                 int idx = highbit64(msp->ms_size) - 1;
2743                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
2744
2745                 if (idx < max_idx) {
2746                         WEIGHT_SET_COUNT(weight, 1ULL);
2747                         WEIGHT_SET_INDEX(weight, idx);
2748                 } else {
2749                         WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
2750                         WEIGHT_SET_INDEX(weight, max_idx);
2751                 }
2752                 WEIGHT_SET_ACTIVE(weight, 0);
2753                 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
2754                 return (weight);
2755         }
2756
2757         ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
2758
2759         /*
2760          * If the metaslab is fully allocated then just make the weight 0.
2761          */
2762         if (metaslab_allocated_space(msp) == msp->ms_size)
2763                 return (0);
2764         /*
2765          * If the metaslab is already loaded, then use the range tree to
2766          * determine the weight. Otherwise, we rely on the space map information
2767          * to generate the weight.
2768          */
2769         if (msp->ms_loaded) {
2770                 weight = metaslab_weight_from_range_tree(msp);
2771         } else {
2772                 weight = metaslab_weight_from_spacemap(msp);
2773         }
2774
2775         /*
2776          * If the metaslab was active the last time we calculated its weight
2777          * then keep it active. We want to consume the entire region that
2778          * is associated with this weight.
2779          */
2780         if (msp->ms_activation_weight != 0 && weight != 0)
2781                 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
2782         return (weight);
2783 }
2784
2785 /*
2786  * Determine if we should attempt to allocate from this metaslab. If the
2787  * metaslab is loaded, then we can determine if the desired allocation
2788  * can be satisfied by looking at the size of the maximum free segment
2789  * on that metaslab. Otherwise, we make our decision based on the metaslab's
2790  * weight. For segment-based weighting we can determine the maximum
2791  * allocation based on the index encoded in its value. For space-based
2792  * weights we rely on the entire weight (excluding the weight-type bit).
2793  */
2794 boolean_t
2795 metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
2796 {
2797         /*
2798          * If the metaslab is loaded, ms_max_size is definitive and we can use
2799          * the fast check. If it's not, the ms_max_size is a lower bound (once
2800          * set), and we should use the fast check as long as we're not in
2801          * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
2802          * seconds since the metaslab was unloaded.
2803          */
2804         if (msp->ms_loaded ||
2805             (msp->ms_max_size != 0 && !try_hard && gethrtime() <
2806             msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
2807                 return (msp->ms_max_size >= asize);
2808
2809         boolean_t should_allocate;
2810         if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2811                 /*
2812                  * The metaslab segment weight indicates segments in the
2813                  * range [2^i, 2^(i+1)), where i is the index in the weight.
2814                  * Since the asize might be in the middle of the range, we
2815                  * should attempt the allocation if asize < 2^(i+1).
2816                  */
2817                 should_allocate = (asize <
2818                     1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
2819         } else {
2820                 should_allocate = (asize <=
2821                     (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
2822         }
2823
2824         return (should_allocate);
2825 }
2826 static uint64_t
2827 metaslab_weight(metaslab_t *msp)
2828 {
2829         vdev_t *vd = msp->ms_group->mg_vd;
2830         spa_t *spa = vd->vdev_spa;
2831         uint64_t weight;
2832
2833         ASSERT(MUTEX_HELD(&msp->ms_lock));
2834
2835         /*
2836          * If this vdev is in the process of being removed, there is nothing
2837          * for us to do here.
2838          */
2839         if (vd->vdev_removing)
2840                 return (0);
2841
2842         metaslab_set_fragmentation(msp);
2843
2844         /*
2845          * Update the maximum size. If the metaslab is loaded, this will
2846          * ensure that we get an accurate maximum size if newly freed space
2847          * has been added back into the free tree. If the metaslab is
2848          * unloaded, we check if there's a larger free segment in the
2849          * unflushed frees. This is a lower bound on the largest allocatable
2850          * segment size. Coalescing of adjacent entries may reveal larger
2851          * allocatable segments, but we aren't aware of those until loading
2852          * the space map into a range tree.
2853          */
2854         if (msp->ms_loaded) {
2855                 msp->ms_max_size = metaslab_largest_allocatable(msp);
2856         } else {
2857                 msp->ms_max_size = MAX(msp->ms_max_size,
2858                     metaslab_largest_unflushed_free(msp));
2859         }
2860
2861         /*
2862          * Segment-based weighting requires space map histogram support.
2863          */
2864         if (zfs_metaslab_segment_weight_enabled &&
2865             spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
2866             (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
2867             sizeof (space_map_phys_t))) {
2868                 weight = metaslab_segment_weight(msp);
2869         } else {
2870                 weight = metaslab_space_weight(msp);
2871         }
2872         return (weight);
2873 }
2874
2875 void
2876 metaslab_recalculate_weight_and_sort(metaslab_t *msp)
2877 {
2878         ASSERT(MUTEX_HELD(&msp->ms_lock));
2879
2880         /* note: we preserve the mask (e.g. indication of primary, etc..) */
2881         uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2882         metaslab_group_sort(msp->ms_group, msp,
2883             metaslab_weight(msp) | was_active);
2884 }
2885
2886 static int
2887 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2888     int allocator, uint64_t activation_weight)
2889 {
2890         ASSERT(MUTEX_HELD(&msp->ms_lock));
2891
2892         /*
2893          * If we're activating for the claim code, we don't want to actually
2894          * set the metaslab up for a specific allocator.
2895          */
2896         if (activation_weight == METASLAB_WEIGHT_CLAIM) {
2897                 ASSERT0(msp->ms_activation_weight);
2898                 msp->ms_activation_weight = msp->ms_weight;
2899                 metaslab_group_sort(mg, msp, msp->ms_weight |
2900                     activation_weight);
2901                 return (0);
2902         }
2903
2904         metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
2905             mg->mg_primaries : mg->mg_secondaries);
2906
2907         mutex_enter(&mg->mg_lock);
2908         if (arr[allocator] != NULL) {
2909                 mutex_exit(&mg->mg_lock);
2910                 return (EEXIST);
2911         }
2912
2913         arr[allocator] = msp;
2914         ASSERT3S(msp->ms_allocator, ==, -1);
2915         msp->ms_allocator = allocator;
2916         msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
2917
2918         ASSERT0(msp->ms_activation_weight);
2919         msp->ms_activation_weight = msp->ms_weight;
2920         metaslab_group_sort_impl(mg, msp,
2921             msp->ms_weight | activation_weight);
2922
2923         mutex_exit(&mg->mg_lock);
2924
2925         return (0);
2926 }
2927
2928 static int
2929 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
2930 {
2931         ASSERT(MUTEX_HELD(&msp->ms_lock));
2932
2933         /*
2934          * The current metaslab is already activated for us so there
2935          * is nothing to do. Already activated though, doesn't mean
2936          * that this metaslab is activated for our allocator nor our
2937          * requested activation weight. The metaslab could have started
2938          * as an active one for our allocator but changed allocators
2939          * while we were waiting to grab its ms_lock or we stole it
2940          * [see find_valid_metaslab()]. This means that there is a
2941          * possibility of passivating a metaslab of another allocator
2942          * or from a different activation mask, from this thread.
2943          */
2944         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
2945                 ASSERT(msp->ms_loaded);
2946                 return (0);
2947         }
2948
2949         int error = metaslab_load(msp);
2950         if (error != 0) {
2951                 metaslab_group_sort(msp->ms_group, msp, 0);
2952                 return (error);
2953         }
2954
2955         /*
2956          * When entering metaslab_load() we may have dropped the
2957          * ms_lock because we were loading this metaslab, or we
2958          * were waiting for another thread to load it for us. In
2959          * that scenario, we recheck the weight of the metaslab
2960          * to see if it was activated by another thread.
2961          *
2962          * If the metaslab was activated for another allocator or
2963          * it was activated with a different activation weight (e.g.
2964          * we wanted to make it a primary but it was activated as
2965          * secondary) we return error (EBUSY).
2966          *
2967          * If the metaslab was activated for the same allocator
2968          * and requested activation mask, skip activating it.
2969          */
2970         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
2971                 if (msp->ms_allocator != allocator)
2972                         return (EBUSY);
2973
2974                 if ((msp->ms_weight & activation_weight) == 0)
2975                         return (SET_ERROR(EBUSY));
2976
2977                 EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
2978                     msp->ms_primary);
2979                 return (0);
2980         }
2981
2982         /*
2983          * If the metaslab has literally 0 space, it will have weight 0. In
2984          * that case, don't bother activating it. This can happen if the
2985          * metaslab had space during find_valid_metaslab, but another thread
2986          * loaded it and used all that space while we were waiting to grab the
2987          * lock.
2988          */
2989         if (msp->ms_weight == 0) {
2990                 ASSERT0(range_tree_space(msp->ms_allocatable));
2991                 return (SET_ERROR(ENOSPC));
2992         }
2993
2994         if ((error = metaslab_activate_allocator(msp->ms_group, msp,
2995             allocator, activation_weight)) != 0) {
2996                 return (error);
2997         }
2998
2999         ASSERT(msp->ms_loaded);
3000         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
3001
3002         return (0);
3003 }
3004
3005 static void
3006 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
3007     uint64_t weight)
3008 {
3009         ASSERT(MUTEX_HELD(&msp->ms_lock));
3010         ASSERT(msp->ms_loaded);
3011
3012         if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
3013                 metaslab_group_sort(mg, msp, weight);
3014                 return;
3015         }
3016
3017         mutex_enter(&mg->mg_lock);
3018         ASSERT3P(msp->ms_group, ==, mg);
3019         ASSERT3S(0, <=, msp->ms_allocator);
3020         ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
3021
3022         if (msp->ms_primary) {
3023                 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
3024                 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
3025                 mg->mg_primaries[msp->ms_allocator] = NULL;
3026         } else {
3027                 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
3028                 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
3029                 mg->mg_secondaries[msp->ms_allocator] = NULL;
3030         }
3031         msp->ms_allocator = -1;
3032         metaslab_group_sort_impl(mg, msp, weight);
3033         mutex_exit(&mg->mg_lock);
3034 }
3035
3036 static void
3037 metaslab_passivate(metaslab_t *msp, uint64_t weight)
3038 {
3039         ASSERTV(uint64_t size = weight & ~METASLAB_WEIGHT_TYPE);
3040
3041         /*
3042          * If size < SPA_MINBLOCKSIZE, then we will not allocate from
3043          * this metaslab again.  In that case, it had better be empty,
3044          * or we would be leaving space on the table.
3045          */
3046         ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
3047             size >= SPA_MINBLOCKSIZE ||
3048             range_tree_space(msp->ms_allocatable) == 0);
3049         ASSERT0(weight & METASLAB_ACTIVE_MASK);
3050
3051         ASSERT(msp->ms_activation_weight != 0);
3052         msp->ms_activation_weight = 0;
3053         metaslab_passivate_allocator(msp->ms_group, msp, weight);
3054         ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
3055 }
3056
3057 /*
3058  * Segment-based metaslabs are activated once and remain active until
3059  * we either fail an allocation attempt (similar to space-based metaslabs)
3060  * or have exhausted the free space in zfs_metaslab_switch_threshold
3061  * buckets since the metaslab was activated. This function checks to see
3062  * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
3063  * metaslab and passivates it proactively. This will allow us to select a
3064  * metaslab with a larger contiguous region, if any, remaining within this
3065  * metaslab group. If we're in sync pass > 1, then we continue using this
3066  * metaslab so that we don't dirty more block and cause more sync passes.
3067  */
3068 void
3069 metaslab_segment_may_passivate(metaslab_t *msp)
3070 {
3071         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3072
3073         if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
3074                 return;
3075
3076         /*
3077          * Since we are in the middle of a sync pass, the most accurate
3078          * information that is accessible to us is the in-core range tree
3079          * histogram; calculate the new weight based on that information.
3080          */
3081         uint64_t weight = metaslab_weight_from_range_tree(msp);
3082         int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
3083         int current_idx = WEIGHT_GET_INDEX(weight);
3084
3085         if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
3086                 metaslab_passivate(msp, weight);
3087 }
3088
3089 static void
3090 metaslab_preload(void *arg)
3091 {
3092         metaslab_t *msp = arg;
3093         metaslab_class_t *mc = msp->ms_group->mg_class;
3094         spa_t *spa = mc->mc_spa;
3095         fstrans_cookie_t cookie = spl_fstrans_mark();
3096
3097         ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
3098
3099         mutex_enter(&msp->ms_lock);
3100         (void) metaslab_load(msp);
3101         metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
3102         mutex_exit(&msp->ms_lock);
3103         spl_fstrans_unmark(cookie);
3104 }
3105
3106 static void
3107 metaslab_group_preload(metaslab_group_t *mg)
3108 {
3109         spa_t *spa = mg->mg_vd->vdev_spa;
3110         metaslab_t *msp;
3111         avl_tree_t *t = &mg->mg_metaslab_tree;
3112         int m = 0;
3113
3114         if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
3115                 taskq_wait_outstanding(mg->mg_taskq, 0);
3116                 return;
3117         }
3118
3119         mutex_enter(&mg->mg_lock);
3120
3121         /*
3122          * Load the next potential metaslabs
3123          */
3124         for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
3125                 ASSERT3P(msp->ms_group, ==, mg);
3126
3127                 /*
3128                  * We preload only the maximum number of metaslabs specified
3129                  * by metaslab_preload_limit. If a metaslab is being forced
3130                  * to condense then we preload it too. This will ensure
3131                  * that force condensing happens in the next txg.
3132                  */
3133                 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
3134                         continue;
3135                 }
3136
3137                 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
3138                     msp, TQ_SLEEP) != TASKQID_INVALID);
3139         }
3140         mutex_exit(&mg->mg_lock);
3141 }
3142
3143 /*
3144  * Determine if the space map's on-disk footprint is past our tolerance for
3145  * inefficiency. We would like to use the following criteria to make our
3146  * decision:
3147  *
3148  * 1. Do not condense if the size of the space map object would dramatically
3149  *    increase as a result of writing out the free space range tree.
3150  *
3151  * 2. Condense if the on on-disk space map representation is at least
3152  *    zfs_condense_pct/100 times the size of the optimal representation
3153  *    (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
3154  *
3155  * 3. Do not condense if the on-disk size of the space map does not actually
3156  *    decrease.
3157  *
3158  * Unfortunately, we cannot compute the on-disk size of the space map in this
3159  * context because we cannot accurately compute the effects of compression, etc.
3160  * Instead, we apply the heuristic described in the block comment for
3161  * zfs_metaslab_condense_block_threshold - we only condense if the space used
3162  * is greater than a threshold number of blocks.
3163  */
3164 static boolean_t
3165 metaslab_should_condense(metaslab_t *msp)
3166 {
3167         space_map_t *sm = msp->ms_sm;
3168         vdev_t *vd = msp->ms_group->mg_vd;
3169         uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
3170
3171         ASSERT(MUTEX_HELD(&msp->ms_lock));
3172         ASSERT(msp->ms_loaded);
3173         ASSERT(sm != NULL);
3174         ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
3175
3176         /*
3177          * We always condense metaslabs that are empty and metaslabs for
3178          * which a condense request has been made.
3179          */
3180         if (avl_is_empty(&msp->ms_allocatable_by_size) ||
3181             msp->ms_condense_wanted)
3182                 return (B_TRUE);
3183
3184         uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
3185         uint64_t object_size = space_map_length(sm);
3186         uint64_t optimal_size = space_map_estimate_optimal_size(sm,
3187             msp->ms_allocatable, SM_NO_VDEVID);
3188
3189         return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
3190             object_size > zfs_metaslab_condense_block_threshold * record_size);
3191 }
3192
3193 /*
3194  * Condense the on-disk space map representation to its minimized form.
3195  * The minimized form consists of a small number of allocations followed
3196  * by the entries of the free range tree (ms_allocatable). The condensed
3197  * spacemap contains all the entries of previous TXGs (including those in
3198  * the pool-wide log spacemaps; thus this is effectively a superset of
3199  * metaslab_flush()), but this TXG's entries still need to be written.
3200  */
3201 static void
3202 metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
3203 {
3204         range_tree_t *condense_tree;
3205         space_map_t *sm = msp->ms_sm;
3206         uint64_t txg = dmu_tx_get_txg(tx);
3207         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3208
3209         ASSERT(MUTEX_HELD(&msp->ms_lock));
3210         ASSERT(msp->ms_loaded);
3211         ASSERT(msp->ms_sm != NULL);
3212
3213         /*
3214          * In order to condense the space map, we need to change it so it
3215          * only describes which segments are currently allocated and free.
3216          *
3217          * All the current free space resides in the ms_allocatable, all
3218          * the ms_defer trees, and all the ms_allocating trees. We ignore
3219          * ms_freed because it is empty because we're in sync pass 1. We
3220          * ignore ms_freeing because these changes are not yet reflected
3221          * in the spacemap (they will be written later this txg).
3222          *
3223          * So to truncate the space map to represent all the entries of
3224          * previous TXGs we do the following:
3225          *
3226          * 1] We create a range tree (condense tree) that is 100% allocated.
3227          * 2] We remove from it all segments found in the ms_defer trees
3228          *    as those segments are marked as free in the original space
3229          *    map. We do the same with the ms_allocating trees for the same
3230          *    reason. Removing these segments should be a relatively
3231          *    inexpensive operation since we expect these trees to have a
3232          *    small number of nodes.
3233          * 3] We vacate any unflushed allocs as they should already exist
3234          *    in the condense tree. Then we vacate any unflushed frees as
3235          *    they should already be part of ms_allocatable.
3236          * 4] At this point, we would ideally like to remove all segments
3237          *    in the ms_allocatable tree from the condense tree. This way
3238          *    we would write all the entries of the condense tree as the
3239          *    condensed space map, which would only contain allocated
3240          *    segments with everything else assumed to be freed.
3241          *
3242          *    Doing so can be prohibitively expensive as ms_allocatable can
3243          *    be large, and therefore computationally expensive to subtract
3244          *    from the condense_tree. Instead we first sync out the
3245          *    condense_tree and then the ms_allocatable, in the condensed
3246          *    space map. While this is not optimal, it is typically close to
3247          *    optimal and more importantly much cheaper to compute.
3248          *
3249          * 5] Finally, as both of the unflushed trees were written to our
3250          *    new and condensed metaslab space map, we basically flushed
3251          *    all the unflushed changes to disk, thus we call
3252          *    metaslab_flush_update().
3253          */
3254         ASSERT3U(spa_sync_pass(spa), ==, 1);
3255         ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
3256
3257         zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
3258             "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
3259             msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
3260             spa->spa_name, space_map_length(msp->ms_sm),
3261             avl_numnodes(&msp->ms_allocatable->rt_root),
3262             msp->ms_condense_wanted ? "TRUE" : "FALSE");
3263
3264         msp->ms_condense_wanted = B_FALSE;
3265
3266         condense_tree = range_tree_create(NULL, NULL);
3267         range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
3268
3269         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3270                 range_tree_walk(msp->ms_defer[t],
3271                     range_tree_remove, condense_tree);
3272         }
3273
3274         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
3275                 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
3276                     range_tree_remove, condense_tree);
3277         }
3278
3279         ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3280             metaslab_unflushed_changes_memused(msp));
3281         spa->spa_unflushed_stats.sus_memused -=
3282             metaslab_unflushed_changes_memused(msp);
3283         range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3284         range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3285
3286         /*
3287          * We're about to drop the metaslab's lock thus allowing other
3288          * consumers to change it's content. Set the metaslab's ms_condensing
3289          * flag to ensure that allocations on this metaslab do not occur
3290          * while we're in the middle of committing it to disk. This is only
3291          * critical for ms_allocatable as all other range trees use per TXG
3292          * views of their content.
3293          */
3294         msp->ms_condensing = B_TRUE;
3295
3296         mutex_exit(&msp->ms_lock);
3297         uint64_t object = space_map_object(msp->ms_sm);
3298         space_map_truncate(sm,
3299             spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
3300             zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
3301
3302         /*
3303          * space_map_truncate() may have reallocated the spacemap object.
3304          * If so, update the vdev_ms_array.
3305          */
3306         if (space_map_object(msp->ms_sm) != object) {
3307                 object = space_map_object(msp->ms_sm);
3308                 dmu_write(spa->spa_meta_objset,
3309                     msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
3310                     msp->ms_id, sizeof (uint64_t), &object, tx);
3311         }
3312
3313         /*
3314          * Note:
3315          * When the log space map feature is enabled, each space map will
3316          * always have ALLOCS followed by FREES for each sync pass. This is
3317          * typically true even when the log space map feature is disabled,
3318          * except from the case where a metaslab goes through metaslab_sync()
3319          * and gets condensed. In that case the metaslab's space map will have
3320          * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
3321          * followed by FREES (due to space_map_write() in metaslab_sync()) for
3322          * sync pass 1.
3323          */
3324         space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
3325         space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
3326
3327         range_tree_vacate(condense_tree, NULL, NULL);
3328         range_tree_destroy(condense_tree);
3329         mutex_enter(&msp->ms_lock);
3330
3331         msp->ms_condensing = B_FALSE;
3332         metaslab_flush_update(msp, tx);
3333 }
3334
3335 /*
3336  * Called when the metaslab has been flushed (its own spacemap now reflects
3337  * all the contents of the pool-wide spacemap log). Updates the metaslab's
3338  * metadata and any pool-wide related log space map data (e.g. summary,
3339  * obsolete logs, etc..) to reflect that.
3340  */
3341 static void
3342 metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
3343 {
3344         metaslab_group_t *mg = msp->ms_group;
3345         spa_t *spa = mg->mg_vd->vdev_spa;
3346
3347         ASSERT(MUTEX_HELD(&msp->ms_lock));
3348
3349         ASSERT3U(spa_sync_pass(spa), ==, 1);
3350         ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3351         ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3352
3353         /*
3354          * Just because a metaslab got flushed, that doesn't mean that
3355          * it will pass through metaslab_sync_done(). Thus, make sure to
3356          * update ms_synced_length here in case it doesn't.
3357          */
3358         msp->ms_synced_length = space_map_length(msp->ms_sm);
3359
3360         /*
3361          * We may end up here from metaslab_condense() without the
3362          * feature being active. In that case this is a no-op.
3363          */
3364         if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
3365                 return;
3366
3367         ASSERT(spa_syncing_log_sm(spa) != NULL);
3368         ASSERT(msp->ms_sm != NULL);
3369         ASSERT(metaslab_unflushed_txg(msp) != 0);
3370         ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
3371
3372         VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
3373
3374         /* update metaslab's position in our flushing tree */
3375         uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
3376         mutex_enter(&spa->spa_flushed_ms_lock);
3377         avl_remove(&spa->spa_metaslabs_by_flushed, msp);
3378         metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3379         avl_add(&spa->spa_metaslabs_by_flushed, msp);
3380         mutex_exit(&spa->spa_flushed_ms_lock);
3381
3382         /* update metaslab counts of spa_log_sm_t nodes */
3383         spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
3384         spa_log_sm_increment_current_mscount(spa);
3385
3386         /* cleanup obsolete logs if any */
3387         uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
3388         spa_cleanup_old_sm_logs(spa, tx);
3389         uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
3390         VERIFY3U(log_blocks_after, <=, log_blocks_before);
3391
3392         /* update log space map summary */
3393         uint64_t blocks_gone = log_blocks_before - log_blocks_after;
3394         spa_log_summary_add_flushed_metaslab(spa);
3395         spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
3396         spa_log_summary_decrement_blkcount(spa, blocks_gone);
3397 }
3398
3399 boolean_t
3400 metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
3401 {
3402         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3403
3404         ASSERT(MUTEX_HELD(&msp->ms_lock));
3405         ASSERT3U(spa_sync_pass(spa), ==, 1);
3406         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
3407
3408         ASSERT(msp->ms_sm != NULL);
3409         ASSERT(metaslab_unflushed_txg(msp) != 0);
3410         ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
3411
3412         /*
3413          * There is nothing wrong with flushing the same metaslab twice, as
3414          * this codepath should work on that case. However, the current
3415          * flushing scheme makes sure to avoid this situation as we would be
3416          * making all these calls without having anything meaningful to write
3417          * to disk. We assert this behavior here.
3418          */
3419         ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
3420
3421         /*
3422          * We can not flush while loading, because then we would
3423          * not load the ms_unflushed_{allocs,frees}.
3424          */
3425         if (msp->ms_loading)
3426                 return (B_FALSE);
3427
3428         metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3429         metaslab_verify_weight_and_frag(msp);
3430
3431         /*
3432          * Metaslab condensing is effectively flushing. Therefore if the
3433          * metaslab can be condensed we can just condense it instead of
3434          * flushing it.
3435          *
3436          * Note that metaslab_condense() does call metaslab_flush_update()
3437          * so we can just return immediately after condensing. We also
3438          * don't need to care about setting ms_flushing or broadcasting
3439          * ms_flush_cv, even if we temporarily drop the ms_lock in
3440          * metaslab_condense(), as the metaslab is already loaded.
3441          */
3442         if (msp->ms_loaded && metaslab_should_condense(msp)) {
3443                 metaslab_group_t *mg = msp->ms_group;
3444
3445                 /*
3446                  * For all histogram operations below refer to the
3447                  * comments of metaslab_sync() where we follow a
3448                  * similar procedure.
3449                  */
3450                 metaslab_group_histogram_verify(mg);
3451                 metaslab_class_histogram_verify(mg->mg_class);
3452                 metaslab_group_histogram_remove(mg, msp);
3453
3454                 metaslab_condense(msp, tx);
3455
3456                 space_map_histogram_clear(msp->ms_sm);
3457                 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
3458                 ASSERT(range_tree_is_empty(msp->ms_freed));
3459                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3460                         space_map_histogram_add(msp->ms_sm,
3461                             msp->ms_defer[t], tx);
3462                 }
3463                 metaslab_aux_histograms_update(msp);
3464
3465                 metaslab_group_histogram_add(mg, msp);
3466                 metaslab_group_histogram_verify(mg);
3467                 metaslab_class_histogram_verify(mg->mg_class);
3468
3469                 metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3470
3471                 /*
3472                  * Since we recreated the histogram (and potentially
3473                  * the ms_sm too while condensing) ensure that the
3474                  * weight is updated too because we are not guaranteed
3475                  * that this metaslab is dirty and will go through
3476                  * metaslab_sync_done().
3477                  */
3478                 metaslab_recalculate_weight_and_sort(msp);
3479                 return (B_TRUE);
3480         }
3481
3482         msp->ms_flushing = B_TRUE;
3483         uint64_t sm_len_before = space_map_length(msp->ms_sm);
3484
3485         mutex_exit(&msp->ms_lock);
3486         space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
3487             SM_NO_VDEVID, tx);
3488         space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
3489             SM_NO_VDEVID, tx);
3490         mutex_enter(&msp->ms_lock);
3491
3492         uint64_t sm_len_after = space_map_length(msp->ms_sm);
3493         if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
3494                 zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
3495                     "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
3496                     "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa),
3497                     msp->ms_group->mg_vd->vdev_id, msp->ms_id,
3498                     range_tree_space(msp->ms_unflushed_allocs),
3499                     range_tree_space(msp->ms_unflushed_frees),
3500                     (sm_len_after - sm_len_before));
3501         }
3502
3503         ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3504             metaslab_unflushed_changes_memused(msp));
3505         spa->spa_unflushed_stats.sus_memused -=
3506             metaslab_unflushed_changes_memused(msp);
3507         range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3508         range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3509
3510         metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3511         metaslab_verify_weight_and_frag(msp);
3512
3513         metaslab_flush_update(msp, tx);
3514
3515         metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3516         metaslab_verify_weight_and_frag(msp);
3517
3518         msp->ms_flushing = B_FALSE;
3519         cv_broadcast(&msp->ms_flush_cv);
3520         return (B_TRUE);
3521 }
3522
3523 /*
3524  * Write a metaslab to disk in the context of the specified transaction group.
3525  */
3526 void
3527 metaslab_sync(metaslab_t *msp, uint64_t txg)
3528 {
3529         metaslab_group_t *mg = msp->ms_group;
3530         vdev_t *vd = mg->mg_vd;
3531         spa_t *spa = vd->vdev_spa;
3532         objset_t *mos = spa_meta_objset(spa);
3533         range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
3534         dmu_tx_t *tx;
3535
3536         ASSERT(!vd->vdev_ishole);
3537
3538         /*
3539          * This metaslab has just been added so there's no work to do now.
3540          */
3541         if (msp->ms_freeing == NULL) {
3542                 ASSERT3P(alloctree, ==, NULL);
3543                 return;
3544         }
3545
3546         ASSERT3P(alloctree, !=, NULL);
3547         ASSERT3P(msp->ms_freeing, !=, NULL);
3548         ASSERT3P(msp->ms_freed, !=, NULL);
3549         ASSERT3P(msp->ms_checkpointing, !=, NULL);
3550         ASSERT3P(msp->ms_trim, !=, NULL);
3551
3552         /*
3553          * Normally, we don't want to process a metaslab if there are no
3554          * allocations or frees to perform. However, if the metaslab is being
3555          * forced to condense and it's loaded, we need to let it through.
3556          */
3557         if (range_tree_is_empty(alloctree) &&
3558             range_tree_is_empty(msp->ms_freeing) &&
3559             range_tree_is_empty(msp->ms_checkpointing) &&
3560             !(msp->ms_loaded && msp->ms_condense_wanted))
3561                 return;
3562
3563
3564         VERIFY(txg <= spa_final_dirty_txg(spa));
3565
3566         /*
3567          * The only state that can actually be changing concurrently
3568          * with metaslab_sync() is the metaslab's ms_allocatable. No
3569          * other thread can be modifying this txg's alloc, freeing,
3570          * freed, or space_map_phys_t.  We drop ms_lock whenever we
3571          * could call into the DMU, because the DMU can call down to
3572          * us (e.g. via zio_free()) at any time.
3573          *
3574          * The spa_vdev_remove_thread() can be reading metaslab state
3575          * concurrently, and it is locked out by the ms_sync_lock.
3576          * Note that the ms_lock is insufficient for this, because it
3577          * is dropped by space_map_write().
3578          */
3579         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
3580
3581         /*
3582          * Generate a log space map if one doesn't exist already.
3583          */
3584         spa_generate_syncing_log_sm(spa, tx);
3585
3586         if (msp->ms_sm == NULL) {
3587                 uint64_t new_object = space_map_alloc(mos,
3588                     spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
3589                     zfs_metaslab_sm_blksz_with_log :
3590                     zfs_metaslab_sm_blksz_no_log, tx);
3591                 VERIFY3U(new_object, !=, 0);
3592
3593                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
3594                     msp->ms_id, sizeof (uint64_t), &new_object, tx);
3595
3596                 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
3597                     msp->ms_start, msp->ms_size, vd->vdev_ashift));
3598                 ASSERT(msp->ms_sm != NULL);
3599
3600                 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3601                 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3602                 ASSERT0(metaslab_allocated_space(msp));
3603         }
3604
3605         if (metaslab_unflushed_txg(msp) == 0 &&
3606             spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
3607                 ASSERT(spa_syncing_log_sm(spa) != NULL);
3608
3609                 metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3610                 spa_log_sm_increment_current_mscount(spa);
3611                 spa_log_summary_add_flushed_metaslab(spa);
3612
3613                 ASSERT(msp->ms_sm != NULL);
3614                 mutex_enter(&spa->spa_flushed_ms_lock);
3615                 avl_add(&spa->spa_metaslabs_by_flushed, msp);
3616                 mutex_exit(&spa->spa_flushed_ms_lock);
3617
3618                 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3619                 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3620         }
3621
3622         if (!range_tree_is_empty(msp->ms_checkpointing) &&
3623             vd->vdev_checkpoint_sm == NULL) {
3624                 ASSERT(spa_has_checkpoint(spa));
3625
3626                 uint64_t new_object = space_map_alloc(mos,
3627                     zfs_vdev_standard_sm_blksz, tx);
3628                 VERIFY3U(new_object, !=, 0);
3629
3630                 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
3631                     mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
3632                 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
3633
3634                 /*
3635                  * We save the space map object as an entry in vdev_top_zap
3636                  * so it can be retrieved when the pool is reopened after an
3637                  * export or through zdb.
3638                  */
3639                 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
3640                     vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
3641                     sizeof (new_object), 1, &new_object, tx));
3642         }
3643
3644         mutex_enter(&msp->ms_sync_lock);
3645         mutex_enter(&msp->ms_lock);
3646
3647         /*
3648          * Note: metaslab_condense() clears the space map's histogram.
3649          * Therefore we must verify and remove this histogram before
3650          * condensing.
3651          */
3652         metaslab_group_histogram_verify(mg);
3653         metaslab_class_histogram_verify(mg->mg_class);
3654         metaslab_group_histogram_remove(mg, msp);
3655
3656         if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
3657             metaslab_should_condense(msp))
3658                 metaslab_condense(msp, tx);
3659
3660         /*
3661          * We'll be going to disk to sync our space accounting, thus we
3662          * drop the ms_lock during that time so allocations coming from
3663          * open-context (ZIL) for future TXGs do not block.
3664          */
3665         mutex_exit(&msp->ms_lock);
3666         space_map_t *log_sm = spa_syncing_log_sm(spa);
3667         if (log_sm != NULL) {
3668                 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
3669
3670                 space_map_write(log_sm, alloctree, SM_ALLOC,
3671                     vd->vdev_id, tx);
3672                 space_map_write(log_sm, msp->ms_freeing, SM_FREE,
3673                     vd->vdev_id, tx);
3674                 mutex_enter(&msp->ms_lock);
3675
3676                 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3677                     metaslab_unflushed_changes_memused(msp));
3678                 spa->spa_unflushed_stats.sus_memused -=
3679                     metaslab_unflushed_changes_memused(msp);
3680                 range_tree_remove_xor_add(alloctree,
3681                     msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
3682                 range_tree_remove_xor_add(msp->ms_freeing,
3683                     msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
3684                 spa->spa_unflushed_stats.sus_memused +=
3685                     metaslab_unflushed_changes_memused(msp);
3686         } else {
3687                 ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
3688
3689                 space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
3690                     SM_NO_VDEVID, tx);
3691                 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
3692                     SM_NO_VDEVID, tx);
3693                 mutex_enter(&msp->ms_lock);
3694         }
3695
3696         msp->ms_allocated_space += range_tree_space(alloctree);
3697         ASSERT3U(msp->ms_allocated_space, >=,
3698             range_tree_space(msp->ms_freeing));
3699         msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
3700
3701         if (!range_tree_is_empty(msp->ms_checkpointing)) {
3702                 ASSERT(spa_has_checkpoint(spa));
3703                 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
3704
3705                 /*
3706                  * Since we are doing writes to disk and the ms_checkpointing
3707                  * tree won't be changing during that time, we drop the
3708                  * ms_lock while writing to the checkpoint space map, for the
3709                  * same reason mentioned above.
3710                  */
3711                 mutex_exit(&msp->ms_lock);
3712                 space_map_write(vd->vdev_checkpoint_sm,
3713                     msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
3714                 mutex_enter(&msp->ms_lock);
3715
3716                 spa->spa_checkpoint_info.sci_dspace +=
3717                     range_tree_space(msp->ms_checkpointing);
3718                 vd->vdev_stat.vs_checkpoint_space +=
3719                     range_tree_space(msp->ms_checkpointing);
3720                 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
3721                     -space_map_allocated(vd->vdev_checkpoint_sm));
3722
3723                 range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
3724         }
3725
3726         if (msp->ms_loaded) {
3727                 /*
3728                  * When the space map is loaded, we have an accurate
3729                  * histogram in the range tree. This gives us an opportunity
3730                  * to bring the space map's histogram up-to-date so we clear
3731                  * it first before updating it.
3732                  */
3733                 space_map_histogram_clear(msp->ms_sm);
3734                 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
3735
3736                 /*
3737                  * Since we've cleared the histogram we need to add back
3738                  * any free space that has already been processed, plus
3739                  * any deferred space. This allows the on-disk histogram
3740                  * to accurately reflect all free space even if some space
3741                  * is not yet available for allocation (i.e. deferred).
3742                  */
3743                 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
3744
3745                 /*
3746                  * Add back any deferred free space that has not been
3747                  * added back into the in-core free tree yet. This will
3748                  * ensure that we don't end up with a space map histogram
3749                  * that is completely empty unless the metaslab is fully
3750                  * allocated.
3751                  */
3752                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3753                         space_map_histogram_add(msp->ms_sm,
3754                             msp->ms_defer[t], tx);
3755                 }
3756         }
3757
3758         /*
3759          * Always add the free space from this sync pass to the space
3760          * map histogram. We want to make sure that the on-disk histogram
3761          * accounts for all free space. If the space map is not loaded,
3762          * then we will lose some accuracy but will correct it the next
3763          * time we load the space map.
3764          */
3765         space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
3766         metaslab_aux_histograms_update(msp);
3767
3768         metaslab_group_histogram_add(mg, msp);
3769         metaslab_group_histogram_verify(mg);
3770         metaslab_class_histogram_verify(mg->mg_class);
3771
3772         /*
3773          * For sync pass 1, we avoid traversing this txg's free range tree
3774          * and instead will just swap the pointers for freeing and freed.
3775          * We can safely do this since the freed_tree is guaranteed to be
3776          * empty on the initial pass.
3777          *
3778          * Keep in mind that even if we are currently using a log spacemap
3779          * we want current frees to end up in the ms_allocatable (but not
3780          * get appended to the ms_sm) so their ranges can be reused as usual.
3781          */
3782         if (spa_sync_pass(spa) == 1) {
3783                 range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
3784                 ASSERT0(msp->ms_allocated_this_txg);
3785         } else {
3786                 range_tree_vacate(msp->ms_freeing,
3787                     range_tree_add, msp->ms_freed);
3788         }
3789         msp->ms_allocated_this_txg += range_tree_space(alloctree);
3790         range_tree_vacate(alloctree, NULL, NULL);
3791
3792         ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
3793         ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
3794             & TXG_MASK]));
3795         ASSERT0(range_tree_space(msp->ms_freeing));
3796         ASSERT0(range_tree_space(msp->ms_checkpointing));
3797
3798         mutex_exit(&msp->ms_lock);
3799
3800         /*
3801          * Verify that the space map object ID has been recorded in the
3802          * vdev_ms_array.
3803          */
3804         uint64_t object;
3805         VERIFY0(dmu_read(mos, vd->vdev_ms_array,
3806             msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
3807         VERIFY3U(object, ==, space_map_object(msp->ms_sm));
3808
3809         mutex_exit(&msp->ms_sync_lock);
3810         dmu_tx_commit(tx);
3811 }
3812
3813 static void
3814 metaslab_evict(metaslab_t *msp, uint64_t txg)
3815 {
3816         if (!msp->ms_loaded || msp->ms_disabled != 0)
3817                 return;
3818
3819         for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
3820                 VERIFY0(range_tree_space(
3821                     msp->ms_allocating[(txg + t) & TXG_MASK]));
3822         }
3823         if (msp->ms_allocator != -1)
3824                 metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
3825
3826         if (!metaslab_debug_unload)
3827                 metaslab_unload(msp);
3828 }
3829
3830 /*
3831  * Called after a transaction group has completely synced to mark
3832  * all of the metaslab's free space as usable.
3833  */
3834 void
3835 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
3836 {
3837         metaslab_group_t *mg = msp->ms_group;
3838         vdev_t *vd = mg->mg_vd;
3839         spa_t *spa = vd->vdev_spa;
3840         range_tree_t **defer_tree;
3841         int64_t alloc_delta, defer_delta;
3842         boolean_t defer_allowed = B_TRUE;
3843
3844         ASSERT(!vd->vdev_ishole);
3845
3846         mutex_enter(&msp->ms_lock);
3847
3848         /*
3849          * If this metaslab is just becoming available, initialize its
3850          * range trees and add its capacity to the vdev.
3851          */
3852         if (msp->ms_freed == NULL) {
3853                 for (int t = 0; t < TXG_SIZE; t++) {
3854                         ASSERT(msp->ms_allocating[t] == NULL);
3855
3856                         msp->ms_allocating[t] = range_tree_create(NULL, NULL);
3857                 }
3858
3859                 ASSERT3P(msp->ms_freeing, ==, NULL);
3860                 msp->ms_freeing = range_tree_create(NULL, NULL);
3861
3862                 ASSERT3P(msp->ms_freed, ==, NULL);
3863                 msp->ms_freed = range_tree_create(NULL, NULL);
3864
3865                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3866                         ASSERT3P(msp->ms_defer[t], ==, NULL);
3867                         msp->ms_defer[t] = range_tree_create(NULL, NULL);
3868                 }
3869
3870                 ASSERT3P(msp->ms_checkpointing, ==, NULL);
3871                 msp->ms_checkpointing = range_tree_create(NULL, NULL);
3872
3873                 ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
3874                 msp->ms_unflushed_allocs = range_tree_create(NULL, NULL);
3875                 ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
3876                 msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops,
3877                     &msp->ms_unflushed_frees_by_size,
3878                     metaslab_rangesize_compare, 0);
3879
3880                 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
3881         }
3882         ASSERT0(range_tree_space(msp->ms_freeing));
3883         ASSERT0(range_tree_space(msp->ms_checkpointing));
3884
3885         defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
3886
3887         uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
3888             metaslab_class_get_alloc(spa_normal_class(spa));
3889         if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
3890                 defer_allowed = B_FALSE;
3891         }
3892
3893         defer_delta = 0;
3894         alloc_delta = msp->ms_allocated_this_txg -
3895             range_tree_space(msp->ms_freed);
3896
3897         if (defer_allowed) {
3898                 defer_delta = range_tree_space(msp->ms_freed) -
3899                     range_tree_space(*defer_tree);
3900         } else {
3901                 defer_delta -= range_tree_space(*defer_tree);
3902         }
3903         metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
3904             defer_delta, 0);
3905
3906         if (spa_syncing_log_sm(spa) == NULL) {
3907                 /*
3908                  * If there's a metaslab_load() in progress and we don't have
3909                  * a log space map, it means that we probably wrote to the
3910                  * metaslab's space map. If this is the case, we need to
3911                  * make sure that we wait for the load to complete so that we
3912                  * have a consistent view at the in-core side of the metaslab.
3913                  */
3914                 metaslab_load_wait(msp);
3915         } else {
3916                 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
3917         }
3918
3919         /*
3920          * When auto-trimming is enabled, free ranges which are added to
3921          * ms_allocatable are also be added to ms_trim.  The ms_trim tree is
3922          * periodically consumed by the vdev_autotrim_thread() which issues
3923          * trims for all ranges and then vacates the tree.  The ms_trim tree
3924          * can be discarded at any time with the sole consequence of recent
3925          * frees not being trimmed.
3926          */
3927         if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
3928                 range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
3929                 if (!defer_allowed) {
3930                         range_tree_walk(msp->ms_freed, range_tree_add,
3931                             msp->ms_trim);
3932                 }
3933         } else {
3934                 range_tree_vacate(msp->ms_trim, NULL, NULL);
3935         }
3936
3937         /*
3938          * Move the frees from the defer_tree back to the free
3939          * range tree (if it's loaded). Swap the freed_tree and
3940          * the defer_tree -- this is safe to do because we've
3941          * just emptied out the defer_tree.
3942          */
3943         range_tree_vacate(*defer_tree,
3944             msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
3945         if (defer_allowed) {
3946                 range_tree_swap(&msp->ms_freed, defer_tree);
3947         } else {
3948                 range_tree_vacate(msp->ms_freed,
3949                     msp->ms_loaded ? range_tree_add : NULL,
3950                     msp->ms_allocatable);
3951         }
3952
3953         msp->ms_synced_length = space_map_length(msp->ms_sm);
3954
3955         msp->ms_deferspace += defer_delta;
3956         ASSERT3S(msp->ms_deferspace, >=, 0);
3957         ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
3958         if (msp->ms_deferspace != 0) {
3959                 /*
3960                  * Keep syncing this metaslab until all deferred frees
3961                  * are back in circulation.
3962                  */
3963                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
3964         }
3965         metaslab_aux_histograms_update_done(msp, defer_allowed);
3966
3967         if (msp->ms_new) {
3968                 msp->ms_new = B_FALSE;
3969                 mutex_enter(&mg->mg_lock);
3970                 mg->mg_ms_ready++;
3971                 mutex_exit(&mg->mg_lock);
3972         }
3973
3974         /*
3975          * Re-sort metaslab within its group now that we've adjusted
3976          * its allocatable space.
3977          */
3978         metaslab_recalculate_weight_and_sort(msp);
3979
3980         ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
3981         ASSERT0(range_tree_space(msp->ms_freeing));
3982         ASSERT0(range_tree_space(msp->ms_freed));
3983         ASSERT0(range_tree_space(msp->ms_checkpointing));
3984         msp->ms_allocating_total -= msp->ms_allocated_this_txg;
3985         msp->ms_allocated_this_txg = 0;
3986         mutex_exit(&msp->ms_lock);
3987 }
3988
3989 void
3990 metaslab_sync_reassess(metaslab_group_t *mg)
3991 {
3992         spa_t *spa = mg->mg_class->mc_spa;
3993
3994         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3995         metaslab_group_alloc_update(mg);
3996         mg->mg_fragmentation = metaslab_group_fragmentation(mg);
3997
3998         /*
3999          * Preload the next potential metaslabs but only on active
4000          * metaslab groups. We can get into a state where the metaslab
4001          * is no longer active since we dirty metaslabs as we remove a
4002          * a device, thus potentially making the metaslab group eligible
4003          * for preloading.
4004          */
4005         if (mg->mg_activation_count > 0) {
4006                 metaslab_group_preload(mg);
4007         }
4008         spa_config_exit(spa, SCL_ALLOC, FTAG);
4009 }
4010
4011 /*
4012  * When writing a ditto block (i.e. more than one DVA for a given BP) on
4013  * the same vdev as an existing DVA of this BP, then try to allocate it
4014  * on a different metaslab than existing DVAs (i.e. a unique metaslab).
4015  */
4016 static boolean_t
4017 metaslab_is_unique(metaslab_t *msp, dva_t *dva)
4018 {
4019         uint64_t dva_ms_id;
4020
4021         if (DVA_GET_ASIZE(dva) == 0)
4022                 return (B_TRUE);
4023
4024         if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
4025                 return (B_TRUE);
4026
4027         dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
4028
4029         return (msp->ms_id != dva_ms_id);
4030 }
4031
4032 /*
4033  * ==========================================================================
4034  * Metaslab allocation tracing facility
4035  * ==========================================================================
4036  */
4037 #ifdef _METASLAB_TRACING
4038 kstat_t *metaslab_trace_ksp;
4039 kstat_named_t metaslab_trace_over_limit;
4040
4041 void
4042 metaslab_alloc_trace_init(void)
4043 {
4044         ASSERT(metaslab_alloc_trace_cache == NULL);
4045         metaslab_alloc_trace_cache = kmem_cache_create(
4046             "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
4047             0, NULL, NULL, NULL, NULL, NULL, 0);
4048         metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
4049             "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
4050         if (metaslab_trace_ksp != NULL) {
4051                 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
4052                 kstat_named_init(&metaslab_trace_over_limit,
4053                     "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
4054                 kstat_install(metaslab_trace_ksp);
4055         }
4056 }
4057
4058 void
4059 metaslab_alloc_trace_fini(void)
4060 {
4061         if (metaslab_trace_ksp != NULL) {
4062                 kstat_delete(metaslab_trace_ksp);
4063                 metaslab_trace_ksp = NULL;
4064         }
4065         kmem_cache_destroy(metaslab_alloc_trace_cache);
4066         metaslab_alloc_trace_cache = NULL;
4067 }
4068
4069 /*
4070  * Add an allocation trace element to the allocation tracing list.
4071  */
4072 static void
4073 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
4074     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
4075     int allocator)
4076 {
4077         metaslab_alloc_trace_t *mat;
4078
4079         if (!metaslab_trace_enabled)
4080                 return;
4081
4082         /*
4083          * When the tracing list reaches its maximum we remove
4084          * the second element in the list before adding a new one.
4085          * By removing the second element we preserve the original
4086          * entry as a clue to what allocations steps have already been
4087          * performed.
4088          */
4089         if (zal->zal_size == metaslab_trace_max_entries) {
4090                 metaslab_alloc_trace_t *mat_next;
4091 #ifdef DEBUG
4092                 panic("too many entries in allocation list");
4093 #endif
4094                 atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
4095                 zal->zal_size--;
4096                 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
4097                 list_remove(&zal->zal_list, mat_next);
4098                 kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
4099         }
4100
4101         mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
4102         list_link_init(&mat->mat_list_node);
4103         mat->mat_mg = mg;
4104         mat->mat_msp = msp;
4105         mat->mat_size = psize;
4106         mat->mat_dva_id = dva_id;
4107         mat->mat_offset = offset;
4108         mat->mat_weight = 0;
4109         mat->mat_allocator = allocator;
4110
4111         if (msp != NULL)
4112                 mat->mat_weight = msp->ms_weight;
4113
4114         /*
4115          * The list is part of the zio so locking is not required. Only
4116          * a single thread will perform allocations for a given zio.
4117          */
4118         list_insert_tail(&zal->zal_list, mat);
4119         zal->zal_size++;
4120
4121         ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
4122 }
4123
4124 void
4125 metaslab_trace_init(zio_alloc_list_t *zal)
4126 {
4127         list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
4128             offsetof(metaslab_alloc_trace_t, mat_list_node));
4129         zal->zal_size = 0;
4130 }
4131
4132 void
4133 metaslab_trace_fini(zio_alloc_list_t *zal)
4134 {
4135         metaslab_alloc_trace_t *mat;
4136
4137         while ((mat = list_remove_head(&zal->zal_list)) != NULL)
4138                 kmem_cache_free(metaslab_alloc_trace_cache, mat);
4139         list_destroy(&zal->zal_list);
4140         zal->zal_size = 0;
4141 }
4142 #else
4143
4144 #define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc)
4145
4146 void
4147 metaslab_alloc_trace_init(void)
4148 {
4149 }
4150
4151 void
4152 metaslab_alloc_trace_fini(void)
4153 {
4154 }
4155
4156 void
4157 metaslab_trace_init(zio_alloc_list_t *zal)
4158 {
4159 }
4160
4161 void
4162 metaslab_trace_fini(zio_alloc_list_t *zal)
4163 {
4164 }
4165
4166 #endif /* _METASLAB_TRACING */
4167
4168 /*
4169  * ==========================================================================
4170  * Metaslab block operations
4171  * ==========================================================================
4172  */
4173
4174 static void
4175 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
4176     int allocator)
4177 {
4178         if (!(flags & METASLAB_ASYNC_ALLOC) ||
4179             (flags & METASLAB_DONT_THROTTLE))
4180                 return;
4181
4182         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4183         if (!mg->mg_class->mc_alloc_throttle_enabled)
4184                 return;
4185
4186         (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
4187 }
4188
4189 static void
4190 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
4191 {
4192         uint64_t max = mg->mg_max_alloc_queue_depth;
4193         uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
4194         while (cur < max) {
4195                 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
4196                     cur, cur + 1) == cur) {
4197                         atomic_inc_64(
4198                             &mg->mg_class->mc_alloc_max_slots[allocator]);
4199                         return;
4200                 }
4201                 cur = mg->mg_cur_max_alloc_queue_depth[allocator];
4202         }
4203 }
4204
4205 void
4206 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
4207     int allocator, boolean_t io_complete)
4208 {
4209         if (!(flags & METASLAB_ASYNC_ALLOC) ||
4210             (flags & METASLAB_DONT_THROTTLE))
4211                 return;
4212
4213         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4214         if (!mg->mg_class->mc_alloc_throttle_enabled)
4215                 return;
4216
4217         (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
4218         if (io_complete)
4219                 metaslab_group_increment_qdepth(mg, allocator);
4220 }
4221
4222 void
4223 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
4224     int allocator)
4225 {
4226 #ifdef ZFS_DEBUG
4227         const dva_t *dva = bp->blk_dva;
4228         int ndvas = BP_GET_NDVAS(bp);
4229
4230         for (int d = 0; d < ndvas; d++) {
4231                 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
4232                 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4233                 VERIFY(zfs_refcount_not_held(
4234                     &mg->mg_alloc_queue_depth[allocator], tag));
4235         }
4236 #endif
4237 }
4238
4239 static uint64_t
4240 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
4241 {
4242         uint64_t start;
4243         range_tree_t *rt = msp->ms_allocatable;
4244         metaslab_class_t *mc = msp->ms_group->mg_class;
4245
4246         ASSERT(MUTEX_HELD(&msp->ms_lock));
4247         VERIFY(!msp->ms_condensing);
4248         VERIFY0(msp->ms_disabled);
4249
4250         start = mc->mc_ops->msop_alloc(msp, size);
4251         if (start != -1ULL) {
4252                 metaslab_group_t *mg = msp->ms_group;
4253                 vdev_t *vd = mg->mg_vd;
4254
4255                 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
4256                 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
4257                 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
4258                 range_tree_remove(rt, start, size);
4259                 range_tree_clear(msp->ms_trim, start, size);
4260
4261                 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
4262                         vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
4263
4264                 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
4265                 msp->ms_allocating_total += size;
4266
4267                 /* Track the last successful allocation */
4268                 msp->ms_alloc_txg = txg;
4269                 metaslab_verify_space(msp, txg);
4270         }
4271
4272         /*
4273          * Now that we've attempted the allocation we need to update the
4274          * metaslab's maximum block size since it may have changed.
4275          */
4276         msp->ms_max_size = metaslab_largest_allocatable(msp);
4277         return (start);
4278 }
4279
4280 /*
4281  * Find the metaslab with the highest weight that is less than what we've
4282  * already tried.  In the common case, this means that we will examine each
4283  * metaslab at most once. Note that concurrent callers could reorder metaslabs
4284  * by activation/passivation once we have dropped the mg_lock. If a metaslab is
4285  * activated by another thread, and we fail to allocate from the metaslab we
4286  * have selected, we may not try the newly-activated metaslab, and instead
4287  * activate another metaslab.  This is not optimal, but generally does not cause
4288  * any problems (a possible exception being if every metaslab is completely full
4289  * except for the the newly-activated metaslab which we fail to examine).
4290  */
4291 static metaslab_t *
4292 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
4293     dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
4294     boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
4295     boolean_t *was_active)
4296 {
4297         avl_index_t idx;
4298         avl_tree_t *t = &mg->mg_metaslab_tree;
4299         metaslab_t *msp = avl_find(t, search, &idx);
4300         if (msp == NULL)
4301                 msp = avl_nearest(t, idx, AVL_AFTER);
4302
4303         for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
4304                 int i;
4305                 if (!metaslab_should_allocate(msp, asize, try_hard)) {
4306                         metaslab_trace_add(zal, mg, msp, asize, d,
4307                             TRACE_TOO_SMALL, allocator);
4308                         continue;
4309                 }
4310
4311                 /*
4312                  * If the selected metaslab is condensing or disabled,
4313                  * skip it.
4314                  */
4315                 if (msp->ms_condensing || msp->ms_disabled > 0)
4316                         continue;
4317
4318                 *was_active = msp->ms_allocator != -1;
4319                 /*
4320                  * If we're activating as primary, this is our first allocation
4321                  * from this disk, so we don't need to check how close we are.
4322                  * If the metaslab under consideration was already active,
4323                  * we're getting desperate enough to steal another allocator's
4324                  * metaslab, so we still don't care about distances.
4325                  */
4326                 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
4327                         break;
4328
4329                 for (i = 0; i < d; i++) {
4330                         if (want_unique &&
4331                             !metaslab_is_unique(msp, &dva[i]))
4332                                 break;  /* try another metaslab */
4333                 }
4334                 if (i == d)
4335                         break;
4336         }
4337
4338         if (msp != NULL) {
4339                 search->ms_weight = msp->ms_weight;
4340                 search->ms_start = msp->ms_start + 1;
4341                 search->ms_allocator = msp->ms_allocator;
4342                 search->ms_primary = msp->ms_primary;
4343         }
4344         return (msp);
4345 }
4346
4347 void
4348 metaslab_active_mask_verify(metaslab_t *msp)
4349 {
4350         ASSERT(MUTEX_HELD(&msp->ms_lock));
4351
4352         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
4353                 return;
4354
4355         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
4356                 return;
4357
4358         if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
4359                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4360                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4361                 VERIFY3S(msp->ms_allocator, !=, -1);
4362                 VERIFY(msp->ms_primary);
4363                 return;
4364         }
4365
4366         if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
4367                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4368                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4369                 VERIFY3S(msp->ms_allocator, !=, -1);
4370                 VERIFY(!msp->ms_primary);
4371                 return;
4372         }
4373
4374         if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
4375                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4376                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4377                 VERIFY3S(msp->ms_allocator, ==, -1);
4378                 return;
4379         }
4380 }
4381
4382 /* ARGSUSED */
4383 static uint64_t
4384 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
4385     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
4386     int allocator, boolean_t try_hard)
4387 {
4388         metaslab_t *msp = NULL;
4389         uint64_t offset = -1ULL;
4390
4391         uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
4392         for (int i = 0; i < d; i++) {
4393                 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4394                     DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
4395                         activation_weight = METASLAB_WEIGHT_SECONDARY;
4396                 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
4397                     DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
4398                         activation_weight = METASLAB_WEIGHT_CLAIM;
4399                         break;
4400                 }
4401         }
4402
4403         /*
4404          * If we don't have enough metaslabs active to fill the entire array, we
4405          * just use the 0th slot.
4406          */
4407         if (mg->mg_ms_ready < mg->mg_allocators * 3)
4408                 allocator = 0;
4409
4410         ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
4411
4412         metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
4413         search->ms_weight = UINT64_MAX;
4414         search->ms_start = 0;
4415         /*
4416          * At the end of the metaslab tree are the already-active metaslabs,
4417          * first the primaries, then the secondaries. When we resume searching
4418          * through the tree, we need to consider ms_allocator and ms_primary so
4419          * we start in the location right after where we left off, and don't
4420          * accidentally loop forever considering the same metaslabs.
4421          */
4422         search->ms_allocator = -1;
4423         search->ms_primary = B_TRUE;
4424         for (;;) {
4425                 boolean_t was_active = B_FALSE;
4426
4427                 mutex_enter(&mg->mg_lock);
4428
4429                 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4430                     mg->mg_primaries[allocator] != NULL) {
4431                         msp = mg->mg_primaries[allocator];
4432
4433                         /*
4434                          * Even though we don't hold the ms_lock for the
4435                          * primary metaslab, those fields should not
4436                          * change while we hold the mg_lock. Thus is is
4437                          * safe to make assertions on them.
4438                          */
4439                         ASSERT(msp->ms_primary);
4440                         ASSERT3S(msp->ms_allocator, ==, allocator);
4441                         ASSERT(msp->ms_loaded);
4442
4443                         was_active = B_TRUE;
4444                         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4445                 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
4446                     mg->mg_secondaries[allocator] != NULL) {
4447                         msp = mg->mg_secondaries[allocator];
4448
4449                         /*
4450                          * See comment above about the similar assertions
4451                          * for the primary metaslab.
4452                          */
4453                         ASSERT(!msp->ms_primary);
4454                         ASSERT3S(msp->ms_allocator, ==, allocator);
4455                         ASSERT(msp->ms_loaded);
4456
4457                         was_active = B_TRUE;
4458                         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4459                 } else {
4460                         msp = find_valid_metaslab(mg, activation_weight, dva, d,
4461                             want_unique, asize, allocator, try_hard, zal,
4462                             search, &was_active);
4463                 }
4464
4465                 mutex_exit(&mg->mg_lock);
4466                 if (msp == NULL) {
4467                         kmem_free(search, sizeof (*search));
4468                         return (-1ULL);
4469                 }
4470                 mutex_enter(&msp->ms_lock);
4471
4472                 metaslab_active_mask_verify(msp);
4473
4474                 /*
4475                  * This code is disabled out because of issues with
4476                  * tracepoints in non-gpl kernel modules.
4477                  */
4478 #if 0
4479                 DTRACE_PROBE3(ms__activation__attempt,
4480                     metaslab_t *, msp, uint64_t, activation_weight,
4481                     boolean_t, was_active);
4482 #endif
4483
4484                 /*
4485                  * Ensure that the metaslab we have selected is still
4486                  * capable of handling our request. It's possible that
4487                  * another thread may have changed the weight while we
4488                  * were blocked on the metaslab lock. We check the
4489                  * active status first to see if we need to set_selected_txg
4490                  * a new metaslab.
4491                  */
4492                 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
4493                         ASSERT3S(msp->ms_allocator, ==, -1);
4494                         mutex_exit(&msp->ms_lock);
4495                         continue;
4496                 }
4497
4498                 /*
4499                  * If the metaslab was activated for another allocator
4500                  * while we were waiting in the ms_lock above, or it's
4501                  * a primary and we're seeking a secondary (or vice versa),
4502                  * we go back and select a new metaslab.
4503                  */
4504                 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
4505                     (msp->ms_allocator != -1) &&
4506                     (msp->ms_allocator != allocator || ((activation_weight ==
4507                     METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
4508                         ASSERT(msp->ms_loaded);
4509                         ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
4510                             msp->ms_allocator != -1);
4511                         mutex_exit(&msp->ms_lock);
4512                         continue;
4513                 }
4514
4515                 /*
4516                  * This metaslab was used for claiming regions allocated
4517                  * by the ZIL during pool import. Once these regions are
4518                  * claimed we don't need to keep the CLAIM bit set
4519                  * anymore. Passivate this metaslab to zero its activation
4520                  * mask.
4521                  */
4522                 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
4523                     activation_weight != METASLAB_WEIGHT_CLAIM) {
4524                         ASSERT(msp->ms_loaded);
4525                         ASSERT3S(msp->ms_allocator, ==, -1);
4526                         metaslab_passivate(msp, msp->ms_weight &
4527                             ~METASLAB_WEIGHT_CLAIM);
4528                         mutex_exit(&msp->ms_lock);
4529                         continue;
4530                 }
4531
4532                 metaslab_set_selected_txg(msp, txg);
4533
4534                 int activation_error =
4535                     metaslab_activate(msp, allocator, activation_weight);
4536                 metaslab_active_mask_verify(msp);
4537
4538                 /*
4539                  * If the metaslab was activated by another thread for
4540                  * another allocator or activation_weight (EBUSY), or it
4541                  * failed because another metaslab was assigned as primary
4542                  * for this allocator (EEXIST) we continue using this
4543                  * metaslab for our allocation, rather than going on to a
4544                  * worse metaslab (we waited for that metaslab to be loaded
4545                  * after all).
4546                  *
4547                  * If the activation failed due to an I/O error or ENOSPC we
4548                  * skip to the next metaslab.
4549                  */
4550                 boolean_t activated;
4551                 if (activation_error == 0) {
4552                         activated = B_TRUE;
4553                 } else if (activation_error == EBUSY ||
4554                     activation_error == EEXIST) {
4555                         activated = B_FALSE;
4556                 } else {
4557                         mutex_exit(&msp->ms_lock);
4558                         continue;
4559                 }
4560                 ASSERT(msp->ms_loaded);
4561
4562                 /*
4563                  * Now that we have the lock, recheck to see if we should
4564                  * continue to use this metaslab for this allocation. The
4565                  * the metaslab is now loaded so metaslab_should_allocate()
4566                  * can accurately determine if the allocation attempt should
4567                  * proceed.
4568                  */
4569                 if (!metaslab_should_allocate(msp, asize, try_hard)) {
4570                         /* Passivate this metaslab and select a new one. */
4571                         metaslab_trace_add(zal, mg, msp, asize, d,
4572                             TRACE_TOO_SMALL, allocator);
4573                         goto next;
4574                 }
4575
4576                 /*
4577                  * If this metaslab is currently condensing then pick again
4578                  * as we can't manipulate this metaslab until it's committed
4579                  * to disk. If this metaslab is being initialized, we shouldn't
4580                  * allocate from it since the allocated region might be
4581                  * overwritten after allocation.
4582                  */
4583                 if (msp->ms_condensing) {
4584                         metaslab_trace_add(zal, mg, msp, asize, d,
4585                             TRACE_CONDENSING, allocator);
4586                         if (activated) {
4587                                 metaslab_passivate(msp, msp->ms_weight &
4588                                     ~METASLAB_ACTIVE_MASK);
4589                         }
4590                         mutex_exit(&msp->ms_lock);
4591                         continue;
4592                 } else if (msp->ms_disabled > 0) {
4593                         metaslab_trace_add(zal, mg, msp, asize, d,
4594                             TRACE_DISABLED, allocator);
4595                         if (activated) {
4596                                 metaslab_passivate(msp, msp->ms_weight &
4597                                     ~METASLAB_ACTIVE_MASK);
4598                         }
4599                         mutex_exit(&msp->ms_lock);
4600                         continue;
4601                 }
4602
4603                 offset = metaslab_block_alloc(msp, asize, txg);
4604                 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
4605
4606                 if (offset != -1ULL) {
4607                         /* Proactively passivate the metaslab, if needed */
4608                         if (activated)
4609                                 metaslab_segment_may_passivate(msp);
4610                         break;
4611                 }
4612 next:
4613                 ASSERT(msp->ms_loaded);
4614
4615                 /*
4616                  * This code is disabled out because of issues with
4617                  * tracepoints in non-gpl kernel modules.
4618                  */
4619 #if 0
4620                 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
4621                     uint64_t, asize);
4622 #endif
4623
4624                 /*
4625                  * We were unable to allocate from this metaslab so determine
4626                  * a new weight for this metaslab. Now that we have loaded
4627                  * the metaslab we can provide a better hint to the metaslab
4628                  * selector.
4629                  *
4630                  * For space-based metaslabs, we use the maximum block size.
4631                  * This information is only available when the metaslab
4632                  * is loaded and is more accurate than the generic free
4633                  * space weight that was calculated by metaslab_weight().
4634                  * This information allows us to quickly compare the maximum
4635                  * available allocation in the metaslab to the allocation
4636                  * size being requested.
4637                  *
4638                  * For segment-based metaslabs, determine the new weight
4639                  * based on the highest bucket in the range tree. We
4640                  * explicitly use the loaded segment weight (i.e. the range
4641                  * tree histogram) since it contains the space that is
4642                  * currently available for allocation and is accurate
4643                  * even within a sync pass.
4644                  */
4645                 uint64_t weight;
4646                 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
4647                         weight = metaslab_largest_allocatable(msp);
4648                         WEIGHT_SET_SPACEBASED(weight);
4649                 } else {
4650                         weight = metaslab_weight_from_range_tree(msp);
4651                 }
4652
4653                 if (activated) {
4654                         metaslab_passivate(msp, weight);
4655                 } else {
4656                         /*
4657                          * For the case where we use the metaslab that is
4658                          * active for another allocator we want to make
4659                          * sure that we retain the activation mask.
4660                          *
4661                          * Note that we could attempt to use something like
4662                          * metaslab_recalculate_weight_and_sort() that
4663                          * retains the activation mask here. That function
4664                          * uses metaslab_weight() to set the weight though
4665                          * which is not as accurate as the calculations
4666                          * above.
4667                          */
4668                         weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
4669                         metaslab_group_sort(mg, msp, weight);
4670                 }
4671                 metaslab_active_mask_verify(msp);
4672
4673                 /*
4674                  * We have just failed an allocation attempt, check
4675                  * that metaslab_should_allocate() agrees. Otherwise,
4676                  * we may end up in an infinite loop retrying the same
4677                  * metaslab.
4678                  */
4679                 ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
4680
4681                 mutex_exit(&msp->ms_lock);
4682         }
4683         mutex_exit(&msp->ms_lock);
4684         kmem_free(search, sizeof (*search));
4685         return (offset);
4686 }
4687
4688 static uint64_t
4689 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
4690     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
4691     int allocator, boolean_t try_hard)
4692 {
4693         uint64_t offset;
4694         ASSERT(mg->mg_initialized);
4695
4696         offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
4697             dva, d, allocator, try_hard);
4698
4699         mutex_enter(&mg->mg_lock);
4700         if (offset == -1ULL) {
4701                 mg->mg_failed_allocations++;
4702                 metaslab_trace_add(zal, mg, NULL, asize, d,
4703                     TRACE_GROUP_FAILURE, allocator);
4704                 if (asize == SPA_GANGBLOCKSIZE) {
4705                         /*
4706                          * This metaslab group was unable to allocate
4707                          * the minimum gang block size so it must be out of
4708                          * space. We must notify the allocation throttle
4709                          * to start skipping allocation attempts to this
4710                          * metaslab group until more space becomes available.
4711                          * Note: this failure cannot be caused by the
4712                          * allocation throttle since the allocation throttle
4713                          * is only responsible for skipping devices and
4714                          * not failing block allocations.
4715                          */
4716                         mg->mg_no_free_space = B_TRUE;
4717                 }
4718         }
4719         mg->mg_allocations++;
4720         mutex_exit(&mg->mg_lock);
4721         return (offset);
4722 }
4723
4724 /*
4725  * Allocate a block for the specified i/o.
4726  */
4727 int
4728 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
4729     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
4730     zio_alloc_list_t *zal, int allocator)
4731 {
4732         metaslab_group_t *mg, *fast_mg, *rotor;
4733         vdev_t *vd;
4734         boolean_t try_hard = B_FALSE;
4735
4736         ASSERT(!DVA_IS_VALID(&dva[d]));
4737
4738         /*
4739          * For testing, make some blocks above a certain size be gang blocks.
4740          * This will result in more split blocks when using device removal,
4741          * and a large number of split blocks coupled with ztest-induced
4742          * damage can result in extremely long reconstruction times.  This
4743          * will also test spilling from special to normal.
4744          */
4745         if (psize >= metaslab_force_ganging && (spa_get_random(100) < 3)) {
4746                 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
4747                     allocator);
4748                 return (SET_ERROR(ENOSPC));
4749         }
4750
4751         /*
4752          * Start at the rotor and loop through all mgs until we find something.
4753          * Note that there's no locking on mc_rotor or mc_aliquot because
4754          * nothing actually breaks if we miss a few updates -- we just won't
4755          * allocate quite as evenly.  It all balances out over time.
4756          *
4757          * If we are doing ditto or log blocks, try to spread them across
4758          * consecutive vdevs.  If we're forced to reuse a vdev before we've
4759          * allocated all of our ditto blocks, then try and spread them out on
4760          * that vdev as much as possible.  If it turns out to not be possible,
4761          * gradually lower our standards until anything becomes acceptable.
4762          * Also, allocating on consecutive vdevs (as opposed to random vdevs)
4763          * gives us hope of containing our fault domains to something we're
4764          * able to reason about.  Otherwise, any two top-level vdev failures
4765          * will guarantee the loss of data.  With consecutive allocation,
4766          * only two adjacent top-level vdev failures will result in data loss.
4767          *
4768          * If we are doing gang blocks (hintdva is non-NULL), try to keep
4769          * ourselves on the same vdev as our gang block header.  That
4770          * way, we can hope for locality in vdev_cache, plus it makes our
4771          * fault domains something tractable.
4772          */
4773         if (hintdva) {
4774                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
4775
4776                 /*
4777                  * It's possible the vdev we're using as the hint no
4778                  * longer exists or its mg has been closed (e.g. by
4779                  * device removal).  Consult the rotor when
4780                  * all else fails.
4781                  */
4782                 if (vd != NULL && vd->vdev_mg != NULL) {
4783                         mg = vd->vdev_mg;
4784
4785                         if (flags & METASLAB_HINTBP_AVOID &&
4786                             mg->mg_next != NULL)
4787                                 mg = mg->mg_next;
4788                 } else {
4789                         mg = mc->mc_rotor;
4790                 }
4791         } else if (d != 0) {
4792                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
4793                 mg = vd->vdev_mg->mg_next;
4794         } else if (flags & METASLAB_FASTWRITE) {
4795                 mg = fast_mg = mc->mc_rotor;
4796
4797                 do {
4798                         if (fast_mg->mg_vd->vdev_pending_fastwrite <
4799                             mg->mg_vd->vdev_pending_fastwrite)
4800                                 mg = fast_mg;
4801                 } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
4802
4803         } else {
4804                 ASSERT(mc->mc_rotor != NULL);
4805                 mg = mc->mc_rotor;
4806         }
4807
4808         /*
4809          * If the hint put us into the wrong metaslab class, or into a
4810          * metaslab group that has been passivated, just follow the rotor.
4811          */
4812         if (mg->mg_class != mc || mg->mg_activation_count <= 0)
4813                 mg = mc->mc_rotor;
4814
4815         rotor = mg;
4816 top:
4817         do {
4818                 boolean_t allocatable;
4819
4820                 ASSERT(mg->mg_activation_count == 1);
4821                 vd = mg->mg_vd;
4822
4823                 /*
4824                  * Don't allocate from faulted devices.
4825                  */
4826                 if (try_hard) {
4827                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
4828                         allocatable = vdev_allocatable(vd);
4829                         spa_config_exit(spa, SCL_ZIO, FTAG);
4830                 } else {
4831                         allocatable = vdev_allocatable(vd);
4832                 }
4833
4834                 /*
4835                  * Determine if the selected metaslab group is eligible
4836                  * for allocations. If we're ganging then don't allow
4837                  * this metaslab group to skip allocations since that would
4838                  * inadvertently return ENOSPC and suspend the pool
4839                  * even though space is still available.
4840                  */
4841                 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
4842                         allocatable = metaslab_group_allocatable(mg, rotor,
4843                             psize, allocator, d);
4844                 }
4845
4846                 if (!allocatable) {
4847                         metaslab_trace_add(zal, mg, NULL, psize, d,
4848                             TRACE_NOT_ALLOCATABLE, allocator);
4849                         goto next;
4850                 }
4851
4852                 ASSERT(mg->mg_initialized);
4853
4854                 /*
4855                  * Avoid writing single-copy data to a failing,
4856                  * non-redundant vdev, unless we've already tried all
4857                  * other vdevs.
4858                  */
4859                 if ((vd->vdev_stat.vs_write_errors > 0 ||
4860                     vd->vdev_state < VDEV_STATE_HEALTHY) &&
4861                     d == 0 && !try_hard && vd->vdev_children == 0) {
4862                         metaslab_trace_add(zal, mg, NULL, psize, d,
4863                             TRACE_VDEV_ERROR, allocator);
4864                         goto next;
4865                 }
4866
4867                 ASSERT(mg->mg_class == mc);
4868
4869                 uint64_t asize = vdev_psize_to_asize(vd, psize);
4870                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
4871
4872                 /*
4873                  * If we don't need to try hard, then require that the
4874                  * block be on an different metaslab from any other DVAs
4875                  * in this BP (unique=true).  If we are trying hard, then
4876                  * allow any metaslab to be used (unique=false).
4877                  */
4878                 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
4879                     !try_hard, dva, d, allocator, try_hard);
4880
4881                 if (offset != -1ULL) {
4882                         /*
4883                          * If we've just selected this metaslab group,
4884                          * figure out whether the corresponding vdev is
4885                          * over- or under-used relative to the pool,
4886                          * and set an allocation bias to even it out.
4887                          *
4888                          * Bias is also used to compensate for unequally
4889                          * sized vdevs so that space is allocated fairly.
4890                          */
4891                         if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
4892                                 vdev_stat_t *vs = &vd->vdev_stat;
4893                                 int64_t vs_free = vs->vs_space - vs->vs_alloc;
4894                                 int64_t mc_free = mc->mc_space - mc->mc_alloc;
4895                                 int64_t ratio;
4896
4897                                 /*
4898                                  * Calculate how much more or less we should
4899                                  * try to allocate from this device during
4900                                  * this iteration around the rotor.
4901                                  *
4902                                  * This basically introduces a zero-centered
4903                                  * bias towards the devices with the most
4904                                  * free space, while compensating for vdev
4905                                  * size differences.
4906                                  *
4907                                  * Examples:
4908                                  *  vdev V1 = 16M/128M
4909                                  *  vdev V2 = 16M/128M
4910                                  *  ratio(V1) = 100% ratio(V2) = 100%
4911                                  *
4912                                  *  vdev V1 = 16M/128M
4913                                  *  vdev V2 = 64M/128M
4914                                  *  ratio(V1) = 127% ratio(V2) =  72%
4915                                  *
4916                                  *  vdev V1 = 16M/128M
4917                                  *  vdev V2 = 64M/512M
4918                                  *  ratio(V1) =  40% ratio(V2) = 160%
4919                                  */
4920                                 ratio = (vs_free * mc->mc_alloc_groups * 100) /
4921                                     (mc_free + 1);
4922                                 mg->mg_bias = ((ratio - 100) *
4923                                     (int64_t)mg->mg_aliquot) / 100;
4924                         } else if (!metaslab_bias_enabled) {
4925                                 mg->mg_bias = 0;
4926                         }
4927
4928                         if ((flags & METASLAB_FASTWRITE) ||
4929                             atomic_add_64_nv(&mc->mc_aliquot, asize) >=
4930                             mg->mg_aliquot + mg->mg_bias) {
4931                                 mc->mc_rotor = mg->mg_next;
4932                                 mc->mc_aliquot = 0;
4933                         }
4934
4935                         DVA_SET_VDEV(&dva[d], vd->vdev_id);
4936                         DVA_SET_OFFSET(&dva[d], offset);
4937                         DVA_SET_GANG(&dva[d],
4938                             ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
4939                         DVA_SET_ASIZE(&dva[d], asize);
4940
4941                         if (flags & METASLAB_FASTWRITE) {
4942                                 atomic_add_64(&vd->vdev_pending_fastwrite,
4943                                     psize);
4944                         }
4945
4946                         return (0);
4947                 }
4948 next:
4949                 mc->mc_rotor = mg->mg_next;
4950                 mc->mc_aliquot = 0;
4951         } while ((mg = mg->mg_next) != rotor);
4952
4953         /*
4954          * If we haven't tried hard, do so now.
4955          */
4956         if (!try_hard) {
4957                 try_hard = B_TRUE;
4958                 goto top;
4959         }
4960
4961         bzero(&dva[d], sizeof (dva_t));
4962
4963         metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
4964         return (SET_ERROR(ENOSPC));
4965 }
4966
4967 void
4968 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
4969     boolean_t checkpoint)
4970 {
4971         metaslab_t *msp;
4972         spa_t *spa = vd->vdev_spa;
4973
4974         ASSERT(vdev_is_concrete(vd));
4975         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
4976         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
4977
4978         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4979
4980         VERIFY(!msp->ms_condensing);
4981         VERIFY3U(offset, >=, msp->ms_start);
4982         VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
4983         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
4984         VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
4985
4986         metaslab_check_free_impl(vd, offset, asize);
4987
4988         mutex_enter(&msp->ms_lock);
4989         if (range_tree_is_empty(msp->ms_freeing) &&
4990             range_tree_is_empty(msp->ms_checkpointing)) {
4991                 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
4992         }
4993
4994         if (checkpoint) {
4995                 ASSERT(spa_has_checkpoint(spa));
4996                 range_tree_add(msp->ms_checkpointing, offset, asize);
4997         } else {
4998                 range_tree_add(msp->ms_freeing, offset, asize);
4999         }
5000         mutex_exit(&msp->ms_lock);
5001 }
5002
5003 /* ARGSUSED */
5004 void
5005 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5006     uint64_t size, void *arg)
5007 {
5008         boolean_t *checkpoint = arg;
5009
5010         ASSERT3P(checkpoint, !=, NULL);
5011
5012         if (vd->vdev_ops->vdev_op_remap != NULL)
5013                 vdev_indirect_mark_obsolete(vd, offset, size);
5014         else
5015                 metaslab_free_impl(vd, offset, size, *checkpoint);
5016 }
5017
5018 static void
5019 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
5020     boolean_t checkpoint)
5021 {
5022         spa_t *spa = vd->vdev_spa;
5023
5024         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5025
5026         if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
5027                 return;
5028
5029         if (spa->spa_vdev_removal != NULL &&
5030             spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
5031             vdev_is_concrete(vd)) {
5032                 /*
5033                  * Note: we check if the vdev is concrete because when
5034                  * we complete the removal, we first change the vdev to be
5035                  * an indirect vdev (in open context), and then (in syncing
5036                  * context) clear spa_vdev_removal.
5037                  */
5038                 free_from_removing_vdev(vd, offset, size);
5039         } else if (vd->vdev_ops->vdev_op_remap != NULL) {
5040                 vdev_indirect_mark_obsolete(vd, offset, size);
5041                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5042                     metaslab_free_impl_cb, &checkpoint);
5043         } else {
5044                 metaslab_free_concrete(vd, offset, size, checkpoint);
5045         }
5046 }
5047
5048 typedef struct remap_blkptr_cb_arg {
5049         blkptr_t *rbca_bp;
5050         spa_remap_cb_t rbca_cb;
5051         vdev_t *rbca_remap_vd;
5052         uint64_t rbca_remap_offset;
5053         void *rbca_cb_arg;
5054 } remap_blkptr_cb_arg_t;
5055
5056 void
5057 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5058     uint64_t size, void *arg)
5059 {
5060         remap_blkptr_cb_arg_t *rbca = arg;
5061         blkptr_t *bp = rbca->rbca_bp;
5062
5063         /* We can not remap split blocks. */
5064         if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
5065                 return;
5066         ASSERT0(inner_offset);
5067
5068         if (rbca->rbca_cb != NULL) {
5069                 /*
5070                  * At this point we know that we are not handling split
5071                  * blocks and we invoke the callback on the previous
5072                  * vdev which must be indirect.
5073                  */
5074                 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
5075
5076                 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
5077                     rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
5078
5079                 /* set up remap_blkptr_cb_arg for the next call */
5080                 rbca->rbca_remap_vd = vd;
5081                 rbca->rbca_remap_offset = offset;
5082         }
5083
5084         /*
5085          * The phys birth time is that of dva[0].  This ensures that we know
5086          * when each dva was written, so that resilver can determine which
5087          * blocks need to be scrubbed (i.e. those written during the time
5088          * the vdev was offline).  It also ensures that the key used in
5089          * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
5090          * we didn't change the phys_birth, a lookup in the ARC for a
5091          * remapped BP could find the data that was previously stored at
5092          * this vdev + offset.
5093          */
5094         vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
5095             DVA_GET_VDEV(&bp->blk_dva[0]));
5096         vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
5097         bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
5098             DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
5099
5100         DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
5101         DVA_SET_OFFSET(&bp->blk_dva[0], offset);
5102 }
5103
5104 /*
5105  * If the block pointer contains any indirect DVAs, modify them to refer to
5106  * concrete DVAs.  Note that this will sometimes not be possible, leaving
5107  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
5108  * segments in the mapping (i.e. it is a "split block").
5109  *
5110  * If the BP was remapped, calls the callback on the original dva (note the
5111  * callback can be called multiple times if the original indirect DVA refers
5112  * to another indirect DVA, etc).
5113  *
5114  * Returns TRUE if the BP was remapped.
5115  */
5116 boolean_t
5117 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
5118 {
5119         remap_blkptr_cb_arg_t rbca;
5120
5121         if (!zfs_remap_blkptr_enable)
5122                 return (B_FALSE);
5123
5124         if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
5125                 return (B_FALSE);
5126
5127         /*
5128          * Dedup BP's can not be remapped, because ddt_phys_select() depends
5129          * on DVA[0] being the same in the BP as in the DDT (dedup table).
5130          */
5131         if (BP_GET_DEDUP(bp))
5132                 return (B_FALSE);
5133
5134         /*
5135          * Gang blocks can not be remapped, because
5136          * zio_checksum_gang_verifier() depends on the DVA[0] that's in
5137          * the BP used to read the gang block header (GBH) being the same
5138          * as the DVA[0] that we allocated for the GBH.
5139          */
5140         if (BP_IS_GANG(bp))
5141                 return (B_FALSE);
5142
5143         /*
5144          * Embedded BP's have no DVA to remap.
5145          */
5146         if (BP_GET_NDVAS(bp) < 1)
5147                 return (B_FALSE);
5148
5149         /*
5150          * Note: we only remap dva[0].  If we remapped other dvas, we
5151          * would no longer know what their phys birth txg is.
5152          */
5153         dva_t *dva = &bp->blk_dva[0];
5154
5155         uint64_t offset = DVA_GET_OFFSET(dva);
5156         uint64_t size = DVA_GET_ASIZE(dva);
5157         vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
5158
5159         if (vd->vdev_ops->vdev_op_remap == NULL)
5160                 return (B_FALSE);
5161
5162         rbca.rbca_bp = bp;
5163         rbca.rbca_cb = callback;
5164         rbca.rbca_remap_vd = vd;
5165         rbca.rbca_remap_offset = offset;
5166         rbca.rbca_cb_arg = arg;
5167
5168         /*
5169          * remap_blkptr_cb() will be called in order for each level of
5170          * indirection, until a concrete vdev is reached or a split block is
5171          * encountered. old_vd and old_offset are updated within the callback
5172          * as we go from the one indirect vdev to the next one (either concrete
5173          * or indirect again) in that order.
5174          */
5175         vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
5176
5177         /* Check if the DVA wasn't remapped because it is a split block */
5178         if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
5179                 return (B_FALSE);
5180
5181         return (B_TRUE);
5182 }
5183
5184 /*
5185  * Undo the allocation of a DVA which happened in the given transaction group.
5186  */
5187 void
5188 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
5189 {
5190         metaslab_t *msp;
5191         vdev_t *vd;
5192         uint64_t vdev = DVA_GET_VDEV(dva);
5193         uint64_t offset = DVA_GET_OFFSET(dva);
5194         uint64_t size = DVA_GET_ASIZE(dva);
5195
5196         ASSERT(DVA_IS_VALID(dva));
5197         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5198
5199         if (txg > spa_freeze_txg(spa))
5200                 return;
5201
5202         if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
5203             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
5204                 zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
5205                     (u_longlong_t)vdev, (u_longlong_t)offset,
5206                     (u_longlong_t)size);
5207                 return;
5208         }
5209
5210         ASSERT(!vd->vdev_removing);
5211         ASSERT(vdev_is_concrete(vd));
5212         ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
5213         ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
5214
5215         if (DVA_GET_GANG(dva))
5216                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5217
5218         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5219
5220         mutex_enter(&msp->ms_lock);
5221         range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
5222             offset, size);
5223         msp->ms_allocating_total -= size;
5224
5225         VERIFY(!msp->ms_condensing);
5226         VERIFY3U(offset, >=, msp->ms_start);
5227         VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
5228         VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
5229             msp->ms_size);
5230         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5231         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
5232         range_tree_add(msp->ms_allocatable, offset, size);
5233         mutex_exit(&msp->ms_lock);
5234 }
5235
5236 /*
5237  * Free the block represented by the given DVA.
5238  */
5239 void
5240 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
5241 {
5242         uint64_t vdev = DVA_GET_VDEV(dva);
5243         uint64_t offset = DVA_GET_OFFSET(dva);
5244         uint64_t size = DVA_GET_ASIZE(dva);
5245         vdev_t *vd = vdev_lookup_top(spa, vdev);
5246
5247         ASSERT(DVA_IS_VALID(dva));
5248         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5249
5250         if (DVA_GET_GANG(dva)) {
5251                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5252         }
5253
5254         metaslab_free_impl(vd, offset, size, checkpoint);
5255 }
5256
5257 /*
5258  * Reserve some allocation slots. The reservation system must be called
5259  * before we call into the allocator. If there aren't any available slots
5260  * then the I/O will be throttled until an I/O completes and its slots are
5261  * freed up. The function returns true if it was successful in placing
5262  * the reservation.
5263  */
5264 boolean_t
5265 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
5266     zio_t *zio, int flags)
5267 {
5268         uint64_t available_slots = 0;
5269         boolean_t slot_reserved = B_FALSE;
5270         uint64_t max = mc->mc_alloc_max_slots[allocator];
5271
5272         ASSERT(mc->mc_alloc_throttle_enabled);
5273         mutex_enter(&mc->mc_lock);
5274
5275         uint64_t reserved_slots =
5276             zfs_refcount_count(&mc->mc_alloc_slots[allocator]);
5277         if (reserved_slots < max)
5278                 available_slots = max - reserved_slots;
5279
5280         if (slots <= available_slots || GANG_ALLOCATION(flags) ||
5281             flags & METASLAB_MUST_RESERVE) {
5282                 /*
5283                  * We reserve the slots individually so that we can unreserve
5284                  * them individually when an I/O completes.
5285                  */
5286                 for (int d = 0; d < slots; d++) {
5287                         reserved_slots =
5288                             zfs_refcount_add(&mc->mc_alloc_slots[allocator],
5289                             zio);
5290                 }
5291                 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
5292                 slot_reserved = B_TRUE;
5293         }
5294
5295         mutex_exit(&mc->mc_lock);
5296         return (slot_reserved);
5297 }
5298
5299 void
5300 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
5301     int allocator, zio_t *zio)
5302 {
5303         ASSERT(mc->mc_alloc_throttle_enabled);
5304         mutex_enter(&mc->mc_lock);
5305         for (int d = 0; d < slots; d++) {
5306                 (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator],
5307                     zio);
5308         }
5309         mutex_exit(&mc->mc_lock);
5310 }
5311
5312 static int
5313 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
5314     uint64_t txg)
5315 {
5316         metaslab_t *msp;
5317         spa_t *spa = vd->vdev_spa;
5318         int error = 0;
5319
5320         if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
5321                 return (SET_ERROR(ENXIO));
5322
5323         ASSERT3P(vd->vdev_ms, !=, NULL);
5324         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5325
5326         mutex_enter(&msp->ms_lock);
5327
5328         if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) {
5329                 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
5330                 if (error == EBUSY) {
5331                         ASSERT(msp->ms_loaded);
5332                         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
5333                         error = 0;
5334                 }
5335         }
5336
5337         if (error == 0 &&
5338             !range_tree_contains(msp->ms_allocatable, offset, size))
5339                 error = SET_ERROR(ENOENT);
5340
5341         if (error || txg == 0) {        /* txg == 0 indicates dry run */
5342                 mutex_exit(&msp->ms_lock);
5343                 return (error);
5344         }
5345
5346         VERIFY(!msp->ms_condensing);
5347         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5348         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
5349         VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
5350             msp->ms_size);
5351         range_tree_remove(msp->ms_allocatable, offset, size);
5352         range_tree_clear(msp->ms_trim, offset, size);
5353
5354         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
5355                 metaslab_class_t *mc = msp->ms_group->mg_class;
5356                 multilist_sublist_t *mls =
5357                     multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
5358                 if (!multilist_link_active(&msp->ms_class_txg_node)) {
5359                         msp->ms_selected_txg = txg;
5360                         multilist_sublist_insert_head(mls, msp);
5361                 }
5362                 multilist_sublist_unlock(mls);
5363
5364                 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
5365                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
5366                 range_tree_add(msp->ms_allocating[txg & TXG_MASK],
5367                     offset, size);
5368                 msp->ms_allocating_total += size;
5369         }
5370
5371         mutex_exit(&msp->ms_lock);
5372
5373         return (0);
5374 }
5375
5376 typedef struct metaslab_claim_cb_arg_t {
5377         uint64_t        mcca_txg;
5378         int             mcca_error;
5379 } metaslab_claim_cb_arg_t;
5380
5381 /* ARGSUSED */
5382 static void
5383 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5384     uint64_t size, void *arg)
5385 {
5386         metaslab_claim_cb_arg_t *mcca_arg = arg;
5387
5388         if (mcca_arg->mcca_error == 0) {
5389                 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
5390                     size, mcca_arg->mcca_txg);
5391         }
5392 }
5393
5394 int
5395 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
5396 {
5397         if (vd->vdev_ops->vdev_op_remap != NULL) {
5398                 metaslab_claim_cb_arg_t arg;
5399
5400                 /*
5401                  * Only zdb(1M) can claim on indirect vdevs.  This is used
5402                  * to detect leaks of mapped space (that are not accounted
5403                  * for in the obsolete counts, spacemap, or bpobj).
5404                  */
5405                 ASSERT(!spa_writeable(vd->vdev_spa));
5406                 arg.mcca_error = 0;
5407                 arg.mcca_txg = txg;
5408
5409                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5410                     metaslab_claim_impl_cb, &arg);
5411
5412                 if (arg.mcca_error == 0) {
5413                         arg.mcca_error = metaslab_claim_concrete(vd,
5414                             offset, size, txg);
5415                 }
5416                 return (arg.mcca_error);
5417         } else {
5418                 return (metaslab_claim_concrete(vd, offset, size, txg));
5419         }
5420 }
5421
5422 /*
5423  * Intent log support: upon opening the pool after a crash, notify the SPA
5424  * of blocks that the intent log has allocated for immediate write, but
5425  * which are still considered free by the SPA because the last transaction
5426  * group didn't commit yet.
5427  */
5428 static int
5429 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
5430 {
5431         uint64_t vdev = DVA_GET_VDEV(dva);
5432         uint64_t offset = DVA_GET_OFFSET(dva);
5433         uint64_t size = DVA_GET_ASIZE(dva);
5434         vdev_t *vd;
5435
5436         if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
5437                 return (SET_ERROR(ENXIO));
5438         }
5439
5440         ASSERT(DVA_IS_VALID(dva));
5441
5442         if (DVA_GET_GANG(dva))
5443                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5444
5445         return (metaslab_claim_impl(vd, offset, size, txg));
5446 }
5447
5448 int
5449 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
5450     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
5451     zio_alloc_list_t *zal, zio_t *zio, int allocator)
5452 {
5453         dva_t *dva = bp->blk_dva;
5454         dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
5455         int error = 0;
5456
5457         ASSERT(bp->blk_birth == 0);
5458         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
5459
5460         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5461
5462         if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
5463                 spa_config_exit(spa, SCL_ALLOC, FTAG);
5464                 return (SET_ERROR(ENOSPC));
5465         }
5466
5467         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
5468         ASSERT(BP_GET_NDVAS(bp) == 0);
5469         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
5470         ASSERT3P(zal, !=, NULL);
5471
5472         for (int d = 0; d < ndvas; d++) {
5473                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
5474                     txg, flags, zal, allocator);
5475                 if (error != 0) {
5476                         for (d--; d >= 0; d--) {
5477                                 metaslab_unalloc_dva(spa, &dva[d], txg);
5478                                 metaslab_group_alloc_decrement(spa,
5479                                     DVA_GET_VDEV(&dva[d]), zio, flags,
5480                                     allocator, B_FALSE);
5481                                 bzero(&dva[d], sizeof (dva_t));
5482                         }
5483                         spa_config_exit(spa, SCL_ALLOC, FTAG);
5484                         return (error);
5485                 } else {
5486                         /*
5487                          * Update the metaslab group's queue depth
5488                          * based on the newly allocated dva.
5489                          */
5490                         metaslab_group_alloc_increment(spa,
5491                             DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
5492                 }
5493
5494         }
5495         ASSERT(error == 0);
5496         ASSERT(BP_GET_NDVAS(bp) == ndvas);
5497
5498         spa_config_exit(spa, SCL_ALLOC, FTAG);
5499
5500         BP_SET_BIRTH(bp, txg, 0);
5501
5502         return (0);
5503 }
5504
5505 void
5506 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
5507 {
5508         const dva_t *dva = bp->blk_dva;
5509         int ndvas = BP_GET_NDVAS(bp);
5510
5511         ASSERT(!BP_IS_HOLE(bp));
5512         ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
5513
5514         /*
5515          * If we have a checkpoint for the pool we need to make sure that
5516          * the blocks that we free that are part of the checkpoint won't be
5517          * reused until the checkpoint is discarded or we revert to it.
5518          *
5519          * The checkpoint flag is passed down the metaslab_free code path
5520          * and is set whenever we want to add a block to the checkpoint's
5521          * accounting. That is, we "checkpoint" blocks that existed at the
5522          * time the checkpoint was created and are therefore referenced by
5523          * the checkpointed uberblock.
5524          *
5525          * Note that, we don't checkpoint any blocks if the current
5526          * syncing txg <= spa_checkpoint_txg. We want these frees to sync
5527          * normally as they will be referenced by the checkpointed uberblock.
5528          */
5529         boolean_t checkpoint = B_FALSE;
5530         if (bp->blk_birth <= spa->spa_checkpoint_txg &&
5531             spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
5532                 /*
5533                  * At this point, if the block is part of the checkpoint
5534                  * there is no way it was created in the current txg.
5535                  */
5536                 ASSERT(!now);
5537                 ASSERT3U(spa_syncing_txg(spa), ==, txg);
5538                 checkpoint = B_TRUE;
5539         }
5540
5541         spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
5542
5543         for (int d = 0; d < ndvas; d++) {
5544                 if (now) {
5545                         metaslab_unalloc_dva(spa, &dva[d], txg);
5546                 } else {
5547                         ASSERT3U(txg, ==, spa_syncing_txg(spa));
5548                         metaslab_free_dva(spa, &dva[d], checkpoint);
5549                 }
5550         }
5551
5552         spa_config_exit(spa, SCL_FREE, FTAG);
5553 }
5554
5555 int
5556 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
5557 {
5558         const dva_t *dva = bp->blk_dva;
5559         int ndvas = BP_GET_NDVAS(bp);
5560         int error = 0;
5561
5562         ASSERT(!BP_IS_HOLE(bp));
5563
5564         if (txg != 0) {
5565                 /*
5566                  * First do a dry run to make sure all DVAs are claimable,
5567                  * so we don't have to unwind from partial failures below.
5568                  */
5569                 if ((error = metaslab_claim(spa, bp, 0)) != 0)
5570                         return (error);
5571         }
5572
5573         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5574
5575         for (int d = 0; d < ndvas; d++) {
5576                 error = metaslab_claim_dva(spa, &dva[d], txg);
5577                 if (error != 0)
5578                         break;
5579         }
5580
5581         spa_config_exit(spa, SCL_ALLOC, FTAG);
5582
5583         ASSERT(error == 0 || txg == 0);
5584
5585         return (error);
5586 }
5587
5588 void
5589 metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
5590 {
5591         const dva_t *dva = bp->blk_dva;
5592         int ndvas = BP_GET_NDVAS(bp);
5593         uint64_t psize = BP_GET_PSIZE(bp);
5594         int d;
5595         vdev_t *vd;
5596
5597         ASSERT(!BP_IS_HOLE(bp));
5598         ASSERT(!BP_IS_EMBEDDED(bp));
5599         ASSERT(psize > 0);
5600
5601         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
5602
5603         for (d = 0; d < ndvas; d++) {
5604                 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
5605                         continue;
5606                 atomic_add_64(&vd->vdev_pending_fastwrite, psize);
5607         }
5608
5609         spa_config_exit(spa, SCL_VDEV, FTAG);
5610 }
5611
5612 void
5613 metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
5614 {
5615         const dva_t *dva = bp->blk_dva;
5616         int ndvas = BP_GET_NDVAS(bp);
5617         uint64_t psize = BP_GET_PSIZE(bp);
5618         int d;
5619         vdev_t *vd;
5620
5621         ASSERT(!BP_IS_HOLE(bp));
5622         ASSERT(!BP_IS_EMBEDDED(bp));
5623         ASSERT(psize > 0);
5624
5625         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
5626
5627         for (d = 0; d < ndvas; d++) {
5628                 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
5629                         continue;
5630                 ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
5631                 atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
5632         }
5633
5634         spa_config_exit(spa, SCL_VDEV, FTAG);
5635 }
5636
5637 /* ARGSUSED */
5638 static void
5639 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
5640     uint64_t size, void *arg)
5641 {
5642         if (vd->vdev_ops == &vdev_indirect_ops)
5643                 return;
5644
5645         metaslab_check_free_impl(vd, offset, size);
5646 }
5647
5648 static void
5649 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
5650 {
5651         metaslab_t *msp;
5652         ASSERTV(spa_t *spa = vd->vdev_spa);
5653
5654         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
5655                 return;
5656
5657         if (vd->vdev_ops->vdev_op_remap != NULL) {
5658                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5659                     metaslab_check_free_impl_cb, NULL);
5660                 return;
5661         }
5662
5663         ASSERT(vdev_is_concrete(vd));
5664         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
5665         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5666
5667         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5668
5669         mutex_enter(&msp->ms_lock);
5670         if (msp->ms_loaded) {
5671                 range_tree_verify_not_present(msp->ms_allocatable,
5672                     offset, size);
5673         }
5674
5675         /*
5676          * Check all segments that currently exist in the freeing pipeline.
5677          *
5678          * It would intuitively make sense to also check the current allocating
5679          * tree since metaslab_unalloc_dva() exists for extents that are
5680          * allocated and freed in the same sync pass withing the same txg.
5681          * Unfortunately there are places (e.g. the ZIL) where we allocate a
5682          * segment but then we free part of it within the same txg
5683          * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
5684          * current allocating tree.
5685          */
5686         range_tree_verify_not_present(msp->ms_freeing, offset, size);
5687         range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
5688         range_tree_verify_not_present(msp->ms_freed, offset, size);
5689         for (int j = 0; j < TXG_DEFER_SIZE; j++)
5690                 range_tree_verify_not_present(msp->ms_defer[j], offset, size);
5691         range_tree_verify_not_present(msp->ms_trim, offset, size);
5692         mutex_exit(&msp->ms_lock);
5693 }
5694
5695 void
5696 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
5697 {
5698         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
5699                 return;
5700
5701         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
5702         for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
5703                 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
5704                 vdev_t *vd = vdev_lookup_top(spa, vdev);
5705                 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
5706                 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
5707
5708                 if (DVA_GET_GANG(&bp->blk_dva[i]))
5709                         size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5710
5711                 ASSERT3P(vd, !=, NULL);
5712
5713                 metaslab_check_free_impl(vd, offset, size);
5714         }
5715         spa_config_exit(spa, SCL_VDEV, FTAG);
5716 }
5717
5718 static void
5719 metaslab_group_disable_wait(metaslab_group_t *mg)
5720 {
5721         ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
5722         while (mg->mg_disabled_updating) {
5723                 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
5724         }
5725 }
5726
5727 static void
5728 metaslab_group_disabled_increment(metaslab_group_t *mg)
5729 {
5730         ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
5731         ASSERT(mg->mg_disabled_updating);
5732
5733         while (mg->mg_ms_disabled >= max_disabled_ms) {
5734                 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
5735         }
5736         mg->mg_ms_disabled++;
5737         ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
5738 }
5739
5740 /*
5741  * Mark the metaslab as disabled to prevent any allocations on this metaslab.
5742  * We must also track how many metaslabs are currently disabled within a
5743  * metaslab group and limit them to prevent allocation failures from
5744  * occurring because all metaslabs are disabled.
5745  */
5746 void
5747 metaslab_disable(metaslab_t *msp)
5748 {
5749         ASSERT(!MUTEX_HELD(&msp->ms_lock));
5750         metaslab_group_t *mg = msp->ms_group;
5751
5752         mutex_enter(&mg->mg_ms_disabled_lock);
5753
5754         /*
5755          * To keep an accurate count of how many threads have disabled
5756          * a specific metaslab group, we only allow one thread to mark
5757          * the metaslab group at a time. This ensures that the value of
5758          * ms_disabled will be accurate when we decide to mark a metaslab
5759          * group as disabled. To do this we force all other threads
5760          * to wait till the metaslab's mg_disabled_updating flag is no
5761          * longer set.
5762          */
5763         metaslab_group_disable_wait(mg);
5764         mg->mg_disabled_updating = B_TRUE;
5765         if (msp->ms_disabled == 0) {
5766                 metaslab_group_disabled_increment(mg);
5767         }
5768         mutex_enter(&msp->ms_lock);
5769         msp->ms_disabled++;
5770         mutex_exit(&msp->ms_lock);
5771
5772         mg->mg_disabled_updating = B_FALSE;
5773         cv_broadcast(&mg->mg_ms_disabled_cv);
5774         mutex_exit(&mg->mg_ms_disabled_lock);
5775 }
5776
5777 void
5778 metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
5779 {
5780         metaslab_group_t *mg = msp->ms_group;
5781         spa_t *spa = mg->mg_vd->vdev_spa;
5782
5783         /*
5784          * Wait for the outstanding IO to be synced to prevent newly
5785          * allocated blocks from being overwritten.  This used by
5786          * initialize and TRIM which are modifying unallocated space.
5787          */
5788         if (sync)
5789                 txg_wait_synced(spa_get_dsl(spa), 0);
5790
5791         mutex_enter(&mg->mg_ms_disabled_lock);
5792         mutex_enter(&msp->ms_lock);
5793         if (--msp->ms_disabled == 0) {
5794                 mg->mg_ms_disabled--;
5795                 cv_broadcast(&mg->mg_ms_disabled_cv);
5796                 if (unload)
5797                         metaslab_unload(msp);
5798         }
5799         mutex_exit(&msp->ms_lock);
5800         mutex_exit(&mg->mg_ms_disabled_lock);
5801 }
5802
5803 static void
5804 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
5805 {
5806         vdev_t *vd = ms->ms_group->mg_vd;
5807         spa_t *spa = vd->vdev_spa;
5808         objset_t *mos = spa_meta_objset(spa);
5809
5810         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
5811
5812         metaslab_unflushed_phys_t entry = {
5813                 .msp_unflushed_txg = metaslab_unflushed_txg(ms),
5814         };
5815         uint64_t entry_size = sizeof (entry);
5816         uint64_t entry_offset = ms->ms_id * entry_size;
5817
5818         uint64_t object = 0;
5819         int err = zap_lookup(mos, vd->vdev_top_zap,
5820             VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
5821             &object);
5822         if (err == ENOENT) {
5823                 object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
5824                     SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
5825                 VERIFY0(zap_add(mos, vd->vdev_top_zap,
5826                     VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
5827                     &object, tx));
5828         } else {
5829                 VERIFY0(err);
5830         }
5831
5832         dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
5833             &entry, tx);
5834 }
5835
5836 void
5837 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
5838 {
5839         spa_t *spa = ms->ms_group->mg_vd->vdev_spa;
5840
5841         if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
5842                 return;
5843
5844         ms->ms_unflushed_txg = txg;
5845         metaslab_update_ondisk_flush_data(ms, tx);
5846 }
5847
5848 uint64_t
5849 metaslab_unflushed_txg(metaslab_t *ms)
5850 {
5851         return (ms->ms_unflushed_txg);
5852 }
5853
5854 #if defined(_KERNEL)
5855 /* BEGIN CSTYLED */
5856 module_param(metaslab_aliquot, ulong, 0644);
5857 MODULE_PARM_DESC(metaslab_aliquot,
5858         "allocation granularity (a.k.a. stripe size)");
5859
5860 module_param(metaslab_debug_load, int, 0644);
5861 MODULE_PARM_DESC(metaslab_debug_load,
5862         "load all metaslabs when pool is first opened");
5863
5864 module_param(metaslab_debug_unload, int, 0644);
5865 MODULE_PARM_DESC(metaslab_debug_unload,
5866         "prevent metaslabs from being unloaded");
5867
5868 module_param(metaslab_preload_enabled, int, 0644);
5869 MODULE_PARM_DESC(metaslab_preload_enabled,
5870         "preload potential metaslabs during reassessment");
5871
5872 module_param(zfs_mg_noalloc_threshold, int, 0644);
5873 MODULE_PARM_DESC(zfs_mg_noalloc_threshold,
5874         "percentage of free space for metaslab group to allow allocation");
5875
5876 module_param(zfs_mg_fragmentation_threshold, int, 0644);
5877 MODULE_PARM_DESC(zfs_mg_fragmentation_threshold,
5878         "fragmentation for metaslab group to allow allocation");
5879
5880 module_param(zfs_metaslab_fragmentation_threshold, int, 0644);
5881 MODULE_PARM_DESC(zfs_metaslab_fragmentation_threshold,
5882         "fragmentation for metaslab to allow allocation");
5883
5884 module_param(metaslab_fragmentation_factor_enabled, int, 0644);
5885 MODULE_PARM_DESC(metaslab_fragmentation_factor_enabled,
5886         "use the fragmentation metric to prefer less fragmented metaslabs");
5887
5888 module_param(metaslab_lba_weighting_enabled, int, 0644);
5889 MODULE_PARM_DESC(metaslab_lba_weighting_enabled,
5890         "prefer metaslabs with lower LBAs");
5891
5892 module_param(metaslab_bias_enabled, int, 0644);
5893 MODULE_PARM_DESC(metaslab_bias_enabled,
5894         "enable metaslab group biasing");
5895
5896 module_param(zfs_metaslab_segment_weight_enabled, int, 0644);
5897 MODULE_PARM_DESC(zfs_metaslab_segment_weight_enabled,
5898         "enable segment-based metaslab selection");
5899
5900 module_param(zfs_metaslab_switch_threshold, int, 0644);
5901 MODULE_PARM_DESC(zfs_metaslab_switch_threshold,
5902         "segment-based metaslab selection maximum buckets before switching");
5903
5904 module_param(metaslab_force_ganging, ulong, 0644);
5905 MODULE_PARM_DESC(metaslab_force_ganging,
5906         "blocks larger than this size are forced to be gang blocks");
5907
5908 module_param(metaslab_df_max_search, int, 0644);
5909 MODULE_PARM_DESC(metaslab_df_max_search,
5910         "max distance (bytes) to search forward before using size tree");
5911
5912 module_param(metaslab_df_use_largest_segment, int, 0644);
5913 MODULE_PARM_DESC(metaslab_df_use_largest_segment,
5914         "when looking in size tree, use largest segment instead of exact fit");
5915
5916 module_param(zfs_metaslab_max_size_cache_sec, ulong, 0644);
5917 MODULE_PARM_DESC(zfs_metaslab_max_size_cache_sec,
5918         "how long to trust the cached max chunk size of a metaslab");
5919
5920 module_param(zfs_metaslab_mem_limit, int, 0644);
5921 MODULE_PARM_DESC(zfs_metaslab_mem_limit,
5922         "percentage of memory that can be used to store metaslab range trees");
5923 /* END CSTYLED */
5924
5925 #endif