granicus.if.org Git - zfs/blob - module/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  24  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
  26  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  27  */
  28
  29 /*
  30  * DVA-based Adjustable Replacement Cache
  31  *
  32  * While much of the theory of operation used here is
  33  * based on the self-tuning, low overhead replacement cache
  34  * presented by Megiddo and Modha at FAST 2003, there are some
  35  * significant differences:
  36  *
  37  * 1. The Megiddo and Modha model assumes any page is evictable.
  38  * Pages in its cache cannot be "locked" into memory.  This makes
  39  * the eviction algorithm simple: evict the last page in the list.
  40  * This also make the performance characteristics easy to reason
  41  * about.  Our cache is not so simple.  At any given moment, some
  42  * subset of the blocks in the cache are un-evictable because we
  43  * have handed out a reference to them.  Blocks are only evictable
  44  * when there are no external references active.  This makes
  45  * eviction far more problematic:  we choose to evict the evictable
  46  * blocks that are the "lowest" in the list.
  47  *
  48  * There are times when it is not possible to evict the requested
  49  * space.  In these circumstances we are unable to adjust the cache
  50  * size.  To prevent the cache growing unbounded at these times we
  51  * implement a "cache throttle" that slows the flow of new data
  52  * into the cache until we can make space available.
  53  *
  54  * 2. The Megiddo and Modha model assumes a fixed cache size.
  55  * Pages are evicted when the cache is full and there is a cache
  56  * miss.  Our model has a variable sized cache.  It grows with
  57  * high use, but also tries to react to memory pressure from the
  58  * operating system: decreasing its size when system memory is
  59  * tight.
  60  *
  61  * 3. The Megiddo and Modha model assumes a fixed page size. All
  62  * elements of the cache are therefore exactly the same size.  So
  63  * when adjusting the cache size following a cache miss, its simply
  64  * a matter of choosing a single page to evict.  In our model, we
  65  * have variable sized cache blocks (rangeing from 512 bytes to
  66  * 128K bytes).  We therefore choose a set of blocks to evict to make
  67  * space for a cache miss that approximates as closely as possible
  68  * the space used by the new block.
  69  *
  70  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  71  * by N. Megiddo & D. Modha, FAST 2003
  72  */
  73
  74 /*
  75  * The locking model:
  76  *
  77  * A new reference to a cache buffer can be obtained in two
  78  * ways: 1) via a hash table lookup using the DVA as a key,
  79  * or 2) via one of the ARC lists.  The arc_read() interface
  80  * uses method 1, while the internal arc algorithms for
  81  * adjusting the cache use method 2.  We therefore provide two
  82  * types of locks: 1) the hash table lock array, and 2) the
  83  * arc list locks.
  84  *
  85  * Buffers do not have their own mutexes, rather they rely on the
  86  * hash table mutexes for the bulk of their protection (i.e. most
  87  * fields in the arc_buf_hdr_t are protected by these mutexes).
  88  *
  89  * buf_hash_find() returns the appropriate mutex (held) when it
  90  * locates the requested buffer in the hash table.  It returns
  91  * NULL for the mutex if the buffer was not in the table.
  92  *
  93  * buf_hash_remove() expects the appropriate hash mutex to be
  94  * already held before it is invoked.
  95  *
  96  * Each arc state also has a mutex which is used to protect the
  97  * buffer list associated with the state.  When attempting to
  98  * obtain a hash table lock while holding an arc list lock you
  99  * must use: mutex_tryenter() to avoid deadlock.  Also note that
 100  * the active state mutex must be held before the ghost state mutex.
 101  *
 102  * Arc buffers may have an associated eviction callback function.
 103  * This function will be invoked prior to removing the buffer (e.g.
 104  * in arc_do_user_evicts()).  Note however that the data associated
 105  * with the buffer may be evicted prior to the callback.  The callback
 106  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 107  * the users of callbacks must ensure that their private data is
 108  * protected from simultaneous callbacks from arc_clear_callback()
 109  * and arc_do_user_evicts().
 110  *
 111  * It as also possible to register a callback which is run when the
 112  * arc_meta_limit is reached and no buffers can be safely evicted.  In
 113  * this case the arc user should drop a reference on some arc buffers so
 114  * they can be reclaimed and the arc_meta_limit honored.  For example,
 115  * when using the ZPL each dentry holds a references on a znode.  These
 116  * dentries must be pruned before the arc buffer holding the znode can
 117  * be safely evicted.
 118  *
 119  * Note that the majority of the performance stats are manipulated
 120  * with atomic operations.
 121  *
 122  * The L2ARC uses the l2ad_mtx on each vdev for the following:
 123  *
 124  *      - L2ARC buflist creation
 125  *      - L2ARC buflist eviction
 126  *      - L2ARC write completion, which walks L2ARC buflists
 127  *      - ARC header destruction, as it removes from L2ARC buflists
 128  *      - ARC header release, as it removes from L2ARC buflists
 129  */
 130
 131 #include <sys/spa.h>
 132 #include <sys/zio.h>
 133 #include <sys/zio_compress.h>
 134 #include <sys/zfs_context.h>
 135 #include <sys/arc.h>
 136 #include <sys/refcount.h>
 137 #include <sys/vdev.h>
 138 #include <sys/vdev_impl.h>
 139 #include <sys/dsl_pool.h>
 140 #include <sys/multilist.h>
 141 #ifdef _KERNEL
 142 #include <sys/vmsystm.h>
 143 #include <vm/anon.h>
 144 #include <sys/fs/swapnode.h>
 145 #include <sys/zpl.h>
 146 #include <linux/mm_compat.h>
 147 #endif
 148 #include <sys/callb.h>
 149 #include <sys/kstat.h>
 150 #include <sys/dmu_tx.h>
 151 #include <zfs_fletcher.h>
 152 #include <sys/arc_impl.h>
 153 #include <sys/trace_arc.h>
 154
 155 #ifndef _KERNEL
 156 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 157 boolean_t arc_watch = B_FALSE;
 158 #endif
 159
 160 static kmutex_t         arc_reclaim_lock;
 161 static kcondvar_t       arc_reclaim_thread_cv;
 162 static boolean_t        arc_reclaim_thread_exit;
 163 static kcondvar_t       arc_reclaim_waiters_cv;
 164
 165 static kmutex_t         arc_user_evicts_lock;
 166 static kcondvar_t       arc_user_evicts_cv;
 167 static boolean_t        arc_user_evicts_thread_exit;
 168
 169 /*
 170  * The number of headers to evict in arc_evict_state_impl() before
 171  * dropping the sublist lock and evicting from another sublist. A lower
 172  * value means we're more likely to evict the "correct" header (i.e. the
 173  * oldest header in the arc state), but comes with higher overhead
 174  * (i.e. more invocations of arc_evict_state_impl()).
 175  */
 176 int zfs_arc_evict_batch_limit = 10;
 177
 178 /*
 179  * The number of sublists used for each of the arc state lists. If this
 180  * is not set to a suitable value by the user, it will be configured to
 181  * the number of CPUs on the system in arc_init().
 182  */
 183 int zfs_arc_num_sublists_per_state = 0;
 184
 185 /* number of seconds before growing cache again */
 186 static int              arc_grow_retry = 5;
 187
 188 /* shift of arc_c for calculating overflow limit in arc_get_data_buf */
 189 int             zfs_arc_overflow_shift = 8;
 190
 191 /* shift of arc_c for calculating both min and max arc_p */
 192 static int              arc_p_min_shift = 4;
 193
 194 /* log2(fraction of arc to reclaim) */
 195 static int              arc_shrink_shift = 7;
 196
 197 /*
 198  * log2(fraction of ARC which must be free to allow growing).
 199  * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
 200  * when reading a new block into the ARC, we will evict an equal-sized block
 201  * from the ARC.
 202  *
 203  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
 204  * we will still not allow it to grow.
 205  */
 206 int                     arc_no_grow_shift = 5;
 207
 208
 209 /*
 210  * minimum lifespan of a prefetch block in clock ticks
 211  * (initialized in arc_init())
 212  */
 213 static int              arc_min_prefetch_lifespan;
 214
 215 /*
 216  * If this percent of memory is free, don't throttle.
 217  */
 218 int arc_lotsfree_percent = 10;
 219
 220 static int arc_dead;
 221
 222 /*
 223  * The arc has filled available memory and has now warmed up.
 224  */
 225 static boolean_t arc_warm;
 226
 227 /*
 228  * These tunables are for performance analysis.
 229  */
 230 unsigned long zfs_arc_max = 0;
 231 unsigned long zfs_arc_min = 0;
 232 unsigned long zfs_arc_meta_limit = 0;
 233 unsigned long zfs_arc_meta_min = 0;
 234 unsigned long zfs_arc_dnode_limit = 0;
 235 unsigned long zfs_arc_dnode_reduce_percent = 10;
 236 int zfs_arc_grow_retry = 0;
 237 int zfs_arc_shrink_shift = 0;
 238 int zfs_arc_p_min_shift = 0;
 239 int zfs_disable_dup_eviction = 0;
 240 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 241
 242 /*
 243  * ARC will evict meta buffers that exceed arc_meta_limit. This
 244  * tunable make arc_meta_limit adjustable for different workloads.
 245  */
 246 unsigned long zfs_arc_meta_limit_percent = 75;
 247
 248 /*
 249  * Percentage that can be consumed by dnodes of ARC meta buffers.
 250  */
 251 unsigned long zfs_arc_dnode_limit_percent = 10;
 252
 253 /*
 254  * These tunables are Linux specific
 255  */
 256 unsigned long zfs_arc_sys_free = 0;
 257 int zfs_arc_min_prefetch_lifespan = 0;
 258 int zfs_arc_p_aggressive_disable = 1;
 259 int zfs_arc_p_dampener_disable = 1;
 260 int zfs_arc_meta_prune = 10000;
 261 int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
 262 int zfs_arc_meta_adjust_restarts = 4096;
 263 int zfs_arc_lotsfree_percent = 10;
 264
 265 /* The 6 states: */
 266 static arc_state_t ARC_anon;
 267 static arc_state_t ARC_mru;
 268 static arc_state_t ARC_mru_ghost;
 269 static arc_state_t ARC_mfu;
 270 static arc_state_t ARC_mfu_ghost;
 271 static arc_state_t ARC_l2c_only;
 272
 273 typedef struct arc_stats {
 274         kstat_named_t arcstat_hits;
 275         kstat_named_t arcstat_misses;
 276         kstat_named_t arcstat_demand_data_hits;
 277         kstat_named_t arcstat_demand_data_misses;
 278         kstat_named_t arcstat_demand_metadata_hits;
 279         kstat_named_t arcstat_demand_metadata_misses;
 280         kstat_named_t arcstat_prefetch_data_hits;
 281         kstat_named_t arcstat_prefetch_data_misses;
 282         kstat_named_t arcstat_prefetch_metadata_hits;
 283         kstat_named_t arcstat_prefetch_metadata_misses;
 284         kstat_named_t arcstat_mru_hits;
 285         kstat_named_t arcstat_mru_ghost_hits;
 286         kstat_named_t arcstat_mfu_hits;
 287         kstat_named_t arcstat_mfu_ghost_hits;
 288         kstat_named_t arcstat_deleted;
 289         /*
 290          * Number of buffers that could not be evicted because the hash lock
 291          * was held by another thread.  The lock may not necessarily be held
 292          * by something using the same buffer, since hash locks are shared
 293          * by multiple buffers.
 294          */
 295         kstat_named_t arcstat_mutex_miss;
 296         /*
 297          * Number of buffers skipped because they have I/O in progress, are
 298          * indrect prefetch buffers that have not lived long enough, or are
 299          * not from the spa we're trying to evict from.
 300          */
 301         kstat_named_t arcstat_evict_skip;
 302         /*
 303          * Number of times arc_evict_state() was unable to evict enough
 304          * buffers to reach its target amount.
 305          */
 306         kstat_named_t arcstat_evict_not_enough;
 307         kstat_named_t arcstat_evict_l2_cached;
 308         kstat_named_t arcstat_evict_l2_eligible;
 309         kstat_named_t arcstat_evict_l2_ineligible;
 310         kstat_named_t arcstat_evict_l2_skip;
 311         kstat_named_t arcstat_hash_elements;
 312         kstat_named_t arcstat_hash_elements_max;
 313         kstat_named_t arcstat_hash_collisions;
 314         kstat_named_t arcstat_hash_chains;
 315         kstat_named_t arcstat_hash_chain_max;
 316         kstat_named_t arcstat_p;
 317         kstat_named_t arcstat_c;
 318         kstat_named_t arcstat_c_min;
 319         kstat_named_t arcstat_c_max;
 320         kstat_named_t arcstat_size;
 321         /*
 322          * Number of bytes consumed by internal ARC structures necessary
 323          * for tracking purposes; these structures are not actually
 324          * backed by ARC buffers. This includes arc_buf_hdr_t structures
 325          * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
 326          * caches), and arc_buf_t structures (allocated via arc_buf_t
 327          * cache).
 328          */
 329         kstat_named_t arcstat_hdr_size;
 330         /*
 331          * Number of bytes consumed by ARC buffers of type equal to
 332          * ARC_BUFC_DATA. This is generally consumed by buffers backing
 333          * on disk user data (e.g. plain file contents).
 334          */
 335         kstat_named_t arcstat_data_size;
 336         /*
 337          * Number of bytes consumed by ARC buffers of type equal to
 338          * ARC_BUFC_METADATA. This is generally consumed by buffers
 339          * backing on disk data that is used for internal ZFS
 340          * structures (e.g. ZAP, dnode, indirect blocks, etc).
 341          */
 342         kstat_named_t arcstat_metadata_size;
 343         /*
 344          * Number of bytes consumed by dmu_buf_impl_t objects.
 345          */
 346         kstat_named_t arcstat_dbuf_size;
 347         /*
 348          * Number of bytes consumed by dnode_t objects.
 349          */
 350         kstat_named_t arcstat_dnode_size;
 351         /*
 352          * Number of bytes consumed by bonus buffers.
 353          */
 354         kstat_named_t arcstat_bonus_size;
 355         /*
 356          * Total number of bytes consumed by ARC buffers residing in the
 357          * arc_anon state. This includes *all* buffers in the arc_anon
 358          * state; e.g. data, metadata, evictable, and unevictable buffers
 359          * are all included in this value.
 360          */
 361         kstat_named_t arcstat_anon_size;
 362         /*
 363          * Number of bytes consumed by ARC buffers that meet the
 364          * following criteria: backing buffers of type ARC_BUFC_DATA,
 365          * residing in the arc_anon state, and are eligible for eviction
 366          * (e.g. have no outstanding holds on the buffer).
 367          */
 368         kstat_named_t arcstat_anon_evictable_data;
 369         /*
 370          * Number of bytes consumed by ARC buffers that meet the
 371          * following criteria: backing buffers of type ARC_BUFC_METADATA,
 372          * residing in the arc_anon state, and are eligible for eviction
 373          * (e.g. have no outstanding holds on the buffer).
 374          */
 375         kstat_named_t arcstat_anon_evictable_metadata;
 376         /*
 377          * Total number of bytes consumed by ARC buffers residing in the
 378          * arc_mru state. This includes *all* buffers in the arc_mru
 379          * state; e.g. data, metadata, evictable, and unevictable buffers
 380          * are all included in this value.
 381          */
 382         kstat_named_t arcstat_mru_size;
 383         /*
 384          * Number of bytes consumed by ARC buffers that meet the
 385          * following criteria: backing buffers of type ARC_BUFC_DATA,
 386          * residing in the arc_mru state, and are eligible for eviction
 387          * (e.g. have no outstanding holds on the buffer).
 388          */
 389         kstat_named_t arcstat_mru_evictable_data;
 390         /*
 391          * Number of bytes consumed by ARC buffers that meet the
 392          * following criteria: backing buffers of type ARC_BUFC_METADATA,
 393          * residing in the arc_mru state, and are eligible for eviction
 394          * (e.g. have no outstanding holds on the buffer).
 395          */
 396         kstat_named_t arcstat_mru_evictable_metadata;
 397         /*
 398          * Total number of bytes that *would have been* consumed by ARC
 399          * buffers in the arc_mru_ghost state. The key thing to note
 400          * here, is the fact that this size doesn't actually indicate
 401          * RAM consumption. The ghost lists only consist of headers and
 402          * don't actually have ARC buffers linked off of these headers.
 403          * Thus, *if* the headers had associated ARC buffers, these
 404          * buffers *would have* consumed this number of bytes.
 405          */
 406         kstat_named_t arcstat_mru_ghost_size;
 407         /*
 408          * Number of bytes that *would have been* consumed by ARC
 409          * buffers that are eligible for eviction, of type
 410          * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
 411          */
 412         kstat_named_t arcstat_mru_ghost_evictable_data;
 413         /*
 414          * Number of bytes that *would have been* consumed by ARC
 415          * buffers that are eligible for eviction, of type
 416          * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
 417          */
 418         kstat_named_t arcstat_mru_ghost_evictable_metadata;
 419         /*
 420          * Total number of bytes consumed by ARC buffers residing in the
 421          * arc_mfu state. This includes *all* buffers in the arc_mfu
 422          * state; e.g. data, metadata, evictable, and unevictable buffers
 423          * are all included in this value.
 424          */
 425         kstat_named_t arcstat_mfu_size;
 426         /*
 427          * Number of bytes consumed by ARC buffers that are eligible for
 428          * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
 429          * state.
 430          */
 431         kstat_named_t arcstat_mfu_evictable_data;
 432         /*
 433          * Number of bytes consumed by ARC buffers that are eligible for
 434          * eviction, of type ARC_BUFC_METADATA, and reside in the
 435          * arc_mfu state.
 436          */
 437         kstat_named_t arcstat_mfu_evictable_metadata;
 438         /*
 439          * Total number of bytes that *would have been* consumed by ARC
 440          * buffers in the arc_mfu_ghost state. See the comment above
 441          * arcstat_mru_ghost_size for more details.
 442          */
 443         kstat_named_t arcstat_mfu_ghost_size;
 444         /*
 445          * Number of bytes that *would have been* consumed by ARC
 446          * buffers that are eligible for eviction, of type
 447          * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
 448          */
 449         kstat_named_t arcstat_mfu_ghost_evictable_data;
 450         /*
 451          * Number of bytes that *would have been* consumed by ARC
 452          * buffers that are eligible for eviction, of type
 453          * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
 454          */
 455         kstat_named_t arcstat_mfu_ghost_evictable_metadata;
 456         kstat_named_t arcstat_l2_hits;
 457         kstat_named_t arcstat_l2_misses;
 458         kstat_named_t arcstat_l2_feeds;
 459         kstat_named_t arcstat_l2_rw_clash;
 460         kstat_named_t arcstat_l2_read_bytes;
 461         kstat_named_t arcstat_l2_write_bytes;
 462         kstat_named_t arcstat_l2_writes_sent;
 463         kstat_named_t arcstat_l2_writes_done;
 464         kstat_named_t arcstat_l2_writes_error;
 465         kstat_named_t arcstat_l2_writes_lock_retry;
 466         kstat_named_t arcstat_l2_writes_skip_toobig;
 467         kstat_named_t arcstat_l2_evict_lock_retry;
 468         kstat_named_t arcstat_l2_evict_reading;
 469         kstat_named_t arcstat_l2_evict_l1cached;
 470         kstat_named_t arcstat_l2_free_on_write;
 471         kstat_named_t arcstat_l2_cdata_free_on_write;
 472         kstat_named_t arcstat_l2_abort_lowmem;
 473         kstat_named_t arcstat_l2_cksum_bad;
 474         kstat_named_t arcstat_l2_io_error;
 475         kstat_named_t arcstat_l2_size;
 476         kstat_named_t arcstat_l2_asize;
 477         kstat_named_t arcstat_l2_hdr_size;
 478         kstat_named_t arcstat_l2_compress_successes;
 479         kstat_named_t arcstat_l2_compress_zeros;
 480         kstat_named_t arcstat_l2_compress_failures;
 481         kstat_named_t arcstat_memory_throttle_count;
 482         kstat_named_t arcstat_duplicate_buffers;
 483         kstat_named_t arcstat_duplicate_buffers_size;
 484         kstat_named_t arcstat_duplicate_reads;
 485         kstat_named_t arcstat_memory_direct_count;
 486         kstat_named_t arcstat_memory_indirect_count;
 487         kstat_named_t arcstat_no_grow;
 488         kstat_named_t arcstat_tempreserve;
 489         kstat_named_t arcstat_loaned_bytes;
 490         kstat_named_t arcstat_prune;
 491         kstat_named_t arcstat_meta_used;
 492         kstat_named_t arcstat_meta_limit;
 493         kstat_named_t arcstat_dnode_limit;
 494         kstat_named_t arcstat_meta_max;
 495         kstat_named_t arcstat_meta_min;
 496         kstat_named_t arcstat_sync_wait_for_async;
 497         kstat_named_t arcstat_demand_hit_predictive_prefetch;
 498         kstat_named_t arcstat_need_free;
 499         kstat_named_t arcstat_sys_free;
 500 } arc_stats_t;
 501
 502 static arc_stats_t arc_stats = {
 503         { "hits",                       KSTAT_DATA_UINT64 },
 504         { "misses",                     KSTAT_DATA_UINT64 },
 505         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 506         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 507         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 508         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 509         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 510         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 511         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 512         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 513         { "mru_hits",                   KSTAT_DATA_UINT64 },
 514         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 515         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 516         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 517         { "deleted",                    KSTAT_DATA_UINT64 },
 518         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 519         { "evict_skip",                 KSTAT_DATA_UINT64 },
 520         { "evict_not_enough",           KSTAT_DATA_UINT64 },
 521         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 522         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 523         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 524         { "evict_l2_skip",              KSTAT_DATA_UINT64 },
 525         { "hash_elements",              KSTAT_DATA_UINT64 },
 526         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 527         { "hash_collisions",            KSTAT_DATA_UINT64 },
 528         { "hash_chains",                KSTAT_DATA_UINT64 },
 529         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 530         { "p",                          KSTAT_DATA_UINT64 },
 531         { "c",                          KSTAT_DATA_UINT64 },
 532         { "c_min",                      KSTAT_DATA_UINT64 },
 533         { "c_max",                      KSTAT_DATA_UINT64 },
 534         { "size",                       KSTAT_DATA_UINT64 },
 535         { "hdr_size",                   KSTAT_DATA_UINT64 },
 536         { "data_size",                  KSTAT_DATA_UINT64 },
 537         { "metadata_size",              KSTAT_DATA_UINT64 },
 538         { "dbuf_size",                  KSTAT_DATA_UINT64 },
 539         { "dnode_size",                 KSTAT_DATA_UINT64 },
 540         { "bonus_size",                 KSTAT_DATA_UINT64 },
 541         { "anon_size",                  KSTAT_DATA_UINT64 },
 542         { "anon_evictable_data",        KSTAT_DATA_UINT64 },
 543         { "anon_evictable_metadata",    KSTAT_DATA_UINT64 },
 544         { "mru_size",                   KSTAT_DATA_UINT64 },
 545         { "mru_evictable_data",         KSTAT_DATA_UINT64 },
 546         { "mru_evictable_metadata",     KSTAT_DATA_UINT64 },
 547         { "mru_ghost_size",             KSTAT_DATA_UINT64 },
 548         { "mru_ghost_evictable_data",   KSTAT_DATA_UINT64 },
 549         { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 550         { "mfu_size",                   KSTAT_DATA_UINT64 },
 551         { "mfu_evictable_data",         KSTAT_DATA_UINT64 },
 552         { "mfu_evictable_metadata",     KSTAT_DATA_UINT64 },
 553         { "mfu_ghost_size",             KSTAT_DATA_UINT64 },
 554         { "mfu_ghost_evictable_data",   KSTAT_DATA_UINT64 },
 555         { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 556         { "l2_hits",                    KSTAT_DATA_UINT64 },
 557         { "l2_misses",                  KSTAT_DATA_UINT64 },
 558         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 559         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 560         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 561         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 562         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 563         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 564         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 565         { "l2_writes_lock_retry",       KSTAT_DATA_UINT64 },
 566         { "l2_writes_skip_toobig",      KSTAT_DATA_UINT64 },
 567         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 568         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 569         { "l2_evict_l1cached",          KSTAT_DATA_UINT64 },
 570         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 571         { "l2_cdata_free_on_write",     KSTAT_DATA_UINT64 },
 572         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 573         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 574         { "l2_io_error",                KSTAT_DATA_UINT64 },
 575         { "l2_size",                    KSTAT_DATA_UINT64 },
 576         { "l2_asize",                   KSTAT_DATA_UINT64 },
 577         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 578         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 579         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 580         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
 581         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 582         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 583         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 584         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 585         { "memory_direct_count",        KSTAT_DATA_UINT64 },
 586         { "memory_indirect_count",      KSTAT_DATA_UINT64 },
 587         { "arc_no_grow",                KSTAT_DATA_UINT64 },
 588         { "arc_tempreserve",            KSTAT_DATA_UINT64 },
 589         { "arc_loaned_bytes",           KSTAT_DATA_UINT64 },
 590         { "arc_prune",                  KSTAT_DATA_UINT64 },
 591         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 592         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 593         { "arc_dnode_limit",            KSTAT_DATA_UINT64 },
 594         { "arc_meta_max",               KSTAT_DATA_UINT64 },
 595         { "arc_meta_min",               KSTAT_DATA_UINT64 },
 596         { "sync_wait_for_async",        KSTAT_DATA_UINT64 },
 597         { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
 598         { "arc_need_free",              KSTAT_DATA_UINT64 },
 599         { "arc_sys_free",               KSTAT_DATA_UINT64 }
 600 };
 601
 602 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 603
 604 #define ARCSTAT_INCR(stat, val) \
 605         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 606
 607 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 608 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 609
 610 #define ARCSTAT_MAX(stat, val) {                                        \
 611         uint64_t m;                                                     \
 612         while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 613             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 614                 continue;                                               \
 615 }
 616
 617 #define ARCSTAT_MAXSTAT(stat) \
 618         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 619
 620 /*
 621  * We define a macro to allow ARC hits/misses to be easily broken down by
 622  * two separate conditions, giving a total of four different subtypes for
 623  * each of hits and misses (so eight statistics total).
 624  */
 625 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 626         if (cond1) {                                                    \
 627                 if (cond2) {                                            \
 628                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 629                 } else {                                                \
 630                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 631                 }                                                       \
 632         } else {                                                        \
 633                 if (cond2) {                                            \
 634                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 635                 } else {                                                \
 636                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 637                 }                                                       \
 638         }
 639
 640 kstat_t                 *arc_ksp;
 641 static arc_state_t      *arc_anon;
 642 static arc_state_t      *arc_mru;
 643 static arc_state_t      *arc_mru_ghost;
 644 static arc_state_t      *arc_mfu;
 645 static arc_state_t      *arc_mfu_ghost;
 646 static arc_state_t      *arc_l2c_only;
 647
 648 /*
 649  * There are several ARC variables that are critical to export as kstats --
 650  * but we don't want to have to grovel around in the kstat whenever we wish to
 651  * manipulate them.  For these variables, we therefore define them to be in
 652  * terms of the statistic variable.  This assures that we are not introducing
 653  * the possibility of inconsistency by having shadow copies of the variables,
 654  * while still allowing the code to be readable.
 655  */
 656 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 657 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 658 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 659 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 660 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 661 #define arc_no_grow     ARCSTAT(arcstat_no_grow)
 662 #define arc_tempreserve ARCSTAT(arcstat_tempreserve)
 663 #define arc_loaned_bytes        ARCSTAT(arcstat_loaned_bytes)
 664 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 665 #define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
 666 #define arc_meta_min    ARCSTAT(arcstat_meta_min) /* min size for metadata */
 667 #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 668 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 669 #define arc_dbuf_size   ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */
 670 #define arc_dnode_size  ARCSTAT(arcstat_dnode_size) /* dnode metadata */
 671 #define arc_bonus_size  ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */
 672 #define arc_need_free   ARCSTAT(arcstat_need_free) /* bytes to be freed */
 673 #define arc_sys_free    ARCSTAT(arcstat_sys_free) /* target system free bytes */
 674
 675 #define L2ARC_IS_VALID_COMPRESS(_c_) \
 676         ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 677
 678 static list_t arc_prune_list;
 679 static kmutex_t arc_prune_mtx;
 680 static taskq_t *arc_prune_taskq;
 681 static arc_buf_t *arc_eviction_list;
 682 static arc_buf_hdr_t arc_eviction_hdr;
 683
 684 #define GHOST_STATE(state)      \
 685         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 686         (state) == arc_l2c_only)
 687
 688 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
 689 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 690 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 691 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_FLAG_PREFETCH)
 692 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
 693 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
 694
 695 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_FLAG_L2CACHE)
 696 #define HDR_L2COMPRESS(hdr)     ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
 697 #define HDR_L2_READING(hdr)     \
 698             (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&      \
 699             ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 700 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 701 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 702 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
 703
 704 #define HDR_ISTYPE_METADATA(hdr)        \
 705             ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 706 #define HDR_ISTYPE_DATA(hdr)    (!HDR_ISTYPE_METADATA(hdr))
 707
 708 #define HDR_HAS_L1HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 709 #define HDR_HAS_L2HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 710
 711 /*
 712  * Other sizes
 713  */
 714
 715 #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 716 #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 717
 718 /*
 719  * Hash table routines
 720  */
 721
 722 #define HT_LOCK_ALIGN   64
 723 #define HT_LOCK_PAD     (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
 724
 725 struct ht_lock {
 726         kmutex_t        ht_lock;
 727 #ifdef _KERNEL
 728         unsigned char   pad[HT_LOCK_PAD];
 729 #endif
 730 };
 731
 732 #define BUF_LOCKS 8192
 733 typedef struct buf_hash_table {
 734         uint64_t ht_mask;
 735         arc_buf_hdr_t **ht_table;
 736         struct ht_lock ht_locks[BUF_LOCKS];
 737 } buf_hash_table_t;
 738
 739 static buf_hash_table_t buf_hash_table;
 740
 741 #define BUF_HASH_INDEX(spa, dva, birth) \
 742         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 743 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 744 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 745 #define HDR_LOCK(hdr) \
 746         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 747
 748 uint64_t zfs_crc64_table[256];
 749
 750 /*
 751  * Level 2 ARC
 752  */
 753
 754 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 755 #define L2ARC_HEADROOM          2                       /* num of writes */
 756 #define L2ARC_MAX_BLOCK_SIZE    (16 * 1024 * 1024)      /* max compress size */
 757
 758 /*
 759  * If we discover during ARC scan any buffers to be compressed, we boost
 760  * our headroom for the next scanning cycle by this percentage multiple.
 761  */
 762 #define L2ARC_HEADROOM_BOOST    200
 763 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 764 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 765
 766
 767 /*
 768  * Used to distinguish headers that are being process by
 769  * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
 770  * address. This can happen when the header is added to the l2arc's list
 771  * of buffers to write in the first stage of l2arc_write_buffers(), but
 772  * has not yet been written out which happens in the second stage of
 773  * l2arc_write_buffers().
 774  */
 775 #define L2ARC_ADDR_UNSET        ((uint64_t)(-1))
 776
 777 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 778 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 779
 780 /* L2ARC Performance Tunables */
 781 unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;       /* def max write size */
 782 unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;     /* extra warmup write */
 783 unsigned long l2arc_headroom = L2ARC_HEADROOM;          /* # of dev writes */
 784 unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 785 unsigned long l2arc_max_block_size = L2ARC_MAX_BLOCK_SIZE;
 786 unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;        /* interval seconds */
 787 unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;    /* min interval msecs */
 788 int l2arc_noprefetch = B_TRUE;                  /* don't cache prefetch bufs */
 789 int l2arc_nocompress = B_FALSE;                 /* don't compress bufs */
 790 int l2arc_feed_again = B_TRUE;                  /* turbo warmup */
 791 int l2arc_norw = B_FALSE;                       /* no reads during writes */
 792
 793 /*
 794  * L2ARC Internals
 795  */
 796 static list_t L2ARC_dev_list;                   /* device list */
 797 static list_t *l2arc_dev_list;                  /* device list pointer */
 798 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 799 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 800 static list_t L2ARC_free_on_write;              /* free after write buf list */
 801 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 802 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 803 static uint64_t l2arc_ndev;                     /* number of devices */
 804
 805 typedef struct l2arc_read_callback {
 806         arc_buf_t               *l2rcb_buf;             /* read buffer */
 807         spa_t                   *l2rcb_spa;             /* spa */
 808         blkptr_t                l2rcb_bp;               /* original blkptr */
 809         zbookmark_phys_t        l2rcb_zb;               /* original bookmark */
 810         int                     l2rcb_flags;            /* original flags */
 811         enum zio_compress       l2rcb_compress;         /* applied compress */
 812 } l2arc_read_callback_t;
 813
 814 typedef struct l2arc_data_free {
 815         /* protected by l2arc_free_on_write_mtx */
 816         void            *l2df_data;
 817         size_t          l2df_size;
 818         void            (*l2df_func)(void *, size_t);
 819         list_node_t     l2df_list_node;
 820 } l2arc_data_free_t;
 821
 822 static kmutex_t l2arc_feed_thr_lock;
 823 static kcondvar_t l2arc_feed_thr_cv;
 824 static uint8_t l2arc_thread_exit;
 825
 826 static void arc_get_data_buf(arc_buf_t *);
 827 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
 828 static boolean_t arc_is_overflowing(void);
 829 static void arc_buf_watch(arc_buf_t *);
 830 static void arc_tuning_update(void);
 831 static void arc_prune_async(int64_t);
 832
 833 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 834 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
 835
 836 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 837 static void l2arc_read_done(zio_t *);
 838
 839 static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
 840 static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
 841 static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
 842
 843 static uint64_t
 844 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 845 {
 846         uint8_t *vdva = (uint8_t *)dva;
 847         uint64_t crc = -1ULL;
 848         int i;
 849
 850         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 851
 852         for (i = 0; i < sizeof (dva_t); i++)
 853                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 854
 855         crc ^= (spa>>8) ^ birth;
 856
 857         return (crc);
 858 }
 859
 860 #define BUF_EMPTY(buf)                                          \
 861         ((buf)->b_dva.dva_word[0] == 0 &&                       \
 862         (buf)->b_dva.dva_word[1] == 0)
 863
 864 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 865         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
 866         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
 867         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 868
 869 static void
 870 buf_discard_identity(arc_buf_hdr_t *hdr)
 871 {
 872         hdr->b_dva.dva_word[0] = 0;
 873         hdr->b_dva.dva_word[1] = 0;
 874         hdr->b_birth = 0;
 875 }
 876
 877 static arc_buf_hdr_t *
 878 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 879 {
 880         const dva_t *dva = BP_IDENTITY(bp);
 881         uint64_t birth = BP_PHYSICAL_BIRTH(bp);
 882         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 883         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 884         arc_buf_hdr_t *hdr;
 885
 886         mutex_enter(hash_lock);
 887         for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
 888             hdr = hdr->b_hash_next) {
 889                 if (BUF_EQUAL(spa, dva, birth, hdr)) {
 890                         *lockp = hash_lock;
 891                         return (hdr);
 892                 }
 893         }
 894         mutex_exit(hash_lock);
 895         *lockp = NULL;
 896         return (NULL);
 897 }
 898
 899 /*
 900  * Insert an entry into the hash table.  If there is already an element
 901  * equal to elem in the hash table, then the already existing element
 902  * will be returned and the new element will not be inserted.
 903  * Otherwise returns NULL.
 904  * If lockp == NULL, the caller is assumed to already hold the hash lock.
 905  */
 906 static arc_buf_hdr_t *
 907 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
 908 {
 909         uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 910         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 911         arc_buf_hdr_t *fhdr;
 912         uint32_t i;
 913
 914         ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
 915         ASSERT(hdr->b_birth != 0);
 916         ASSERT(!HDR_IN_HASH_TABLE(hdr));
 917
 918         if (lockp != NULL) {
 919                 *lockp = hash_lock;
 920                 mutex_enter(hash_lock);
 921         } else {
 922                 ASSERT(MUTEX_HELD(hash_lock));
 923         }
 924
 925         for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
 926             fhdr = fhdr->b_hash_next, i++) {
 927                 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
 928                         return (fhdr);
 929         }
 930
 931         hdr->b_hash_next = buf_hash_table.ht_table[idx];
 932         buf_hash_table.ht_table[idx] = hdr;
 933         hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
 934
 935         /* collect some hash table performance data */
 936         if (i > 0) {
 937                 ARCSTAT_BUMP(arcstat_hash_collisions);
 938                 if (i == 1)
 939                         ARCSTAT_BUMP(arcstat_hash_chains);
 940
 941                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 942         }
 943
 944         ARCSTAT_BUMP(arcstat_hash_elements);
 945         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 946
 947         return (NULL);
 948 }
 949
 950 static void
 951 buf_hash_remove(arc_buf_hdr_t *hdr)
 952 {
 953         arc_buf_hdr_t *fhdr, **hdrp;
 954         uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 955
 956         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 957         ASSERT(HDR_IN_HASH_TABLE(hdr));
 958
 959         hdrp = &buf_hash_table.ht_table[idx];
 960         while ((fhdr = *hdrp) != hdr) {
 961                 ASSERT(fhdr != NULL);
 962                 hdrp = &fhdr->b_hash_next;
 963         }
 964         *hdrp = hdr->b_hash_next;
 965         hdr->b_hash_next = NULL;
 966         hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
 967
 968         /* collect some hash table performance data */
 969         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 970
 971         if (buf_hash_table.ht_table[idx] &&
 972             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 973                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 974 }
 975
 976 /*
 977  * Global data structures and functions for the buf kmem cache.
 978  */
 979 static kmem_cache_t *hdr_full_cache;
 980 static kmem_cache_t *hdr_l2only_cache;
 981 static kmem_cache_t *buf_cache;
 982
 983 static void
 984 buf_fini(void)
 985 {
 986         int i;
 987
 988 #if defined(_KERNEL) && defined(HAVE_SPL)
 989         /*
 990          * Large allocations which do not require contiguous pages
 991          * should be using vmem_free() in the linux kernel\
 992          */
 993         vmem_free(buf_hash_table.ht_table,
 994             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 995 #else
 996         kmem_free(buf_hash_table.ht_table,
 997             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 998 #endif
 999         for (i = 0; i < BUF_LOCKS; i++)
1000                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1001         kmem_cache_destroy(hdr_full_cache);
1002         kmem_cache_destroy(hdr_l2only_cache);
1003         kmem_cache_destroy(buf_cache);
1004 }
1005
1006 /*
1007  * Constructor callback - called when the cache is empty
1008  * and a new buf is requested.
1009  */
1010 /* ARGSUSED */
1011 static int
1012 hdr_full_cons(void *vbuf, void *unused, int kmflag)
1013 {
1014         arc_buf_hdr_t *hdr = vbuf;
1015
1016         bzero(hdr, HDR_FULL_SIZE);
1017         cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1018         refcount_create(&hdr->b_l1hdr.b_refcnt);
1019         mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1020         list_link_init(&hdr->b_l1hdr.b_arc_node);
1021         list_link_init(&hdr->b_l2hdr.b_l2node);
1022         multilist_link_init(&hdr->b_l1hdr.b_arc_node);
1023         arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1024
1025         return (0);
1026 }
1027
1028 /* ARGSUSED */
1029 static int
1030 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1031 {
1032         arc_buf_hdr_t *hdr = vbuf;
1033
1034         bzero(hdr, HDR_L2ONLY_SIZE);
1035         arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1036
1037         return (0);
1038 }
1039
1040 /* ARGSUSED */
1041 static int
1042 buf_cons(void *vbuf, void *unused, int kmflag)
1043 {
1044         arc_buf_t *buf = vbuf;
1045
1046         bzero(buf, sizeof (arc_buf_t));
1047         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1048         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1049
1050         return (0);
1051 }
1052
1053 /*
1054  * Destructor callback - called when a cached buf is
1055  * no longer required.
1056  */
1057 /* ARGSUSED */
1058 static void
1059 hdr_full_dest(void *vbuf, void *unused)
1060 {
1061         arc_buf_hdr_t *hdr = vbuf;
1062
1063         ASSERT(BUF_EMPTY(hdr));
1064         cv_destroy(&hdr->b_l1hdr.b_cv);
1065         refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1066         mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1067         ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1068         arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1069 }
1070
1071 /* ARGSUSED */
1072 static void
1073 hdr_l2only_dest(void *vbuf, void *unused)
1074 {
1075         ASSERTV(arc_buf_hdr_t *hdr = vbuf);
1076
1077         ASSERT(BUF_EMPTY(hdr));
1078         arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1079 }
1080
1081 /* ARGSUSED */
1082 static void
1083 buf_dest(void *vbuf, void *unused)
1084 {
1085         arc_buf_t *buf = vbuf;
1086
1087         mutex_destroy(&buf->b_evict_lock);
1088         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1089 }
1090
1091 /*
1092  * Reclaim callback -- invoked when memory is low.
1093  */
1094 /* ARGSUSED */
1095 static void
1096 hdr_recl(void *unused)
1097 {
1098         dprintf("hdr_recl called\n");
1099         /*
1100          * umem calls the reclaim func when we destroy the buf cache,
1101          * which is after we do arc_fini().
1102          */
1103         if (!arc_dead)
1104                 cv_signal(&arc_reclaim_thread_cv);
1105 }
1106
1107 static void
1108 buf_init(void)
1109 {
1110         uint64_t *ct;
1111         uint64_t hsize = 1ULL << 12;
1112         int i, j;
1113
1114         /*
1115          * The hash table is big enough to fill all of physical memory
1116          * with an average block size of zfs_arc_average_blocksize (default 8K).
1117          * By default, the table will take up
1118          * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1119          */
1120         while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
1121                 hsize <<= 1;
1122 retry:
1123         buf_hash_table.ht_mask = hsize - 1;
1124 #if defined(_KERNEL) && defined(HAVE_SPL)
1125         /*
1126          * Large allocations which do not require contiguous pages
1127          * should be using vmem_alloc() in the linux kernel
1128          */
1129         buf_hash_table.ht_table =
1130             vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
1131 #else
1132         buf_hash_table.ht_table =
1133             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1134 #endif
1135         if (buf_hash_table.ht_table == NULL) {
1136                 ASSERT(hsize > (1ULL << 8));
1137                 hsize >>= 1;
1138                 goto retry;
1139         }
1140
1141         hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1142             0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
1143         hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1144             HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1145             NULL, NULL, 0);
1146         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1147             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1148
1149         for (i = 0; i < 256; i++)
1150                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1151                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1152
1153         for (i = 0; i < BUF_LOCKS; i++) {
1154                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1155                     NULL, MUTEX_DEFAULT, NULL);
1156         }
1157 }
1158
1159 /*
1160  * Transition between the two allocation states for the arc_buf_hdr struct.
1161  * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
1162  * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
1163  * version is used when a cache buffer is only in the L2ARC in order to reduce
1164  * memory usage.
1165  */
1166 static arc_buf_hdr_t *
1167 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
1168 {
1169         arc_buf_hdr_t *nhdr;
1170         l2arc_dev_t *dev;
1171
1172         ASSERT(HDR_HAS_L2HDR(hdr));
1173         ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
1174             (old == hdr_l2only_cache && new == hdr_full_cache));
1175
1176         dev = hdr->b_l2hdr.b_dev;
1177         nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
1178
1179         ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
1180         buf_hash_remove(hdr);
1181
1182         bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
1183
1184         if (new == hdr_full_cache) {
1185                 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1186                 /*
1187                  * arc_access and arc_change_state need to be aware that a
1188                  * header has just come out of L2ARC, so we set its state to
1189                  * l2c_only even though it's about to change.
1190                  */
1191                 nhdr->b_l1hdr.b_state = arc_l2c_only;
1192
1193                 /* Verify previous threads set to NULL before freeing */
1194                 ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1195         } else {
1196                 ASSERT(hdr->b_l1hdr.b_buf == NULL);
1197                 ASSERT0(hdr->b_l1hdr.b_datacnt);
1198
1199                 /*
1200                  * If we've reached here, We must have been called from
1201                  * arc_evict_hdr(), as such we should have already been
1202                  * removed from any ghost list we were previously on
1203                  * (which protects us from racing with arc_evict_state),
1204                  * thus no locking is needed during this check.
1205                  */
1206                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1207
1208                 /*
1209                  * A buffer must not be moved into the arc_l2c_only
1210                  * state if it's not finished being written out to the
1211                  * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
1212                  * might try to be accessed, even though it was removed.
1213                  */
1214                 VERIFY(!HDR_L2_WRITING(hdr));
1215                 VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1216
1217                 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
1218         }
1219         /*
1220          * The header has been reallocated so we need to re-insert it into any
1221          * lists it was on.
1222          */
1223         (void) buf_hash_insert(nhdr, NULL);
1224
1225         ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
1226
1227         mutex_enter(&dev->l2ad_mtx);
1228
1229         /*
1230          * We must place the realloc'ed header back into the list at
1231          * the same spot. Otherwise, if it's placed earlier in the list,
1232          * l2arc_write_buffers() could find it during the function's
1233          * write phase, and try to write it out to the l2arc.
1234          */
1235         list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
1236         list_remove(&dev->l2ad_buflist, hdr);
1237
1238         mutex_exit(&dev->l2ad_mtx);
1239
1240         /*
1241          * Since we're using the pointer address as the tag when
1242          * incrementing and decrementing the l2ad_alloc refcount, we
1243          * must remove the old pointer (that we're about to destroy) and
1244          * add the new pointer to the refcount. Otherwise we'd remove
1245          * the wrong pointer address when calling arc_hdr_destroy() later.
1246          */
1247
1248         (void) refcount_remove_many(&dev->l2ad_alloc,
1249             hdr->b_l2hdr.b_asize, hdr);
1250
1251         (void) refcount_add_many(&dev->l2ad_alloc,
1252             nhdr->b_l2hdr.b_asize, nhdr);
1253
1254         buf_discard_identity(hdr);
1255         hdr->b_freeze_cksum = NULL;
1256         kmem_cache_free(old, hdr);
1257
1258         return (nhdr);
1259 }
1260
1261
1262 #define ARC_MINTIME     (hz>>4) /* 62 ms */
1263
1264 static void
1265 arc_cksum_verify(arc_buf_t *buf)
1266 {
1267         zio_cksum_t zc;
1268
1269         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1270                 return;
1271
1272         mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1273         if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
1274                 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1275                 return;
1276         }
1277         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1278         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1279                 panic("buffer modified while frozen!");
1280         mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1281 }
1282
1283 static int
1284 arc_cksum_equal(arc_buf_t *buf)
1285 {
1286         zio_cksum_t zc;
1287         int equal;
1288
1289         mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1290         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1291         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1292         mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1293
1294         return (equal);
1295 }
1296
1297 static void
1298 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1299 {
1300         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1301                 return;
1302
1303         mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1304         if (buf->b_hdr->b_freeze_cksum != NULL) {
1305                 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1306                 return;
1307         }
1308         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1309         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1310             buf->b_hdr->b_freeze_cksum);
1311         mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1312         arc_buf_watch(buf);
1313 }
1314
1315 #ifndef _KERNEL
1316 void
1317 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
1318 {
1319         panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr);
1320 }
1321 #endif
1322
1323 /* ARGSUSED */
1324 static void
1325 arc_buf_unwatch(arc_buf_t *buf)
1326 {
1327 #ifndef _KERNEL
1328         if (arc_watch) {
1329                 ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size,
1330                     PROT_READ | PROT_WRITE));
1331         }
1332 #endif
1333 }
1334
1335 /* ARGSUSED */
1336 static void
1337 arc_buf_watch(arc_buf_t *buf)
1338 {
1339 #ifndef _KERNEL
1340         if (arc_watch)
1341                 ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ));
1342 #endif
1343 }
1344
1345 static arc_buf_contents_t
1346 arc_buf_type(arc_buf_hdr_t *hdr)
1347 {
1348         if (HDR_ISTYPE_METADATA(hdr)) {
1349                 return (ARC_BUFC_METADATA);
1350         } else {
1351                 return (ARC_BUFC_DATA);
1352         }
1353 }
1354
1355 static uint32_t
1356 arc_bufc_to_flags(arc_buf_contents_t type)
1357 {
1358         switch (type) {
1359         case ARC_BUFC_DATA:
1360                 /* metadata field is 0 if buffer contains normal data */
1361                 return (0);
1362         case ARC_BUFC_METADATA:
1363                 return (ARC_FLAG_BUFC_METADATA);
1364         default:
1365                 break;
1366         }
1367         panic("undefined ARC buffer type!");
1368         return ((uint32_t)-1);
1369 }
1370
1371 void
1372 arc_buf_thaw(arc_buf_t *buf)
1373 {
1374         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1375                 if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
1376                         panic("modifying non-anon buffer!");
1377                 if (HDR_IO_IN_PROGRESS(buf->b_hdr))
1378                         panic("modifying buffer while i/o in progress!");
1379                 arc_cksum_verify(buf);
1380         }
1381
1382         mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1383         if (buf->b_hdr->b_freeze_cksum != NULL) {
1384                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1385                 buf->b_hdr->b_freeze_cksum = NULL;
1386         }
1387
1388         mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1389
1390         arc_buf_unwatch(buf);
1391 }
1392
1393 void
1394 arc_buf_freeze(arc_buf_t *buf)
1395 {
1396         kmutex_t *hash_lock;
1397
1398         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1399                 return;
1400
1401         hash_lock = HDR_LOCK(buf->b_hdr);
1402         mutex_enter(hash_lock);
1403
1404         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1405             buf->b_hdr->b_l1hdr.b_state == arc_anon);
1406         arc_cksum_compute(buf, B_FALSE);
1407         mutex_exit(hash_lock);
1408
1409 }
1410
1411 static void
1412 add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1413 {
1414         arc_state_t *state;
1415
1416         ASSERT(HDR_HAS_L1HDR(hdr));
1417         ASSERT(MUTEX_HELD(hash_lock));
1418
1419         state = hdr->b_l1hdr.b_state;
1420
1421         if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
1422             (state != arc_anon)) {
1423                 /* We don't use the L2-only state list. */
1424                 if (state != arc_l2c_only) {
1425                         arc_buf_contents_t type = arc_buf_type(hdr);
1426                         uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
1427                         multilist_t *list = &state->arcs_list[type];
1428                         uint64_t *size = &state->arcs_lsize[type];
1429
1430                         multilist_remove(list, hdr);
1431
1432                         if (GHOST_STATE(state)) {
1433                                 ASSERT0(hdr->b_l1hdr.b_datacnt);
1434                                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
1435                                 delta = hdr->b_size;
1436                         }
1437                         ASSERT(delta > 0);
1438                         ASSERT3U(*size, >=, delta);
1439                         atomic_add_64(size, -delta);
1440                 }
1441                 /* remove the prefetch flag if we get a reference */
1442                 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
1443         }
1444 }
1445
1446 static int
1447 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1448 {
1449         int cnt;
1450         arc_state_t *state = hdr->b_l1hdr.b_state;
1451
1452         ASSERT(HDR_HAS_L1HDR(hdr));
1453         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1454         ASSERT(!GHOST_STATE(state));
1455
1456         /*
1457          * arc_l2c_only counts as a ghost state so we don't need to explicitly
1458          * check to prevent usage of the arc_l2c_only list.
1459          */
1460         if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
1461             (state != arc_anon)) {
1462                 arc_buf_contents_t type = arc_buf_type(hdr);
1463                 multilist_t *list = &state->arcs_list[type];
1464                 uint64_t *size = &state->arcs_lsize[type];
1465
1466                 multilist_insert(list, hdr);
1467
1468                 ASSERT(hdr->b_l1hdr.b_datacnt > 0);
1469                 atomic_add_64(size, hdr->b_size *
1470                     hdr->b_l1hdr.b_datacnt);
1471         }
1472         return (cnt);
1473 }
1474
1475 /*
1476  * Returns detailed information about a specific arc buffer.  When the
1477  * state_index argument is set the function will calculate the arc header
1478  * list position for its arc state.  Since this requires a linear traversal
1479  * callers are strongly encourage not to do this.  However, it can be helpful
1480  * for targeted analysis so the functionality is provided.
1481  */
1482 void
1483 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
1484 {
1485         arc_buf_hdr_t *hdr = ab->b_hdr;
1486         l1arc_buf_hdr_t *l1hdr = NULL;
1487         l2arc_buf_hdr_t *l2hdr = NULL;
1488         arc_state_t *state = NULL;
1489
1490         memset(abi, 0, sizeof (arc_buf_info_t));
1491
1492         if (hdr == NULL)
1493                 return;
1494
1495         abi->abi_flags = hdr->b_flags;
1496
1497         if (HDR_HAS_L1HDR(hdr)) {
1498                 l1hdr = &hdr->b_l1hdr;
1499                 state = l1hdr->b_state;
1500         }
1501         if (HDR_HAS_L2HDR(hdr))
1502                 l2hdr = &hdr->b_l2hdr;
1503
1504         if (l1hdr) {
1505                 abi->abi_datacnt = l1hdr->b_datacnt;
1506                 abi->abi_access = l1hdr->b_arc_access;
1507                 abi->abi_mru_hits = l1hdr->b_mru_hits;
1508                 abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
1509                 abi->abi_mfu_hits = l1hdr->b_mfu_hits;
1510                 abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
1511                 abi->abi_holds = refcount_count(&l1hdr->b_refcnt);
1512         }
1513
1514         if (l2hdr) {
1515                 abi->abi_l2arc_dattr = l2hdr->b_daddr;
1516                 abi->abi_l2arc_asize = l2hdr->b_asize;
1517                 abi->abi_l2arc_compress = l2hdr->b_compress;
1518                 abi->abi_l2arc_hits = l2hdr->b_hits;
1519         }
1520
1521         abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
1522         abi->abi_state_contents = arc_buf_type(hdr);
1523         abi->abi_size = hdr->b_size;
1524 }
1525
1526 /*
1527  * Move the supplied buffer to the indicated state. The hash lock
1528  * for the buffer must be held by the caller.
1529  */
1530 static void
1531 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
1532     kmutex_t *hash_lock)
1533 {
1534         arc_state_t *old_state;
1535         int64_t refcnt;
1536         uint32_t datacnt;
1537         uint64_t from_delta, to_delta;
1538         arc_buf_contents_t buftype = arc_buf_type(hdr);
1539
1540         /*
1541          * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
1542          * in arc_read() when bringing a buffer out of the L2ARC.  However, the
1543          * L1 hdr doesn't always exist when we change state to arc_anon before
1544          * destroying a header, in which case reallocating to add the L1 hdr is
1545          * pointless.
1546          */
1547         if (HDR_HAS_L1HDR(hdr)) {
1548                 old_state = hdr->b_l1hdr.b_state;
1549                 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
1550                 datacnt = hdr->b_l1hdr.b_datacnt;
1551         } else {
1552                 old_state = arc_l2c_only;
1553                 refcnt = 0;
1554                 datacnt = 0;
1555         }
1556
1557         ASSERT(MUTEX_HELD(hash_lock));
1558         ASSERT3P(new_state, !=, old_state);
1559         ASSERT(refcnt == 0 || datacnt > 0);
1560         ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
1561         ASSERT(old_state != arc_anon || datacnt <= 1);
1562
1563         from_delta = to_delta = datacnt * hdr->b_size;
1564
1565         /*
1566          * If this buffer is evictable, transfer it from the
1567          * old state list to the new state list.
1568          */
1569         if (refcnt == 0) {
1570                 if (old_state != arc_anon && old_state != arc_l2c_only) {
1571                         uint64_t *size = &old_state->arcs_lsize[buftype];
1572
1573                         ASSERT(HDR_HAS_L1HDR(hdr));
1574                         multilist_remove(&old_state->arcs_list[buftype], hdr);
1575
1576                         /*
1577                          * If prefetching out of the ghost cache,
1578                          * we will have a non-zero datacnt.
1579                          */
1580                         if (GHOST_STATE(old_state) && datacnt == 0) {
1581                                 /* ghost elements have a ghost size */
1582                                 ASSERT(hdr->b_l1hdr.b_buf == NULL);
1583                                 from_delta = hdr->b_size;
1584                         }
1585                         ASSERT3U(*size, >=, from_delta);
1586                         atomic_add_64(size, -from_delta);
1587                 }
1588                 if (new_state != arc_anon && new_state != arc_l2c_only) {
1589                         uint64_t *size = &new_state->arcs_lsize[buftype];
1590
1591                         /*
1592                          * An L1 header always exists here, since if we're
1593                          * moving to some L1-cached state (i.e. not l2c_only or
1594                          * anonymous), we realloc the header to add an L1hdr
1595                          * beforehand.
1596                          */
1597                         ASSERT(HDR_HAS_L1HDR(hdr));
1598                         multilist_insert(&new_state->arcs_list[buftype], hdr);
1599
1600                         /* ghost elements have a ghost size */
1601                         if (GHOST_STATE(new_state)) {
1602                                 ASSERT0(datacnt);
1603                                 ASSERT(hdr->b_l1hdr.b_buf == NULL);
1604                                 to_delta = hdr->b_size;
1605                         }
1606                         atomic_add_64(size, to_delta);
1607                 }
1608         }
1609
1610         ASSERT(!BUF_EMPTY(hdr));
1611         if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
1612                 buf_hash_remove(hdr);
1613
1614         /* adjust state sizes (ignore arc_l2c_only) */
1615
1616         if (to_delta && new_state != arc_l2c_only) {
1617                 ASSERT(HDR_HAS_L1HDR(hdr));
1618                 if (GHOST_STATE(new_state)) {
1619                         ASSERT0(datacnt);
1620
1621                         /*
1622                          * We moving a header to a ghost state, we first
1623                          * remove all arc buffers. Thus, we'll have a
1624                          * datacnt of zero, and no arc buffer to use for
1625                          * the reference. As a result, we use the arc
1626                          * header pointer for the reference.
1627                          */
1628                         (void) refcount_add_many(&new_state->arcs_size,
1629                             hdr->b_size, hdr);
1630                 } else {
1631                         arc_buf_t *buf;
1632                         ASSERT3U(datacnt, !=, 0);
1633
1634                         /*
1635                          * Each individual buffer holds a unique reference,
1636                          * thus we must remove each of these references one
1637                          * at a time.
1638                          */
1639                         for (buf = hdr->b_l1hdr.b_buf; buf != NULL;
1640                             buf = buf->b_next) {
1641                                 (void) refcount_add_many(&new_state->arcs_size,
1642                                     hdr->b_size, buf);
1643                         }
1644                 }
1645         }
1646
1647         if (from_delta && old_state != arc_l2c_only) {
1648                 ASSERT(HDR_HAS_L1HDR(hdr));
1649                 if (GHOST_STATE(old_state)) {
1650                         /*
1651                          * When moving a header off of a ghost state,
1652                          * there's the possibility for datacnt to be
1653                          * non-zero. This is because we first add the
1654                          * arc buffer to the header prior to changing
1655                          * the header's state. Since we used the header
1656                          * for the reference when putting the header on
1657                          * the ghost state, we must balance that and use
1658                          * the header when removing off the ghost state
1659                          * (even though datacnt is non zero).
1660                          */
1661
1662                         IMPLY(datacnt == 0, new_state == arc_anon ||
1663                             new_state == arc_l2c_only);
1664
1665                         (void) refcount_remove_many(&old_state->arcs_size,
1666                             hdr->b_size, hdr);
1667                 } else {
1668                         arc_buf_t *buf;
1669                         ASSERT3U(datacnt, !=, 0);
1670
1671                         /*
1672                          * Each individual buffer holds a unique reference,
1673                          * thus we must remove each of these references one
1674                          * at a time.
1675                          */
1676                         for (buf = hdr->b_l1hdr.b_buf; buf != NULL;
1677                             buf = buf->b_next) {
1678                                 (void) refcount_remove_many(
1679                                     &old_state->arcs_size, hdr->b_size, buf);
1680                         }
1681                 }
1682         }
1683
1684         if (HDR_HAS_L1HDR(hdr))
1685                 hdr->b_l1hdr.b_state = new_state;
1686
1687         /*
1688          * L2 headers should never be on the L2 state list since they don't
1689          * have L1 headers allocated.
1690          */
1691         ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
1692             multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
1693 }
1694
1695 void
1696 arc_space_consume(uint64_t space, arc_space_type_t type)
1697 {
1698         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1699
1700         switch (type) {
1701         default:
1702                 break;
1703         case ARC_SPACE_DATA:
1704                 ARCSTAT_INCR(arcstat_data_size, space);
1705                 break;
1706         case ARC_SPACE_META:
1707                 ARCSTAT_INCR(arcstat_metadata_size, space);
1708                 break;
1709         case ARC_SPACE_BONUS:
1710                 ARCSTAT_INCR(arcstat_bonus_size, space);
1711                 break;
1712         case ARC_SPACE_DNODE:
1713                 ARCSTAT_INCR(arcstat_dnode_size, space);
1714                 break;
1715         case ARC_SPACE_DBUF:
1716                 ARCSTAT_INCR(arcstat_dbuf_size, space);
1717                 break;
1718         case ARC_SPACE_HDRS:
1719                 ARCSTAT_INCR(arcstat_hdr_size, space);
1720                 break;
1721         case ARC_SPACE_L2HDRS:
1722                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1723                 break;
1724         }
1725
1726         if (type != ARC_SPACE_DATA)
1727                 ARCSTAT_INCR(arcstat_meta_used, space);
1728
1729         atomic_add_64(&arc_size, space);
1730 }
1731
1732 void
1733 arc_space_return(uint64_t space, arc_space_type_t type)
1734 {
1735         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1736
1737         switch (type) {
1738         default:
1739                 break;
1740         case ARC_SPACE_DATA:
1741                 ARCSTAT_INCR(arcstat_data_size, -space);
1742                 break;
1743         case ARC_SPACE_META:
1744                 ARCSTAT_INCR(arcstat_metadata_size, -space);
1745                 break;
1746         case ARC_SPACE_BONUS:
1747                 ARCSTAT_INCR(arcstat_bonus_size, -space);
1748                 break;
1749         case ARC_SPACE_DNODE:
1750                 ARCSTAT_INCR(arcstat_dnode_size, -space);
1751                 break;
1752         case ARC_SPACE_DBUF:
1753                 ARCSTAT_INCR(arcstat_dbuf_size, -space);
1754                 break;
1755         case ARC_SPACE_HDRS:
1756                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1757                 break;
1758         case ARC_SPACE_L2HDRS:
1759                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1760                 break;
1761         }
1762
1763         if (type != ARC_SPACE_DATA) {
1764                 ASSERT(arc_meta_used >= space);
1765                 if (arc_meta_max < arc_meta_used)
1766                         arc_meta_max = arc_meta_used;
1767                 ARCSTAT_INCR(arcstat_meta_used, -space);
1768         }
1769
1770         ASSERT(arc_size >= space);
1771         atomic_add_64(&arc_size, -space);
1772 }
1773
1774 arc_buf_t *
1775 arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type)
1776 {
1777         arc_buf_hdr_t *hdr;
1778         arc_buf_t *buf;
1779
1780         VERIFY3U(size, <=, spa_maxblocksize(spa));
1781         hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
1782         ASSERT(BUF_EMPTY(hdr));
1783         ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
1784         hdr->b_size = size;
1785         hdr->b_spa = spa_load_guid(spa);
1786         hdr->b_l1hdr.b_mru_hits = 0;
1787         hdr->b_l1hdr.b_mru_ghost_hits = 0;
1788         hdr->b_l1hdr.b_mfu_hits = 0;
1789         hdr->b_l1hdr.b_mfu_ghost_hits = 0;
1790         hdr->b_l1hdr.b_l2_hits = 0;
1791
1792         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1793         buf->b_hdr = hdr;
1794         buf->b_data = NULL;
1795         buf->b_efunc = NULL;
1796         buf->b_private = NULL;
1797         buf->b_next = NULL;
1798
1799         hdr->b_flags = arc_bufc_to_flags(type);
1800         hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1801
1802         hdr->b_l1hdr.b_buf = buf;
1803         hdr->b_l1hdr.b_state = arc_anon;
1804         hdr->b_l1hdr.b_arc_access = 0;
1805         hdr->b_l1hdr.b_datacnt = 1;
1806         hdr->b_l1hdr.b_tmp_cdata = NULL;
1807
1808         arc_get_data_buf(buf);
1809         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
1810         (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1811
1812         return (buf);
1813 }
1814
1815 static char *arc_onloan_tag = "onloan";
1816
1817 /*
1818  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1819  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1820  * buffers must be returned to the arc before they can be used by the DMU or
1821  * freed.
1822  */
1823 arc_buf_t *
1824 arc_loan_buf(spa_t *spa, uint64_t size)
1825 {
1826         arc_buf_t *buf;
1827
1828         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1829
1830         atomic_add_64(&arc_loaned_bytes, size);
1831         return (buf);
1832 }
1833
1834 /*
1835  * Return a loaned arc buffer to the arc.
1836  */
1837 void
1838 arc_return_buf(arc_buf_t *buf, void *tag)
1839 {
1840         arc_buf_hdr_t *hdr = buf->b_hdr;
1841
1842         ASSERT(buf->b_data != NULL);
1843         ASSERT(HDR_HAS_L1HDR(hdr));
1844         (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1845         (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
1846
1847         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1848 }
1849
1850 /* Detach an arc_buf from a dbuf (tag) */
1851 void
1852 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1853 {
1854         arc_buf_hdr_t *hdr = buf->b_hdr;
1855
1856         ASSERT(buf->b_data != NULL);
1857         ASSERT(HDR_HAS_L1HDR(hdr));
1858         (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
1859         (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
1860         buf->b_efunc = NULL;
1861         buf->b_private = NULL;
1862
1863         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1864 }
1865
1866 static arc_buf_t *
1867 arc_buf_clone(arc_buf_t *from)
1868 {
1869         arc_buf_t *buf;
1870         arc_buf_hdr_t *hdr = from->b_hdr;
1871         uint64_t size = hdr->b_size;
1872
1873         ASSERT(HDR_HAS_L1HDR(hdr));
1874         ASSERT(hdr->b_l1hdr.b_state != arc_anon);
1875
1876         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1877         buf->b_hdr = hdr;
1878         buf->b_data = NULL;
1879         buf->b_efunc = NULL;
1880         buf->b_private = NULL;
1881         buf->b_next = hdr->b_l1hdr.b_buf;
1882         hdr->b_l1hdr.b_buf = buf;
1883         arc_get_data_buf(buf);
1884         bcopy(from->b_data, buf->b_data, size);
1885
1886         /*
1887          * This buffer already exists in the arc so create a duplicate
1888          * copy for the caller.  If the buffer is associated with user data
1889          * then track the size and number of duplicates.  These stats will be
1890          * updated as duplicate buffers are created and destroyed.
1891          */
1892         if (HDR_ISTYPE_DATA(hdr)) {
1893                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1894                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1895         }
1896         hdr->b_l1hdr.b_datacnt += 1;
1897         return (buf);
1898 }
1899
1900 void
1901 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1902 {
1903         arc_buf_hdr_t *hdr;
1904         kmutex_t *hash_lock;
1905
1906         /*
1907          * Check to see if this buffer is evicted.  Callers
1908          * must verify b_data != NULL to know if the add_ref
1909          * was successful.
1910          */
1911         mutex_enter(&buf->b_evict_lock);
1912         if (buf->b_data == NULL) {
1913                 mutex_exit(&buf->b_evict_lock);
1914                 return;
1915         }
1916         hash_lock = HDR_LOCK(buf->b_hdr);
1917         mutex_enter(hash_lock);
1918         hdr = buf->b_hdr;
1919         ASSERT(HDR_HAS_L1HDR(hdr));
1920         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1921         mutex_exit(&buf->b_evict_lock);
1922
1923         ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
1924             hdr->b_l1hdr.b_state == arc_mfu);
1925
1926         add_reference(hdr, hash_lock, tag);
1927         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1928         arc_access(hdr, hash_lock);
1929         mutex_exit(hash_lock);
1930         ARCSTAT_BUMP(arcstat_hits);
1931         ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
1932             demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
1933             data, metadata, hits);
1934 }
1935
1936 static void
1937 arc_buf_free_on_write(void *data, size_t size,
1938     void (*free_func)(void *, size_t))
1939 {
1940         l2arc_data_free_t *df;
1941
1942         df = kmem_alloc(sizeof (*df), KM_SLEEP);
1943         df->l2df_data = data;
1944         df->l2df_size = size;
1945         df->l2df_func = free_func;
1946         mutex_enter(&l2arc_free_on_write_mtx);
1947         list_insert_head(l2arc_free_on_write, df);
1948         mutex_exit(&l2arc_free_on_write_mtx);
1949 }
1950
1951 /*
1952  * Free the arc data buffer.  If it is an l2arc write in progress,
1953  * the buffer is placed on l2arc_free_on_write to be freed later.
1954  */
1955 static void
1956 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1957 {
1958         arc_buf_hdr_t *hdr = buf->b_hdr;
1959
1960         if (HDR_L2_WRITING(hdr)) {
1961                 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
1962                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1963         } else {
1964                 free_func(buf->b_data, hdr->b_size);
1965         }
1966 }
1967
1968 static void
1969 arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
1970 {
1971         ASSERT(HDR_HAS_L2HDR(hdr));
1972         ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
1973
1974         /*
1975          * The b_tmp_cdata field is linked off of the b_l1hdr, so if
1976          * that doesn't exist, the header is in the arc_l2c_only state,
1977          * and there isn't anything to free (it's already been freed).
1978          */
1979         if (!HDR_HAS_L1HDR(hdr))
1980                 return;
1981
1982         /*
1983          * The header isn't being written to the l2arc device, thus it
1984          * shouldn't have a b_tmp_cdata to free.
1985          */
1986         if (!HDR_L2_WRITING(hdr)) {
1987                 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1988                 return;
1989         }
1990
1991         /*
1992          * The header does not have compression enabled. This can be due
1993          * to the buffer not being compressible, or because we're
1994          * freeing the buffer before the second phase of
1995          * l2arc_write_buffer() has started (which does the compression
1996          * step). In either case, b_tmp_cdata does not point to a
1997          * separately compressed buffer, so there's nothing to free (it
1998          * points to the same buffer as the arc_buf_t's b_data field).
1999          */
2000         if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_OFF) {
2001                 hdr->b_l1hdr.b_tmp_cdata = NULL;
2002                 return;
2003         }
2004
2005         /*
2006          * There's nothing to free since the buffer was all zero's and
2007          * compressed to a zero length buffer.
2008          */
2009         if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) {
2010                 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
2011                 return;
2012         }
2013
2014         ASSERT(L2ARC_IS_VALID_COMPRESS(hdr->b_l2hdr.b_compress));
2015
2016         arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
2017             hdr->b_size, zio_data_buf_free);
2018
2019         ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
2020         hdr->b_l1hdr.b_tmp_cdata = NULL;
2021 }
2022
2023 /*
2024  * Free up buf->b_data and if 'remove' is set, then pull the
2025  * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
2026  */
2027 static void
2028 arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
2029 {
2030         arc_buf_t **bufp;
2031
2032         /* free up data associated with the buf */
2033         if (buf->b_data != NULL) {
2034                 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
2035                 uint64_t size = buf->b_hdr->b_size;
2036                 arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
2037
2038                 arc_cksum_verify(buf);
2039                 arc_buf_unwatch(buf);
2040
2041                 if (type == ARC_BUFC_METADATA) {
2042                         arc_buf_data_free(buf, zio_buf_free);
2043                         arc_space_return(size, ARC_SPACE_META);
2044                 } else {
2045                         ASSERT(type == ARC_BUFC_DATA);
2046                         arc_buf_data_free(buf, zio_data_buf_free);
2047                         arc_space_return(size, ARC_SPACE_DATA);
2048                 }
2049
2050                 /* protected by hash lock, if in the hash table */
2051                 if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
2052                         uint64_t *cnt = &state->arcs_lsize[type];
2053
2054                         ASSERT(refcount_is_zero(
2055                             &buf->b_hdr->b_l1hdr.b_refcnt));
2056                         ASSERT(state != arc_anon && state != arc_l2c_only);
2057
2058                         ASSERT3U(*cnt, >=, size);
2059                         atomic_add_64(cnt, -size);
2060                 }
2061
2062                 (void) refcount_remove_many(&state->arcs_size, size, buf);
2063                 buf->b_data = NULL;
2064
2065                 /*
2066                  * If we're destroying a duplicate buffer make sure
2067                  * that the appropriate statistics are updated.
2068                  */
2069                 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
2070                     HDR_ISTYPE_DATA(buf->b_hdr)) {
2071                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
2072                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
2073                 }
2074                 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
2075                 buf->b_hdr->b_l1hdr.b_datacnt -= 1;
2076         }
2077
2078         /* only remove the buf if requested */
2079         if (!remove)
2080                 return;
2081
2082         /* remove the buf from the hdr list */
2083         for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
2084             bufp = &(*bufp)->b_next)
2085                 continue;
2086         *bufp = buf->b_next;
2087         buf->b_next = NULL;
2088
2089         ASSERT(buf->b_efunc == NULL);
2090
2091         /* clean up the buf */
2092         buf->b_hdr = NULL;
2093         kmem_cache_free(buf_cache, buf);
2094 }
2095
2096 static void
2097 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
2098 {
2099         l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
2100         l2arc_dev_t *dev = l2hdr->b_dev;
2101
2102         ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
2103         ASSERT(HDR_HAS_L2HDR(hdr));
2104
2105         list_remove(&dev->l2ad_buflist, hdr);
2106
2107         /*
2108          * We don't want to leak the b_tmp_cdata buffer that was
2109          * allocated in l2arc_write_buffers()
2110          */
2111         arc_buf_l2_cdata_free(hdr);
2112
2113         /*
2114          * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then
2115          * this header is being processed by l2arc_write_buffers() (i.e.
2116          * it's in the first stage of l2arc_write_buffers()).
2117          * Re-affirming that truth here, just to serve as a reminder. If
2118          * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or
2119          * may not have its HDR_L2_WRITING flag set. (the write may have
2120          * completed, in which case HDR_L2_WRITING will be false and the
2121          * b_daddr field will point to the address of the buffer on disk).
2122          */
2123         IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr));
2124
2125         /*
2126          * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with
2127          * l2arc_write_buffers(). Since we've just removed this header
2128          * from the l2arc buffer list, this header will never reach the
2129          * second stage of l2arc_write_buffers(), which increments the
2130          * accounting stats for this header. Thus, we must be careful
2131          * not to decrement them for this header either.
2132          */
2133         if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) {
2134                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
2135                 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
2136
2137                 vdev_space_update(dev->l2ad_vdev,
2138                     -l2hdr->b_asize, 0, 0);
2139
2140                 (void) refcount_remove_many(&dev->l2ad_alloc,
2141                     l2hdr->b_asize, hdr);
2142         }
2143
2144         hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
2145 }
2146
2147 static void
2148 arc_hdr_destroy(arc_buf_hdr_t *hdr)
2149 {
2150         if (HDR_HAS_L1HDR(hdr)) {
2151                 ASSERT(hdr->b_l1hdr.b_buf == NULL ||
2152                     hdr->b_l1hdr.b_datacnt > 0);
2153                 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2154                 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
2155         }
2156         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2157         ASSERT(!HDR_IN_HASH_TABLE(hdr));
2158
2159         if (HDR_HAS_L2HDR(hdr)) {
2160                 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
2161                 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
2162
2163                 if (!buflist_held)
2164                         mutex_enter(&dev->l2ad_mtx);
2165
2166                 /*
2167                  * Even though we checked this conditional above, we
2168                  * need to check this again now that we have the
2169                  * l2ad_mtx. This is because we could be racing with
2170                  * another thread calling l2arc_evict() which might have
2171                  * destroyed this header's L2 portion as we were waiting
2172                  * to acquire the l2ad_mtx. If that happens, we don't
2173                  * want to re-destroy the header's L2 portion.
2174                  */
2175                 if (HDR_HAS_L2HDR(hdr))
2176                         arc_hdr_l2hdr_destroy(hdr);
2177
2178                 if (!buflist_held)
2179                         mutex_exit(&dev->l2ad_mtx);
2180         }
2181
2182         if (!BUF_EMPTY(hdr))
2183                 buf_discard_identity(hdr);
2184
2185         if (hdr->b_freeze_cksum != NULL) {
2186                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
2187                 hdr->b_freeze_cksum = NULL;
2188         }
2189
2190         if (HDR_HAS_L1HDR(hdr)) {
2191                 while (hdr->b_l1hdr.b_buf) {
2192                         arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2193
2194                         if (buf->b_efunc != NULL) {
2195                                 mutex_enter(&arc_user_evicts_lock);
2196                                 mutex_enter(&buf->b_evict_lock);
2197                                 ASSERT(buf->b_hdr != NULL);
2198                                 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
2199                                 hdr->b_l1hdr.b_buf = buf->b_next;
2200                                 buf->b_hdr = &arc_eviction_hdr;
2201                                 buf->b_next = arc_eviction_list;
2202                                 arc_eviction_list = buf;
2203                                 mutex_exit(&buf->b_evict_lock);
2204                                 cv_signal(&arc_user_evicts_cv);
2205                                 mutex_exit(&arc_user_evicts_lock);
2206                         } else {
2207                                 arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
2208                         }
2209                 }
2210         }
2211
2212         ASSERT3P(hdr->b_hash_next, ==, NULL);
2213         if (HDR_HAS_L1HDR(hdr)) {
2214                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
2215                 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
2216                 kmem_cache_free(hdr_full_cache, hdr);
2217         } else {
2218                 kmem_cache_free(hdr_l2only_cache, hdr);
2219         }
2220 }
2221
2222 void
2223 arc_buf_free(arc_buf_t *buf, void *tag)
2224 {
2225         arc_buf_hdr_t *hdr = buf->b_hdr;
2226         int hashed = hdr->b_l1hdr.b_state != arc_anon;
2227
2228         ASSERT(buf->b_efunc == NULL);
2229         ASSERT(buf->b_data != NULL);
2230
2231         if (hashed) {
2232                 kmutex_t *hash_lock = HDR_LOCK(hdr);
2233
2234                 mutex_enter(hash_lock);
2235                 hdr = buf->b_hdr;
2236                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2237
2238                 (void) remove_reference(hdr, hash_lock, tag);
2239                 if (hdr->b_l1hdr.b_datacnt > 1) {
2240                         arc_buf_destroy(buf, TRUE);
2241                 } else {
2242                         ASSERT(buf == hdr->b_l1hdr.b_buf);
2243                         ASSERT(buf->b_efunc == NULL);
2244                         hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2245                 }
2246                 mutex_exit(hash_lock);
2247         } else if (HDR_IO_IN_PROGRESS(hdr)) {
2248                 int destroy_hdr;
2249                 /*
2250                  * We are in the middle of an async write.  Don't destroy
2251                  * this buffer unless the write completes before we finish
2252                  * decrementing the reference count.
2253                  */
2254                 mutex_enter(&arc_user_evicts_lock);
2255                 (void) remove_reference(hdr, NULL, tag);
2256                 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2257                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
2258                 mutex_exit(&arc_user_evicts_lock);
2259                 if (destroy_hdr)
2260                         arc_hdr_destroy(hdr);
2261         } else {
2262                 if (remove_reference(hdr, NULL, tag) > 0)
2263                         arc_buf_destroy(buf, TRUE);
2264                 else
2265                         arc_hdr_destroy(hdr);
2266         }
2267 }
2268
2269 boolean_t
2270 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
2271 {
2272         arc_buf_hdr_t *hdr = buf->b_hdr;
2273         kmutex_t *hash_lock = HDR_LOCK(hdr);
2274         boolean_t no_callback = (buf->b_efunc == NULL);
2275
2276         if (hdr->b_l1hdr.b_state == arc_anon) {
2277                 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
2278                 arc_buf_free(buf, tag);
2279                 return (no_callback);
2280         }
2281
2282         mutex_enter(hash_lock);
2283         hdr = buf->b_hdr;
2284         ASSERT(hdr->b_l1hdr.b_datacnt > 0);
2285         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2286         ASSERT(hdr->b_l1hdr.b_state != arc_anon);
2287         ASSERT(buf->b_data != NULL);
2288
2289         (void) remove_reference(hdr, hash_lock, tag);
2290         if (hdr->b_l1hdr.b_datacnt > 1) {
2291                 if (no_callback)
2292                         arc_buf_destroy(buf, TRUE);
2293         } else if (no_callback) {
2294                 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
2295                 ASSERT(buf->b_efunc == NULL);
2296                 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2297         }
2298         ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
2299             refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2300         mutex_exit(hash_lock);
2301         return (no_callback);
2302 }
2303
2304 uint64_t
2305 arc_buf_size(arc_buf_t *buf)
2306 {
2307         return (buf->b_hdr->b_size);
2308 }
2309
2310 /*
2311  * Called from the DMU to determine if the current buffer should be
2312  * evicted. In order to ensure proper locking, the eviction must be initiated
2313  * from the DMU. Return true if the buffer is associated with user data and
2314  * duplicate buffers still exist.
2315  */
2316 boolean_t
2317 arc_buf_eviction_needed(arc_buf_t *buf)
2318 {
2319         arc_buf_hdr_t *hdr;
2320         boolean_t evict_needed = B_FALSE;
2321
2322         if (zfs_disable_dup_eviction)
2323                 return (B_FALSE);
2324
2325         mutex_enter(&buf->b_evict_lock);
2326         hdr = buf->b_hdr;
2327         if (hdr == NULL) {
2328                 /*
2329                  * We are in arc_do_user_evicts(); let that function
2330                  * perform the eviction.
2331                  */
2332                 ASSERT(buf->b_data == NULL);
2333                 mutex_exit(&buf->b_evict_lock);
2334                 return (B_FALSE);
2335         } else if (buf->b_data == NULL) {
2336                 /*
2337                  * We have already been added to the arc eviction list;
2338                  * recommend eviction.
2339                  */
2340                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
2341                 mutex_exit(&buf->b_evict_lock);
2342                 return (B_TRUE);
2343         }
2344
2345         if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
2346                 evict_needed = B_TRUE;
2347
2348         mutex_exit(&buf->b_evict_lock);
2349         return (evict_needed);
2350 }
2351
2352 /*
2353  * Evict the arc_buf_hdr that is provided as a parameter. The resultant
2354  * state of the header is dependent on its state prior to entering this
2355  * function. The following transitions are possible:
2356  *
2357  *    - arc_mru -> arc_mru_ghost
2358  *    - arc_mfu -> arc_mfu_ghost
2359  *    - arc_mru_ghost -> arc_l2c_only
2360  *    - arc_mru_ghost -> deleted
2361  *    - arc_mfu_ghost -> arc_l2c_only
2362  *    - arc_mfu_ghost -> deleted
2363  */
2364 static int64_t
2365 arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
2366 {
2367         arc_state_t *evicted_state, *state;
2368         int64_t bytes_evicted = 0;
2369
2370         ASSERT(MUTEX_HELD(hash_lock));
2371         ASSERT(HDR_HAS_L1HDR(hdr));
2372
2373         state = hdr->b_l1hdr.b_state;
2374         if (GHOST_STATE(state)) {
2375                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2376                 ASSERT(hdr->b_l1hdr.b_buf == NULL);
2377
2378                 /*
2379                  * l2arc_write_buffers() relies on a header's L1 portion
2380                  * (i.e. its b_tmp_cdata field) during its write phase.
2381                  * Thus, we cannot push a header onto the arc_l2c_only
2382                  * state (removing its L1 piece) until the header is
2383                  * done being written to the l2arc.
2384                  */
2385                 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
2386                         ARCSTAT_BUMP(arcstat_evict_l2_skip);
2387                         return (bytes_evicted);
2388                 }
2389
2390                 ARCSTAT_BUMP(arcstat_deleted);
2391                 bytes_evicted += hdr->b_size;
2392
2393                 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
2394
2395                 if (HDR_HAS_L2HDR(hdr)) {
2396                         /*
2397                          * This buffer is cached on the 2nd Level ARC;
2398                          * don't destroy the header.
2399                          */
2400                         arc_change_state(arc_l2c_only, hdr, hash_lock);
2401                         /*
2402                          * dropping from L1+L2 cached to L2-only,
2403                          * realloc to remove the L1 header.
2404                          */
2405                         hdr = arc_hdr_realloc(hdr, hdr_full_cache,
2406                             hdr_l2only_cache);
2407                 } else {
2408                         arc_change_state(arc_anon, hdr, hash_lock);
2409                         arc_hdr_destroy(hdr);
2410                 }
2411                 return (bytes_evicted);
2412         }
2413
2414         ASSERT(state == arc_mru || state == arc_mfu);
2415         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2416
2417         /* prefetch buffers have a minimum lifespan */
2418         if (HDR_IO_IN_PROGRESS(hdr) ||
2419             ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
2420             ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
2421             arc_min_prefetch_lifespan)) {
2422                 ARCSTAT_BUMP(arcstat_evict_skip);
2423                 return (bytes_evicted);
2424         }
2425
2426         ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
2427         ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
2428         while (hdr->b_l1hdr.b_buf) {
2429                 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2430                 if (!mutex_tryenter(&buf->b_evict_lock)) {
2431                         ARCSTAT_BUMP(arcstat_mutex_miss);
2432                         break;
2433                 }
2434                 if (buf->b_data != NULL)
2435                         bytes_evicted += hdr->b_size;
2436                 if (buf->b_efunc != NULL) {
2437                         mutex_enter(&arc_user_evicts_lock);
2438                         arc_buf_destroy(buf, FALSE);
2439                         hdr->b_l1hdr.b_buf = buf->b_next;
2440                         buf->b_hdr = &arc_eviction_hdr;
2441                         buf->b_next = arc_eviction_list;
2442                         arc_eviction_list = buf;
2443                         cv_signal(&arc_user_evicts_cv);
2444                         mutex_exit(&arc_user_evicts_lock);
2445                         mutex_exit(&buf->b_evict_lock);
2446                 } else {
2447                         mutex_exit(&buf->b_evict_lock);
2448                         arc_buf_destroy(buf, TRUE);
2449                 }
2450         }
2451
2452         if (HDR_HAS_L2HDR(hdr)) {
2453                 ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
2454         } else {
2455                 if (l2arc_write_eligible(hdr->b_spa, hdr))
2456                         ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
2457                 else
2458                         ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
2459         }
2460
2461         if (hdr->b_l1hdr.b_datacnt == 0) {
2462                 arc_change_state(evicted_state, hdr, hash_lock);
2463                 ASSERT(HDR_IN_HASH_TABLE(hdr));
2464                 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
2465                 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
2466                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
2467         }
2468
2469         return (bytes_evicted);
2470 }
2471
2472 static uint64_t
2473 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
2474     uint64_t spa, int64_t bytes)
2475 {
2476         multilist_sublist_t *mls;
2477         uint64_t bytes_evicted = 0;
2478         arc_buf_hdr_t *hdr;
2479         kmutex_t *hash_lock;
2480         int evict_count = 0;
2481
2482         ASSERT3P(marker, !=, NULL);
2483         IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
2484
2485         mls = multilist_sublist_lock(ml, idx);
2486
2487         for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
2488             hdr = multilist_sublist_prev(mls, marker)) {
2489                 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
2490                     (evict_count >= zfs_arc_evict_batch_limit))
2491                         break;
2492
2493                 /*
2494                  * To keep our iteration location, move the marker
2495                  * forward. Since we're not holding hdr's hash lock, we
2496                  * must be very careful and not remove 'hdr' from the
2497                  * sublist. Otherwise, other consumers might mistake the
2498                  * 'hdr' as not being on a sublist when they call the
2499                  * multilist_link_active() function (they all rely on
2500                  * the hash lock protecting concurrent insertions and
2501                  * removals). multilist_sublist_move_forward() was
2502                  * specifically implemented to ensure this is the case
2503                  * (only 'marker' will be removed and re-inserted).
2504                  */
2505                 multilist_sublist_move_forward(mls, marker);
2506
2507                 /*
2508                  * The only case where the b_spa field should ever be
2509                  * zero, is the marker headers inserted by
2510                  * arc_evict_state(). It's possible for multiple threads
2511                  * to be calling arc_evict_state() concurrently (e.g.
2512                  * dsl_pool_close() and zio_inject_fault()), so we must
2513                  * skip any markers we see from these other threads.
2514                  */
2515                 if (hdr->b_spa == 0)
2516                         continue;
2517
2518                 /* we're only interested in evicting buffers of a certain spa */
2519                 if (spa != 0 && hdr->b_spa != spa) {
2520                         ARCSTAT_BUMP(arcstat_evict_skip);
2521                         continue;
2522                 }
2523
2524                 hash_lock = HDR_LOCK(hdr);
2525
2526                 /*
2527                  * We aren't calling this function from any code path
2528                  * that would already be holding a hash lock, so we're
2529                  * asserting on this assumption to be defensive in case
2530                  * this ever changes. Without this check, it would be
2531                  * possible to incorrectly increment arcstat_mutex_miss
2532                  * below (e.g. if the code changed such that we called
2533                  * this function with a hash lock held).
2534                  */
2535                 ASSERT(!MUTEX_HELD(hash_lock));
2536
2537                 if (mutex_tryenter(hash_lock)) {
2538                         uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
2539                         mutex_exit(hash_lock);
2540
2541                         bytes_evicted += evicted;
2542
2543                         /*
2544                          * If evicted is zero, arc_evict_hdr() must have
2545                          * decided to skip this header, don't increment
2546                          * evict_count in this case.
2547                          */
2548                         if (evicted != 0)
2549                                 evict_count++;
2550
2551                         /*
2552                          * If arc_size isn't overflowing, signal any
2553                          * threads that might happen to be waiting.
2554                          *
2555                          * For each header evicted, we wake up a single
2556                          * thread. If we used cv_broadcast, we could
2557                          * wake up "too many" threads causing arc_size
2558                          * to significantly overflow arc_c; since
2559                          * arc_get_data_buf() doesn't check for overflow
2560                          * when it's woken up (it doesn't because it's
2561                          * possible for the ARC to be overflowing while
2562                          * full of un-evictable buffers, and the
2563                          * function should proceed in this case).
2564                          *
2565                          * If threads are left sleeping, due to not
2566                          * using cv_broadcast, they will be woken up
2567                          * just before arc_reclaim_thread() sleeps.
2568                          */
2569                         mutex_enter(&arc_reclaim_lock);
2570                         if (!arc_is_overflowing())
2571                                 cv_signal(&arc_reclaim_waiters_cv);
2572                         mutex_exit(&arc_reclaim_lock);
2573                 } else {
2574                         ARCSTAT_BUMP(arcstat_mutex_miss);
2575                 }
2576         }
2577
2578         multilist_sublist_unlock(mls);
2579
2580         return (bytes_evicted);
2581 }
2582
2583 /*
2584  * Evict buffers from the given arc state, until we've removed the
2585  * specified number of bytes. Move the removed buffers to the
2586  * appropriate evict state.
2587  *
2588  * This function makes a "best effort". It skips over any buffers
2589  * it can't get a hash_lock on, and so, may not catch all candidates.
2590  * It may also return without evicting as much space as requested.
2591  *
2592  * If bytes is specified using the special value ARC_EVICT_ALL, this
2593  * will evict all available (i.e. unlocked and evictable) buffers from
2594  * the given arc state; which is used by arc_flush().
2595  */
2596 static uint64_t
2597 arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
2598     arc_buf_contents_t type)
2599 {
2600         uint64_t total_evicted = 0;
2601         multilist_t *ml = &state->arcs_list[type];
2602         int num_sublists;
2603         arc_buf_hdr_t **markers;
2604         int i;
2605
2606         IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
2607
2608         num_sublists = multilist_get_num_sublists(ml);
2609
2610         /*
2611          * If we've tried to evict from each sublist, made some
2612          * progress, but still have not hit the target number of bytes
2613          * to evict, we want to keep trying. The markers allow us to
2614          * pick up where we left off for each individual sublist, rather
2615          * than starting from the tail each time.
2616          */
2617         markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
2618         for (i = 0; i < num_sublists; i++) {
2619                 multilist_sublist_t *mls;
2620
2621                 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
2622
2623                 /*
2624                  * A b_spa of 0 is used to indicate that this header is
2625                  * a marker. This fact is used in arc_adjust_type() and
2626                  * arc_evict_state_impl().
2627                  */
2628                 markers[i]->b_spa = 0;
2629
2630                 mls = multilist_sublist_lock(ml, i);
2631                 multilist_sublist_insert_tail(mls, markers[i]);
2632                 multilist_sublist_unlock(mls);
2633         }
2634
2635         /*
2636          * While we haven't hit our target number of bytes to evict, or
2637          * we're evicting all available buffers.
2638          */
2639         while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
2640                 int sublist_idx = multilist_get_random_index(ml);
2641                 uint64_t scan_evicted = 0;
2642
2643                 /*
2644                  * Try to reduce pinned dnodes with a floor of arc_dnode_limit.
2645                  * Request that 10% of the LRUs be scanned by the superblock
2646                  * shrinker.
2647                  */
2648                 if (type == ARC_BUFC_DATA && arc_dnode_size > arc_dnode_limit)
2649                         arc_prune_async((arc_dnode_size - arc_dnode_limit) /
2650                             sizeof (dnode_t) / zfs_arc_dnode_reduce_percent);
2651
2652                 /*
2653                  * Start eviction using a randomly selected sublist,
2654                  * this is to try and evenly balance eviction across all
2655                  * sublists. Always starting at the same sublist
2656                  * (e.g. index 0) would cause evictions to favor certain
2657                  * sublists over others.
2658                  */
2659                 for (i = 0; i < num_sublists; i++) {
2660                         uint64_t bytes_remaining;
2661                         uint64_t bytes_evicted;
2662
2663                         if (bytes == ARC_EVICT_ALL)
2664                                 bytes_remaining = ARC_EVICT_ALL;
2665                         else if (total_evicted < bytes)
2666                                 bytes_remaining = bytes - total_evicted;
2667                         else
2668                                 break;
2669
2670                         bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
2671                             markers[sublist_idx], spa, bytes_remaining);
2672
2673                         scan_evicted += bytes_evicted;
2674                         total_evicted += bytes_evicted;
2675
2676                         /* we've reached the end, wrap to the beginning */
2677                         if (++sublist_idx >= num_sublists)
2678                                 sublist_idx = 0;
2679                 }
2680
2681                 /*
2682                  * If we didn't evict anything during this scan, we have
2683                  * no reason to believe we'll evict more during another
2684                  * scan, so break the loop.
2685                  */
2686                 if (scan_evicted == 0) {
2687                         /* This isn't possible, let's make that obvious */
2688                         ASSERT3S(bytes, !=, 0);
2689
2690                         /*
2691                          * When bytes is ARC_EVICT_ALL, the only way to
2692                          * break the loop is when scan_evicted is zero.
2693                          * In that case, we actually have evicted enough,
2694                          * so we don't want to increment the kstat.
2695                          */
2696                         if (bytes != ARC_EVICT_ALL) {
2697                                 ASSERT3S(total_evicted, <, bytes);
2698                                 ARCSTAT_BUMP(arcstat_evict_not_enough);
2699                         }
2700
2701                         break;
2702                 }
2703         }
2704
2705         for (i = 0; i < num_sublists; i++) {
2706                 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
2707                 multilist_sublist_remove(mls, markers[i]);
2708                 multilist_sublist_unlock(mls);
2709
2710                 kmem_cache_free(hdr_full_cache, markers[i]);
2711         }
2712         kmem_free(markers, sizeof (*markers) * num_sublists);
2713
2714         return (total_evicted);
2715 }
2716
2717 /*
2718  * Flush all "evictable" data of the given type from the arc state
2719  * specified. This will not evict any "active" buffers (i.e. referenced).
2720  *
2721  * When 'retry' is set to FALSE, the function will make a single pass
2722  * over the state and evict any buffers that it can. Since it doesn't
2723  * continually retry the eviction, it might end up leaving some buffers
2724  * in the ARC due to lock misses.
2725  *
2726  * When 'retry' is set to TRUE, the function will continually retry the
2727  * eviction until *all* evictable buffers have been removed from the
2728  * state. As a result, if concurrent insertions into the state are
2729  * allowed (e.g. if the ARC isn't shutting down), this function might
2730  * wind up in an infinite loop, continually trying to evict buffers.
2731  */
2732 static uint64_t
2733 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
2734     boolean_t retry)
2735 {
2736         uint64_t evicted = 0;
2737
2738         while (state->arcs_lsize[type] != 0) {
2739                 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
2740
2741                 if (!retry)
2742                         break;
2743         }
2744
2745         return (evicted);
2746 }
2747
2748 /*
2749  * Helper function for arc_prune_async() it is responsible for safely
2750  * handling the execution of a registered arc_prune_func_t.
2751  */
2752 static void
2753 arc_prune_task(void *ptr)
2754 {
2755         arc_prune_t *ap = (arc_prune_t *)ptr;
2756         arc_prune_func_t *func = ap->p_pfunc;
2757
2758         if (func != NULL)
2759                 func(ap->p_adjust, ap->p_private);
2760
2761         refcount_remove(&ap->p_refcnt, func);
2762 }
2763
2764 /*
2765  * Notify registered consumers they must drop holds on a portion of the ARC
2766  * buffered they reference.  This provides a mechanism to ensure the ARC can
2767  * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers.  This
2768  * is analogous to dnlc_reduce_cache() but more generic.
2769  *
2770  * This operation is performed asynchronously so it may be safely called
2771  * in the context of the arc_reclaim_thread().  A reference is taken here
2772  * for each registered arc_prune_t and the arc_prune_task() is responsible
2773  * for releasing it once the registered arc_prune_func_t has completed.
2774  */
2775 static void
2776 arc_prune_async(int64_t adjust)
2777 {
2778         arc_prune_t *ap;
2779
2780         mutex_enter(&arc_prune_mtx);
2781         for (ap = list_head(&arc_prune_list); ap != NULL;
2782             ap = list_next(&arc_prune_list, ap)) {
2783
2784                 if (refcount_count(&ap->p_refcnt) >= 2)
2785                         continue;
2786
2787                 refcount_add(&ap->p_refcnt, ap->p_pfunc);
2788                 ap->p_adjust = adjust;
2789                 taskq_dispatch(arc_prune_taskq, arc_prune_task, ap, TQ_SLEEP);
2790                 ARCSTAT_BUMP(arcstat_prune);
2791         }
2792         mutex_exit(&arc_prune_mtx);
2793 }
2794
2795 /*
2796  * Evict the specified number of bytes from the state specified,
2797  * restricting eviction to the spa and type given. This function
2798  * prevents us from trying to evict more from a state's list than
2799  * is "evictable", and to skip evicting altogether when passed a
2800  * negative value for "bytes". In contrast, arc_evict_state() will
2801  * evict everything it can, when passed a negative value for "bytes".
2802  */
2803 static uint64_t
2804 arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
2805     arc_buf_contents_t type)
2806 {
2807         int64_t delta;
2808
2809         if (bytes > 0 && state->arcs_lsize[type] > 0) {
2810                 delta = MIN(state->arcs_lsize[type], bytes);
2811                 return (arc_evict_state(state, spa, delta, type));
2812         }
2813
2814         return (0);
2815 }
2816
2817 /*
2818  * The goal of this function is to evict enough meta data buffers from the
2819  * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
2820  * more complicated than it appears because it is common for data buffers
2821  * to have holds on meta data buffers.  In addition, dnode meta data buffers
2822  * will be held by the dnodes in the block preventing them from being freed.
2823  * This means we can't simply traverse the ARC and expect to always find
2824  * enough unheld meta data buffer to release.
2825  *
2826  * Therefore, this function has been updated to make alternating passes
2827  * over the ARC releasing data buffers and then newly unheld meta data
2828  * buffers.  This ensures forward progress is maintained and arc_meta_used
2829  * will decrease.  Normally this is sufficient, but if required the ARC
2830  * will call the registered prune callbacks causing dentry and inodes to
2831  * be dropped from the VFS cache.  This will make dnode meta data buffers
2832  * available for reclaim.
2833  */
2834 static uint64_t
2835 arc_adjust_meta_balanced(void)
2836 {
2837         int64_t adjustmnt, delta, prune = 0;
2838         uint64_t total_evicted = 0;
2839         arc_buf_contents_t type = ARC_BUFC_DATA;
2840         int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
2841
2842 restart:
2843         /*
2844          * This slightly differs than the way we evict from the mru in
2845          * arc_adjust because we don't have a "target" value (i.e. no
2846          * "meta" arc_p). As a result, I think we can completely
2847          * cannibalize the metadata in the MRU before we evict the
2848          * metadata from the MFU. I think we probably need to implement a
2849          * "metadata arc_p" value to do this properly.
2850          */
2851         adjustmnt = arc_meta_used - arc_meta_limit;
2852
2853         if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) {
2854                 delta = MIN(arc_mru->arcs_lsize[type], adjustmnt);
2855                 total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
2856                 adjustmnt -= delta;
2857         }
2858
2859         /*
2860          * We can't afford to recalculate adjustmnt here. If we do,
2861          * new metadata buffers can sneak into the MRU or ANON lists,
2862          * thus penalize the MFU metadata. Although the fudge factor is
2863          * small, it has been empirically shown to be significant for
2864          * certain workloads (e.g. creating many empty directories). As
2865          * such, we use the original calculation for adjustmnt, and
2866          * simply decrement the amount of data evicted from the MRU.
2867          */
2868
2869         if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) {
2870                 delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt);
2871                 total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
2872         }
2873
2874         adjustmnt = arc_meta_used - arc_meta_limit;
2875
2876         if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
2877                 delta = MIN(adjustmnt,
2878                     arc_mru_ghost->arcs_lsize[type]);
2879                 total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
2880                 adjustmnt -= delta;
2881         }
2882
2883         if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) {
2884                 delta = MIN(adjustmnt,
2885                     arc_mfu_ghost->arcs_lsize[type]);
2886                 total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
2887         }
2888
2889         /*
2890          * If after attempting to make the requested adjustment to the ARC
2891          * the meta limit is still being exceeded then request that the
2892          * higher layers drop some cached objects which have holds on ARC
2893          * meta buffers.  Requests to the upper layers will be made with
2894          * increasingly large scan sizes until the ARC is below the limit.
2895          */
2896         if (arc_meta_used > arc_meta_limit) {
2897                 if (type == ARC_BUFC_DATA) {
2898                         type = ARC_BUFC_METADATA;
2899                 } else {
2900                         type = ARC_BUFC_DATA;
2901
2902                         if (zfs_arc_meta_prune) {
2903                                 prune += zfs_arc_meta_prune;
2904                                 arc_prune_async(prune);
2905                         }
2906                 }
2907
2908                 if (restarts > 0) {
2909                         restarts--;
2910                         goto restart;
2911                 }
2912         }
2913         return (total_evicted);
2914 }
2915
2916 /*
2917  * Evict metadata buffers from the cache, such that arc_meta_used is
2918  * capped by the arc_meta_limit tunable.
2919  */
2920 static uint64_t
2921 arc_adjust_meta_only(void)
2922 {
2923         uint64_t total_evicted = 0;
2924         int64_t target;
2925
2926         /*
2927          * If we're over the meta limit, we want to evict enough
2928          * metadata to get back under the meta limit. We don't want to
2929          * evict so much that we drop the MRU below arc_p, though. If
2930          * we're over the meta limit more than we're over arc_p, we
2931          * evict some from the MRU here, and some from the MFU below.
2932          */
2933         target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
2934             (int64_t)(refcount_count(&arc_anon->arcs_size) +
2935             refcount_count(&arc_mru->arcs_size) - arc_p));
2936
2937         total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
2938
2939         /*
2940          * Similar to the above, we want to evict enough bytes to get us
2941          * below the meta limit, but not so much as to drop us below the
2942          * space alloted to the MFU (which is defined as arc_c - arc_p).
2943          */
2944         target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
2945             (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
2946
2947         total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
2948
2949         return (total_evicted);
2950 }
2951
2952 static uint64_t
2953 arc_adjust_meta(void)
2954 {
2955         if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
2956                 return (arc_adjust_meta_only());
2957         else
2958                 return (arc_adjust_meta_balanced());
2959 }
2960
2961 /*
2962  * Return the type of the oldest buffer in the given arc state
2963  *
2964  * This function will select a random sublist of type ARC_BUFC_DATA and
2965  * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
2966  * is compared, and the type which contains the "older" buffer will be
2967  * returned.
2968  */
2969 static arc_buf_contents_t
2970 arc_adjust_type(arc_state_t *state)
2971 {
2972         multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
2973         multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
2974         int data_idx = multilist_get_random_index(data_ml);
2975         int meta_idx = multilist_get_random_index(meta_ml);
2976         multilist_sublist_t *data_mls;
2977         multilist_sublist_t *meta_mls;
2978         arc_buf_contents_t type;
2979         arc_buf_hdr_t *data_hdr;
2980         arc_buf_hdr_t *meta_hdr;
2981
2982         /*
2983          * We keep the sublist lock until we're finished, to prevent
2984          * the headers from being destroyed via arc_evict_state().
2985          */
2986         data_mls = multilist_sublist_lock(data_ml, data_idx);
2987         meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
2988
2989         /*
2990          * These two loops are to ensure we skip any markers that
2991          * might be at the tail of the lists due to arc_evict_state().
2992          */
2993
2994         for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
2995             data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
2996                 if (data_hdr->b_spa != 0)
2997                         break;
2998         }
2999
3000         for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
3001             meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
3002                 if (meta_hdr->b_spa != 0)
3003                         break;
3004         }
3005
3006         if (data_hdr == NULL && meta_hdr == NULL) {
3007                 type = ARC_BUFC_DATA;
3008         } else if (data_hdr == NULL) {
3009                 ASSERT3P(meta_hdr, !=, NULL);
3010                 type = ARC_BUFC_METADATA;
3011         } else if (meta_hdr == NULL) {
3012                 ASSERT3P(data_hdr, !=, NULL);
3013                 type = ARC_BUFC_DATA;
3014         } else {
3015                 ASSERT3P(data_hdr, !=, NULL);
3016                 ASSERT3P(meta_hdr, !=, NULL);
3017
3018                 /* The headers can't be on the sublist without an L1 header */
3019                 ASSERT(HDR_HAS_L1HDR(data_hdr));
3020                 ASSERT(HDR_HAS_L1HDR(meta_hdr));
3021
3022                 if (data_hdr->b_l1hdr.b_arc_access <
3023                     meta_hdr->b_l1hdr.b_arc_access) {
3024                         type = ARC_BUFC_DATA;
3025                 } else {
3026                         type = ARC_BUFC_METADATA;
3027                 }
3028         }
3029
3030         multilist_sublist_unlock(meta_mls);
3031         multilist_sublist_unlock(data_mls);
3032
3033         return (type);
3034 }
3035
3036 /*
3037  * Evict buffers from the cache, such that arc_size is capped by arc_c.
3038  */
3039 static uint64_t
3040 arc_adjust(void)
3041 {
3042         uint64_t total_evicted = 0;
3043         uint64_t bytes;
3044         int64_t target;
3045
3046         /*
3047          * If we're over arc_meta_limit, we want to correct that before
3048          * potentially evicting data buffers below.
3049          */
3050         total_evicted += arc_adjust_meta();
3051
3052         /*
3053          * Adjust MRU size
3054          *
3055          * If we're over the target cache size, we want to evict enough
3056          * from the list to get back to our target size. We don't want
3057          * to evict too much from the MRU, such that it drops below
3058          * arc_p. So, if we're over our target cache size more than
3059          * the MRU is over arc_p, we'll evict enough to get back to
3060          * arc_p here, and then evict more from the MFU below.
3061          */
3062         target = MIN((int64_t)(arc_size - arc_c),
3063             (int64_t)(refcount_count(&arc_anon->arcs_size) +
3064             refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
3065
3066         /*
3067          * If we're below arc_meta_min, always prefer to evict data.
3068          * Otherwise, try to satisfy the requested number of bytes to
3069          * evict from the type which contains older buffers; in an
3070          * effort to keep newer buffers in the cache regardless of their
3071          * type. If we cannot satisfy the number of bytes from this
3072          * type, spill over into the next type.
3073          */
3074         if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
3075             arc_meta_used > arc_meta_min) {
3076                 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
3077                 total_evicted += bytes;
3078
3079                 /*
3080                  * If we couldn't evict our target number of bytes from
3081                  * metadata, we try to get the rest from data.
3082                  */
3083                 target -= bytes;
3084
3085                 total_evicted +=
3086                     arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
3087         } else {
3088                 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
3089                 total_evicted += bytes;
3090
3091                 /*
3092                  * If we couldn't evict our target number of bytes from
3093                  * data, we try to get the rest from metadata.
3094                  */
3095                 target -= bytes;
3096
3097                 total_evicted +=
3098                     arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
3099         }
3100
3101         /*
3102          * Adjust MFU size
3103          *
3104          * Now that we've tried to evict enough from the MRU to get its
3105          * size back to arc_p, if we're still above the target cache
3106          * size, we evict the rest from the MFU.
3107          */
3108         target = arc_size - arc_c;
3109
3110         if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
3111             arc_meta_used > arc_meta_min) {
3112                 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3113                 total_evicted += bytes;
3114
3115                 /*
3116                  * If we couldn't evict our target number of bytes from
3117                  * metadata, we try to get the rest from data.
3118                  */
3119                 target -= bytes;
3120
3121                 total_evicted +=
3122                     arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
3123         } else {
3124                 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
3125                 total_evicted += bytes;
3126
3127                 /*
3128                  * If we couldn't evict our target number of bytes from
3129                  * data, we try to get the rest from data.
3130                  */
3131                 target -= bytes;
3132
3133                 total_evicted +=
3134                     arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3135         }
3136
3137         /*
3138          * Adjust ghost lists
3139          *
3140          * In addition to the above, the ARC also defines target values
3141          * for the ghost lists. The sum of the mru list and mru ghost
3142          * list should never exceed the target size of the cache, and
3143          * the sum of the mru list, mfu list, mru ghost list, and mfu
3144          * ghost list should never exceed twice the target size of the
3145          * cache. The following logic enforces these limits on the ghost
3146          * caches, and evicts from them as needed.
3147          */
3148         target = refcount_count(&arc_mru->arcs_size) +
3149             refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
3150
3151         bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
3152         total_evicted += bytes;
3153
3154         target -= bytes;
3155
3156         total_evicted +=
3157             arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
3158
3159         /*
3160          * We assume the sum of the mru list and mfu list is less than
3161          * or equal to arc_c (we enforced this above), which means we
3162          * can use the simpler of the two equations below:
3163          *
3164          *      mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
3165          *                  mru ghost + mfu ghost <= arc_c
3166          */
3167         target = refcount_count(&arc_mru_ghost->arcs_size) +
3168             refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
3169
3170         bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
3171         total_evicted += bytes;
3172
3173         target -= bytes;
3174
3175         total_evicted +=
3176             arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
3177
3178         return (total_evicted);
3179 }
3180
3181 static void
3182 arc_do_user_evicts(void)
3183 {
3184         mutex_enter(&arc_user_evicts_lock);
3185         while (arc_eviction_list != NULL) {
3186                 arc_buf_t *buf = arc_eviction_list;
3187                 arc_eviction_list = buf->b_next;
3188                 mutex_enter(&buf->b_evict_lock);
3189                 buf->b_hdr = NULL;
3190                 mutex_exit(&buf->b_evict_lock);
3191                 mutex_exit(&arc_user_evicts_lock);
3192
3193                 if (buf->b_efunc != NULL)
3194                         VERIFY0(buf->b_efunc(buf->b_private));
3195
3196                 buf->b_efunc = NULL;
3197                 buf->b_private = NULL;
3198                 kmem_cache_free(buf_cache, buf);
3199                 mutex_enter(&arc_user_evicts_lock);
3200         }
3201         mutex_exit(&arc_user_evicts_lock);
3202 }
3203
3204 void
3205 arc_flush(spa_t *spa, boolean_t retry)
3206 {
3207         uint64_t guid = 0;
3208
3209         /*
3210          * If retry is TRUE, a spa must not be specified since we have
3211          * no good way to determine if all of a spa's buffers have been
3212          * evicted from an arc state.
3213          */
3214         ASSERT(!retry || spa == 0);
3215
3216         if (spa != NULL)
3217                 guid = spa_load_guid(spa);
3218
3219         (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
3220         (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
3221
3222         (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
3223         (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
3224
3225         (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
3226         (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
3227
3228         (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
3229         (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
3230
3231         arc_do_user_evicts();
3232         ASSERT(spa || arc_eviction_list == NULL);
3233 }
3234
3235 void
3236 arc_shrink(int64_t to_free)
3237 {
3238         uint64_t c = arc_c;
3239
3240         if (c > to_free && c - to_free > arc_c_min) {
3241                 arc_c = c - to_free;
3242                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
3243                 if (arc_c > arc_size)
3244                         arc_c = MAX(arc_size, arc_c_min);
3245                 if (arc_p > arc_c)
3246                         arc_p = (arc_c >> 1);
3247                 ASSERT(arc_c >= arc_c_min);
3248                 ASSERT((int64_t)arc_p >= 0);
3249         } else {
3250                 arc_c = arc_c_min;
3251         }
3252
3253         if (arc_size > arc_c)
3254                 (void) arc_adjust();
3255 }
3256
3257 typedef enum free_memory_reason_t {
3258         FMR_UNKNOWN,
3259         FMR_NEEDFREE,
3260         FMR_LOTSFREE,
3261         FMR_SWAPFS_MINFREE,
3262         FMR_PAGES_PP_MAXIMUM,
3263         FMR_HEAP_ARENA,
3264         FMR_ZIO_ARENA,
3265 } free_memory_reason_t;
3266
3267 int64_t last_free_memory;
3268 free_memory_reason_t last_free_reason;
3269
3270 #ifdef _KERNEL
3271 /*
3272  * Additional reserve of pages for pp_reserve.
3273  */
3274 int64_t arc_pages_pp_reserve = 64;
3275
3276 /*
3277  * Additional reserve of pages for swapfs.
3278  */
3279 int64_t arc_swapfs_reserve = 64;
3280 #endif /* _KERNEL */
3281
3282 /*
3283  * Return the amount of memory that can be consumed before reclaim will be
3284  * needed.  Positive if there is sufficient free memory, negative indicates
3285  * the amount of memory that needs to be freed up.
3286  */
3287 static int64_t
3288 arc_available_memory(void)
3289 {
3290         int64_t lowest = INT64_MAX;
3291         free_memory_reason_t r = FMR_UNKNOWN;
3292 #ifdef _KERNEL
3293         int64_t n;
3294 #ifdef __linux__
3295         pgcnt_t needfree = btop(arc_need_free);
3296         pgcnt_t lotsfree = btop(arc_sys_free);
3297         pgcnt_t desfree = 0;
3298 #endif
3299
3300         if (needfree > 0) {
3301                 n = PAGESIZE * (-needfree);
3302                 if (n < lowest) {
3303                         lowest = n;
3304                         r = FMR_NEEDFREE;
3305                 }
3306         }
3307
3308         /*
3309          * check that we're out of range of the pageout scanner.  It starts to
3310          * schedule paging if freemem is less than lotsfree and needfree.
3311          * lotsfree is the high-water mark for pageout, and needfree is the
3312          * number of needed free pages.  We add extra pages here to make sure
3313          * the scanner doesn't start up while we're freeing memory.
3314          */
3315         n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
3316         if (n < lowest) {
3317                 lowest = n;
3318                 r = FMR_LOTSFREE;
3319         }
3320
3321 #ifndef __linux__
3322         /*
3323          * check to make sure that swapfs has enough space so that anon
3324          * reservations can still succeed. anon_resvmem() checks that the
3325          * availrmem is greater than swapfs_minfree, and the number of reserved
3326          * swap pages.  We also add a bit of extra here just to prevent
3327          * circumstances from getting really dire.
3328          */
3329         n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
3330             desfree - arc_swapfs_reserve);
3331         if (n < lowest) {
3332                 lowest = n;
3333                 r = FMR_SWAPFS_MINFREE;
3334         }
3335
3336
3337         /*
3338          * Check that we have enough availrmem that memory locking (e.g., via
3339          * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
3340          * stores the number of pages that cannot be locked; when availrmem
3341          * drops below pages_pp_maximum, page locking mechanisms such as
3342          * page_pp_lock() will fail.)
3343          */
3344         n = PAGESIZE * (availrmem - pages_pp_maximum -
3345             arc_pages_pp_reserve);
3346         if (n < lowest) {
3347                 lowest = n;
3348                 r = FMR_PAGES_PP_MAXIMUM;
3349         }
3350 #endif
3351
3352 #if defined(__i386)
3353         /*
3354          * If we're on an i386 platform, it's possible that we'll exhaust the
3355          * kernel heap space before we ever run out of available physical
3356          * memory.  Most checks of the size of the heap_area compare against
3357          * tune.t_minarmem, which is the minimum available real memory that we
3358          * can have in the system.  However, this is generally fixed at 25 pages
3359          * which is so low that it's useless.  In this comparison, we seek to
3360          * calculate the total heap-size, and reclaim if more than 3/4ths of the
3361          * heap is allocated.  (Or, in the calculation, if less than 1/4th is
3362          * free)
3363          */
3364         n = vmem_size(heap_arena, VMEM_FREE) -
3365             (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
3366         if (n < lowest) {
3367                 lowest = n;
3368                 r = FMR_HEAP_ARENA;
3369         }
3370 #endif
3371
3372         /*
3373          * If zio data pages are being allocated out of a separate heap segment,
3374          * then enforce that the size of available vmem for this arena remains
3375          * above about 1/16th free.
3376          *
3377          * Note: The 1/16th arena free requirement was put in place
3378          * to aggressively evict memory from the arc in order to avoid
3379          * memory fragmentation issues.
3380          */
3381         if (zio_arena != NULL) {
3382                 n = vmem_size(zio_arena, VMEM_FREE) -
3383                     (vmem_size(zio_arena, VMEM_ALLOC) >> 4);
3384                 if (n < lowest) {
3385                         lowest = n;
3386                         r = FMR_ZIO_ARENA;
3387                 }
3388         }
3389 #else /* _KERNEL */
3390         /* Every 100 calls, free a small amount */
3391         if (spa_get_random(100) == 0)
3392                 lowest = -1024;
3393 #endif /* _KERNEL */
3394
3395         last_free_memory = lowest;
3396         last_free_reason = r;
3397
3398         return (lowest);
3399 }
3400
3401 /*
3402  * Determine if the system is under memory pressure and is asking
3403  * to reclaim memory. A return value of TRUE indicates that the system
3404  * is under memory pressure and that the arc should adjust accordingly.
3405  */
3406 static boolean_t
3407 arc_reclaim_needed(void)
3408 {
3409         return (arc_available_memory() < 0);
3410 }
3411
3412 static void
3413 arc_kmem_reap_now(void)
3414 {
3415         size_t                  i;
3416         kmem_cache_t            *prev_cache = NULL;
3417         kmem_cache_t            *prev_data_cache = NULL;
3418         extern kmem_cache_t     *zio_buf_cache[];
3419         extern kmem_cache_t     *zio_data_buf_cache[];
3420         extern kmem_cache_t     *range_seg_cache;
3421
3422         if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) {
3423                 /*
3424                  * We are exceeding our meta-data cache limit.
3425                  * Prune some entries to release holds on meta-data.
3426                  */
3427                 arc_prune_async(zfs_arc_meta_prune);
3428         }
3429
3430         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
3431 #ifdef _ILP32
3432                 /* reach upper limit of cache size on 32-bit */
3433                 if (zio_buf_cache[i] == NULL)
3434                         break;
3435 #endif
3436                 if (zio_buf_cache[i] != prev_cache) {
3437                         prev_cache = zio_buf_cache[i];
3438                         kmem_cache_reap_now(zio_buf_cache[i]);
3439                 }
3440                 if (zio_data_buf_cache[i] != prev_data_cache) {
3441                         prev_data_cache = zio_data_buf_cache[i];
3442                         kmem_cache_reap_now(zio_data_buf_cache[i]);
3443                 }
3444         }
3445         kmem_cache_reap_now(buf_cache);
3446         kmem_cache_reap_now(hdr_full_cache);
3447         kmem_cache_reap_now(hdr_l2only_cache);
3448         kmem_cache_reap_now(range_seg_cache);
3449
3450         if (zio_arena != NULL) {
3451                 /*
3452                  * Ask the vmem arena to reclaim unused memory from its
3453                  * quantum caches.
3454                  */
3455                 vmem_qcache_reap(zio_arena);
3456         }
3457 }
3458
3459 /*
3460  * Threads can block in arc_get_data_buf() waiting for this thread to evict
3461  * enough data and signal them to proceed. When this happens, the threads in
3462  * arc_get_data_buf() are sleeping while holding the hash lock for their
3463  * particular arc header. Thus, we must be careful to never sleep on a
3464  * hash lock in this thread. This is to prevent the following deadlock:
3465  *
3466  *  - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
3467  *    waiting for the reclaim thread to signal it.
3468  *
3469  *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
3470  *    fails, and goes to sleep forever.
3471  *
3472  * This possible deadlock is avoided by always acquiring a hash lock
3473  * using mutex_tryenter() from arc_reclaim_thread().
3474  */
3475 static void
3476 arc_reclaim_thread(void)
3477 {
3478         fstrans_cookie_t        cookie = spl_fstrans_mark();
3479         hrtime_t                growtime = 0;
3480         callb_cpr_t             cpr;
3481
3482         CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
3483
3484         mutex_enter(&arc_reclaim_lock);
3485         while (!arc_reclaim_thread_exit) {
3486                 int64_t to_free;
3487                 int64_t free_memory = arc_available_memory();
3488                 uint64_t evicted = 0;
3489
3490                 arc_tuning_update();
3491
3492                 mutex_exit(&arc_reclaim_lock);
3493
3494                 if (free_memory < 0) {
3495
3496                         arc_no_grow = B_TRUE;
3497                         arc_warm = B_TRUE;
3498
3499                         /*
3500                          * Wait at least zfs_grow_retry (default 5) seconds
3501                          * before considering growing.
3502                          */
3503                         growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
3504
3505                         arc_kmem_reap_now();
3506
3507                         /*
3508                          * If we are still low on memory, shrink the ARC
3509                          * so that we have arc_shrink_min free space.
3510                          */
3511                         free_memory = arc_available_memory();
3512
3513                         to_free = (arc_c >> arc_shrink_shift) - free_memory;
3514                         if (to_free > 0) {
3515 #ifdef _KERNEL
3516                                 to_free = MAX(to_free, arc_need_free);
3517 #endif
3518                                 arc_shrink(to_free);
3519                         }
3520                 } else if (free_memory < arc_c >> arc_no_grow_shift) {
3521                         arc_no_grow = B_TRUE;
3522                 } else if (gethrtime() >= growtime) {
3523                         arc_no_grow = B_FALSE;
3524                 }
3525
3526                 evicted = arc_adjust();
3527
3528                 mutex_enter(&arc_reclaim_lock);
3529
3530                 /*
3531                  * If evicted is zero, we couldn't evict anything via
3532                  * arc_adjust(). This could be due to hash lock
3533                  * collisions, but more likely due to the majority of
3534                  * arc buffers being unevictable. Therefore, even if
3535                  * arc_size is above arc_c, another pass is unlikely to
3536                  * be helpful and could potentially cause us to enter an
3537                  * infinite loop.
3538                  */
3539                 if (arc_size <= arc_c || evicted == 0) {
3540                         /*
3541                          * We're either no longer overflowing, or we
3542                          * can't evict anything more, so we should wake
3543                          * up any threads before we go to sleep and clear
3544                          * arc_need_free since nothing more can be done.
3545                          */
3546                         cv_broadcast(&arc_reclaim_waiters_cv);
3547                         arc_need_free = 0;
3548
3549                         /*
3550                          * Block until signaled, or after one second (we
3551                          * might need to perform arc_kmem_reap_now()
3552                          * even if we aren't being signalled)
3553                          */
3554                         CALLB_CPR_SAFE_BEGIN(&cpr);
3555                         (void) cv_timedwait_sig_hires(&arc_reclaim_thread_cv,
3556                             &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
3557                         CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
3558                 }
3559         }
3560
3561         arc_reclaim_thread_exit = FALSE;
3562         cv_broadcast(&arc_reclaim_thread_cv);
3563         CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_lock */
3564         spl_fstrans_unmark(cookie);
3565         thread_exit();
3566 }
3567
3568 static void
3569 arc_user_evicts_thread(void)
3570 {
3571         fstrans_cookie_t        cookie = spl_fstrans_mark();
3572         callb_cpr_t cpr;
3573
3574         CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG);
3575
3576         mutex_enter(&arc_user_evicts_lock);
3577         while (!arc_user_evicts_thread_exit) {
3578                 mutex_exit(&arc_user_evicts_lock);
3579
3580                 arc_do_user_evicts();
3581
3582                 /*
3583                  * This is necessary in order for the mdb ::arc dcmd to
3584                  * show up to date information. Since the ::arc command
3585                  * does not call the kstat's update function, without
3586                  * this call, the command may show stale stats for the
3587                  * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
3588                  * with this change, the data might be up to 1 second
3589                  * out of date; but that should suffice. The arc_state_t
3590                  * structures can be queried directly if more accurate
3591                  * information is needed.
3592                  */
3593                 if (arc_ksp != NULL)
3594                         arc_ksp->ks_update(arc_ksp, KSTAT_READ);
3595
3596                 mutex_enter(&arc_user_evicts_lock);
3597
3598                 /*
3599                  * Block until signaled, or after one second (we need to
3600                  * call the arc's kstat update function regularly).
3601                  */
3602                 CALLB_CPR_SAFE_BEGIN(&cpr);
3603                 (void) cv_timedwait_sig(&arc_user_evicts_cv,
3604                     &arc_user_evicts_lock, ddi_get_lbolt() + hz);
3605                 CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock);
3606         }
3607
3608         arc_user_evicts_thread_exit = FALSE;
3609         cv_broadcast(&arc_user_evicts_cv);
3610         CALLB_CPR_EXIT(&cpr);           /* drops arc_user_evicts_lock */
3611         spl_fstrans_unmark(cookie);
3612         thread_exit();
3613 }
3614
3615 #ifdef _KERNEL
3616 /*
3617  * Determine the amount of memory eligible for eviction contained in the
3618  * ARC. All clean data reported by the ghost lists can always be safely
3619  * evicted. Due to arc_c_min, the same does not hold for all clean data
3620  * contained by the regular mru and mfu lists.
3621  *
3622  * In the case of the regular mru and mfu lists, we need to report as
3623  * much clean data as possible, such that evicting that same reported
3624  * data will not bring arc_size below arc_c_min. Thus, in certain
3625  * circumstances, the total amount of clean data in the mru and mfu
3626  * lists might not actually be evictable.
3627  *
3628  * The following two distinct cases are accounted for:
3629  *
3630  * 1. The sum of the amount of dirty data contained by both the mru and
3631  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
3632  *    is greater than or equal to arc_c_min.
3633  *    (i.e. amount of dirty data >= arc_c_min)
3634  *
3635  *    This is the easy case; all clean data contained by the mru and mfu
3636  *    lists is evictable. Evicting all clean data can only drop arc_size
3637  *    to the amount of dirty data, which is greater than arc_c_min.
3638  *
3639  * 2. The sum of the amount of dirty data contained by both the mru and
3640  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
3641  *    is less than arc_c_min.
3642  *    (i.e. arc_c_min > amount of dirty data)
3643  *
3644  *    2.1. arc_size is greater than or equal arc_c_min.
3645  *         (i.e. arc_size >= arc_c_min > amount of dirty data)
3646  *
3647  *         In this case, not all clean data from the regular mru and mfu
3648  *         lists is actually evictable; we must leave enough clean data
3649  *         to keep arc_size above arc_c_min. Thus, the maximum amount of
3650  *         evictable data from the two lists combined, is exactly the
3651  *         difference between arc_size and arc_c_min.
3652  *
3653  *    2.2. arc_size is less than arc_c_min
3654  *         (i.e. arc_c_min > arc_size > amount of dirty data)
3655  *
3656  *         In this case, none of the data contained in the mru and mfu
3657  *         lists is evictable, even if it's clean. Since arc_size is
3658  *         already below arc_c_min, evicting any more would only
3659  *         increase this negative difference.
3660  */
3661 static uint64_t
3662 arc_evictable_memory(void) {
3663         uint64_t arc_clean =
3664             arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3665             arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3666             arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3667             arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3668         uint64_t ghost_clean =
3669             arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
3670             arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
3671             arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
3672             arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
3673         uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
3674
3675         if (arc_dirty >= arc_c_min)
3676                 return (ghost_clean + arc_clean);
3677
3678         return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
3679 }
3680
3681 /*
3682  * If sc->nr_to_scan is zero, the caller is requesting a query of the
3683  * number of objects which can potentially be freed.  If it is nonzero,
3684  * the request is to free that many objects.
3685  *
3686  * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
3687  * in struct shrinker and also require the shrinker to return the number
3688  * of objects freed.
3689  *
3690  * Older kernels require the shrinker to return the number of freeable
3691  * objects following the freeing of nr_to_free.
3692  */
3693 static spl_shrinker_t
3694 __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
3695 {
3696         int64_t pages;
3697
3698         /* The arc is considered warm once reclaim has occurred */
3699         if (unlikely(arc_warm == B_FALSE))
3700                 arc_warm = B_TRUE;
3701
3702         /* Return the potential number of reclaimable pages */
3703         pages = btop((int64_t)arc_evictable_memory());
3704         if (sc->nr_to_scan == 0)
3705                 return (pages);
3706
3707         /* Not allowed to perform filesystem reclaim */
3708         if (!(sc->gfp_mask & __GFP_FS))
3709                 return (SHRINK_STOP);
3710
3711         /* Reclaim in progress */
3712         if (mutex_tryenter(&arc_reclaim_lock) == 0)
3713                 return (SHRINK_STOP);
3714
3715         mutex_exit(&arc_reclaim_lock);
3716
3717         /*
3718          * Evict the requested number of pages by shrinking arc_c the
3719          * requested amount.  If there is nothing left to evict just
3720          * reap whatever we can from the various arc slabs.
3721          */
3722         if (pages > 0) {
3723                 arc_shrink(ptob(sc->nr_to_scan));
3724                 arc_kmem_reap_now();
3725 #ifdef HAVE_SPLIT_SHRINKER_CALLBACK
3726                 pages = MAX(pages - btop(arc_evictable_memory()), 0);
3727 #else
3728                 pages = btop(arc_evictable_memory());
3729 #endif
3730         } else {
3731                 arc_kmem_reap_now();
3732                 pages = SHRINK_STOP;
3733         }
3734
3735         /*
3736          * We've reaped what we can, wake up threads.
3737          */
3738         cv_broadcast(&arc_reclaim_waiters_cv);
3739
3740         /*
3741          * When direct reclaim is observed it usually indicates a rapid
3742          * increase in memory pressure.  This occurs because the kswapd
3743          * threads were unable to asynchronously keep enough free memory
3744          * available.  In this case set arc_no_grow to briefly pause arc
3745          * growth to avoid compounding the memory pressure.
3746          */
3747         if (current_is_kswapd()) {
3748                 ARCSTAT_BUMP(arcstat_memory_indirect_count);
3749         } else {
3750                 arc_no_grow = B_TRUE;
3751                 arc_need_free = ptob(sc->nr_to_scan);
3752                 ARCSTAT_BUMP(arcstat_memory_direct_count);
3753         }
3754
3755         return (pages);
3756 }
3757 SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
3758
3759 SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
3760 #endif /* _KERNEL */
3761
3762 /*
3763  * Adapt arc info given the number of bytes we are trying to add and
3764  * the state that we are comming from.  This function is only called
3765  * when we are adding new content to the cache.
3766  */
3767 static void
3768 arc_adapt(int bytes, arc_state_t *state)
3769 {
3770         int mult;
3771         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
3772         int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
3773         int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
3774
3775         if (state == arc_l2c_only)
3776                 return;
3777
3778         ASSERT(bytes > 0);
3779         /*
3780          * Adapt the target size of the MRU list:
3781          *      - if we just hit in the MRU ghost list, then increase
3782          *        the target size of the MRU list.
3783          *      - if we just hit in the MFU ghost list, then increase
3784          *        the target size of the MFU list by decreasing the
3785          *        target size of the MRU list.
3786          */
3787         if (state == arc_mru_ghost) {
3788                 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
3789                 if (!zfs_arc_p_dampener_disable)
3790                         mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
3791
3792                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
3793         } else if (state == arc_mfu_ghost) {
3794                 uint64_t delta;
3795
3796                 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
3797                 if (!zfs_arc_p_dampener_disable)
3798                         mult = MIN(mult, 10);
3799
3800                 delta = MIN(bytes * mult, arc_p);
3801                 arc_p = MAX(arc_p_min, arc_p - delta);
3802         }
3803         ASSERT((int64_t)arc_p >= 0);
3804
3805         if (arc_reclaim_needed()) {
3806                 cv_signal(&arc_reclaim_thread_cv);
3807                 return;
3808         }
3809
3810         if (arc_no_grow)
3811                 return;
3812
3813         if (arc_c >= arc_c_max)
3814                 return;
3815
3816         /*
3817          * If we're within (2 * maxblocksize) bytes of the target
3818          * cache size, increment the target cache size
3819          */
3820         ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
3821         if (arc_size >= arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
3822                 atomic_add_64(&arc_c, (int64_t)bytes);
3823                 if (arc_c > arc_c_max)
3824                         arc_c = arc_c_max;
3825                 else if (state == arc_anon)
3826                         atomic_add_64(&arc_p, (int64_t)bytes);
3827                 if (arc_p > arc_c)
3828                         arc_p = arc_c;
3829         }
3830         ASSERT((int64_t)arc_p >= 0);
3831 }
3832
3833 /*
3834  * Check if arc_size has grown past our upper threshold, determined by
3835  * zfs_arc_overflow_shift.
3836  */
3837 static boolean_t
3838 arc_is_overflowing(void)
3839 {
3840         /* Always allow at least one block of overflow */
3841         uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
3842             arc_c >> zfs_arc_overflow_shift);
3843
3844         return (arc_size >= arc_c + overflow);
3845 }
3846
3847 /*
3848  * The buffer, supplied as the first argument, needs a data block. If we
3849  * are hitting the hard limit for the cache size, we must sleep, waiting
3850  * for the eviction thread to catch up. If we're past the target size
3851  * but below the hard limit, we'll only signal the reclaim thread and
3852  * continue on.
3853  */
3854 static void
3855 arc_get_data_buf(arc_buf_t *buf)
3856 {
3857         arc_state_t             *state = buf->b_hdr->b_l1hdr.b_state;
3858         uint64_t                size = buf->b_hdr->b_size;
3859         arc_buf_contents_t      type = arc_buf_type(buf->b_hdr);
3860
3861         arc_adapt(size, state);
3862
3863         /*
3864          * If arc_size is currently overflowing, and has grown past our
3865          * upper limit, we must be adding data faster than the evict
3866          * thread can evict. Thus, to ensure we don't compound the
3867          * problem by adding more data and forcing arc_size to grow even
3868          * further past it's target size, we halt and wait for the
3869          * eviction thread to catch up.
3870          *
3871          * It's also possible that the reclaim thread is unable to evict
3872          * enough buffers to get arc_size below the overflow limit (e.g.
3873          * due to buffers being un-evictable, or hash lock collisions).
3874          * In this case, we want to proceed regardless if we're
3875          * overflowing; thus we don't use a while loop here.
3876          */
3877         if (arc_is_overflowing()) {
3878                 mutex_enter(&arc_reclaim_lock);
3879
3880                 /*
3881                  * Now that we've acquired the lock, we may no longer be
3882                  * over the overflow limit, lets check.
3883                  *
3884                  * We're ignoring the case of spurious wake ups. If that
3885                  * were to happen, it'd let this thread consume an ARC
3886                  * buffer before it should have (i.e. before we're under
3887                  * the overflow limit and were signalled by the reclaim
3888                  * thread). As long as that is a rare occurrence, it
3889                  * shouldn't cause any harm.
3890                  */
3891                 if (arc_is_overflowing()) {
3892                         cv_signal(&arc_reclaim_thread_cv);
3893                         cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
3894                 }
3895
3896                 mutex_exit(&arc_reclaim_lock);
3897         }
3898
3899         if (type == ARC_BUFC_METADATA) {
3900                 buf->b_data = zio_buf_alloc(size);
3901                 arc_space_consume(size, ARC_SPACE_META);
3902         } else {
3903                 ASSERT(type == ARC_BUFC_DATA);
3904                 buf->b_data = zio_data_buf_alloc(size);
3905                 arc_space_consume(size, ARC_SPACE_DATA);
3906         }
3907
3908         /*
3909          * Update the state size.  Note that ghost states have a
3910          * "ghost size" and so don't need to be updated.
3911          */
3912         if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
3913                 arc_buf_hdr_t *hdr = buf->b_hdr;
3914                 arc_state_t *state = hdr->b_l1hdr.b_state;
3915
3916                 (void) refcount_add_many(&state->arcs_size, size, buf);
3917
3918                 /*
3919                  * If this is reached via arc_read, the link is
3920                  * protected by the hash lock. If reached via
3921                  * arc_buf_alloc, the header should not be accessed by
3922                  * any other thread. And, if reached via arc_read_done,
3923                  * the hash lock will protect it if it's found in the
3924                  * hash table; otherwise no other thread should be
3925                  * trying to [add|remove]_reference it.
3926                  */
3927                 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
3928                         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3929                         atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
3930                             size);
3931                 }
3932                 /*
3933                  * If we are growing the cache, and we are adding anonymous
3934                  * data, and we have outgrown arc_p, update arc_p
3935                  */
3936                 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
3937                     (refcount_count(&arc_anon->arcs_size) +
3938                     refcount_count(&arc_mru->arcs_size) > arc_p))
3939                         arc_p = MIN(arc_c, arc_p + size);
3940         }
3941 }
3942
3943 /*
3944  * This routine is called whenever a buffer is accessed.
3945  * NOTE: the hash lock is dropped in this function.
3946  */
3947 static void
3948 arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
3949 {
3950         clock_t now;
3951
3952         ASSERT(MUTEX_HELD(hash_lock));
3953         ASSERT(HDR_HAS_L1HDR(hdr));
3954
3955         if (hdr->b_l1hdr.b_state == arc_anon) {
3956                 /*
3957                  * This buffer is not in the cache, and does not
3958                  * appear in our "ghost" list.  Add the new buffer
3959                  * to the MRU state.
3960                  */
3961
3962                 ASSERT0(hdr->b_l1hdr.b_arc_access);
3963                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3964                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3965                 arc_change_state(arc_mru, hdr, hash_lock);
3966
3967         } else if (hdr->b_l1hdr.b_state == arc_mru) {
3968                 now = ddi_get_lbolt();
3969
3970                 /*
3971                  * If this buffer is here because of a prefetch, then either:
3972                  * - clear the flag if this is a "referencing" read
3973                  *   (any subsequent access will bump this into the MFU state).
3974                  * or
3975                  * - move the buffer to the head of the list if this is
3976                  *   another prefetch (to make it less likely to be evicted).
3977                  */
3978                 if (HDR_PREFETCH(hdr)) {
3979                         if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
3980                                 /* link protected by hash lock */
3981                                 ASSERT(multilist_link_active(
3982                                     &hdr->b_l1hdr.b_arc_node));
3983                         } else {
3984                                 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3985                                 atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
3986                                 ARCSTAT_BUMP(arcstat_mru_hits);
3987                         }
3988                         hdr->b_l1hdr.b_arc_access = now;
3989                         return;
3990                 }
3991
3992                 /*
3993                  * This buffer has been "accessed" only once so far,
3994                  * but it is still in the cache. Move it to the MFU
3995                  * state.
3996                  */
3997                 if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
3998                     ARC_MINTIME)) {
3999                         /*
4000                          * More than 125ms have passed since we
4001                          * instantiated this buffer.  Move it to the
4002                          * most frequently used state.
4003                          */
4004                         hdr->b_l1hdr.b_arc_access = now;
4005                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4006                         arc_change_state(arc_mfu, hdr, hash_lock);
4007                 }
4008                 atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
4009                 ARCSTAT_BUMP(arcstat_mru_hits);
4010         } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
4011                 arc_state_t     *new_state;
4012                 /*
4013                  * This buffer has been "accessed" recently, but
4014                  * was evicted from the cache.  Move it to the
4015                  * MFU state.
4016                  */
4017
4018                 if (HDR_PREFETCH(hdr)) {
4019                         new_state = arc_mru;
4020                         if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
4021                                 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
4022                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
4023                 } else {
4024                         new_state = arc_mfu;
4025                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4026                 }
4027
4028                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4029                 arc_change_state(new_state, hdr, hash_lock);
4030
4031                 atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
4032                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
4033         } else if (hdr->b_l1hdr.b_state == arc_mfu) {
4034                 /*
4035                  * This buffer has been accessed more than once and is
4036                  * still in the cache.  Keep it in the MFU state.
4037                  *
4038                  * NOTE: an add_reference() that occurred when we did
4039                  * the arc_read() will have kicked this off the list.
4040                  * If it was a prefetch, we will explicitly move it to
4041                  * the head of the list now.
4042                  */
4043                 if ((HDR_PREFETCH(hdr)) != 0) {
4044                         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4045                         /* link protected by hash_lock */
4046                         ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
4047                 }
4048                 atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
4049                 ARCSTAT_BUMP(arcstat_mfu_hits);
4050                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4051         } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
4052                 arc_state_t     *new_state = arc_mfu;
4053                 /*
4054                  * This buffer has been accessed more than once but has
4055                  * been evicted from the cache.  Move it back to the
4056                  * MFU state.
4057                  */
4058
4059                 if (HDR_PREFETCH(hdr)) {
4060                         /*
4061                          * This is a prefetch access...
4062                          * move this block back to the MRU state.
4063                          */
4064                         ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
4065                         new_state = arc_mru;
4066                 }
4067
4068                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4069                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4070                 arc_change_state(new_state, hdr, hash_lock);
4071
4072                 atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
4073                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
4074         } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
4075                 /*
4076                  * This buffer is on the 2nd Level ARC.
4077                  */
4078
4079                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4080                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4081                 arc_change_state(arc_mfu, hdr, hash_lock);
4082         } else {
4083                 cmn_err(CE_PANIC, "invalid arc state 0x%p",
4084                     hdr->b_l1hdr.b_state);
4085         }
4086 }
4087
4088 /* a generic arc_done_func_t which you can use */
4089 /* ARGSUSED */
4090 void
4091 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
4092 {
4093         if (zio == NULL || zio->io_error == 0)
4094                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
4095         VERIFY(arc_buf_remove_ref(buf, arg));
4096 }
4097
4098 /* a generic arc_done_func_t */
4099 void
4100 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
4101 {
4102         arc_buf_t **bufp = arg;
4103         if (zio && zio->io_error) {
4104                 VERIFY(arc_buf_remove_ref(buf, arg));
4105                 *bufp = NULL;
4106         } else {
4107                 *bufp = buf;
4108                 ASSERT(buf->b_data);
4109         }
4110 }
4111
4112 static void
4113 arc_read_done(zio_t *zio)
4114 {
4115         arc_buf_hdr_t   *hdr;
4116         arc_buf_t       *buf;
4117         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
4118         kmutex_t        *hash_lock = NULL;
4119         arc_callback_t  *callback_list, *acb;
4120         int             freeable = FALSE;
4121
4122         buf = zio->io_private;
4123         hdr = buf->b_hdr;
4124
4125         /*
4126          * The hdr was inserted into hash-table and removed from lists
4127          * prior to starting I/O.  We should find this header, since
4128          * it's in the hash table, and it should be legit since it's
4129          * not possible to evict it during the I/O.  The only possible
4130          * reason for it not to be found is if we were freed during the
4131          * read.
4132          */
4133         if (HDR_IN_HASH_TABLE(hdr)) {
4134                 arc_buf_hdr_t *found;
4135
4136                 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
4137                 ASSERT3U(hdr->b_dva.dva_word[0], ==,
4138                     BP_IDENTITY(zio->io_bp)->dva_word[0]);
4139                 ASSERT3U(hdr->b_dva.dva_word[1], ==,
4140                     BP_IDENTITY(zio->io_bp)->dva_word[1]);
4141
4142                 found = buf_hash_find(hdr->b_spa, zio->io_bp,
4143                     &hash_lock);
4144
4145                 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
4146                     hash_lock == NULL) ||
4147                     (found == hdr &&
4148                     DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
4149                     (found == hdr && HDR_L2_READING(hdr)));
4150         }
4151
4152         hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
4153         if (l2arc_noprefetch && HDR_PREFETCH(hdr))
4154                 hdr->b_flags &= ~ARC_FLAG_L2CACHE;
4155
4156         /* byteswap if necessary */
4157         callback_list = hdr->b_l1hdr.b_acb;
4158         ASSERT(callback_list != NULL);
4159         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
4160                 dmu_object_byteswap_t bswap =
4161                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
4162                 if (BP_GET_LEVEL(zio->io_bp) > 0)
4163                     byteswap_uint64_array(buf->b_data, hdr->b_size);
4164                 else
4165                     dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
4166         }
4167
4168         arc_cksum_compute(buf, B_FALSE);
4169         arc_buf_watch(buf);
4170
4171         if (hash_lock && zio->io_error == 0 &&
4172             hdr->b_l1hdr.b_state == arc_anon) {
4173                 /*
4174                  * Only call arc_access on anonymous buffers.  This is because
4175                  * if we've issued an I/O for an evicted buffer, we've already
4176                  * called arc_access (to prevent any simultaneous readers from
4177                  * getting confused).
4178                  */
4179                 arc_access(hdr, hash_lock);
4180         }
4181
4182         /* create copies of the data buffer for the callers */
4183         abuf = buf;
4184         for (acb = callback_list; acb; acb = acb->acb_next) {
4185                 if (acb->acb_done) {
4186                         if (abuf == NULL) {
4187                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
4188                                 abuf = arc_buf_clone(buf);
4189                         }
4190                         acb->acb_buf = abuf;
4191                         abuf = NULL;
4192                 }
4193         }
4194         hdr->b_l1hdr.b_acb = NULL;
4195         hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4196         ASSERT(!HDR_BUF_AVAILABLE(hdr));
4197         if (abuf == buf) {
4198                 ASSERT(buf->b_efunc == NULL);
4199                 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
4200                 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
4201         }
4202
4203         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
4204             callback_list != NULL);
4205
4206         if (zio->io_error != 0) {
4207                 hdr->b_flags |= ARC_FLAG_IO_ERROR;
4208                 if (hdr->b_l1hdr.b_state != arc_anon)
4209                         arc_change_state(arc_anon, hdr, hash_lock);
4210                 if (HDR_IN_HASH_TABLE(hdr))
4211                         buf_hash_remove(hdr);
4212                 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
4213         }
4214
4215         /*
4216          * Broadcast before we drop the hash_lock to avoid the possibility
4217          * that the hdr (and hence the cv) might be freed before we get to
4218          * the cv_broadcast().
4219          */
4220         cv_broadcast(&hdr->b_l1hdr.b_cv);
4221
4222         if (hash_lock != NULL) {
4223                 mutex_exit(hash_lock);
4224         } else {
4225                 /*
4226                  * This block was freed while we waited for the read to
4227                  * complete.  It has been removed from the hash table and
4228                  * moved to the anonymous state (so that it won't show up
4229                  * in the cache).
4230                  */
4231                 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
4232                 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
4233         }
4234
4235         /* execute each callback and free its structure */
4236         while ((acb = callback_list) != NULL) {
4237                 if (acb->acb_done)
4238                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
4239
4240                 if (acb->acb_zio_dummy != NULL) {
4241                         acb->acb_zio_dummy->io_error = zio->io_error;
4242                         zio_nowait(acb->acb_zio_dummy);
4243                 }
4244
4245                 callback_list = acb->acb_next;
4246                 kmem_free(acb, sizeof (arc_callback_t));
4247         }
4248
4249         if (freeable)
4250                 arc_hdr_destroy(hdr);
4251 }
4252
4253 /*
4254  * "Read" the block at the specified DVA (in bp) via the
4255  * cache.  If the block is found in the cache, invoke the provided
4256  * callback immediately and return.  Note that the `zio' parameter
4257  * in the callback will be NULL in this case, since no IO was
4258  * required.  If the block is not in the cache pass the read request
4259  * on to the spa with a substitute callback function, so that the
4260  * requested block will be added to the cache.
4261  *
4262  * If a read request arrives for a block that has a read in-progress,
4263  * either wait for the in-progress read to complete (and return the
4264  * results); or, if this is a read with a "done" func, add a record
4265  * to the read to invoke the "done" func when the read completes,
4266  * and return; or just return.
4267  *
4268  * arc_read_done() will invoke all the requested "done" functions
4269  * for readers of this block.
4270  */
4271 int
4272 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
4273     void *private, zio_priority_t priority, int zio_flags,
4274     arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
4275 {
4276         arc_buf_hdr_t *hdr = NULL;
4277         arc_buf_t *buf = NULL;
4278         kmutex_t *hash_lock = NULL;
4279         zio_t *rzio;
4280         uint64_t guid = spa_load_guid(spa);
4281         int rc = 0;
4282
4283         ASSERT(!BP_IS_EMBEDDED(bp) ||
4284             BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
4285
4286 top:
4287         if (!BP_IS_EMBEDDED(bp)) {
4288                 /*
4289                  * Embedded BP's have no DVA and require no I/O to "read".
4290                  * Create an anonymous arc buf to back it.
4291                  */
4292                 hdr = buf_hash_find(guid, bp, &hash_lock);
4293         }
4294
4295         if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
4296
4297                 *arc_flags |= ARC_FLAG_CACHED;
4298
4299                 if (HDR_IO_IN_PROGRESS(hdr)) {
4300
4301                         if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
4302                             priority == ZIO_PRIORITY_SYNC_READ) {
4303                                 /*
4304                                  * This sync read must wait for an
4305                                  * in-progress async read (e.g. a predictive
4306                                  * prefetch).  Async reads are queued
4307                                  * separately at the vdev_queue layer, so
4308                                  * this is a form of priority inversion.
4309                                  * Ideally, we would "inherit" the demand
4310                                  * i/o's priority by moving the i/o from
4311                                  * the async queue to the synchronous queue,
4312                                  * but there is currently no mechanism to do
4313                                  * so.  Track this so that we can evaluate
4314                                  * the magnitude of this potential performance
4315                                  * problem.
4316                                  *
4317                                  * Note that if the prefetch i/o is already
4318                                  * active (has been issued to the device),
4319                                  * the prefetch improved performance, because
4320                                  * we issued it sooner than we would have
4321                                  * without the prefetch.
4322                                  */
4323                                 DTRACE_PROBE1(arc__sync__wait__for__async,
4324                                     arc_buf_hdr_t *, hdr);
4325                                 ARCSTAT_BUMP(arcstat_sync_wait_for_async);
4326                         }
4327                         if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
4328                                 hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
4329                         }
4330
4331                         if (*arc_flags & ARC_FLAG_WAIT) {
4332                                 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
4333                                 mutex_exit(hash_lock);
4334                                 goto top;
4335                         }
4336                         ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
4337
4338                         if (done) {
4339                                 arc_callback_t *acb = NULL;
4340
4341                                 acb = kmem_zalloc(sizeof (arc_callback_t),
4342                                     KM_SLEEP);
4343                                 acb->acb_done = done;
4344                                 acb->acb_private = private;
4345                                 if (pio != NULL)
4346                                         acb->acb_zio_dummy = zio_null(pio,
4347                                             spa, NULL, NULL, NULL, zio_flags);
4348
4349                                 ASSERT(acb->acb_done != NULL);
4350                                 acb->acb_next = hdr->b_l1hdr.b_acb;
4351                                 hdr->b_l1hdr.b_acb = acb;
4352                                 add_reference(hdr, hash_lock, private);
4353                                 mutex_exit(hash_lock);
4354                                 goto out;
4355                         }
4356                         mutex_exit(hash_lock);
4357                         goto out;
4358                 }
4359
4360                 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4361                     hdr->b_l1hdr.b_state == arc_mfu);
4362
4363                 if (done) {
4364                         if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
4365                                 /*
4366                                  * This is a demand read which does not have to
4367                                  * wait for i/o because we did a predictive
4368                                  * prefetch i/o for it, which has completed.
4369                                  */
4370                                 DTRACE_PROBE1(
4371                                     arc__demand__hit__predictive__prefetch,
4372                                     arc_buf_hdr_t *, hdr);
4373                                 ARCSTAT_BUMP(
4374                                     arcstat_demand_hit_predictive_prefetch);
4375                                 hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
4376                         }
4377                         add_reference(hdr, hash_lock, private);
4378                         /*
4379                          * If this block is already in use, create a new
4380                          * copy of the data so that we will be guaranteed
4381                          * that arc_release() will always succeed.
4382                          */
4383                         buf = hdr->b_l1hdr.b_buf;
4384                         ASSERT(buf);
4385                         ASSERT(buf->b_data);
4386                         if (HDR_BUF_AVAILABLE(hdr)) {
4387                                 ASSERT(buf->b_efunc == NULL);
4388                                 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
4389                         } else {
4390                                 buf = arc_buf_clone(buf);
4391                         }
4392
4393                 } else if (*arc_flags & ARC_FLAG_PREFETCH &&
4394                     refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
4395                         hdr->b_flags |= ARC_FLAG_PREFETCH;
4396                 }
4397                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
4398                 arc_access(hdr, hash_lock);
4399                 if (*arc_flags & ARC_FLAG_L2CACHE)
4400                         hdr->b_flags |= ARC_FLAG_L2CACHE;
4401                 if (*arc_flags & ARC_FLAG_L2COMPRESS)
4402                         hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4403                 mutex_exit(hash_lock);
4404                 ARCSTAT_BUMP(arcstat_hits);
4405                 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4406                     demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
4407                     data, metadata, hits);
4408
4409                 if (done)
4410                         done(NULL, buf, private);
4411         } else {
4412                 uint64_t size = BP_GET_LSIZE(bp);
4413                 arc_callback_t *acb;
4414                 vdev_t *vd = NULL;
4415                 uint64_t addr = 0;
4416                 boolean_t devw = B_FALSE;
4417                 enum zio_compress b_compress = ZIO_COMPRESS_OFF;
4418                 int32_t b_asize = 0;
4419
4420                 /*
4421                  * Gracefully handle a damaged logical block size as a
4422                  * checksum error.
4423                  */
4424                 if (size > spa_maxblocksize(spa)) {
4425                         ASSERT3P(buf, ==, NULL);
4426                         rc = SET_ERROR(ECKSUM);
4427                         goto out;
4428                 }
4429
4430                 if (hdr == NULL) {
4431                         /* this block is not in the cache */
4432                         arc_buf_hdr_t *exists = NULL;
4433                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
4434                         buf = arc_buf_alloc(spa, size, private, type);
4435                         hdr = buf->b_hdr;
4436                         if (!BP_IS_EMBEDDED(bp)) {
4437                                 hdr->b_dva = *BP_IDENTITY(bp);
4438                                 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
4439                                 exists = buf_hash_insert(hdr, &hash_lock);
4440                         }
4441                         if (exists != NULL) {
4442                                 /* somebody beat us to the hash insert */
4443                                 mutex_exit(hash_lock);
4444                                 buf_discard_identity(hdr);
4445                                 (void) arc_buf_remove_ref(buf, private);
4446                                 goto top; /* restart the IO request */
4447                         }
4448
4449                         /*
4450                          * If there is a callback, we pass our reference to
4451                          * it; otherwise we remove our reference.
4452                          */
4453                         if (done == NULL) {
4454                                 (void) remove_reference(hdr, hash_lock,
4455                                     private);
4456                         }
4457                         if (*arc_flags & ARC_FLAG_PREFETCH)
4458                                 hdr->b_flags |= ARC_FLAG_PREFETCH;
4459                         if (*arc_flags & ARC_FLAG_L2CACHE)
4460                                 hdr->b_flags |= ARC_FLAG_L2CACHE;
4461                         if (*arc_flags & ARC_FLAG_L2COMPRESS)
4462                                 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4463                         if (BP_GET_LEVEL(bp) > 0)
4464                                 hdr->b_flags |= ARC_FLAG_INDIRECT;
4465                 } else {
4466                         /*
4467                          * This block is in the ghost cache. If it was L2-only
4468                          * (and thus didn't have an L1 hdr), we realloc the
4469                          * header to add an L1 hdr.
4470                          */
4471                         if (!HDR_HAS_L1HDR(hdr)) {
4472                                 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
4473                                     hdr_full_cache);
4474                         }
4475
4476                         ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
4477                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4478                         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4479                         ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
4480
4481                         /*
4482                          * If there is a callback, we pass a reference to it.
4483                          */
4484                         if (done != NULL)
4485                                 add_reference(hdr, hash_lock, private);
4486                         if (*arc_flags & ARC_FLAG_PREFETCH)
4487                                 hdr->b_flags |= ARC_FLAG_PREFETCH;
4488                         if (*arc_flags & ARC_FLAG_L2CACHE)
4489                                 hdr->b_flags |= ARC_FLAG_L2CACHE;
4490                         if (*arc_flags & ARC_FLAG_L2COMPRESS)
4491                                 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4492                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
4493                         buf->b_hdr = hdr;
4494                         buf->b_data = NULL;
4495                         buf->b_efunc = NULL;
4496                         buf->b_private = NULL;
4497                         buf->b_next = NULL;
4498                         hdr->b_l1hdr.b_buf = buf;
4499                         ASSERT0(hdr->b_l1hdr.b_datacnt);
4500                         hdr->b_l1hdr.b_datacnt = 1;
4501                         arc_get_data_buf(buf);
4502                         arc_access(hdr, hash_lock);
4503                 }
4504
4505                 if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
4506                         hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH;
4507                 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
4508
4509                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
4510                 acb->acb_done = done;
4511                 acb->acb_private = private;
4512
4513                 ASSERT(hdr->b_l1hdr.b_acb == NULL);
4514                 hdr->b_l1hdr.b_acb = acb;
4515                 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4516
4517                 if (HDR_HAS_L2HDR(hdr) &&
4518                     (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
4519                         devw = hdr->b_l2hdr.b_dev->l2ad_writing;
4520                         addr = hdr->b_l2hdr.b_daddr;
4521                         b_compress = hdr->b_l2hdr.b_compress;
4522                         b_asize = hdr->b_l2hdr.b_asize;
4523                         /*
4524                          * Lock out device removal.
4525                          */
4526                         if (vdev_is_dead(vd) ||
4527                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
4528                                 vd = NULL;
4529                 }
4530
4531                 if (hash_lock != NULL)
4532                         mutex_exit(hash_lock);
4533
4534                 /*
4535                  * At this point, we have a level 1 cache miss.  Try again in
4536                  * L2ARC if possible.
4537                  */
4538                 ASSERT3U(hdr->b_size, ==, size);
4539                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
4540                     uint64_t, size, zbookmark_phys_t *, zb);
4541                 ARCSTAT_BUMP(arcstat_misses);
4542                 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4543                     demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
4544                     data, metadata, misses);
4545
4546                 if (priority == ZIO_PRIORITY_ASYNC_READ)
4547                         hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ;
4548                 else
4549                         hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ;
4550
4551                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
4552                         /*
4553                          * Read from the L2ARC if the following are true:
4554                          * 1. The L2ARC vdev was previously cached.
4555                          * 2. This buffer still has L2ARC metadata.
4556                          * 3. This buffer isn't currently writing to the L2ARC.
4557                          * 4. The L2ARC entry wasn't evicted, which may
4558                          *    also have invalidated the vdev.
4559                          * 5. This isn't prefetch and l2arc_noprefetch is set.
4560                          */
4561                         if (HDR_HAS_L2HDR(hdr) &&
4562                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
4563                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
4564                                 l2arc_read_callback_t *cb;
4565
4566                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
4567                                 ARCSTAT_BUMP(arcstat_l2_hits);
4568                                 atomic_inc_32(&hdr->b_l2hdr.b_hits);
4569
4570                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
4571                                     KM_SLEEP);
4572                                 cb->l2rcb_buf = buf;
4573                                 cb->l2rcb_spa = spa;
4574                                 cb->l2rcb_bp = *bp;
4575                                 cb->l2rcb_zb = *zb;
4576                                 cb->l2rcb_flags = zio_flags;
4577                                 cb->l2rcb_compress = b_compress;
4578
4579                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
4580                                     addr + size < vd->vdev_psize -
4581                                     VDEV_LABEL_END_SIZE);
4582
4583                                 /*
4584                                  * l2arc read.  The SCL_L2ARC lock will be
4585                                  * released by l2arc_read_done().
4586                                  * Issue a null zio if the underlying buffer
4587                                  * was squashed to zero size by compression.
4588                                  */
4589                                 if (b_compress == ZIO_COMPRESS_EMPTY) {
4590                                         rzio = zio_null(pio, spa, vd,
4591                                             l2arc_read_done, cb,
4592                                             zio_flags | ZIO_FLAG_DONT_CACHE |
4593                                             ZIO_FLAG_CANFAIL |
4594                                             ZIO_FLAG_DONT_PROPAGATE |
4595                                             ZIO_FLAG_DONT_RETRY);
4596                                 } else {
4597                                         rzio = zio_read_phys(pio, vd, addr,
4598                                             b_asize, buf->b_data,
4599                                             ZIO_CHECKSUM_OFF,
4600                                             l2arc_read_done, cb, priority,
4601                                             zio_flags | ZIO_FLAG_DONT_CACHE |
4602                                             ZIO_FLAG_CANFAIL |
4603                                             ZIO_FLAG_DONT_PROPAGATE |
4604                                             ZIO_FLAG_DONT_RETRY, B_FALSE);
4605                                 }
4606                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
4607                                     zio_t *, rzio);
4608                                 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
4609
4610                                 if (*arc_flags & ARC_FLAG_NOWAIT) {
4611                                         zio_nowait(rzio);
4612                                         goto out;
4613                                 }
4614
4615                                 ASSERT(*arc_flags & ARC_FLAG_WAIT);
4616                                 if (zio_wait(rzio) == 0)
4617                                         goto out;
4618
4619                                 /* l2arc read error; goto zio_read() */
4620                         } else {
4621                                 DTRACE_PROBE1(l2arc__miss,
4622                                     arc_buf_hdr_t *, hdr);
4623                                 ARCSTAT_BUMP(arcstat_l2_misses);
4624                                 if (HDR_L2_WRITING(hdr))
4625                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
4626                                 spa_config_exit(spa, SCL_L2ARC, vd);
4627                         }
4628                 } else {
4629                         if (vd != NULL)
4630                                 spa_config_exit(spa, SCL_L2ARC, vd);
4631                         if (l2arc_ndev != 0) {
4632                                 DTRACE_PROBE1(l2arc__miss,
4633                                     arc_buf_hdr_t *, hdr);
4634                                 ARCSTAT_BUMP(arcstat_l2_misses);
4635                         }
4636                 }
4637
4638                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
4639                     arc_read_done, buf, priority, zio_flags, zb);
4640
4641                 if (*arc_flags & ARC_FLAG_WAIT) {
4642                         rc = zio_wait(rzio);
4643                         goto out;
4644                 }
4645
4646                 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
4647                 zio_nowait(rzio);
4648         }
4649
4650 out:
4651         spa_read_history_add(spa, zb, *arc_flags);
4652         return (rc);
4653 }
4654
4655 arc_prune_t *
4656 arc_add_prune_callback(arc_prune_func_t *func, void *private)
4657 {
4658         arc_prune_t *p;
4659
4660         p = kmem_alloc(sizeof (*p), KM_SLEEP);
4661         p->p_pfunc = func;
4662         p->p_private = private;
4663         list_link_init(&p->p_node);
4664         refcount_create(&p->p_refcnt);
4665
4666         mutex_enter(&arc_prune_mtx);
4667         refcount_add(&p->p_refcnt, &arc_prune_list);
4668         list_insert_head(&arc_prune_list, p);
4669         mutex_exit(&arc_prune_mtx);
4670
4671         return (p);
4672 }
4673
4674 void
4675 arc_remove_prune_callback(arc_prune_t *p)
4676 {
4677         boolean_t wait = B_FALSE;
4678         mutex_enter(&arc_prune_mtx);
4679         list_remove(&arc_prune_list, p);
4680         if (refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
4681                 wait = B_TRUE;
4682         mutex_exit(&arc_prune_mtx);
4683
4684         /* wait for arc_prune_task to finish */
4685         if (wait)
4686                 taskq_wait_outstanding(arc_prune_taskq, 0);
4687         ASSERT0(refcount_count(&p->p_refcnt));
4688         refcount_destroy(&p->p_refcnt);
4689         kmem_free(p, sizeof (*p));
4690 }
4691
4692 void
4693 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
4694 {
4695         ASSERT(buf->b_hdr != NULL);
4696         ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
4697         ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
4698             func == NULL);
4699         ASSERT(buf->b_efunc == NULL);
4700         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
4701
4702         buf->b_efunc = func;
4703         buf->b_private = private;
4704 }
4705
4706 /*
4707  * Notify the arc that a block was freed, and thus will never be used again.
4708  */
4709 void
4710 arc_freed(spa_t *spa, const blkptr_t *bp)
4711 {
4712         arc_buf_hdr_t *hdr;
4713         kmutex_t *hash_lock;
4714         uint64_t guid = spa_load_guid(spa);
4715
4716         ASSERT(!BP_IS_EMBEDDED(bp));
4717
4718         hdr = buf_hash_find(guid, bp, &hash_lock);
4719         if (hdr == NULL)
4720                 return;
4721         if (HDR_BUF_AVAILABLE(hdr)) {
4722                 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
4723                 add_reference(hdr, hash_lock, FTAG);
4724                 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
4725                 mutex_exit(hash_lock);
4726
4727                 arc_release(buf, FTAG);
4728                 (void) arc_buf_remove_ref(buf, FTAG);
4729         } else {
4730                 mutex_exit(hash_lock);
4731         }
4732
4733 }
4734
4735 /*
4736  * Clear the user eviction callback set by arc_set_callback(), first calling
4737  * it if it exists.  Because the presence of a callback keeps an arc_buf cached
4738  * clearing the callback may result in the arc_buf being destroyed.  However,
4739  * it will not result in the *last* arc_buf being destroyed, hence the data
4740  * will remain cached in the ARC. We make a copy of the arc buffer here so
4741  * that we can process the callback without holding any locks.
4742  *
4743  * It's possible that the callback is already in the process of being cleared
4744  * by another thread.  In this case we can not clear the callback.
4745  *
4746  * Returns B_TRUE if the callback was successfully called and cleared.
4747  */
4748 boolean_t
4749 arc_clear_callback(arc_buf_t *buf)
4750 {
4751         arc_buf_hdr_t *hdr;
4752         kmutex_t *hash_lock;
4753         arc_evict_func_t *efunc = buf->b_efunc;
4754         void *private = buf->b_private;
4755
4756         mutex_enter(&buf->b_evict_lock);
4757         hdr = buf->b_hdr;
4758         if (hdr == NULL) {
4759                 /*
4760                  * We are in arc_do_user_evicts().
4761                  */
4762                 ASSERT(buf->b_data == NULL);
4763                 mutex_exit(&buf->b_evict_lock);
4764                 return (B_FALSE);
4765         } else if (buf->b_data == NULL) {
4766                 /*
4767                  * We are on the eviction list; process this buffer now
4768                  * but let arc_do_user_evicts() do the reaping.
4769                  */
4770                 buf->b_efunc = NULL;
4771                 mutex_exit(&buf->b_evict_lock);
4772                 VERIFY0(efunc(private));
4773                 return (B_TRUE);
4774         }
4775         hash_lock = HDR_LOCK(hdr);
4776         mutex_enter(hash_lock);
4777         hdr = buf->b_hdr;
4778         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4779
4780         ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
4781             hdr->b_l1hdr.b_datacnt);
4782         ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4783             hdr->b_l1hdr.b_state == arc_mfu);
4784
4785         buf->b_efunc = NULL;
4786         buf->b_private = NULL;
4787
4788         if (hdr->b_l1hdr.b_datacnt > 1) {
4789                 mutex_exit(&buf->b_evict_lock);
4790                 arc_buf_destroy(buf, TRUE);
4791         } else {
4792                 ASSERT(buf == hdr->b_l1hdr.b_buf);
4793                 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
4794                 mutex_exit(&buf->b_evict_lock);
4795         }
4796
4797         mutex_exit(hash_lock);
4798         VERIFY0(efunc(private));
4799         return (B_TRUE);
4800 }
4801
4802 /*
4803  * Release this buffer from the cache, making it an anonymous buffer.  This
4804  * must be done after a read and prior to modifying the buffer contents.
4805  * If the buffer has more than one reference, we must make
4806  * a new hdr for the buffer.
4807  */
4808 void
4809 arc_release(arc_buf_t *buf, void *tag)
4810 {
4811         kmutex_t *hash_lock;
4812         arc_state_t *state;
4813         arc_buf_hdr_t *hdr = buf->b_hdr;
4814
4815         /*
4816          * It would be nice to assert that if its DMU metadata (level >
4817          * 0 || it's the dnode file), then it must be syncing context.
4818          * But we don't know that information at this level.
4819          */
4820
4821         mutex_enter(&buf->b_evict_lock);
4822
4823         ASSERT(HDR_HAS_L1HDR(hdr));
4824
4825         /*
4826          * We don't grab the hash lock prior to this check, because if
4827          * the buffer's header is in the arc_anon state, it won't be
4828          * linked into the hash table.
4829          */
4830         if (hdr->b_l1hdr.b_state == arc_anon) {
4831                 mutex_exit(&buf->b_evict_lock);
4832                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4833                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
4834                 ASSERT(!HDR_HAS_L2HDR(hdr));
4835                 ASSERT(BUF_EMPTY(hdr));
4836
4837                 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
4838                 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
4839                 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4840
4841                 ASSERT3P(buf->b_efunc, ==, NULL);
4842                 ASSERT3P(buf->b_private, ==, NULL);
4843
4844                 hdr->b_l1hdr.b_arc_access = 0;
4845                 arc_buf_thaw(buf);
4846
4847                 return;
4848         }
4849
4850         hash_lock = HDR_LOCK(hdr);
4851         mutex_enter(hash_lock);
4852
4853         /*
4854          * This assignment is only valid as long as the hash_lock is
4855          * held, we must be careful not to reference state or the
4856          * b_state field after dropping the lock.
4857          */
4858         state = hdr->b_l1hdr.b_state;
4859         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4860         ASSERT3P(state, !=, arc_anon);
4861
4862         /* this buffer is not on any list */
4863         ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
4864
4865         if (HDR_HAS_L2HDR(hdr)) {
4866                 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4867
4868                 /*
4869                  * We have to recheck this conditional again now that
4870                  * we're holding the l2ad_mtx to prevent a race with
4871                  * another thread which might be concurrently calling
4872                  * l2arc_evict(). In that case, l2arc_evict() might have
4873                  * destroyed the header's L2 portion as we were waiting
4874                  * to acquire the l2ad_mtx.
4875                  */
4876                 if (HDR_HAS_L2HDR(hdr))
4877                         arc_hdr_l2hdr_destroy(hdr);
4878
4879                 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4880         }
4881
4882         /*
4883          * Do we have more than one buf?
4884          */
4885         if (hdr->b_l1hdr.b_datacnt > 1) {
4886                 arc_buf_hdr_t *nhdr;
4887                 arc_buf_t **bufp;
4888                 uint64_t blksz = hdr->b_size;
4889                 uint64_t spa = hdr->b_spa;
4890                 arc_buf_contents_t type = arc_buf_type(hdr);
4891                 uint32_t flags = hdr->b_flags;
4892
4893                 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
4894                 /*
4895                  * Pull the data off of this hdr and attach it to
4896                  * a new anonymous hdr.
4897                  */
4898                 (void) remove_reference(hdr, hash_lock, tag);
4899                 bufp = &hdr->b_l1hdr.b_buf;
4900                 while (*bufp != buf)
4901                         bufp = &(*bufp)->b_next;
4902                 *bufp = buf->b_next;
4903                 buf->b_next = NULL;
4904
4905                 ASSERT3P(state, !=, arc_l2c_only);
4906
4907                 (void) refcount_remove_many(
4908                     &state->arcs_size, hdr->b_size, buf);
4909
4910                 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
4911                         uint64_t *size;
4912
4913                         ASSERT3P(state, !=, arc_l2c_only);
4914                         size = &state->arcs_lsize[type];
4915                         ASSERT3U(*size, >=, hdr->b_size);
4916                         atomic_add_64(size, -hdr->b_size);
4917                 }
4918
4919                 /*
4920                  * We're releasing a duplicate user data buffer, update
4921                  * our statistics accordingly.
4922                  */
4923                 if (HDR_ISTYPE_DATA(hdr)) {
4924                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
4925                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
4926                             -hdr->b_size);
4927                 }
4928                 hdr->b_l1hdr.b_datacnt -= 1;
4929                 arc_cksum_verify(buf);
4930                 arc_buf_unwatch(buf);
4931
4932                 mutex_exit(hash_lock);
4933
4934                 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
4935                 nhdr->b_size = blksz;
4936                 nhdr->b_spa = spa;
4937
4938                 nhdr->b_l1hdr.b_mru_hits = 0;
4939                 nhdr->b_l1hdr.b_mru_ghost_hits = 0;
4940                 nhdr->b_l1hdr.b_mfu_hits = 0;
4941                 nhdr->b_l1hdr.b_mfu_ghost_hits = 0;
4942                 nhdr->b_l1hdr.b_l2_hits = 0;
4943                 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
4944                 nhdr->b_flags |= arc_bufc_to_flags(type);
4945                 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
4946
4947                 nhdr->b_l1hdr.b_buf = buf;
4948                 nhdr->b_l1hdr.b_datacnt = 1;
4949                 nhdr->b_l1hdr.b_state = arc_anon;
4950                 nhdr->b_l1hdr.b_arc_access = 0;
4951                 nhdr->b_l1hdr.b_tmp_cdata = NULL;
4952                 nhdr->b_freeze_cksum = NULL;
4953
4954                 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
4955                 buf->b_hdr = nhdr;
4956                 mutex_exit(&buf->b_evict_lock);
4957                 (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf);
4958         } else {
4959                 mutex_exit(&buf->b_evict_lock);
4960                 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
4961                 /* protected by hash lock, or hdr is on arc_anon */
4962                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
4963                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4964                 hdr->b_l1hdr.b_mru_hits = 0;
4965                 hdr->b_l1hdr.b_mru_ghost_hits = 0;
4966                 hdr->b_l1hdr.b_mfu_hits = 0;
4967                 hdr->b_l1hdr.b_mfu_ghost_hits = 0;
4968                 hdr->b_l1hdr.b_l2_hits = 0;
4969                 arc_change_state(arc_anon, hdr, hash_lock);
4970                 hdr->b_l1hdr.b_arc_access = 0;
4971                 mutex_exit(hash_lock);
4972
4973                 buf_discard_identity(hdr);
4974                 arc_buf_thaw(buf);
4975         }
4976         buf->b_efunc = NULL;
4977         buf->b_private = NULL;
4978 }
4979
4980 int
4981 arc_released(arc_buf_t *buf)
4982 {
4983         int released;
4984
4985         mutex_enter(&buf->b_evict_lock);
4986         released = (buf->b_data != NULL &&
4987             buf->b_hdr->b_l1hdr.b_state == arc_anon);
4988         mutex_exit(&buf->b_evict_lock);
4989         return (released);
4990 }
4991
4992 #ifdef ZFS_DEBUG
4993 int
4994 arc_referenced(arc_buf_t *buf)
4995 {
4996         int referenced;
4997
4998         mutex_enter(&buf->b_evict_lock);
4999         referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
5000         mutex_exit(&buf->b_evict_lock);
5001         return (referenced);
5002 }
5003 #endif
5004
5005 static void
5006 arc_write_ready(zio_t *zio)
5007 {
5008         arc_write_callback_t *callback = zio->io_private;
5009         arc_buf_t *buf = callback->awcb_buf;
5010         arc_buf_hdr_t *hdr = buf->b_hdr;
5011
5012         ASSERT(HDR_HAS_L1HDR(hdr));
5013         ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
5014         ASSERT(hdr->b_l1hdr.b_datacnt > 0);
5015         callback->awcb_ready(zio, buf, callback->awcb_private);
5016
5017         /*
5018          * If the IO is already in progress, then this is a re-write
5019          * attempt, so we need to thaw and re-compute the cksum.
5020          * It is the responsibility of the callback to handle the
5021          * accounting for any re-write attempt.
5022          */
5023         if (HDR_IO_IN_PROGRESS(hdr)) {
5024                 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
5025                 if (hdr->b_freeze_cksum != NULL) {
5026                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
5027                         hdr->b_freeze_cksum = NULL;
5028                 }
5029                 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
5030         }
5031         arc_cksum_compute(buf, B_FALSE);
5032         hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
5033 }
5034
5035 static void
5036 arc_write_children_ready(zio_t *zio)
5037 {
5038         arc_write_callback_t *callback = zio->io_private;
5039         arc_buf_t *buf = callback->awcb_buf;
5040
5041         callback->awcb_children_ready(zio, buf, callback->awcb_private);
5042 }
5043
5044 /*
5045  * The SPA calls this callback for each physical write that happens on behalf
5046  * of a logical write.  See the comment in dbuf_write_physdone() for details.
5047  */
5048 static void
5049 arc_write_physdone(zio_t *zio)
5050 {
5051         arc_write_callback_t *cb = zio->io_private;
5052         if (cb->awcb_physdone != NULL)
5053                 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
5054 }
5055
5056 static void
5057 arc_write_done(zio_t *zio)
5058 {
5059         arc_write_callback_t *callback = zio->io_private;
5060         arc_buf_t *buf = callback->awcb_buf;
5061         arc_buf_hdr_t *hdr = buf->b_hdr;
5062
5063         ASSERT(hdr->b_l1hdr.b_acb == NULL);
5064
5065         if (zio->io_error == 0) {
5066                 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
5067                         buf_discard_identity(hdr);
5068                 } else {
5069                         hdr->b_dva = *BP_IDENTITY(zio->io_bp);
5070                         hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
5071                 }
5072         } else {
5073                 ASSERT(BUF_EMPTY(hdr));
5074         }
5075
5076         /*
5077          * If the block to be written was all-zero or compressed enough to be
5078          * embedded in the BP, no write was performed so there will be no
5079          * dva/birth/checksum.  The buffer must therefore remain anonymous
5080          * (and uncached).
5081          */
5082         if (!BUF_EMPTY(hdr)) {
5083                 arc_buf_hdr_t *exists;
5084                 kmutex_t *hash_lock;
5085
5086                 ASSERT(zio->io_error == 0);
5087
5088                 arc_cksum_verify(buf);
5089
5090                 exists = buf_hash_insert(hdr, &hash_lock);
5091                 if (exists != NULL) {
5092                         /*
5093                          * This can only happen if we overwrite for
5094                          * sync-to-convergence, because we remove
5095                          * buffers from the hash table when we arc_free().
5096                          */
5097                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
5098                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
5099                                         panic("bad overwrite, hdr=%p exists=%p",
5100                                             (void *)hdr, (void *)exists);
5101                                 ASSERT(refcount_is_zero(
5102                                     &exists->b_l1hdr.b_refcnt));
5103                                 arc_change_state(arc_anon, exists, hash_lock);
5104                                 mutex_exit(hash_lock);
5105                                 arc_hdr_destroy(exists);
5106                                 exists = buf_hash_insert(hdr, &hash_lock);
5107                                 ASSERT3P(exists, ==, NULL);
5108                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
5109                                 /* nopwrite */
5110                                 ASSERT(zio->io_prop.zp_nopwrite);
5111                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
5112                                         panic("bad nopwrite, hdr=%p exists=%p",
5113                                             (void *)hdr, (void *)exists);
5114                         } else {
5115                                 /* Dedup */
5116                                 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
5117                                 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
5118                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
5119                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
5120                         }
5121                 }
5122                 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
5123                 /* if it's not anon, we are doing a scrub */
5124                 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
5125                         arc_access(hdr, hash_lock);
5126                 mutex_exit(hash_lock);
5127         } else {
5128                 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
5129         }
5130
5131         ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5132         callback->awcb_done(zio, buf, callback->awcb_private);
5133
5134         kmem_free(callback, sizeof (arc_write_callback_t));
5135 }
5136
5137 zio_t *
5138 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
5139     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
5140     const zio_prop_t *zp, arc_done_func_t *ready,
5141     arc_done_func_t *children_ready, arc_done_func_t *physdone,
5142     arc_done_func_t *done, void *private, zio_priority_t priority,
5143     int zio_flags, const zbookmark_phys_t *zb)
5144 {
5145         arc_buf_hdr_t *hdr = buf->b_hdr;
5146         arc_write_callback_t *callback;
5147         zio_t *zio;
5148
5149         ASSERT(ready != NULL);
5150         ASSERT(done != NULL);
5151         ASSERT(!HDR_IO_ERROR(hdr));
5152         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
5153         ASSERT(hdr->b_l1hdr.b_acb == NULL);
5154         ASSERT(hdr->b_l1hdr.b_datacnt > 0);
5155         if (l2arc)
5156                 hdr->b_flags |= ARC_FLAG_L2CACHE;
5157         if (l2arc_compress)
5158                 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
5159         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
5160         callback->awcb_ready = ready;
5161         callback->awcb_children_ready = children_ready;
5162         callback->awcb_physdone = physdone;
5163         callback->awcb_done = done;
5164         callback->awcb_private = private;
5165         callback->awcb_buf = buf;
5166
5167         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
5168             arc_write_ready,
5169             (children_ready != NULL) ? arc_write_children_ready : NULL,
5170             arc_write_physdone, arc_write_done, callback,
5171             priority, zio_flags, zb);
5172
5173         return (zio);
5174 }
5175
5176 static int
5177 arc_memory_throttle(uint64_t reserve, uint64_t txg)
5178 {
5179 #ifdef _KERNEL
5180         uint64_t available_memory = ptob(freemem);
5181         static uint64_t page_load = 0;
5182         static uint64_t last_txg = 0;
5183 #ifdef __linux__
5184         pgcnt_t minfree = btop(arc_sys_free / 4);
5185 #endif
5186
5187         if (freemem > physmem * arc_lotsfree_percent / 100)
5188                 return (0);
5189
5190         if (txg > last_txg) {
5191                 last_txg = txg;
5192                 page_load = 0;
5193         }
5194
5195         /*
5196          * If we are in pageout, we know that memory is already tight,
5197          * the arc is already going to be evicting, so we just want to
5198          * continue to let page writes occur as quickly as possible.
5199          */
5200         if (current_is_kswapd()) {
5201                 if (page_load > MAX(ptob(minfree), available_memory) / 4) {
5202                         DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
5203                         return (SET_ERROR(ERESTART));
5204                 }
5205                 /* Note: reserve is inflated, so we deflate */
5206                 page_load += reserve / 8;
5207                 return (0);
5208         } else if (page_load > 0 && arc_reclaim_needed()) {
5209                 /* memory is low, delay before restarting */
5210                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
5211                 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
5212                 return (SET_ERROR(EAGAIN));
5213         }
5214         page_load = 0;
5215 #endif
5216         return (0);
5217 }
5218
5219 void
5220 arc_tempreserve_clear(uint64_t reserve)
5221 {
5222         atomic_add_64(&arc_tempreserve, -reserve);
5223         ASSERT((int64_t)arc_tempreserve >= 0);
5224 }
5225
5226 int
5227 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
5228 {
5229         int error;
5230         uint64_t anon_size;
5231
5232         if (!arc_no_grow &&
5233             reserve > arc_c/4 &&
5234             reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT))
5235                 arc_c = MIN(arc_c_max, reserve * 4);
5236
5237         /*
5238          * Throttle when the calculated memory footprint for the TXG
5239          * exceeds the target ARC size.
5240          */
5241         if (reserve > arc_c) {
5242                 DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
5243                 return (SET_ERROR(ERESTART));
5244         }
5245
5246         /*
5247          * Don't count loaned bufs as in flight dirty data to prevent long
5248          * network delays from blocking transactions that are ready to be
5249          * assigned to a txg.
5250          */
5251         anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
5252             arc_loaned_bytes), 0);
5253
5254         /*
5255          * Writes will, almost always, require additional memory allocations
5256          * in order to compress/encrypt/etc the data.  We therefore need to
5257          * make sure that there is sufficient available memory for this.
5258          */
5259         error = arc_memory_throttle(reserve, txg);
5260         if (error != 0)
5261                 return (error);
5262
5263         /*
5264          * Throttle writes when the amount of dirty data in the cache
5265          * gets too large.  We try to keep the cache less than half full
5266          * of dirty blocks so that our sync times don't grow too large.
5267          * Note: if two requests come in concurrently, we might let them
5268          * both succeed, when one of them should fail.  Not a huge deal.
5269          */
5270
5271         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
5272             anon_size > arc_c / 4) {
5273                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
5274                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
5275                     arc_tempreserve>>10,
5276                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
5277                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
5278                     reserve>>10, arc_c>>10);
5279                 DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
5280                 return (SET_ERROR(ERESTART));
5281         }
5282         atomic_add_64(&arc_tempreserve, reserve);
5283         return (0);
5284 }
5285
5286 static void
5287 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
5288     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
5289 {
5290         size->value.ui64 = refcount_count(&state->arcs_size);
5291         evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
5292         evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
5293 }
5294
5295 static int
5296 arc_kstat_update(kstat_t *ksp, int rw)
5297 {
5298         arc_stats_t *as = ksp->ks_data;
5299
5300         if (rw == KSTAT_WRITE) {
5301                 return (EACCES);
5302         } else {
5303                 arc_kstat_update_state(arc_anon,
5304                     &as->arcstat_anon_size,
5305                     &as->arcstat_anon_evictable_data,
5306                     &as->arcstat_anon_evictable_metadata);
5307                 arc_kstat_update_state(arc_mru,
5308                     &as->arcstat_mru_size,
5309                     &as->arcstat_mru_evictable_data,
5310                     &as->arcstat_mru_evictable_metadata);
5311                 arc_kstat_update_state(arc_mru_ghost,
5312                     &as->arcstat_mru_ghost_size,
5313                     &as->arcstat_mru_ghost_evictable_data,
5314                     &as->arcstat_mru_ghost_evictable_metadata);
5315                 arc_kstat_update_state(arc_mfu,
5316                     &as->arcstat_mfu_size,
5317                     &as->arcstat_mfu_evictable_data,
5318                     &as->arcstat_mfu_evictable_metadata);
5319                 arc_kstat_update_state(arc_mfu_ghost,
5320                     &as->arcstat_mfu_ghost_size,
5321                     &as->arcstat_mfu_ghost_evictable_data,
5322                     &as->arcstat_mfu_ghost_evictable_metadata);
5323         }
5324
5325         return (0);
5326 }
5327
5328 /*
5329  * This function *must* return indices evenly distributed between all
5330  * sublists of the multilist. This is needed due to how the ARC eviction
5331  * code is laid out; arc_evict_state() assumes ARC buffers are evenly
5332  * distributed between all sublists and uses this assumption when
5333  * deciding which sublist to evict from and how much to evict from it.
5334  */
5335 unsigned int
5336 arc_state_multilist_index_func(multilist_t *ml, void *obj)
5337 {
5338         arc_buf_hdr_t *hdr = obj;
5339
5340         /*
5341          * We rely on b_dva to generate evenly distributed index
5342          * numbers using buf_hash below. So, as an added precaution,
5343          * let's make sure we never add empty buffers to the arc lists.
5344          */
5345         ASSERT(!BUF_EMPTY(hdr));
5346
5347         /*
5348          * The assumption here, is the hash value for a given
5349          * arc_buf_hdr_t will remain constant throughout its lifetime
5350          * (i.e. its b_spa, b_dva, and b_birth fields don't change).
5351          * Thus, we don't need to store the header's sublist index
5352          * on insertion, as this index can be recalculated on removal.
5353          *
5354          * Also, the low order bits of the hash value are thought to be
5355          * distributed evenly. Otherwise, in the case that the multilist
5356          * has a power of two number of sublists, each sublists' usage
5357          * would not be evenly distributed.
5358          */
5359         return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
5360             multilist_get_num_sublists(ml));
5361 }
5362
5363 /*
5364  * Called during module initialization and periodically thereafter to
5365  * apply reasonable changes to the exposed performance tunings.  Non-zero
5366  * zfs_* values which differ from the currently set values will be applied.
5367  */
5368 static void
5369 arc_tuning_update(void)
5370 {
5371         uint64_t percent;
5372         /* Valid range: 64M - <all physical memory> */
5373         if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
5374             (zfs_arc_max > 64 << 20) && (zfs_arc_max < ptob(physmem)) &&
5375             (zfs_arc_max > arc_c_min)) {
5376                 arc_c_max = zfs_arc_max;
5377                 arc_c = arc_c_max;
5378                 arc_p = (arc_c >> 1);
5379                 /* Valid range of arc_meta_limit: arc_meta_min - arc_c_max */
5380                 percent = MIN(zfs_arc_meta_limit_percent, 100);
5381                 arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100);
5382                 percent = MIN(zfs_arc_dnode_limit_percent, 100);
5383                 arc_dnode_limit = (percent * arc_meta_limit) / 100;
5384         }
5385
5386         /* Valid range: 32M - <arc_c_max> */
5387         if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
5388             (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
5389             (zfs_arc_min <= arc_c_max)) {
5390                 arc_c_min = zfs_arc_min;
5391                 arc_c = MAX(arc_c, arc_c_min);
5392         }
5393
5394         /* Valid range: 16M - <arc_c_max> */
5395         if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) &&
5396             (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) &&
5397             (zfs_arc_meta_min <= arc_c_max)) {
5398                 arc_meta_min = zfs_arc_meta_min;
5399                 arc_meta_limit = MAX(arc_meta_limit, arc_meta_min);
5400                 arc_dnode_limit = arc_meta_limit / 10;
5401         }
5402
5403         /* Valid range: <arc_meta_min> - <arc_c_max> */
5404         if ((zfs_arc_meta_limit) && (zfs_arc_meta_limit != arc_meta_limit) &&
5405             (zfs_arc_meta_limit >= zfs_arc_meta_min) &&
5406             (zfs_arc_meta_limit <= arc_c_max))
5407                 arc_meta_limit = zfs_arc_meta_limit;
5408
5409         /* Valid range: <arc_meta_min> - <arc_c_max> */
5410         if ((zfs_arc_dnode_limit) && (zfs_arc_dnode_limit != arc_dnode_limit) &&
5411             (zfs_arc_dnode_limit >= zfs_arc_meta_min) &&
5412             (zfs_arc_dnode_limit <= arc_c_max))
5413                 arc_dnode_limit = zfs_arc_dnode_limit;
5414
5415         /* Valid range: 1 - N */
5416         if (zfs_arc_grow_retry)
5417                 arc_grow_retry = zfs_arc_grow_retry;
5418
5419         /* Valid range: 1 - N */
5420         if (zfs_arc_shrink_shift) {
5421                 arc_shrink_shift = zfs_arc_shrink_shift;
5422                 arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
5423         }
5424
5425         /* Valid range: 1 - N */
5426         if (zfs_arc_p_min_shift)
5427                 arc_p_min_shift = zfs_arc_p_min_shift;
5428
5429         /* Valid range: 1 - N ticks */
5430         if (zfs_arc_min_prefetch_lifespan)
5431                 arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan;
5432
5433         /* Valid range: 0 - 100 */
5434         if ((zfs_arc_lotsfree_percent >= 0) &&
5435             (zfs_arc_lotsfree_percent <= 100))
5436                 arc_lotsfree_percent = zfs_arc_lotsfree_percent;
5437
5438         /* Valid range: 0 - <all physical memory> */
5439         if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
5440                 arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), ptob(physmem));
5441
5442 }
5443
5444 void
5445 arc_init(void)
5446 {
5447         /*
5448          * allmem is "all memory that we could possibly use".
5449          */
5450 #ifdef _KERNEL
5451         uint64_t allmem = ptob(physmem);
5452 #else
5453         uint64_t allmem = (physmem * PAGESIZE) / 2;
5454 #endif
5455         uint64_t percent;
5456
5457         mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
5458         cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
5459         cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
5460
5461         mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
5462         cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL);
5463
5464         /* Convert seconds to clock ticks */
5465         arc_min_prefetch_lifespan = 1 * hz;
5466
5467         /* Start out with 1/8 of all memory */
5468         arc_c = allmem / 8;
5469
5470 #ifdef _KERNEL
5471         /*
5472          * On architectures where the physical memory can be larger
5473          * than the addressable space (intel in 32-bit mode), we may
5474          * need to limit the cache to 1/8 of VM size.
5475          */
5476         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
5477
5478         /*
5479          * Register a shrinker to support synchronous (direct) memory
5480          * reclaim from the arc.  This is done to prevent kswapd from
5481          * swapping out pages when it is preferable to shrink the arc.
5482          */
5483         spl_register_shrinker(&arc_shrinker);
5484
5485         /* Set to 1/64 of all memory or a minimum of 512K */
5486         arc_sys_free = MAX(ptob(physmem / 64), (512 * 1024));
5487         arc_need_free = 0;
5488 #endif
5489
5490         /* Set max to 1/2 of all memory */
5491         arc_c_max = allmem / 2;
5492
5493         /*
5494          * In userland, there's only the memory pressure that we artificially
5495          * create (see arc_available_memory()).  Don't let arc_c get too
5496          * small, because it can cause transactions to be larger than
5497          * arc_c, causing arc_tempreserve_space() to fail.
5498          */
5499 #ifndef _KERNEL
5500         arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT);
5501 #else
5502         arc_c_min = 2ULL << SPA_MAXBLOCKSHIFT;
5503 #endif
5504
5505         arc_c = arc_c_max;
5506         arc_p = (arc_c >> 1);
5507
5508         /* Set min to 1/2 of arc_c_min */
5509         arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
5510         /* Initialize maximum observed usage to zero */
5511         arc_meta_max = 0;
5512         /*
5513          * Set arc_meta_limit to a percent of arc_c_max with a floor of
5514          * arc_meta_min, and a ceiling of arc_c_max.
5515          */
5516         percent = MIN(zfs_arc_meta_limit_percent, 100);
5517         arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100);
5518         percent = MIN(zfs_arc_dnode_limit_percent, 100);
5519         arc_dnode_limit = (percent * arc_meta_limit) / 100;
5520
5521         /* Apply user specified tunings */
5522         arc_tuning_update();
5523
5524         if (zfs_arc_num_sublists_per_state < 1)
5525                 zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1);
5526
5527         /* if kmem_flags are set, lets try to use less memory */
5528         if (kmem_debugging())
5529                 arc_c = arc_c / 2;
5530         if (arc_c < arc_c_min)
5531                 arc_c = arc_c_min;
5532
5533         arc_anon = &ARC_anon;
5534         arc_mru = &ARC_mru;
5535         arc_mru_ghost = &ARC_mru_ghost;
5536         arc_mfu = &ARC_mfu;
5537         arc_mfu_ghost = &ARC_mfu_ghost;
5538         arc_l2c_only = &ARC_l2c_only;
5539         arc_size = 0;
5540
5541         multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
5542             sizeof (arc_buf_hdr_t),
5543             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5544             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5545         multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
5546             sizeof (arc_buf_hdr_t),
5547             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5548             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5549         multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
5550             sizeof (arc_buf_hdr_t),
5551             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5552             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5553         multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
5554             sizeof (arc_buf_hdr_t),
5555             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5556             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5557         multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
5558             sizeof (arc_buf_hdr_t),
5559             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5560             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5561         multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
5562             sizeof (arc_buf_hdr_t),
5563             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5564             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5565         multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
5566             sizeof (arc_buf_hdr_t),
5567             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5568             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5569         multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
5570             sizeof (arc_buf_hdr_t),
5571             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5572             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5573         multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
5574             sizeof (arc_buf_hdr_t),
5575             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5576             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5577         multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
5578             sizeof (arc_buf_hdr_t),
5579             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5580             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5581
5582         arc_anon->arcs_state = ARC_STATE_ANON;
5583         arc_mru->arcs_state = ARC_STATE_MRU;
5584         arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
5585         arc_mfu->arcs_state = ARC_STATE_MFU;
5586         arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
5587         arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
5588
5589         refcount_create(&arc_anon->arcs_size);
5590         refcount_create(&arc_mru->arcs_size);
5591         refcount_create(&arc_mru_ghost->arcs_size);
5592         refcount_create(&arc_mfu->arcs_size);
5593         refcount_create(&arc_mfu_ghost->arcs_size);
5594         refcount_create(&arc_l2c_only->arcs_size);
5595
5596         buf_init();
5597
5598         arc_reclaim_thread_exit = FALSE;
5599         arc_user_evicts_thread_exit = FALSE;
5600         list_create(&arc_prune_list, sizeof (arc_prune_t),
5601             offsetof(arc_prune_t, p_node));
5602         arc_eviction_list = NULL;
5603         mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
5604         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
5605
5606         arc_prune_taskq = taskq_create("arc_prune", max_ncpus, defclsyspri,
5607             max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
5608
5609         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
5610             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
5611
5612         if (arc_ksp != NULL) {
5613                 arc_ksp->ks_data = &arc_stats;
5614                 arc_ksp->ks_update = arc_kstat_update;
5615                 kstat_install(arc_ksp);
5616         }
5617
5618         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
5619             TS_RUN, defclsyspri);
5620
5621         (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0,
5622             TS_RUN, defclsyspri);
5623
5624         arc_dead = FALSE;
5625         arc_warm = B_FALSE;
5626
5627         /*
5628          * Calculate maximum amount of dirty data per pool.
5629          *
5630          * If it has been set by a module parameter, take that.
5631          * Otherwise, use a percentage of physical memory defined by
5632          * zfs_dirty_data_max_percent (default 10%) with a cap at
5633          * zfs_dirty_data_max_max (default 25% of physical memory).
5634          */
5635         if (zfs_dirty_data_max_max == 0)
5636                 zfs_dirty_data_max_max = (uint64_t)physmem * PAGESIZE *
5637                     zfs_dirty_data_max_max_percent / 100;
5638
5639         if (zfs_dirty_data_max == 0) {
5640                 zfs_dirty_data_max = (uint64_t)physmem * PAGESIZE *
5641                     zfs_dirty_data_max_percent / 100;
5642                 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
5643                     zfs_dirty_data_max_max);
5644         }
5645 }
5646
5647 void
5648 arc_fini(void)
5649 {
5650         arc_prune_t *p;
5651
5652 #ifdef _KERNEL
5653         spl_unregister_shrinker(&arc_shrinker);
5654 #endif /* _KERNEL */
5655
5656         mutex_enter(&arc_reclaim_lock);
5657         arc_reclaim_thread_exit = TRUE;
5658         /*
5659          * The reclaim thread will set arc_reclaim_thread_exit back to
5660          * FALSE when it is finished exiting; we're waiting for that.
5661          */
5662         while (arc_reclaim_thread_exit) {
5663                 cv_signal(&arc_reclaim_thread_cv);
5664                 cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
5665         }
5666         mutex_exit(&arc_reclaim_lock);
5667
5668         mutex_enter(&arc_user_evicts_lock);
5669         arc_user_evicts_thread_exit = TRUE;
5670         /*
5671          * The user evicts thread will set arc_user_evicts_thread_exit
5672          * to FALSE when it is finished exiting; we're waiting for that.
5673          */
5674         while (arc_user_evicts_thread_exit) {
5675                 cv_signal(&arc_user_evicts_cv);
5676                 cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock);
5677         }
5678         mutex_exit(&arc_user_evicts_lock);
5679
5680         /* Use TRUE to ensure *all* buffers are evicted */
5681         arc_flush(NULL, TRUE);
5682
5683         arc_dead = TRUE;
5684
5685         if (arc_ksp != NULL) {
5686                 kstat_delete(arc_ksp);
5687                 arc_ksp = NULL;
5688         }
5689
5690         taskq_wait(arc_prune_taskq);
5691         taskq_destroy(arc_prune_taskq);
5692
5693         mutex_enter(&arc_prune_mtx);
5694         while ((p = list_head(&arc_prune_list)) != NULL) {
5695                 list_remove(&arc_prune_list, p);
5696                 refcount_remove(&p->p_refcnt, &arc_prune_list);
5697                 refcount_destroy(&p->p_refcnt);
5698                 kmem_free(p, sizeof (*p));
5699         }
5700         mutex_exit(&arc_prune_mtx);
5701
5702         list_destroy(&arc_prune_list);
5703         mutex_destroy(&arc_prune_mtx);
5704         mutex_destroy(&arc_reclaim_lock);
5705         cv_destroy(&arc_reclaim_thread_cv);
5706         cv_destroy(&arc_reclaim_waiters_cv);
5707
5708         mutex_destroy(&arc_user_evicts_lock);
5709         cv_destroy(&arc_user_evicts_cv);
5710
5711         refcount_destroy(&arc_anon->arcs_size);
5712         refcount_destroy(&arc_mru->arcs_size);
5713         refcount_destroy(&arc_mru_ghost->arcs_size);
5714         refcount_destroy(&arc_mfu->arcs_size);
5715         refcount_destroy(&arc_mfu_ghost->arcs_size);
5716         refcount_destroy(&arc_l2c_only->arcs_size);
5717
5718         multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
5719         multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
5720         multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
5721         multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
5722         multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
5723         multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
5724         multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
5725         multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
5726         multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
5727         multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
5728
5729         buf_fini();
5730
5731         ASSERT0(arc_loaned_bytes);
5732 }
5733
5734 /*
5735  * Level 2 ARC
5736  *
5737  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
5738  * It uses dedicated storage devices to hold cached data, which are populated
5739  * using large infrequent writes.  The main role of this cache is to boost
5740  * the performance of random read workloads.  The intended L2ARC devices
5741  * include short-stroked disks, solid state disks, and other media with
5742  * substantially faster read latency than disk.
5743  *
5744  *                 +-----------------------+
5745  *                 |         ARC           |
5746  *                 +-----------------------+
5747  *                    |         ^     ^
5748  *                    |         |     |
5749  *      l2arc_feed_thread()    arc_read()
5750  *                    |         |     |
5751  *                    |  l2arc read   |
5752  *                    V         |     |
5753  *               +---------------+    |
5754  *               |     L2ARC     |    |
5755  *               +---------------+    |
5756  *                   |    ^           |
5757  *          l2arc_write() |           |
5758  *                   |    |           |
5759  *                   V    |           |
5760  *                 +-------+      +-------+
5761  *                 | vdev  |      | vdev  |
5762  *                 | cache |      | cache |
5763  *                 +-------+      +-------+
5764  *                 +=========+     .-----.
5765  *                 :  L2ARC  :    |-_____-|
5766  *                 : devices :    | Disks |
5767  *                 +=========+    `-_____-'
5768  *
5769  * Read requests are satisfied from the following sources, in order:
5770  *
5771  *      1) ARC
5772  *      2) vdev cache of L2ARC devices
5773  *      3) L2ARC devices
5774  *      4) vdev cache of disks
5775  *      5) disks
5776  *
5777  * Some L2ARC device types exhibit extremely slow write performance.
5778  * To accommodate for this there are some significant differences between
5779  * the L2ARC and traditional cache design:
5780  *
5781  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
5782  * the ARC behave as usual, freeing buffers and placing headers on ghost
5783  * lists.  The ARC does not send buffers to the L2ARC during eviction as
5784  * this would add inflated write latencies for all ARC memory pressure.
5785  *
5786  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
5787  * It does this by periodically scanning buffers from the eviction-end of
5788  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
5789  * not already there. It scans until a headroom of buffers is satisfied,
5790  * which itself is a buffer for ARC eviction. If a compressible buffer is
5791  * found during scanning and selected for writing to an L2ARC device, we
5792  * temporarily boost scanning headroom during the next scan cycle to make
5793  * sure we adapt to compression effects (which might significantly reduce
5794  * the data volume we write to L2ARC). The thread that does this is
5795  * l2arc_feed_thread(), illustrated below; example sizes are included to
5796  * provide a better sense of ratio than this diagram:
5797  *
5798  *             head -->                        tail
5799  *              +---------------------+----------+
5800  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
5801  *              +---------------------+----------+   |   o L2ARC eligible
5802  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
5803  *              +---------------------+----------+   |
5804  *                   15.9 Gbytes      ^ 32 Mbytes    |
5805  *                                 headroom          |
5806  *                                            l2arc_feed_thread()
5807  *                                                   |
5808  *                       l2arc write hand <--[oooo]--'
5809  *                               |           8 Mbyte
5810  *                               |          write max
5811  *                               V
5812  *                +==============================+
5813  *      L2ARC dev |####|#|###|###|    |####| ... |
5814  *                +==============================+
5815  *                           32 Gbytes
5816  *
5817  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
5818  * evicted, then the L2ARC has cached a buffer much sooner than it probably
5819  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
5820  * safe to say that this is an uncommon case, since buffers at the end of
5821  * the ARC lists have moved there due to inactivity.
5822  *
5823  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
5824  * then the L2ARC simply misses copying some buffers.  This serves as a
5825  * pressure valve to prevent heavy read workloads from both stalling the ARC
5826  * with waits and clogging the L2ARC with writes.  This also helps prevent
5827  * the potential for the L2ARC to churn if it attempts to cache content too
5828  * quickly, such as during backups of the entire pool.
5829  *
5830  * 5. After system boot and before the ARC has filled main memory, there are
5831  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
5832  * lists can remain mostly static.  Instead of searching from tail of these
5833  * lists as pictured, the l2arc_feed_thread() will search from the list heads
5834  * for eligible buffers, greatly increasing its chance of finding them.
5835  *
5836  * The L2ARC device write speed is also boosted during this time so that
5837  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
5838  * there are no L2ARC reads, and no fear of degrading read performance
5839  * through increased writes.
5840  *
5841  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
5842  * the vdev queue can aggregate them into larger and fewer writes.  Each
5843  * device is written to in a rotor fashion, sweeping writes through
5844  * available space then repeating.
5845  *
5846  * 7. The L2ARC does not store dirty content.  It never needs to flush
5847  * write buffers back to disk based storage.
5848  *
5849  * 8. If an ARC buffer is written (and dirtied) which also exists in the
5850  * L2ARC, the now stale L2ARC buffer is immediately dropped.
5851  *
5852  * The performance of the L2ARC can be tweaked by a number of tunables, which
5853  * may be necessary for different workloads:
5854  *
5855  *      l2arc_write_max         max write bytes per interval
5856  *      l2arc_write_boost       extra write bytes during device warmup
5857  *      l2arc_noprefetch        skip caching prefetched buffers
5858  *      l2arc_nocompress        skip compressing buffers
5859  *      l2arc_headroom          number of max device writes to precache
5860  *      l2arc_headroom_boost    when we find compressed buffers during ARC
5861  *                              scanning, we multiply headroom by this
5862  *                              percentage factor for the next scan cycle,
5863  *                              since more compressed buffers are likely to
5864  *                              be present
5865  *      l2arc_feed_secs         seconds between L2ARC writing
5866  *
5867  * Tunables may be removed or added as future performance improvements are
5868  * integrated, and also may become zpool properties.
5869  *
5870  * There are three key functions that control how the L2ARC warms up:
5871  *
5872  *      l2arc_write_eligible()  check if a buffer is eligible to cache
5873  *      l2arc_write_size()      calculate how much to write
5874  *      l2arc_write_interval()  calculate sleep delay between writes
5875  *
5876  * These three functions determine what to write, how much, and how quickly
5877  * to send writes.
5878  */
5879
5880 static boolean_t
5881 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
5882 {
5883         /*
5884          * A buffer is *not* eligible for the L2ARC if it:
5885          * 1. belongs to a different spa.
5886          * 2. is already cached on the L2ARC.
5887          * 3. has an I/O in progress (it may be an incomplete read).
5888          * 4. is flagged not eligible (zfs property).
5889          */
5890         if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
5891             HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
5892                 return (B_FALSE);
5893
5894         return (B_TRUE);
5895 }
5896
5897 static uint64_t
5898 l2arc_write_size(void)
5899 {
5900         uint64_t size;
5901
5902         /*
5903          * Make sure our globals have meaningful values in case the user
5904          * altered them.
5905          */
5906         size = l2arc_write_max;
5907         if (size == 0) {
5908                 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
5909                     "be greater than zero, resetting it to the default (%d)",
5910                     L2ARC_WRITE_SIZE);
5911                 size = l2arc_write_max = L2ARC_WRITE_SIZE;
5912         }
5913
5914         if (arc_warm == B_FALSE)
5915                 size += l2arc_write_boost;
5916
5917         return (size);
5918
5919 }
5920
5921 static clock_t
5922 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
5923 {
5924         clock_t interval, next, now;
5925
5926         /*
5927          * If the ARC lists are busy, increase our write rate; if the
5928          * lists are stale, idle back.  This is achieved by checking
5929          * how much we previously wrote - if it was more than half of
5930          * what we wanted, schedule the next write much sooner.
5931          */
5932         if (l2arc_feed_again && wrote > (wanted / 2))
5933                 interval = (hz * l2arc_feed_min_ms) / 1000;
5934         else
5935                 interval = hz * l2arc_feed_secs;
5936
5937         now = ddi_get_lbolt();
5938         next = MAX(now, MIN(now + interval, began + interval));
5939
5940         return (next);
5941 }
5942
5943 /*
5944  * Cycle through L2ARC devices.  This is how L2ARC load balances.
5945  * If a device is returned, this also returns holding the spa config lock.
5946  */
5947 static l2arc_dev_t *
5948 l2arc_dev_get_next(void)
5949 {
5950         l2arc_dev_t *first, *next = NULL;
5951
5952         /*
5953          * Lock out the removal of spas (spa_namespace_lock), then removal
5954          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
5955          * both locks will be dropped and a spa config lock held instead.
5956          */
5957         mutex_enter(&spa_namespace_lock);
5958         mutex_enter(&l2arc_dev_mtx);
5959
5960         /* if there are no vdevs, there is nothing to do */
5961         if (l2arc_ndev == 0)
5962                 goto out;
5963
5964         first = NULL;
5965         next = l2arc_dev_last;
5966         do {
5967                 /* loop around the list looking for a non-faulted vdev */
5968                 if (next == NULL) {
5969                         next = list_head(l2arc_dev_list);
5970                 } else {
5971                         next = list_next(l2arc_dev_list, next);
5972                         if (next == NULL)
5973                                 next = list_head(l2arc_dev_list);
5974                 }
5975
5976                 /* if we have come back to the start, bail out */
5977                 if (first == NULL)
5978                         first = next;
5979                 else if (next == first)
5980                         break;
5981
5982         } while (vdev_is_dead(next->l2ad_vdev));
5983
5984         /* if we were unable to find any usable vdevs, return NULL */
5985         if (vdev_is_dead(next->l2ad_vdev))
5986                 next = NULL;
5987
5988         l2arc_dev_last = next;
5989
5990 out:
5991         mutex_exit(&l2arc_dev_mtx);
5992
5993         /*
5994          * Grab the config lock to prevent the 'next' device from being
5995          * removed while we are writing to it.
5996          */
5997         if (next != NULL)
5998                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
5999         mutex_exit(&spa_namespace_lock);
6000
6001         return (next);
6002 }
6003
6004 /*
6005  * Free buffers that were tagged for destruction.
6006  */
6007 static void
6008 l2arc_do_free_on_write(void)
6009 {
6010         list_t *buflist;
6011         l2arc_data_free_t *df, *df_prev;
6012
6013         mutex_enter(&l2arc_free_on_write_mtx);
6014         buflist = l2arc_free_on_write;
6015
6016         for (df = list_tail(buflist); df; df = df_prev) {
6017                 df_prev = list_prev(buflist, df);
6018                 ASSERT(df->l2df_data != NULL);
6019                 ASSERT(df->l2df_func != NULL);
6020                 df->l2df_func(df->l2df_data, df->l2df_size);
6021                 list_remove(buflist, df);
6022                 kmem_free(df, sizeof (l2arc_data_free_t));
6023         }
6024
6025         mutex_exit(&l2arc_free_on_write_mtx);
6026 }
6027
6028 /*
6029  * A write to a cache device has completed.  Update all headers to allow
6030  * reads from these buffers to begin.
6031  */
6032 static void
6033 l2arc_write_done(zio_t *zio)
6034 {
6035         l2arc_write_callback_t *cb;
6036         l2arc_dev_t *dev;
6037         list_t *buflist;
6038         arc_buf_hdr_t *head, *hdr, *hdr_prev;
6039         kmutex_t *hash_lock;
6040         int64_t bytes_dropped = 0;
6041
6042         cb = zio->io_private;
6043         ASSERT(cb != NULL);
6044         dev = cb->l2wcb_dev;
6045         ASSERT(dev != NULL);
6046         head = cb->l2wcb_head;
6047         ASSERT(head != NULL);
6048         buflist = &dev->l2ad_buflist;
6049         ASSERT(buflist != NULL);
6050         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
6051             l2arc_write_callback_t *, cb);
6052
6053         if (zio->io_error != 0)
6054                 ARCSTAT_BUMP(arcstat_l2_writes_error);
6055
6056         /*
6057          * All writes completed, or an error was hit.
6058          */
6059 top:
6060         mutex_enter(&dev->l2ad_mtx);
6061         for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
6062                 hdr_prev = list_prev(buflist, hdr);
6063
6064                 hash_lock = HDR_LOCK(hdr);
6065
6066                 /*
6067                  * We cannot use mutex_enter or else we can deadlock
6068                  * with l2arc_write_buffers (due to swapping the order
6069                  * the hash lock and l2ad_mtx are taken).
6070                  */
6071                 if (!mutex_tryenter(hash_lock)) {
6072                         /*
6073                          * Missed the hash lock. We must retry so we
6074                          * don't leave the ARC_FLAG_L2_WRITING bit set.
6075                          */
6076                         ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
6077
6078                         /*
6079                          * We don't want to rescan the headers we've
6080                          * already marked as having been written out, so
6081                          * we reinsert the head node so we can pick up
6082                          * where we left off.
6083                          */
6084                         list_remove(buflist, head);
6085                         list_insert_after(buflist, hdr, head);
6086
6087                         mutex_exit(&dev->l2ad_mtx);
6088
6089                         /*
6090                          * We wait for the hash lock to become available
6091                          * to try and prevent busy waiting, and increase
6092                          * the chance we'll be able to acquire the lock
6093                          * the next time around.
6094                          */
6095                         mutex_enter(hash_lock);
6096                         mutex_exit(hash_lock);
6097                         goto top;
6098                 }
6099
6100                 /*
6101                  * We could not have been moved into the arc_l2c_only
6102                  * state while in-flight due to our ARC_FLAG_L2_WRITING
6103                  * bit being set. Let's just ensure that's being enforced.
6104                  */
6105                 ASSERT(HDR_HAS_L1HDR(hdr));
6106
6107                 /*
6108                  * We may have allocated a buffer for L2ARC compression,
6109                  * we must release it to avoid leaking this data.
6110                  */
6111                 l2arc_release_cdata_buf(hdr);
6112
6113                 /*
6114                  * Skipped - drop L2ARC entry and mark the header as no
6115                  * longer L2 eligibile.
6116                  */
6117                 if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) {
6118                         list_remove(buflist, hdr);
6119                         hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
6120                         hdr->b_flags &= ~ARC_FLAG_L2CACHE;
6121
6122                         ARCSTAT_BUMP(arcstat_l2_writes_skip_toobig);
6123
6124                         (void) refcount_remove_many(&dev->l2ad_alloc,
6125                             hdr->b_l2hdr.b_asize, hdr);
6126                 } else if (zio->io_error != 0) {
6127                         /*
6128                          * Error - drop L2ARC entry.
6129                          */
6130                         list_remove(buflist, hdr);
6131                         hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
6132
6133                         ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
6134                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
6135
6136                         bytes_dropped += hdr->b_l2hdr.b_asize;
6137                         (void) refcount_remove_many(&dev->l2ad_alloc,
6138                             hdr->b_l2hdr.b_asize, hdr);
6139                 }
6140
6141                 /*
6142                  * Allow ARC to begin reads and ghost list evictions to
6143                  * this L2ARC entry.
6144                  */
6145                 hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
6146
6147                 mutex_exit(hash_lock);
6148         }
6149
6150         atomic_inc_64(&l2arc_writes_done);
6151         list_remove(buflist, head);
6152         ASSERT(!HDR_HAS_L1HDR(head));
6153         kmem_cache_free(hdr_l2only_cache, head);
6154         mutex_exit(&dev->l2ad_mtx);
6155
6156         vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
6157
6158         l2arc_do_free_on_write();
6159
6160         kmem_free(cb, sizeof (l2arc_write_callback_t));
6161 }
6162
6163 /*
6164  * A read to a cache device completed.  Validate buffer contents before
6165  * handing over to the regular ARC routines.
6166  */
6167 static void
6168 l2arc_read_done(zio_t *zio)
6169 {
6170         l2arc_read_callback_t *cb;
6171         arc_buf_hdr_t *hdr;
6172         arc_buf_t *buf;
6173         kmutex_t *hash_lock;
6174         int equal;
6175
6176         ASSERT(zio->io_vd != NULL);
6177         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
6178
6179         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
6180
6181         cb = zio->io_private;
6182         ASSERT(cb != NULL);
6183         buf = cb->l2rcb_buf;
6184         ASSERT(buf != NULL);
6185
6186         hash_lock = HDR_LOCK(buf->b_hdr);
6187         mutex_enter(hash_lock);
6188         hdr = buf->b_hdr;
6189         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
6190
6191         /*
6192          * If the buffer was compressed, decompress it first.
6193          */
6194         if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
6195                 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
6196         ASSERT(zio->io_data != NULL);
6197         ASSERT3U(zio->io_size, ==, hdr->b_size);
6198         ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size);
6199
6200         /*
6201          * Check this survived the L2ARC journey.
6202          */
6203         equal = arc_cksum_equal(buf);
6204         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
6205                 mutex_exit(hash_lock);
6206                 zio->io_private = buf;
6207                 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
6208                 zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
6209                 arc_read_done(zio);
6210         } else {
6211                 mutex_exit(hash_lock);
6212                 /*
6213                  * Buffer didn't survive caching.  Increment stats and
6214                  * reissue to the original storage device.
6215                  */
6216                 if (zio->io_error != 0) {
6217                         ARCSTAT_BUMP(arcstat_l2_io_error);
6218                 } else {
6219                         zio->io_error = SET_ERROR(EIO);
6220                 }
6221                 if (!equal)
6222                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
6223
6224                 /*
6225                  * If there's no waiter, issue an async i/o to the primary
6226                  * storage now.  If there *is* a waiter, the caller must
6227                  * issue the i/o in a context where it's OK to block.
6228                  */
6229                 if (zio->io_waiter == NULL) {
6230                         zio_t *pio = zio_unique_parent(zio);
6231
6232                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
6233
6234                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
6235                             buf->b_data, hdr->b_size, arc_read_done, buf,
6236                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
6237                 }
6238         }
6239
6240         kmem_free(cb, sizeof (l2arc_read_callback_t));
6241 }
6242
6243 /*
6244  * This is the list priority from which the L2ARC will search for pages to
6245  * cache.  This is used within loops (0..3) to cycle through lists in the
6246  * desired order.  This order can have a significant effect on cache
6247  * performance.
6248  *
6249  * Currently the metadata lists are hit first, MFU then MRU, followed by
6250  * the data lists.  This function returns a locked list, and also returns
6251  * the lock pointer.
6252  */
6253 static multilist_sublist_t *
6254 l2arc_sublist_lock(int list_num)
6255 {
6256         multilist_t *ml = NULL;
6257         unsigned int idx;
6258
6259         ASSERT(list_num >= 0 && list_num <= 3);
6260
6261         switch (list_num) {
6262         case 0:
6263                 ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
6264                 break;
6265         case 1:
6266                 ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
6267                 break;
6268         case 2:
6269                 ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
6270                 break;
6271         case 3:
6272                 ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
6273                 break;
6274         }
6275
6276         /*
6277          * Return a randomly-selected sublist. This is acceptable
6278          * because the caller feeds only a little bit of data for each
6279          * call (8MB). Subsequent calls will result in different
6280          * sublists being selected.
6281          */
6282         idx = multilist_get_random_index(ml);
6283         return (multilist_sublist_lock(ml, idx));
6284 }
6285
6286 /*
6287  * Evict buffers from the device write hand to the distance specified in
6288  * bytes.  This distance may span populated buffers, it may span nothing.
6289  * This is clearing a region on the L2ARC device ready for writing.
6290  * If the 'all' boolean is set, every buffer is evicted.
6291  */
6292 static void
6293 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
6294 {
6295         list_t *buflist;
6296         arc_buf_hdr_t *hdr, *hdr_prev;
6297         kmutex_t *hash_lock;
6298         uint64_t taddr;
6299
6300         buflist = &dev->l2ad_buflist;
6301
6302         if (!all && dev->l2ad_first) {
6303                 /*
6304                  * This is the first sweep through the device.  There is
6305                  * nothing to evict.
6306                  */
6307                 return;
6308         }
6309
6310         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
6311                 /*
6312                  * When nearing the end of the device, evict to the end
6313                  * before the device write hand jumps to the start.
6314                  */
6315                 taddr = dev->l2ad_end;
6316         } else {
6317                 taddr = dev->l2ad_hand + distance;
6318         }
6319         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
6320             uint64_t, taddr, boolean_t, all);
6321
6322 top:
6323         mutex_enter(&dev->l2ad_mtx);
6324         for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
6325                 hdr_prev = list_prev(buflist, hdr);
6326
6327                 hash_lock = HDR_LOCK(hdr);
6328
6329                 /*
6330                  * We cannot use mutex_enter or else we can deadlock
6331                  * with l2arc_write_buffers (due to swapping the order
6332                  * the hash lock and l2ad_mtx are taken).
6333                  */
6334                 if (!mutex_tryenter(hash_lock)) {
6335                         /*
6336                          * Missed the hash lock.  Retry.
6337                          */
6338                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
6339                         mutex_exit(&dev->l2ad_mtx);
6340                         mutex_enter(hash_lock);
6341                         mutex_exit(hash_lock);
6342                         goto top;
6343                 }
6344
6345                 if (HDR_L2_WRITE_HEAD(hdr)) {
6346                         /*
6347                          * We hit a write head node.  Leave it for
6348                          * l2arc_write_done().
6349                          */
6350                         list_remove(buflist, hdr);
6351                         mutex_exit(hash_lock);
6352                         continue;
6353                 }
6354
6355                 if (!all && HDR_HAS_L2HDR(hdr) &&
6356                     (hdr->b_l2hdr.b_daddr > taddr ||
6357                     hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
6358                         /*
6359                          * We've evicted to the target address,
6360                          * or the end of the device.
6361                          */
6362                         mutex_exit(hash_lock);
6363                         break;
6364                 }
6365
6366                 ASSERT(HDR_HAS_L2HDR(hdr));
6367                 if (!HDR_HAS_L1HDR(hdr)) {
6368                         ASSERT(!HDR_L2_READING(hdr));
6369                         /*
6370                          * This doesn't exist in the ARC.  Destroy.
6371                          * arc_hdr_destroy() will call list_remove()
6372                          * and decrement arcstat_l2_size.
6373                          */
6374                         arc_change_state(arc_anon, hdr, hash_lock);
6375                         arc_hdr_destroy(hdr);
6376                 } else {
6377                         ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
6378                         ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
6379                         /*
6380                          * Invalidate issued or about to be issued
6381                          * reads, since we may be about to write
6382                          * over this location.
6383                          */
6384                         if (HDR_L2_READING(hdr)) {
6385                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
6386                                 hdr->b_flags |= ARC_FLAG_L2_EVICTED;
6387                         }
6388
6389                         /* Ensure this header has finished being written */
6390                         ASSERT(!HDR_L2_WRITING(hdr));
6391                         ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
6392
6393                         arc_hdr_l2hdr_destroy(hdr);
6394                 }
6395                 mutex_exit(hash_lock);
6396         }
6397         mutex_exit(&dev->l2ad_mtx);
6398 }
6399
6400 /*
6401  * Find and write ARC buffers to the L2ARC device.
6402  *
6403  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
6404  * for reading until they have completed writing.
6405  * The headroom_boost is an in-out parameter used to maintain headroom boost
6406  * state between calls to this function.
6407  *
6408  * Returns the number of bytes actually written (which may be smaller than
6409  * the delta by which the device hand has changed due to alignment).
6410  */
6411 static uint64_t
6412 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
6413     boolean_t *headroom_boost)
6414 {
6415         arc_buf_hdr_t *hdr, *hdr_prev, *head;
6416         uint64_t write_asize, write_sz, headroom, buf_compress_minsz,
6417             stats_size;
6418         void *buf_data;
6419         boolean_t full;
6420         l2arc_write_callback_t *cb;
6421         zio_t *pio, *wzio;
6422         uint64_t guid = spa_load_guid(spa);
6423         int try;
6424         const boolean_t do_headroom_boost = *headroom_boost;
6425
6426         ASSERT(dev->l2ad_vdev != NULL);
6427
6428         /* Lower the flag now, we might want to raise it again later. */
6429         *headroom_boost = B_FALSE;
6430
6431         pio = NULL;
6432         write_sz = write_asize = 0;
6433         full = B_FALSE;
6434         head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
6435         head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
6436         head->b_flags |= ARC_FLAG_HAS_L2HDR;
6437
6438         /*
6439          * We will want to try to compress buffers that are at least 2x the
6440          * device sector size.
6441          */
6442         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
6443
6444         /*
6445          * Copy buffers for L2ARC writing.
6446          */
6447         for (try = 0; try <= 3; try++) {
6448                 multilist_sublist_t *mls = l2arc_sublist_lock(try);
6449                 uint64_t passed_sz = 0;
6450
6451                 /*
6452                  * L2ARC fast warmup.
6453                  *
6454                  * Until the ARC is warm and starts to evict, read from the
6455                  * head of the ARC lists rather than the tail.
6456                  */
6457                 if (arc_warm == B_FALSE)
6458                         hdr = multilist_sublist_head(mls);
6459                 else
6460                         hdr = multilist_sublist_tail(mls);
6461
6462                 headroom = target_sz * l2arc_headroom;
6463                 if (do_headroom_boost)
6464                         headroom = (headroom * l2arc_headroom_boost) / 100;
6465
6466                 for (; hdr; hdr = hdr_prev) {
6467                         kmutex_t *hash_lock;
6468                         uint64_t buf_sz;
6469                         uint64_t buf_a_sz;
6470
6471                         if (arc_warm == B_FALSE)
6472                                 hdr_prev = multilist_sublist_next(mls, hdr);
6473                         else
6474                                 hdr_prev = multilist_sublist_prev(mls, hdr);
6475
6476                         hash_lock = HDR_LOCK(hdr);
6477                         if (!mutex_tryenter(hash_lock)) {
6478                                 /*
6479                                  * Skip this buffer rather than waiting.
6480                                  */
6481                                 continue;
6482                         }
6483
6484                         passed_sz += hdr->b_size;
6485                         if (passed_sz > headroom) {
6486                                 /*
6487                                  * Searched too far.
6488                                  */
6489                                 mutex_exit(hash_lock);
6490                                 break;
6491                         }
6492
6493                         if (!l2arc_write_eligible(guid, hdr)) {
6494                                 mutex_exit(hash_lock);
6495                                 continue;
6496                         }
6497
6498                         /*
6499                          * Assume that the buffer is not going to be compressed
6500                          * and could take more space on disk because of a larger
6501                          * disk block size.
6502                          */
6503                         buf_sz = hdr->b_size;
6504                         buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6505
6506                         if ((write_asize + buf_a_sz) > target_sz) {
6507                                 full = B_TRUE;
6508                                 mutex_exit(hash_lock);
6509                                 break;
6510                         }
6511
6512                         if (pio == NULL) {
6513                                 /*
6514                                  * Insert a dummy header on the buflist so
6515                                  * l2arc_write_done() can find where the
6516                                  * write buffers begin without searching.
6517                                  */
6518                                 mutex_enter(&dev->l2ad_mtx);
6519                                 list_insert_head(&dev->l2ad_buflist, head);
6520                                 mutex_exit(&dev->l2ad_mtx);
6521
6522                                 cb = kmem_alloc(
6523                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
6524                                 cb->l2wcb_dev = dev;
6525                                 cb->l2wcb_head = head;
6526                                 pio = zio_root(spa, l2arc_write_done, cb,
6527                                     ZIO_FLAG_CANFAIL);
6528                         }
6529
6530                         /*
6531                          * Create and add a new L2ARC header.
6532                          */
6533                         hdr->b_l2hdr.b_dev = dev;
6534                         hdr->b_flags |= ARC_FLAG_L2_WRITING;
6535                         /*
6536                          * Temporarily stash the data buffer in b_tmp_cdata.
6537                          * The subsequent write step will pick it up from
6538                          * there. This is because can't access b_l1hdr.b_buf
6539                          * without holding the hash_lock, which we in turn
6540                          * can't access without holding the ARC list locks
6541                          * (which we want to avoid during compression/writing)
6542                          */
6543                         hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF;
6544                         hdr->b_l2hdr.b_asize = hdr->b_size;
6545                         hdr->b_l2hdr.b_hits = 0;
6546                         hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
6547
6548                         /*
6549                          * Explicitly set the b_daddr field to a known
6550                          * value which means "invalid address". This
6551                          * enables us to differentiate which stage of
6552                          * l2arc_write_buffers() the particular header
6553                          * is in (e.g. this loop, or the one below).
6554                          * ARC_FLAG_L2_WRITING is not enough to make
6555                          * this distinction, and we need to know in
6556                          * order to do proper l2arc vdev accounting in
6557                          * arc_release() and arc_hdr_destroy().
6558                          *
6559                          * Note, we can't use a new flag to distinguish
6560                          * the two stages because we don't hold the
6561                          * header's hash_lock below, in the second stage
6562                          * of this function. Thus, we can't simply
6563                          * change the b_flags field to denote that the
6564                          * IO has been sent. We can change the b_daddr
6565                          * field of the L2 portion, though, since we'll
6566                          * be holding the l2ad_mtx; which is why we're
6567                          * using it to denote the header's state change.
6568                          */
6569                         hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
6570                         hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
6571
6572                         mutex_enter(&dev->l2ad_mtx);
6573                         list_insert_head(&dev->l2ad_buflist, hdr);
6574                         mutex_exit(&dev->l2ad_mtx);
6575
6576                         /*
6577                          * Compute and store the buffer cksum before
6578                          * writing.  On debug the cksum is verified first.
6579                          */
6580                         arc_cksum_verify(hdr->b_l1hdr.b_buf);
6581                         arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
6582
6583                         mutex_exit(hash_lock);
6584
6585                         write_sz += buf_sz;
6586                         write_asize += buf_a_sz;
6587                 }
6588
6589                 multilist_sublist_unlock(mls);
6590
6591                 if (full == B_TRUE)
6592                         break;
6593         }
6594
6595         /* No buffers selected for writing? */
6596         if (pio == NULL) {
6597                 ASSERT0(write_sz);
6598                 ASSERT(!HDR_HAS_L1HDR(head));
6599                 kmem_cache_free(hdr_l2only_cache, head);
6600                 return (0);
6601         }
6602
6603         mutex_enter(&dev->l2ad_mtx);
6604
6605         /*
6606          * Note that elsewhere in this file arcstat_l2_asize
6607          * and the used space on l2ad_vdev are updated using b_asize,
6608          * which is not necessarily rounded up to the device block size.
6609          * Too keep accounting consistent we do the same here as well:
6610          * stats_size accumulates the sum of b_asize of the written buffers,
6611          * while write_asize accumulates the sum of b_asize rounded up
6612          * to the device block size.
6613          * The latter sum is used only to validate the corectness of the code.
6614          */
6615         stats_size = 0;
6616         write_asize = 0;
6617
6618         /*
6619          * Now start writing the buffers. We're starting at the write head
6620          * and work backwards, retracing the course of the buffer selector
6621          * loop above.
6622          */
6623         for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
6624             hdr = list_prev(&dev->l2ad_buflist, hdr)) {
6625                 uint64_t buf_sz;
6626
6627                 /*
6628                  * We rely on the L1 portion of the header below, so
6629                  * it's invalid for this header to have been evicted out
6630                  * of the ghost cache, prior to being written out. The
6631                  * ARC_FLAG_L2_WRITING bit ensures this won't happen.
6632                  */
6633                 ASSERT(HDR_HAS_L1HDR(hdr));
6634
6635                 /*
6636                  * We shouldn't need to lock the buffer here, since we flagged
6637                  * it as ARC_FLAG_L2_WRITING in the previous step, but we must
6638                  * take care to only access its L2 cache parameters. In
6639                  * particular, hdr->l1hdr.b_buf may be invalid by now due to
6640                  * ARC eviction.
6641                  */
6642                 hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
6643
6644                 if ((!l2arc_nocompress && HDR_L2COMPRESS(hdr)) &&
6645                     hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
6646                         if (l2arc_compress_buf(hdr)) {
6647                                 /*
6648                                  * If compression succeeded, enable headroom
6649                                  * boost on the next scan cycle.
6650                                  */
6651                                 *headroom_boost = B_TRUE;
6652                         }
6653                 }
6654
6655                 /*
6656                  * Pick up the buffer data we had previously stashed away
6657                  * (and now potentially also compressed).
6658                  */
6659                 buf_data = hdr->b_l1hdr.b_tmp_cdata;
6660                 buf_sz = hdr->b_l2hdr.b_asize;
6661
6662                 /*
6663                  * We need to do this regardless if buf_sz is zero or
6664                  * not, otherwise, when this l2hdr is evicted we'll
6665                  * remove a reference that was never added.
6666                  */
6667                 (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
6668
6669                 /* Compression may have squashed the buffer to zero length. */
6670                 if (buf_sz != 0) {
6671                         uint64_t buf_a_sz;
6672
6673                         /*
6674                          * Buffers which are larger than l2arc_max_block_size
6675                          * after compression are skipped and removed from L2
6676                          * eligibility.
6677                          */
6678                         if (buf_sz > l2arc_max_block_size) {
6679                                 hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
6680                                 continue;
6681                         }
6682
6683                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
6684                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
6685                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
6686                             ZIO_FLAG_CANFAIL, B_FALSE);
6687
6688                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6689                             zio_t *, wzio);
6690                         (void) zio_nowait(wzio);
6691
6692                         stats_size += buf_sz;
6693
6694                         /*
6695                          * Keep the clock hand suitably device-aligned.
6696                          */
6697                         buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6698                         write_asize += buf_a_sz;
6699                         dev->l2ad_hand += buf_a_sz;
6700                 }
6701         }
6702
6703         mutex_exit(&dev->l2ad_mtx);
6704
6705         ASSERT3U(write_asize, <=, target_sz);
6706         ARCSTAT_BUMP(arcstat_l2_writes_sent);
6707         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
6708         ARCSTAT_INCR(arcstat_l2_size, write_sz);
6709         ARCSTAT_INCR(arcstat_l2_asize, stats_size);
6710         vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0);
6711
6712         /*
6713          * Bump device hand to the device start if it is approaching the end.
6714          * l2arc_evict() will already have evicted ahead for this case.
6715          */
6716         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
6717                 dev->l2ad_hand = dev->l2ad_start;
6718                 dev->l2ad_first = B_FALSE;
6719         }
6720
6721         dev->l2ad_writing = B_TRUE;
6722         (void) zio_wait(pio);
6723         dev->l2ad_writing = B_FALSE;
6724
6725         return (write_asize);
6726 }
6727
6728 /*
6729  * Compresses an L2ARC buffer.
6730  * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
6731  * size in l2hdr->b_asize. This routine tries to compress the data and
6732  * depending on the compression result there are three possible outcomes:
6733  * *) The buffer was incompressible. The original l2hdr contents were left
6734  *    untouched and are ready for writing to an L2 device.
6735  * *) The buffer was all-zeros, so there is no need to write it to an L2
6736  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
6737  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
6738  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
6739  *    data buffer which holds the compressed data to be written, and b_asize
6740  *    tells us how much data there is. b_compress is set to the appropriate
6741  *    compression algorithm. Once writing is done, invoke
6742  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
6743  *
6744  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
6745  * buffer was incompressible).
6746  */
6747 static boolean_t
6748 l2arc_compress_buf(arc_buf_hdr_t *hdr)
6749 {
6750         void *cdata;
6751         size_t csize, len, rounded;
6752         l2arc_buf_hdr_t *l2hdr;
6753
6754         ASSERT(HDR_HAS_L2HDR(hdr));
6755
6756         l2hdr = &hdr->b_l2hdr;
6757
6758         ASSERT(HDR_HAS_L1HDR(hdr));
6759         ASSERT3U(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF);
6760         ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6761
6762         len = l2hdr->b_asize;
6763         cdata = zio_data_buf_alloc(len);
6764         ASSERT3P(cdata, !=, NULL);
6765         csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
6766             cdata, l2hdr->b_asize);
6767
6768         rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
6769         if (rounded > csize) {
6770                 bzero((char *)cdata + csize, rounded - csize);
6771                 csize = rounded;
6772         }
6773
6774         if (csize == 0) {
6775                 /* zero block, indicate that there's nothing to write */
6776                 zio_data_buf_free(cdata, len);
6777                 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
6778                 l2hdr->b_asize = 0;
6779                 hdr->b_l1hdr.b_tmp_cdata = NULL;
6780                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
6781                 return (B_TRUE);
6782         } else if (csize > 0 && csize < len) {
6783                 /*
6784                  * Compression succeeded, we'll keep the cdata around for
6785                  * writing and release it afterwards.
6786                  */
6787                 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
6788                 l2hdr->b_asize = csize;
6789                 hdr->b_l1hdr.b_tmp_cdata = cdata;
6790                 ARCSTAT_BUMP(arcstat_l2_compress_successes);
6791                 return (B_TRUE);
6792         } else {
6793                 /*
6794                  * Compression failed, release the compressed buffer.
6795                  * l2hdr will be left unmodified.
6796                  */
6797                 zio_data_buf_free(cdata, len);
6798                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
6799                 return (B_FALSE);
6800         }
6801 }
6802
6803 /*
6804  * Decompresses a zio read back from an l2arc device. On success, the
6805  * underlying zio's io_data buffer is overwritten by the uncompressed
6806  * version. On decompression error (corrupt compressed stream), the
6807  * zio->io_error value is set to signal an I/O error.
6808  *
6809  * Please note that the compressed data stream is not checksummed, so
6810  * if the underlying device is experiencing data corruption, we may feed
6811  * corrupt data to the decompressor, so the decompressor needs to be
6812  * able to handle this situation (LZ4 does).
6813  */
6814 static void
6815 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
6816 {
6817         uint64_t csize;
6818         void *cdata;
6819
6820         ASSERT(L2ARC_IS_VALID_COMPRESS(c));
6821
6822         if (zio->io_error != 0) {
6823                 /*
6824                  * An io error has occured, just restore the original io
6825                  * size in preparation for a main pool read.
6826                  */
6827                 zio->io_orig_size = zio->io_size = hdr->b_size;
6828                 return;
6829         }
6830
6831         if (c == ZIO_COMPRESS_EMPTY) {
6832                 /*
6833                  * An empty buffer results in a null zio, which means we
6834                  * need to fill its io_data after we're done restoring the
6835                  * buffer's contents.
6836                  */
6837                 ASSERT(hdr->b_l1hdr.b_buf != NULL);
6838                 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
6839                 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
6840         } else {
6841                 ASSERT(zio->io_data != NULL);
6842                 /*
6843                  * We copy the compressed data from the start of the arc buffer
6844                  * (the zio_read will have pulled in only what we need, the
6845                  * rest is garbage which we will overwrite at decompression)
6846                  * and then decompress back to the ARC data buffer. This way we
6847                  * can minimize copying by simply decompressing back over the
6848                  * original compressed data (rather than decompressing to an
6849                  * aux buffer and then copying back the uncompressed buffer,
6850                  * which is likely to be much larger).
6851                  */
6852                 csize = zio->io_size;
6853                 cdata = zio_data_buf_alloc(csize);
6854                 bcopy(zio->io_data, cdata, csize);
6855                 if (zio_decompress_data(c, cdata, zio->io_data, csize,
6856                     hdr->b_size) != 0)
6857                         zio->io_error = EIO;
6858                 zio_data_buf_free(cdata, csize);
6859         }
6860
6861         /* Restore the expected uncompressed IO size. */
6862         zio->io_orig_size = zio->io_size = hdr->b_size;
6863 }
6864
6865 /*
6866  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
6867  * This buffer serves as a temporary holder of compressed data while
6868  * the buffer entry is being written to an l2arc device. Once that is
6869  * done, we can dispose of it.
6870  */
6871 static void
6872 l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
6873 {
6874         enum zio_compress comp;
6875
6876         ASSERT(HDR_HAS_L1HDR(hdr));
6877         ASSERT(HDR_HAS_L2HDR(hdr));
6878         comp = hdr->b_l2hdr.b_compress;
6879         ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
6880
6881         if (comp == ZIO_COMPRESS_OFF) {
6882                 /*
6883                  * In this case, b_tmp_cdata points to the same buffer
6884                  * as the arc_buf_t's b_data field. We don't want to
6885                  * free it, since the arc_buf_t will handle that.
6886                  */
6887                 hdr->b_l1hdr.b_tmp_cdata = NULL;
6888         } else if (comp == ZIO_COMPRESS_EMPTY) {
6889                 /*
6890                  * In this case, b_tmp_cdata was compressed to an empty
6891                  * buffer, thus there's nothing to free and b_tmp_cdata
6892                  * should have been set to NULL in l2arc_write_buffers().
6893                  */
6894                 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
6895         } else {
6896                 /*
6897                  * If the data was compressed, then we've allocated a
6898                  * temporary buffer for it, so now we need to release it.
6899                  */
6900                 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6901                 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
6902                     hdr->b_size);
6903                 hdr->b_l1hdr.b_tmp_cdata = NULL;
6904         }
6905
6906 }
6907
6908 /*
6909  * This thread feeds the L2ARC at regular intervals.  This is the beating
6910  * heart of the L2ARC.
6911  */
6912 static void
6913 l2arc_feed_thread(void)
6914 {
6915         callb_cpr_t cpr;
6916         l2arc_dev_t *dev;
6917         spa_t *spa;
6918         uint64_t size, wrote;
6919         clock_t begin, next = ddi_get_lbolt();
6920         boolean_t headroom_boost = B_FALSE;
6921         fstrans_cookie_t cookie;
6922
6923         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
6924
6925         mutex_enter(&l2arc_feed_thr_lock);
6926
6927         cookie = spl_fstrans_mark();
6928         while (l2arc_thread_exit == 0) {
6929                 CALLB_CPR_SAFE_BEGIN(&cpr);
6930                 (void) cv_timedwait_sig(&l2arc_feed_thr_cv,
6931                     &l2arc_feed_thr_lock, next);
6932                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
6933                 next = ddi_get_lbolt() + hz;
6934
6935                 /*
6936                  * Quick check for L2ARC devices.
6937                  */
6938                 mutex_enter(&l2arc_dev_mtx);
6939                 if (l2arc_ndev == 0) {
6940                         mutex_exit(&l2arc_dev_mtx);
6941                         continue;
6942                 }
6943                 mutex_exit(&l2arc_dev_mtx);
6944                 begin = ddi_get_lbolt();
6945
6946                 /*
6947                  * This selects the next l2arc device to write to, and in
6948                  * doing so the next spa to feed from: dev->l2ad_spa.   This
6949                  * will return NULL if there are now no l2arc devices or if
6950                  * they are all faulted.
6951                  *
6952                  * If a device is returned, its spa's config lock is also
6953                  * held to prevent device removal.  l2arc_dev_get_next()
6954                  * will grab and release l2arc_dev_mtx.
6955                  */
6956                 if ((dev = l2arc_dev_get_next()) == NULL)
6957                         continue;
6958
6959                 spa = dev->l2ad_spa;
6960                 ASSERT(spa != NULL);
6961
6962                 /*
6963                  * If the pool is read-only then force the feed thread to
6964                  * sleep a little longer.
6965                  */
6966                 if (!spa_writeable(spa)) {
6967                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
6968                         spa_config_exit(spa, SCL_L2ARC, dev);
6969                         continue;
6970                 }
6971
6972                 /*
6973                  * Avoid contributing to memory pressure.
6974                  */
6975                 if (arc_reclaim_needed()) {
6976                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
6977                         spa_config_exit(spa, SCL_L2ARC, dev);
6978                         continue;
6979                 }
6980
6981                 ARCSTAT_BUMP(arcstat_l2_feeds);
6982
6983                 size = l2arc_write_size();
6984
6985                 /*
6986                  * Evict L2ARC buffers that will be overwritten.
6987                  */
6988                 l2arc_evict(dev, size, B_FALSE);
6989
6990                 /*
6991                  * Write ARC buffers.
6992                  */
6993                 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
6994
6995                 /*
6996                  * Calculate interval between writes.
6997                  */
6998                 next = l2arc_write_interval(begin, size, wrote);
6999                 spa_config_exit(spa, SCL_L2ARC, dev);
7000         }
7001         spl_fstrans_unmark(cookie);
7002
7003         l2arc_thread_exit = 0;
7004         cv_broadcast(&l2arc_feed_thr_cv);
7005         CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
7006         thread_exit();
7007 }
7008
7009 boolean_t
7010 l2arc_vdev_present(vdev_t *vd)
7011 {
7012         l2arc_dev_t *dev;
7013
7014         mutex_enter(&l2arc_dev_mtx);
7015         for (dev = list_head(l2arc_dev_list); dev != NULL;
7016             dev = list_next(l2arc_dev_list, dev)) {
7017                 if (dev->l2ad_vdev == vd)
7018                         break;
7019         }
7020         mutex_exit(&l2arc_dev_mtx);
7021
7022         return (dev != NULL);
7023 }
7024
7025 /*
7026  * Add a vdev for use by the L2ARC.  By this point the spa has already
7027  * validated the vdev and opened it.
7028  */
7029 void
7030 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
7031 {
7032         l2arc_dev_t *adddev;
7033
7034         ASSERT(!l2arc_vdev_present(vd));
7035
7036         /*
7037          * Create a new l2arc device entry.
7038          */
7039         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
7040         adddev->l2ad_spa = spa;
7041         adddev->l2ad_vdev = vd;
7042         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
7043         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
7044         adddev->l2ad_hand = adddev->l2ad_start;
7045         adddev->l2ad_first = B_TRUE;
7046         adddev->l2ad_writing = B_FALSE;
7047         list_link_init(&adddev->l2ad_node);
7048
7049         mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
7050         /*
7051          * This is a list of all ARC buffers that are still valid on the
7052          * device.
7053          */
7054         list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
7055             offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
7056
7057         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
7058         refcount_create(&adddev->l2ad_alloc);
7059
7060         /*
7061          * Add device to global list
7062          */
7063         mutex_enter(&l2arc_dev_mtx);
7064         list_insert_head(l2arc_dev_list, adddev);
7065         atomic_inc_64(&l2arc_ndev);
7066         mutex_exit(&l2arc_dev_mtx);
7067 }
7068
7069 /*
7070  * Remove a vdev from the L2ARC.
7071  */
7072 void
7073 l2arc_remove_vdev(vdev_t *vd)
7074 {
7075         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
7076
7077         /*
7078          * Find the device by vdev
7079          */
7080         mutex_enter(&l2arc_dev_mtx);
7081         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
7082                 nextdev = list_next(l2arc_dev_list, dev);
7083                 if (vd == dev->l2ad_vdev) {
7084                         remdev = dev;
7085                         break;
7086                 }
7087         }
7088         ASSERT(remdev != NULL);
7089
7090         /*
7091          * Remove device from global list
7092          */
7093         list_remove(l2arc_dev_list, remdev);
7094         l2arc_dev_last = NULL;          /* may have been invalidated */
7095         atomic_dec_64(&l2arc_ndev);
7096         mutex_exit(&l2arc_dev_mtx);
7097
7098         /*
7099          * Clear all buflists and ARC references.  L2ARC device flush.
7100          */
7101         l2arc_evict(remdev, 0, B_TRUE);
7102         list_destroy(&remdev->l2ad_buflist);
7103         mutex_destroy(&remdev->l2ad_mtx);
7104         refcount_destroy(&remdev->l2ad_alloc);
7105         kmem_free(remdev, sizeof (l2arc_dev_t));
7106 }
7107
7108 void
7109 l2arc_init(void)
7110 {
7111         l2arc_thread_exit = 0;
7112         l2arc_ndev = 0;
7113         l2arc_writes_sent = 0;
7114         l2arc_writes_done = 0;
7115
7116         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
7117         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
7118         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
7119         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
7120
7121         l2arc_dev_list = &L2ARC_dev_list;
7122         l2arc_free_on_write = &L2ARC_free_on_write;
7123         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
7124             offsetof(l2arc_dev_t, l2ad_node));
7125         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
7126             offsetof(l2arc_data_free_t, l2df_list_node));
7127 }
7128
7129 void
7130 l2arc_fini(void)
7131 {
7132         /*
7133          * This is called from dmu_fini(), which is called from spa_fini();
7134          * Because of this, we can assume that all l2arc devices have
7135          * already been removed when the pools themselves were removed.
7136          */
7137
7138         l2arc_do_free_on_write();
7139
7140         mutex_destroy(&l2arc_feed_thr_lock);
7141         cv_destroy(&l2arc_feed_thr_cv);
7142         mutex_destroy(&l2arc_dev_mtx);
7143         mutex_destroy(&l2arc_free_on_write_mtx);
7144
7145         list_destroy(l2arc_dev_list);
7146         list_destroy(l2arc_free_on_write);
7147 }
7148
7149 void
7150 l2arc_start(void)
7151 {
7152         if (!(spa_mode_global & FWRITE))
7153                 return;
7154
7155         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
7156             TS_RUN, defclsyspri);
7157 }
7158
7159 void
7160 l2arc_stop(void)
7161 {
7162         if (!(spa_mode_global & FWRITE))
7163                 return;
7164
7165         mutex_enter(&l2arc_feed_thr_lock);
7166         cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
7167         l2arc_thread_exit = 1;
7168         while (l2arc_thread_exit != 0)
7169                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
7170         mutex_exit(&l2arc_feed_thr_lock);
7171 }
7172
7173 #if defined(_KERNEL) && defined(HAVE_SPL)
7174 EXPORT_SYMBOL(arc_buf_size);
7175 EXPORT_SYMBOL(arc_write);
7176 EXPORT_SYMBOL(arc_read);
7177 EXPORT_SYMBOL(arc_buf_remove_ref);
7178 EXPORT_SYMBOL(arc_buf_info);
7179 EXPORT_SYMBOL(arc_getbuf_func);
7180 EXPORT_SYMBOL(arc_add_prune_callback);
7181 EXPORT_SYMBOL(arc_remove_prune_callback);
7182
7183 module_param(zfs_arc_min, ulong, 0644);
7184 MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
7185
7186 module_param(zfs_arc_max, ulong, 0644);
7187 MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
7188
7189 module_param(zfs_arc_meta_limit, ulong, 0644);
7190 MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
7191
7192 module_param(zfs_arc_meta_limit_percent, ulong, 0644);
7193 MODULE_PARM_DESC(zfs_arc_meta_limit_percent,
7194         "Percent of arc size for arc meta limit");
7195
7196 module_param(zfs_arc_meta_min, ulong, 0644);
7197 MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata");
7198
7199 module_param(zfs_arc_meta_prune, int, 0644);
7200 MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
7201
7202 module_param(zfs_arc_meta_adjust_restarts, int, 0644);
7203 MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
7204         "Limit number of restarts in arc_adjust_meta");
7205
7206 module_param(zfs_arc_meta_strategy, int, 0644);
7207 MODULE_PARM_DESC(zfs_arc_meta_strategy, "Meta reclaim strategy");
7208
7209 module_param(zfs_arc_grow_retry, int, 0644);
7210 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
7211
7212 module_param(zfs_arc_p_aggressive_disable, int, 0644);
7213 MODULE_PARM_DESC(zfs_arc_p_aggressive_disable, "disable aggressive arc_p grow");
7214
7215 module_param(zfs_arc_p_dampener_disable, int, 0644);
7216 MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener");
7217
7218 module_param(zfs_arc_shrink_shift, int, 0644);
7219 MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
7220
7221 module_param(zfs_arc_p_min_shift, int, 0644);
7222 MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
7223
7224 module_param(zfs_disable_dup_eviction, int, 0644);
7225 MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
7226
7227 module_param(zfs_arc_average_blocksize, int, 0444);
7228 MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
7229
7230 module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
7231 MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
7232
7233 module_param(zfs_arc_num_sublists_per_state, int, 0644);
7234 MODULE_PARM_DESC(zfs_arc_num_sublists_per_state,
7235         "Number of sublists used in each of the ARC state lists");
7236
7237 module_param(l2arc_write_max, ulong, 0644);
7238 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
7239
7240 module_param(l2arc_write_boost, ulong, 0644);
7241 MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
7242
7243 module_param(l2arc_headroom, ulong, 0644);
7244 MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
7245
7246 module_param(l2arc_headroom_boost, ulong, 0644);
7247 MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
7248
7249 module_param(l2arc_max_block_size, ulong, 0644);
7250 MODULE_PARM_DESC(l2arc_max_block_size, "Skip L2ARC buffers larger than N");
7251
7252 module_param(l2arc_feed_secs, ulong, 0644);
7253 MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
7254
7255 module_param(l2arc_feed_min_ms, ulong, 0644);
7256 MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
7257
7258 module_param(l2arc_noprefetch, int, 0644);
7259 MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
7260
7261 module_param(l2arc_nocompress, int, 0644);
7262 MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers");
7263
7264 module_param(l2arc_feed_again, int, 0644);
7265 MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
7266
7267 module_param(l2arc_norw, int, 0644);
7268 MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
7269
7270 module_param(zfs_arc_lotsfree_percent, int, 0644);
7271 MODULE_PARM_DESC(zfs_arc_lotsfree_percent,
7272         "System free memory I/O throttle in bytes");
7273
7274 module_param(zfs_arc_sys_free, ulong, 0644);
7275 MODULE_PARM_DESC(zfs_arc_sys_free, "System free memory target size in bytes");
7276
7277 module_param(zfs_arc_dnode_limit, ulong, 0644);
7278 MODULE_PARM_DESC(zfs_arc_dnode_limit, "Minimum bytes of dnodes in arc");
7279
7280 module_param(zfs_arc_dnode_limit_percent, ulong, 0644);
7281 MODULE_PARM_DESC(zfs_arc_dnode_limit_percent,
7282         "Percent of ARC meta buffers for dnodes");
7283
7284 module_param(zfs_arc_dnode_reduce_percent, ulong, 0644);
7285 MODULE_PARM_DESC(zfs_arc_dnode_reduce_percent,
7286         "Percentage of excess dnodes to try to unpin");
7287
7288 #endif