--- /dev/null
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/aggsum.h>
+
+/*
+ * Aggregate-sum counters are a form of fanned-out counter, used when atomic
+ * instructions on a single field cause enough CPU cache line contention to
+ * slow system performance. Due to their increased overhead and the expense
+ * involved with precisely reading from them, they should only be used in cases
+ * where the write rate (increment/decrement) is much higher than the read rate
+ * (get value).
+ *
+ * Aggregate sum counters are comprised of two basic parts, the core and the
+ * buckets. The core counter contains a lock for the entire counter, as well
+ * as the current upper and lower bounds on the value of the counter. The
+ * aggsum_bucket structure contains a per-bucket lock to protect the contents of
+ * the bucket, the current amount that this bucket has changed from the global
+ * counter (called the delta), and the amount of increment and decrement we have
+ * "borrowed" from the core counter.
+ *
+ * The basic operation of an aggsum is simple. Threads that wish to modify the
+ * counter will modify one bucket's counter (determined by their current CPU, to
+ * help minimize lock and cache contention). If the bucket already has
+ * sufficient capacity borrowed from the core structure to handle their request,
+ * they simply modify the delta and return. If the bucket does not, we clear
+ * the bucket's current state (to prevent the borrowed amounts from getting too
+ * large), and borrow more from the core counter. Borrowing is done by adding to
+ * the upper bound (or subtracting from the lower bound) of the core counter,
+ * and setting the borrow value for the bucket to the amount added (or
+ * subtracted). Clearing the bucket is the opposite; we add the current delta
+ * to both the lower and upper bounds of the core counter, subtract the borrowed
+ * incremental from the upper bound, and add the borrowed decrement from the
+ * lower bound. Note that only borrowing and clearing require access to the
+ * core counter; since all other operations access CPU-local resources,
+ * performance can be much higher than a traditional counter.
+ *
+ * Threads that wish to read from the counter have a slightly more challenging
+ * task. It is fast to determine the upper and lower bounds of the aggum; this
+ * does not require grabbing any locks. This suffices for cases where an
+ * approximation of the aggsum's value is acceptable. However, if one needs to
+ * know whether some specific value is above or below the current value in the
+ * aggsum, they invoke aggsum_compare(). This function operates by repeatedly
+ * comparing the target value to the upper and lower bounds of the aggsum, and
+ * then clearing a bucket. This proceeds until the target is outside of the
+ * upper and lower bounds and we return a response, or the last bucket has been
+ * cleared and we know that the target is equal to the aggsum's value. Finally,
+ * the most expensive operation is determining the precise value of the aggsum.
+ * To do this, we clear every bucket and then return the upper bound (which must
+ * be equal to the lower bound). What makes aggsum_compare() and aggsum_value()
+ * expensive is clearing buckets. This involves grabbing the global lock
+ * (serializing against themselves and borrow operations), grabbing a bucket's
+ * lock (preventing threads on those CPUs from modifying their delta), and
+ * zeroing out the borrowed value (forcing that thread to borrow on its next
+ * request, which will also be expensive). This is what makes aggsums well
+ * suited for write-many read-rarely operations.
+ */
+
+/*
+ * We will borrow aggsum_borrow_multiplier times the current request, so we will
+ * have to get the as_lock approximately every aggsum_borrow_multiplier calls to
+ * aggsum_delta().
+ */
+static uint_t aggsum_borrow_multiplier = 10;
+
+void
+aggsum_init(aggsum_t *as, uint64_t value)
+{
+ bzero(as, sizeof (*as));
+ as->as_lower_bound = as->as_upper_bound = value;
+ mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
+ as->as_numbuckets = boot_ncpus;
+ as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t),
+ KM_SLEEP);
+ for (int i = 0; i < as->as_numbuckets; i++) {
+ mutex_init(&as->as_buckets[i].asc_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ }
+}
+
+void
+aggsum_fini(aggsum_t *as)
+{
+ for (int i = 0; i < as->as_numbuckets; i++)
+ mutex_destroy(&as->as_buckets[i].asc_lock);
+ kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t));
+ mutex_destroy(&as->as_lock);
+}
+
+int64_t
+aggsum_lower_bound(aggsum_t *as)
+{
+ return (as->as_lower_bound);
+}
+
+int64_t
+aggsum_upper_bound(aggsum_t *as)
+{
+ return (as->as_upper_bound);
+}
+
+static void
+aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb)
+{
+ ASSERT(MUTEX_HELD(&as->as_lock));
+ ASSERT(MUTEX_HELD(&asb->asc_lock));
+
+ /*
+ * We use atomic instructions for this because we read the upper and
+ * lower bounds without the lock, so we need stores to be atomic.
+ */
+ atomic_add_64((volatile uint64_t *)&as->as_lower_bound, asb->asc_delta);
+ atomic_add_64((volatile uint64_t *)&as->as_upper_bound, asb->asc_delta);
+ asb->asc_delta = 0;
+ atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
+ -asb->asc_borrowed);
+ atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
+ asb->asc_borrowed);
+ asb->asc_borrowed = 0;
+}
+
+uint64_t
+aggsum_value(aggsum_t *as)
+{
+ int64_t rv;
+
+ mutex_enter(&as->as_lock);
+ if (as->as_lower_bound == as->as_upper_bound) {
+ rv = as->as_lower_bound;
+ for (int i = 0; i < as->as_numbuckets; i++) {
+ ASSERT0(as->as_buckets[i].asc_delta);
+ ASSERT0(as->as_buckets[i].asc_borrowed);
+ }
+ mutex_exit(&as->as_lock);
+ return (rv);
+ }
+ for (int i = 0; i < as->as_numbuckets; i++) {
+ struct aggsum_bucket *asb = &as->as_buckets[i];
+ mutex_enter(&asb->asc_lock);
+ aggsum_flush_bucket(as, asb);
+ mutex_exit(&asb->asc_lock);
+ }
+ VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
+ rv = as->as_lower_bound;
+ mutex_exit(&as->as_lock);
+
+ return (rv);
+}
+
+static void
+aggsum_borrow(aggsum_t *as, int64_t delta, struct aggsum_bucket *asb)
+{
+ int64_t abs_delta = (delta < 0 ? -delta : delta);
+ mutex_enter(&as->as_lock);
+ mutex_enter(&asb->asc_lock);
+
+ aggsum_flush_bucket(as, asb);
+
+ atomic_add_64((volatile uint64_t *)&as->as_upper_bound, abs_delta);
+ atomic_add_64((volatile uint64_t *)&as->as_lower_bound, -abs_delta);
+ asb->asc_borrowed = abs_delta;
+
+ mutex_exit(&asb->asc_lock);
+ mutex_exit(&as->as_lock);
+}
+
+void
+aggsum_add(aggsum_t *as, int64_t delta)
+{
+ struct aggsum_bucket *asb =
+ &as->as_buckets[CPU_SEQID % as->as_numbuckets];
+
+ for (;;) {
+ mutex_enter(&asb->asc_lock);
+ if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed &&
+ asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) {
+ asb->asc_delta += delta;
+ mutex_exit(&asb->asc_lock);
+ return;
+ }
+ mutex_exit(&asb->asc_lock);
+ aggsum_borrow(as, delta * aggsum_borrow_multiplier, asb);
+ }
+}
+
+/*
+ * Compare the aggsum value to target efficiently. Returns -1 if the value
+ * represented by the aggsum is less than target, 1 if it's greater, and 0 if
+ * they are equal.
+ */
+int
+aggsum_compare(aggsum_t *as, uint64_t target)
+{
+ if (as->as_upper_bound < target)
+ return (-1);
+ if (as->as_lower_bound > target)
+ return (1);
+ mutex_enter(&as->as_lock);
+ for (int i = 0; i < as->as_numbuckets; i++) {
+ struct aggsum_bucket *asb = &as->as_buckets[i];
+ mutex_enter(&asb->asc_lock);
+ aggsum_flush_bucket(as, asb);
+ mutex_exit(&asb->asc_lock);
+ if (as->as_upper_bound < target) {
+ mutex_exit(&as->as_lock);
+ return (-1);
+ }
+ if (as->as_lower_bound > target) {
+ mutex_exit(&as->as_lock);
+ return (1);
+ }
+ }
+ VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
+ ASSERT3U(as->as_lower_bound, ==, target);
+ mutex_exit(&as->as_lock);
+ return (0);
+}
#include <zfs_fletcher.h>
#include <sys/arc_impl.h>
#include <sys/trace_arc.h>
+#include <sys/aggsum.h>
+#include <sys/cityhash.h>
#ifndef _KERNEL
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
kstat_named_t arcstat_c;
kstat_named_t arcstat_c_min;
kstat_named_t arcstat_c_max;
+ /* Not updated directly; only synced in arc_kstat_update. */
kstat_named_t arcstat_size;
/*
* Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
* (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
* caches), and arc_buf_t structures (allocated via arc_buf_t
* cache).
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_hdr_size;
/*
* Number of bytes consumed by ARC buffers of type equal to
* ARC_BUFC_DATA. This is generally consumed by buffers backing
* on disk user data (e.g. plain file contents).
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_data_size;
/*
* ARC_BUFC_METADATA. This is generally consumed by buffers
* backing on disk data that is used for internal ZFS
* structures (e.g. ZAP, dnode, indirect blocks, etc).
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_metadata_size;
/*
* Number of bytes consumed by dmu_buf_impl_t objects.
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_dbuf_size;
/*
* Number of bytes consumed by dnode_t objects.
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_dnode_size;
/*
* Number of bytes consumed by bonus buffers.
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_bonus_size;
/*
* arc_anon state. This includes *all* buffers in the arc_anon
* state; e.g. data, metadata, evictable, and unevictable buffers
* are all included in this value.
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_anon_size;
/*
* following criteria: backing buffers of type ARC_BUFC_DATA,
* residing in the arc_anon state, and are eligible for eviction
* (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_anon_evictable_data;
/*
* following criteria: backing buffers of type ARC_BUFC_METADATA,
* residing in the arc_anon state, and are eligible for eviction
* (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_anon_evictable_metadata;
/*
* arc_mru state. This includes *all* buffers in the arc_mru
* state; e.g. data, metadata, evictable, and unevictable buffers
* are all included in this value.
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_mru_size;
/*
* following criteria: backing buffers of type ARC_BUFC_DATA,
* residing in the arc_mru state, and are eligible for eviction
* (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_mru_evictable_data;
/*
* following criteria: backing buffers of type ARC_BUFC_METADATA,
* residing in the arc_mru state, and are eligible for eviction
* (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_mru_evictable_metadata;
/*
* don't actually have ARC buffers linked off of these headers.
* Thus, *if* the headers had associated ARC buffers, these
* buffers *would have* consumed this number of bytes.
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_mru_ghost_size;
/*
* Number of bytes that *would have been* consumed by ARC
* buffers that are eligible for eviction, of type
* ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_mru_ghost_evictable_data;
/*
* Number of bytes that *would have been* consumed by ARC
* buffers that are eligible for eviction, of type
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_mru_ghost_evictable_metadata;
/*
* arc_mfu state. This includes *all* buffers in the arc_mfu
* state; e.g. data, metadata, evictable, and unevictable buffers
* are all included in this value.
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_mfu_size;
/*
* Number of bytes consumed by ARC buffers that are eligible for
* eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
* state.
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_mfu_evictable_data;
/*
* Number of bytes consumed by ARC buffers that are eligible for
* eviction, of type ARC_BUFC_METADATA, and reside in the
* arc_mfu state.
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_mfu_evictable_metadata;
/*
* Total number of bytes that *would have been* consumed by ARC
* buffers in the arc_mfu_ghost state. See the comment above
* arcstat_mru_ghost_size for more details.
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_mfu_ghost_size;
/*
* Number of bytes that *would have been* consumed by ARC
* buffers that are eligible for eviction, of type
* ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_mfu_ghost_evictable_data;
/*
* Number of bytes that *would have been* consumed by ARC
* buffers that are eligible for eviction, of type
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
*/
kstat_named_t arcstat_mfu_ghost_evictable_metadata;
kstat_named_t arcstat_l2_hits;
kstat_named_t arcstat_l2_io_error;
kstat_named_t arcstat_l2_lsize;
kstat_named_t arcstat_l2_psize;
+ /* Not updated directly; only synced in arc_kstat_update. */
kstat_named_t arcstat_l2_hdr_size;
kstat_named_t arcstat_memory_throttle_count;
kstat_named_t arcstat_memory_direct_count;
kstat_named_t arcstat_tempreserve;
kstat_named_t arcstat_loaned_bytes;
kstat_named_t arcstat_prune;
+ /* Not updated directly; only synced in arc_kstat_update. */
kstat_named_t arcstat_meta_used;
kstat_named_t arcstat_meta_limit;
kstat_named_t arcstat_dnode_limit;
* the possibility of inconsistency by having shadow copies of the variables,
* while still allowing the code to be readable.
*/
-#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
-#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
-#define arc_dbuf_size ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */
-#define arc_dnode_size ARCSTAT(arcstat_dnode_size) /* dnode metadata */
-#define arc_bonus_size ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */
#define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */
#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */
/* number of bytes in the arc from arc_buf_t's */
#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
+/*
+ * There are also some ARC variables that we want to export, but that are
+ * updated so often that having the canonical representation be the statistic
+ * variable causes a performance bottleneck. We want to use aggsum_t's for these
+ * instead, but still be able to export the kstat in the same way as before.
+ * The solution is to always use the aggsum version, except in the kstat update
+ * callback.
+ */
+aggsum_t arc_size;
+aggsum_t arc_meta_used;
+aggsum_t astat_data_size;
+aggsum_t astat_metadata_size;
+aggsum_t astat_dbuf_size;
+aggsum_t astat_dnode_size;
+aggsum_t astat_bonus_size;
+aggsum_t astat_hdr_size;
+aggsum_t astat_l2_hdr_size;
+
static list_t arc_prune_list;
static kmutex_t arc_prune_mtx;
static taskq_t *arc_prune_taskq;
static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
static void l2arc_read_done(zio_t *);
+
+/*
+ * We use Cityhash for this. It's fast, and has good hash properties without
+ * requiring any large static buffers.
+ */
static uint64_t
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{
- uint8_t *vdva = (uint8_t *)dva;
- uint64_t crc = -1ULL;
- int i;
-
- ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-
- for (i = 0; i < sizeof (dva_t); i++)
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
-
- crc ^= (spa>>8) ^ birth;
-
- return (crc);
+ return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
}
#define HDR_EMPTY(hdr) \
default:
break;
case ARC_SPACE_DATA:
- ARCSTAT_INCR(arcstat_data_size, space);
+ aggsum_add(&astat_data_size, space);
break;
case ARC_SPACE_META:
- ARCSTAT_INCR(arcstat_metadata_size, space);
+ aggsum_add(&astat_metadata_size, space);
break;
case ARC_SPACE_BONUS:
- ARCSTAT_INCR(arcstat_bonus_size, space);
+ aggsum_add(&astat_bonus_size, space);
break;
case ARC_SPACE_DNODE:
- ARCSTAT_INCR(arcstat_dnode_size, space);
+ aggsum_add(&astat_dnode_size, space);
break;
case ARC_SPACE_DBUF:
- ARCSTAT_INCR(arcstat_dbuf_size, space);
+ aggsum_add(&astat_dbuf_size, space);
break;
case ARC_SPACE_HDRS:
- ARCSTAT_INCR(arcstat_hdr_size, space);
+ aggsum_add(&astat_hdr_size, space);
break;
case ARC_SPACE_L2HDRS:
- ARCSTAT_INCR(arcstat_l2_hdr_size, space);
+ aggsum_add(&astat_l2_hdr_size, space);
break;
}
if (type != ARC_SPACE_DATA)
- ARCSTAT_INCR(arcstat_meta_used, space);
+ aggsum_add(&arc_meta_used, space);
- atomic_add_64(&arc_size, space);
+ aggsum_add(&arc_size, space);
}
void
default:
break;
case ARC_SPACE_DATA:
- ARCSTAT_INCR(arcstat_data_size, -space);
+ aggsum_add(&astat_data_size, -space);
break;
case ARC_SPACE_META:
- ARCSTAT_INCR(arcstat_metadata_size, -space);
+ aggsum_add(&astat_metadata_size, -space);
break;
case ARC_SPACE_BONUS:
- ARCSTAT_INCR(arcstat_bonus_size, -space);
+ aggsum_add(&astat_bonus_size, -space);
break;
case ARC_SPACE_DNODE:
- ARCSTAT_INCR(arcstat_dnode_size, -space);
+ aggsum_add(&astat_dnode_size, -space);
break;
case ARC_SPACE_DBUF:
- ARCSTAT_INCR(arcstat_dbuf_size, -space);
+ aggsum_add(&astat_dbuf_size, -space);
break;
case ARC_SPACE_HDRS:
- ARCSTAT_INCR(arcstat_hdr_size, -space);
+ aggsum_add(&astat_hdr_size, -space);
break;
case ARC_SPACE_L2HDRS:
- ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
+ aggsum_add(&astat_l2_hdr_size, -space);
break;
}
if (type != ARC_SPACE_DATA) {
- ASSERT(arc_meta_used >= space);
- if (arc_meta_max < arc_meta_used)
- arc_meta_max = arc_meta_used;
- ARCSTAT_INCR(arcstat_meta_used, -space);
+ ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
+ /*
+ * We use the upper bound here rather than the precise value
+ * because the arc_meta_max value doesn't need to be
+ * precise. It's only consumed by humans via arcstats.
+ */
+ if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
+ arc_meta_max = aggsum_upper_bound(&arc_meta_used);
+ aggsum_add(&arc_meta_used, -space);
}
- ASSERT(arc_size >= space);
- atomic_add_64(&arc_size, -space);
+ ASSERT(aggsum_compare(&arc_size, space) >= 0);
+ aggsum_add(&arc_size, -space);
}
/*
* Request that 10% of the LRUs be scanned by the superblock
* shrinker.
*/
- if (type == ARC_BUFC_DATA && arc_dnode_size > arc_dnode_limit)
- arc_prune_async((arc_dnode_size - arc_dnode_limit) /
- sizeof (dnode_t) / zfs_arc_dnode_reduce_percent);
+ if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size,
+ arc_dnode_limit) > 0) {
+ arc_prune_async((aggsum_upper_bound(&astat_dnode_size) -
+ arc_dnode_limit) / sizeof (dnode_t) /
+ zfs_arc_dnode_reduce_percent);
+ }
/*
* Start eviction using a randomly selected sublist,
*
* Therefore, this function has been updated to make alternating passes
* over the ARC releasing data buffers and then newly unheld meta data
- * buffers. This ensures forward progress is maintained and arc_meta_used
+ * buffers. This ensures forward progress is maintained and meta_used
* will decrease. Normally this is sufficient, but if required the ARC
* will call the registered prune callbacks causing dentry and inodes to
* be dropped from the VFS cache. This will make dnode meta data buffers
* available for reclaim.
*/
static uint64_t
-arc_adjust_meta_balanced(void)
+arc_adjust_meta_balanced(uint64_t meta_used)
{
int64_t delta, prune = 0, adjustmnt;
uint64_t total_evicted = 0;
* metadata from the MFU. I think we probably need to implement a
* "metadata arc_p" value to do this properly.
*/
- adjustmnt = arc_meta_used - arc_meta_limit;
+ adjustmnt = meta_used - arc_meta_limit;
if (adjustmnt > 0 && refcount_count(&arc_mru->arcs_esize[type]) > 0) {
delta = MIN(refcount_count(&arc_mru->arcs_esize[type]),
total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
}
- adjustmnt = arc_meta_used - arc_meta_limit;
+ adjustmnt = meta_used - arc_meta_limit;
if (adjustmnt > 0 &&
refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
* meta buffers. Requests to the upper layers will be made with
* increasingly large scan sizes until the ARC is below the limit.
*/
- if (arc_meta_used > arc_meta_limit) {
+ if (meta_used > arc_meta_limit) {
if (type == ARC_BUFC_DATA) {
type = ARC_BUFC_METADATA;
} else {
* capped by the arc_meta_limit tunable.
*/
static uint64_t
-arc_adjust_meta_only(void)
+arc_adjust_meta_only(uint64_t meta_used)
{
uint64_t total_evicted = 0;
int64_t target;
* we're over the meta limit more than we're over arc_p, we
* evict some from the MRU here, and some from the MFU below.
*/
- target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+ target = MIN((int64_t)(meta_used - arc_meta_limit),
(int64_t)(refcount_count(&arc_anon->arcs_size) +
refcount_count(&arc_mru->arcs_size) - arc_p));
* below the meta limit, but not so much as to drop us below the
* space allotted to the MFU (which is defined as arc_c - arc_p).
*/
- target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
- (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
+ target = MIN((int64_t)(meta_used - arc_meta_limit),
+ (int64_t)(refcount_count(&arc_mfu->arcs_size) -
+ (arc_c - arc_p)));
total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
}
static uint64_t
-arc_adjust_meta(void)
+arc_adjust_meta(uint64_t meta_used)
{
if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
- return (arc_adjust_meta_only());
+ return (arc_adjust_meta_only(meta_used));
else
- return (arc_adjust_meta_balanced());
+ return (arc_adjust_meta_balanced(meta_used));
}
/*
uint64_t total_evicted = 0;
uint64_t bytes;
int64_t target;
+ uint64_t asize = aggsum_value(&arc_size);
+ uint64_t ameta = aggsum_value(&arc_meta_used);
/*
* If we're over arc_meta_limit, we want to correct that before
* potentially evicting data buffers below.
*/
- total_evicted += arc_adjust_meta();
+ total_evicted += arc_adjust_meta(ameta);
/*
* Adjust MRU size
* the MRU is over arc_p, we'll evict enough to get back to
* arc_p here, and then evict more from the MFU below.
*/
- target = MIN((int64_t)(arc_size - arc_c),
+ target = MIN((int64_t)(asize - arc_c),
(int64_t)(refcount_count(&arc_anon->arcs_size) +
- refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
+ refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
/*
* If we're below arc_meta_min, always prefer to evict data.
* type, spill over into the next type.
*/
if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
- arc_meta_used > arc_meta_min) {
+ ameta > arc_meta_min) {
bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
total_evicted += bytes;
* size back to arc_p, if we're still above the target cache
* size, we evict the rest from the MFU.
*/
- target = arc_size - arc_c;
+ target = asize - arc_c;
if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
- arc_meta_used > arc_meta_min) {
+ ameta > arc_meta_min) {
bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
total_evicted += bytes;
void
arc_shrink(int64_t to_free)
{
+ uint64_t asize = aggsum_value(&arc_size);
uint64_t c = arc_c;
if (c > to_free && c - to_free > arc_c_min) {
arc_c = c - to_free;
atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
- if (arc_c > arc_size)
- arc_c = MAX(arc_size, arc_c_min);
+ if (asize < arc_c)
+ arc_c = MAX(asize, arc_c_min);
if (arc_p > arc_c)
arc_p = (arc_c >> 1);
ASSERT(arc_c >= arc_c_min);
arc_c = arc_c_min;
}
- if (arc_size > arc_c)
+ if (asize > arc_c)
(void) arc_adjust();
}
extern kmem_cache_t *range_seg_cache;
#ifdef _KERNEL
- if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) {
+ if ((aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) &&
+ zfs_arc_meta_prune) {
/*
* We are exceeding our meta-data cache limit.
* Prune some entries to release holds on meta-data.
* be helpful and could potentially cause us to enter an
* infinite loop.
*/
- if (arc_size <= arc_c || evicted == 0) {
+ if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) {
/*
* We're either no longer overflowing, or we
* can't evict anything more, so we should wake
static uint64_t
arc_evictable_memory(void)
{
+ int64_t asize = aggsum_value(&arc_size);
uint64_t arc_clean =
refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) +
refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) +
refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) +
refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
- uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
+ uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0);
/*
* Scale reported evictable memory in proportion to page cache, cap
if (arc_dirty >= min)
return (arc_clean);
- return (MAX((int64_t)arc_size - (int64_t)min, 0));
+ return (MAX((int64_t)asize - (int64_t)min, 0));
}
/*
* cache size, increment the target cache size
*/
ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
- if (arc_size >= arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+ if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) >=
+ 0) {
atomic_add_64(&arc_c, (int64_t)bytes);
if (arc_c > arc_c_max)
arc_c = arc_c_max;
uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
arc_c >> zfs_arc_overflow_shift);
- return (arc_size >= arc_c + overflow);
+ /*
+ * We just compare the lower bound here for performance reasons. Our
+ * primary goals are to make sure that the arc never grows without
+ * bound, and that it can reach its maximum size. This check
+ * accomplishes both goals. The maximum amount we could run over by is
+ * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
+ * in the ARC. In practice, that's in the tens of MB, which is low
+ * enough to be safe.
+ */
+ return (aggsum_lower_bound(&arc_size) >= arc_c + overflow);
}
static abd_t *
* If we are growing the cache, and we are adding anonymous
* data, and we have outgrown arc_p, update arc_p
*/
- if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
+ if (aggsum_compare(&arc_size, arc_c) < 0 &&
+ hdr->b_l1hdr.b_state == arc_anon &&
(refcount_count(&arc_anon->arcs_size) +
refcount_count(&arc_mru->arcs_size) > arc_p))
arc_p = MIN(arc_c, arc_p + size);
&as->arcstat_mfu_ghost_evictable_data,
&as->arcstat_mfu_ghost_evictable_metadata);
+ ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
+ ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
+ ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
+ ARCSTAT(arcstat_metadata_size) =
+ aggsum_value(&astat_metadata_size);
+ ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
+ ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
+ ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size);
+ ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size);
+ ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size);
+
as->arcstat_memory_all_bytes.value.ui64 =
arc_all_memory();
as->arcstat_memory_free_bytes.value.ui64 =
refcount_create(&arc_mfu_ghost->arcs_size);
refcount_create(&arc_l2c_only->arcs_size);
+ aggsum_init(&arc_meta_used, 0);
+ aggsum_init(&arc_size, 0);
+ aggsum_init(&astat_data_size, 0);
+ aggsum_init(&astat_metadata_size, 0);
+ aggsum_init(&astat_hdr_size, 0);
+ aggsum_init(&astat_l2_hdr_size, 0);
+ aggsum_init(&astat_bonus_size, 0);
+ aggsum_init(&astat_dnode_size, 0);
+ aggsum_init(&astat_dbuf_size, 0);
+
arc_anon->arcs_state = ARC_STATE_ANON;
arc_mru->arcs_state = ARC_STATE_MRU;
arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
+
+ aggsum_fini(&arc_meta_used);
+ aggsum_fini(&arc_size);
+ aggsum_fini(&astat_data_size);
+ aggsum_fini(&astat_metadata_size);
+ aggsum_fini(&astat_hdr_size);
+ aggsum_fini(&astat_l2_hdr_size);
+ aggsum_fini(&astat_bonus_size);
+ aggsum_fini(&astat_dnode_size);
+ aggsum_fini(&astat_dbuf_size);
}
uint64_t
arc_c = arc_c_max;
arc_p = (arc_c >> 1);
- arc_size = 0;
/* Set min to 1/2 of arc_c_min */
arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;