*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
- * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
*/
/*
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
-#include <sys/dmu_tx.h>
+#include <sys/zthr.h>
#include <zfs_fletcher.h>
#include <sys/arc_impl.h>
#include <sys/trace_arc.h>
boolean_t arc_watch = B_FALSE;
#endif
-static kmutex_t arc_reclaim_lock;
-static kcondvar_t arc_reclaim_thread_cv;
-static boolean_t arc_reclaim_thread_exit;
-static kcondvar_t arc_reclaim_waiters_cv;
+/*
+ * This thread's job is to keep enough free memory in the system, by
+ * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
+ * arc_available_memory().
+ */
+static zthr_t *arc_reap_zthr;
+
+/*
+ * This thread's job is to keep arc_size under arc_c, by calling
+ * arc_adjust(), which improves arc_is_overflowing().
+ */
+static zthr_t *arc_adjust_zthr;
+
+static kmutex_t arc_adjust_lock;
+static kcondvar_t arc_adjust_waiters_cv;
+static boolean_t arc_adjust_needed = B_FALSE;
/*
* The number of headers to evict in arc_evict_state_impl() before
int zfs_arc_evict_batch_limit = 10;
/* number of seconds before growing cache again */
-static int arc_grow_retry = 5;
+static int arc_grow_retry = 5;
+
+/*
+ * Minimum time between calls to arc_kmem_reap_soon().
+ */
+int arc_kmem_cache_reap_retry_ms = 1000;
/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
-int zfs_arc_overflow_shift = 8;
+int zfs_arc_overflow_shift = 8;
/* shift of arc_c for calculating both min and max arc_p */
-static int arc_p_min_shift = 4;
+int arc_p_min_shift = 4;
/* log2(fraction of arc to reclaim) */
-static int arc_shrink_shift = 7;
+static int arc_shrink_shift = 7;
/* percent of pagecache to reclaim arc to */
#ifdef _KERNEL
-static uint_t zfs_arc_pc_percent = 0;
+static uint_t zfs_arc_pc_percent = 0;
#endif
/*
*/
int arc_lotsfree_percent = 10;
-static int arc_dead;
+/*
+ * hdr_recl() uses this to determine if the arc is up and running.
+ */
+static boolean_t arc_initialized;
/*
* The arc has filled available memory and has now warmed up.
aggsum_t astat_hdr_size;
aggsum_t astat_l2_hdr_size;
+static hrtime_t arc_growtime;
static list_t arc_prune_list;
static kmutex_t arc_prune_mtx;
static taskq_t *arc_prune_taskq;
* umem calls the reclaim func when we destroy the buf cache,
* which is after we do arc_fini().
*/
- if (!arc_dead)
- cv_signal(&arc_reclaim_thread_cv);
+ if (arc_initialized)
+ zthr_wakeup(arc_reap_zthr);
}
static void
* function should proceed in this case).
*
* If threads are left sleeping, due to not
- * using cv_broadcast, they will be woken up
- * just before arc_reclaim_thread() sleeps.
+ * using cv_broadcast here, they will be woken
+ * up via cv_broadcast in arc_adjust_cb() just
+ * before arc_adjust_zthr sleeps.
*/
- mutex_enter(&arc_reclaim_lock);
+ mutex_enter(&arc_adjust_lock);
if (!arc_is_overflowing())
- cv_signal(&arc_reclaim_waiters_cv);
- mutex_exit(&arc_reclaim_lock);
+ cv_signal(&arc_adjust_waiters_cv);
+ mutex_exit(&arc_adjust_lock);
} else {
ARCSTAT_BUMP(arcstat_mutex_miss);
}
(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
}
-void
-arc_shrink(int64_t to_free)
+static void
+arc_reduce_target_size(int64_t to_free)
{
uint64_t asize = aggsum_value(&arc_size);
uint64_t c = arc_c;
arc_c = arc_c_min;
}
- if (asize > arc_c)
- (void) arc_adjust();
+ if (asize > arc_c) {
+ /* See comment in arc_adjust_cb_check() on why lock+flag */
+ mutex_enter(&arc_adjust_lock);
+ arc_adjust_needed = B_TRUE;
+ mutex_exit(&arc_adjust_lock);
+ zthr_wakeup(arc_adjust_zthr);
+ }
}
-
/*
* Return maximum amount of memory that we could possibly use. Reduced
* to half of all memory in user space which is primarily used for testing.
}
static void
-arc_kmem_reap_now(void)
+arc_kmem_reap_soon(void)
{
size_t i;
kmem_cache_t *prev_cache = NULL;
}
}
+/* ARGSUSED */
+static boolean_t
+arc_adjust_cb_check(void *arg, zthr_t *zthr)
+{
+ /*
+ * This is necessary in order to keep the kstat information
+ * up to date for tools that display kstat data such as the
+ * mdb ::arc dcmd and the Linux crash utility. These tools
+ * typically do not call kstat's update function, but simply
+ * dump out stats from the most recent update. Without
+ * this call, these commands may show stale stats for the
+ * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
+ * with this change, the data might be up to 1 second
+ * out of date(the arc_adjust_zthr has a maximum sleep
+ * time of 1 second); but that should suffice. The
+ * arc_state_t structures can be queried directly if more
+ * accurate information is needed.
+ */
+ if (arc_ksp != NULL)
+ arc_ksp->ks_update(arc_ksp, KSTAT_READ);
+
+ /*
+ * We have to rely on arc_get_data_impl() to tell us when to adjust,
+ * rather than checking if we are overflowing here, so that we are
+ * sure to not leave arc_get_data_impl() waiting on
+ * arc_adjust_waiters_cv. If we have become "not overflowing" since
+ * arc_get_data_impl() checked, we need to wake it up. We could
+ * broadcast the CV here, but arc_get_data_impl() may have not yet
+ * gone to sleep. We would need to use a mutex to ensure that this
+ * function doesn't broadcast until arc_get_data_impl() has gone to
+ * sleep (e.g. the arc_adjust_lock). However, the lock ordering of
+ * such a lock would necessarily be incorrect with respect to the
+ * zthr_lock, which is held before this function is called, and is
+ * held by arc_get_data_impl() when it calls zthr_wakeup().
+ */
+ return (arc_adjust_needed);
+}
+
/*
- * Threads can block in arc_get_data_impl() waiting for this thread to evict
- * enough data and signal them to proceed. When this happens, the threads in
- * arc_get_data_impl() are sleeping while holding the hash lock for their
- * particular arc header. Thus, we must be careful to never sleep on a
- * hash lock in this thread. This is to prevent the following deadlock:
- *
- * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
- * waiting for the reclaim thread to signal it.
- *
- * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
- * fails, and goes to sleep forever.
- *
- * This possible deadlock is avoided by always acquiring a hash lock
- * using mutex_tryenter() from arc_reclaim_thread().
+ * Keep arc_size under arc_c by running arc_adjust which evicts data
+ * from the ARC.
*/
/* ARGSUSED */
-static void
-arc_reclaim_thread(void *unused)
+static int
+arc_adjust_cb(void *arg, zthr_t *zthr)
{
- fstrans_cookie_t cookie = spl_fstrans_mark();
- hrtime_t growtime = 0;
- callb_cpr_t cpr;
-
- CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
+ uint64_t evicted = 0;
+ fstrans_cookie_t cookie = spl_fstrans_mark();
- mutex_enter(&arc_reclaim_lock);
- while (!arc_reclaim_thread_exit) {
- uint64_t evicted = 0;
- uint64_t need_free = arc_need_free;
- arc_tuning_update();
+ /* Evict from cache */
+ evicted = arc_adjust();
+ /*
+ * If evicted is zero, we couldn't evict anything
+ * via arc_adjust(). This could be due to hash lock
+ * collisions, but more likely due to the majority of
+ * arc buffers being unevictable. Therefore, even if
+ * arc_size is above arc_c, another pass is unlikely to
+ * be helpful and could potentially cause us to enter an
+ * infinite loop. Additionally, zthr_iscancelled() is
+ * checked here so that if the arc is shutting down, the
+ * broadcast will wake any remaining arc adjust waiters.
+ */
+ mutex_enter(&arc_adjust_lock);
+ arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) &&
+ evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0;
+ if (!arc_adjust_needed) {
/*
- * This is necessary in order for the mdb ::arc dcmd to
- * show up to date information. Since the ::arc command
- * does not call the kstat's update function, without
- * this call, the command may show stale stats for the
- * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
- * with this change, the data might be up to 1 second
- * out of date; but that should suffice. The arc_state_t
- * structures can be queried directly if more accurate
- * information is needed.
+ * We're either no longer overflowing, or we
+ * can't evict anything more, so we should wake
+ * arc_get_data_impl() sooner.
*/
-#ifndef __linux__
- if (arc_ksp != NULL)
- arc_ksp->ks_update(arc_ksp, KSTAT_READ);
-#endif
- mutex_exit(&arc_reclaim_lock);
+ cv_broadcast(&arc_adjust_waiters_cv);
+ arc_need_free = 0;
+ }
+ mutex_exit(&arc_adjust_lock);
+ spl_fstrans_unmark(cookie);
+
+ return (0);
+}
+/* ARGSUSED */
+static boolean_t
+arc_reap_cb_check(void *arg, zthr_t *zthr)
+{
+ int64_t free_memory = arc_available_memory();
+
+ /*
+ * If a kmem reap is already active, don't schedule more. We must
+ * check for this because kmem_cache_reap_soon() won't actually
+ * block on the cache being reaped (this is to prevent callers from
+ * becoming implicitly blocked by a system-wide kmem reap -- which,
+ * on a system with many, many full magazines, can take minutes).
+ */
+ if (!kmem_cache_reap_active() && free_memory < 0) {
+
+ arc_no_grow = B_TRUE;
+ arc_warm = B_TRUE;
/*
- * We call arc_adjust() before (possibly) calling
- * arc_kmem_reap_now(), so that we can wake up
- * arc_get_data_buf() sooner.
+ * Wait at least zfs_grow_retry (default 5) seconds
+ * before considering growing.
*/
- evicted = arc_adjust();
+ arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
+ return (B_TRUE);
+ } else if (free_memory < arc_c >> arc_no_grow_shift) {
+ arc_no_grow = B_TRUE;
+ } else if (gethrtime() >= arc_growtime) {
+ arc_no_grow = B_FALSE;
+ }
- int64_t free_memory = arc_available_memory();
- if (free_memory < 0) {
+ return (B_FALSE);
+}
- arc_no_grow = B_TRUE;
- arc_warm = B_TRUE;
+/*
+ * Keep enough free memory in the system by reaping the ARC's kmem
+ * caches. To cause more slabs to be reapable, we may reduce the
+ * target size of the cache (arc_c), causing the arc_adjust_cb()
+ * to free more buffers.
+ */
+/* ARGSUSED */
+static int
+arc_reap_cb(void *arg, zthr_t *zthr)
+{
+ int64_t free_memory;
+ fstrans_cookie_t cookie = spl_fstrans_mark();
- /*
- * Wait at least zfs_grow_retry (default 5) seconds
- * before considering growing.
- */
- growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
+ /*
+ * Kick off asynchronous kmem_reap()'s of all our caches.
+ */
+ arc_kmem_reap_soon();
- arc_kmem_reap_now();
+ /*
+ * Wait at least arc_kmem_cache_reap_retry_ms between
+ * arc_kmem_reap_soon() calls. Without this check it is possible to
+ * end up in a situation where we spend lots of time reaping
+ * caches, while we're near arc_c_min. Waiting here also gives the
+ * subsequent free memory check a chance of finding that the
+ * asynchronous reap has already freed enough memory, and we don't
+ * need to call arc_reduce_target_size().
+ */
+ delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
- /*
- * If we are still low on memory, shrink the ARC
- * so that we have arc_shrink_min free space.
- */
- free_memory = arc_available_memory();
+ /*
+ * Reduce the target size as needed to maintain the amount of free
+ * memory in the system at a fraction of the arc_size (1/128th by
+ * default). If oversubscribed (free_memory < 0) then reduce the
+ * target arc_size by the deficit amount plus the fractional
+ * amount. If free memory is positive but less then the fractional
+ * amount, reduce by what is needed to hit the fractional amount.
+ */
+ free_memory = arc_available_memory();
- int64_t to_free =
- (arc_c >> arc_shrink_shift) - free_memory;
- if (to_free > 0) {
+ int64_t to_free =
+ (arc_c >> arc_shrink_shift) - free_memory;
+ if (to_free > 0) {
#ifdef _KERNEL
- to_free = MAX(to_free, need_free);
+ to_free = MAX(to_free, arc_need_free);
#endif
- arc_shrink(to_free);
- }
- } else if (free_memory < arc_c >> arc_no_grow_shift) {
- arc_no_grow = B_TRUE;
- } else if (gethrtime() >= growtime) {
- arc_no_grow = B_FALSE;
- }
-
- mutex_enter(&arc_reclaim_lock);
-
- /*
- * If evicted is zero, we couldn't evict anything via
- * arc_adjust(). This could be due to hash lock
- * collisions, but more likely due to the majority of
- * arc buffers being unevictable. Therefore, even if
- * arc_size is above arc_c, another pass is unlikely to
- * be helpful and could potentially cause us to enter an
- * infinite loop.
- */
- if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) {
- /*
- * We're either no longer overflowing, or we
- * can't evict anything more, so we should wake
- * up any threads before we go to sleep and remove
- * the bytes we were working on from arc_need_free
- * since nothing more will be done here.
- */
- cv_broadcast(&arc_reclaim_waiters_cv);
- ARCSTAT_INCR(arcstat_need_free, -need_free);
-
- /*
- * Block until signaled, or after one second (we
- * might need to perform arc_kmem_reap_now()
- * even if we aren't being signalled)
- */
- CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_timedwait_sig_hires(&arc_reclaim_thread_cv,
- &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
- CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
- }
+ arc_reduce_target_size(to_free);
}
-
- arc_reclaim_thread_exit = B_FALSE;
- cv_broadcast(&arc_reclaim_thread_cv);
- CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */
spl_fstrans_unmark(cookie);
- thread_exit();
+
+ return (0);
}
#ifdef _KERNEL
return (SHRINK_STOP);
/* Reclaim in progress */
- if (mutex_tryenter(&arc_reclaim_lock) == 0) {
+ if (mutex_tryenter(&arc_adjust_lock) == 0) {
ARCSTAT_INCR(arcstat_need_free, ptob(sc->nr_to_scan));
return (0);
}
- mutex_exit(&arc_reclaim_lock);
+ mutex_exit(&arc_adjust_lock);
/*
* Evict the requested number of pages by shrinking arc_c the
* requested amount.
*/
if (pages > 0) {
- arc_shrink(ptob(sc->nr_to_scan));
+ arc_reduce_target_size(ptob(sc->nr_to_scan));
if (current_is_kswapd())
- arc_kmem_reap_now();
+ arc_kmem_reap_soon();
#ifdef HAVE_SPLIT_SHRINKER_CALLBACK
pages = MAX((int64_t)pages -
(int64_t)btop(arc_evictable_memory()), 0);
/*
* We've shrunk what we can, wake up threads.
*/
- cv_broadcast(&arc_reclaim_waiters_cv);
+ cv_broadcast(&arc_adjust_waiters_cv);
} else
pages = SHRINK_STOP;
ARCSTAT_BUMP(arcstat_memory_indirect_count);
} else {
arc_no_grow = B_TRUE;
- arc_kmem_reap_now();
+ arc_kmem_reap_soon();
ARCSTAT_BUMP(arcstat_memory_direct_count);
}
}
ASSERT((int64_t)arc_p >= 0);
+ /*
+ * Wake reap thread if we do not have any available memory
+ */
if (arc_reclaim_needed()) {
- cv_signal(&arc_reclaim_thread_cv);
+ zthr_wakeup(arc_reap_zthr);
return;
}
* overflowing; thus we don't use a while loop here.
*/
if (arc_is_overflowing()) {
- mutex_enter(&arc_reclaim_lock);
+ mutex_enter(&arc_adjust_lock);
/*
* Now that we've acquired the lock, we may no longer be
* shouldn't cause any harm.
*/
if (arc_is_overflowing()) {
- cv_signal(&arc_reclaim_thread_cv);
- cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
+ arc_adjust_needed = B_TRUE;
+ zthr_wakeup(arc_adjust_zthr);
+ (void) cv_wait(&arc_adjust_waiters_cv,
+ &arc_adjust_lock);
}
-
- mutex_exit(&arc_reclaim_lock);
+ mutex_exit(&arc_adjust_lock);
}
VERIFY3U(hdr->b_type, ==, type);
arc_init(void)
{
uint64_t percent, allmem = arc_all_memory();
-
- mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL);
arc_min_prefetch_ms = 1000;
arc_min_prescient_prefetch_ms = 6000;
arc_c = arc_c_min;
arc_state_init();
+
+ /*
+ * The arc must be "uninitialized", so that hdr_recl() (which is
+ * registered by buf_init()) will not access arc_reap_zthr before
+ * it is created.
+ */
+ ASSERT(!arc_initialized);
buf_init();
list_create(&arc_prune_list, sizeof (arc_prune_t),
arc_prune_taskq = taskq_create("arc_prune", max_ncpus, defclsyspri,
max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
- arc_reclaim_thread_exit = B_FALSE;
-
arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
kstat_install(arc_ksp);
}
- (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
- TS_RUN, defclsyspri);
+ arc_adjust_zthr = zthr_create(arc_adjust_cb_check,
+ arc_adjust_cb, NULL);
+ arc_reap_zthr = zthr_create_timer(arc_reap_cb_check,
+ arc_reap_cb, NULL, SEC2NSEC(1));
- arc_dead = B_FALSE;
+ arc_initialized = B_TRUE;
arc_warm = B_FALSE;
/*
spl_unregister_shrinker(&arc_shrinker);
#endif /* _KERNEL */
- mutex_enter(&arc_reclaim_lock);
- arc_reclaim_thread_exit = B_TRUE;
- /*
- * The reclaim thread will set arc_reclaim_thread_exit back to
- * B_FALSE when it is finished exiting; we're waiting for that.
- */
- while (arc_reclaim_thread_exit) {
- cv_signal(&arc_reclaim_thread_cv);
- cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
- }
- mutex_exit(&arc_reclaim_lock);
-
/* Use B_TRUE to ensure *all* buffers are evicted */
arc_flush(NULL, B_TRUE);
- arc_dead = B_TRUE;
+ arc_initialized = B_FALSE;
if (arc_ksp != NULL) {
kstat_delete(arc_ksp);
list_destroy(&arc_prune_list);
mutex_destroy(&arc_prune_mtx);
- mutex_destroy(&arc_reclaim_lock);
- cv_destroy(&arc_reclaim_thread_cv);
- cv_destroy(&arc_reclaim_waiters_cv);
+ (void) zthr_cancel(arc_adjust_zthr);
+ zthr_destroy(arc_adjust_zthr);
+
+ (void) zthr_cancel(arc_reap_zthr);
+ zthr_destroy(arc_reap_zthr);
+
+ mutex_destroy(&arc_adjust_lock);
+ cv_destroy(&arc_adjust_waiters_cv);
/*
* buf_fini() must proceed arc_state_fini() because buf_fin() may