From: Matthew Ahrens Date: Fri, 26 Jun 2015 18:28:18 +0000 (-0700) Subject: Illumos 5376 - arc_kmem_reap_now() should not result in clearing arc_no_grow X-Git-Tag: zfs-0.6.5~55 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ca67b33abadd2893755735130c11faa5df756989;p=zfs Illumos 5376 - arc_kmem_reap_now() should not result in clearing arc_no_grow 5376 arc_kmem_reap_now() should not result in clearing arc_no_grow Reviewed by: Christopher Siden Reviewed by: George Wilson Reviewed by: Steven Hartland Reviewed by: Richard Elling Approved by: Dan McDonald References: https://www.illumos.org/issues/5376 https://github.com/illumos/illumos-gate/commit/2ec99e3 Porting Notes: The good news is that many of the recent changes made upstream to the ARC tackled issues previously observed by ZoL with similar solutions. The bad news is those solution weren't identical to the ones we applied. This patch is designed to split the difference and apply as much of the upstream work as possible. * The arc_available_memory() function was removed previous in ZoL but due to the upstream changes it makes sense to add it back. This function has been customized for Linux so that it can be used to determine a low memory. This provides the same basic functionality as the illumos version allowing us to minimize changes through the rest of the code base. The exact mechanism used to detect a low memory state remains unchanged so this change isn't a significant as it might first appear. * This patch includes the long standing fix for arc_shrink() which was originally proposed in #2167. Since there were related changes to this function it made sense to include that work. * The arc_init() function has been re-factored. As before it sets sane default values for the ARC but then calls arc_tuning_update() to apply user specific tuning made via module options. The arc_tuning_update() function is then called periodically by the arc_reclaim_thread() to apply changes to the tunings made during normal operation. Ported-by: Brian Behlendorf Closes #3616 Closes #2167 --- diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index acd370b23..7652a9cae 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -410,9 +410,12 @@ extern void kstat_set_raw_ops(kstat_t *ksp, #define kmem_debugging() 0 #define kmem_cache_reap_now(_c) umem_cache_reap_now(_c); #define kmem_cache_set_move(_c, _cb) /* nothing */ +#define vmem_qcache_reap(_v) /* nothing */ #define POINTER_INVALIDATE(_pp) /* nothing */ #define POINTER_IS_VALID(_p) 0 +extern vmem_t *zio_arena; + typedef umem_cache_t kmem_cache_t; typedef enum kmem_cbrc { @@ -610,7 +613,7 @@ extern void delay(clock_t ticks); } while (0); #define max_ncpus 64 -#define num_online_cpus() (sysconf(_SC_NPROCESSORS_ONLN)) +#define boot_ncpus (sysconf(_SC_NPROCESSORS_ONLN)) #define minclsyspri 60 #define maxclsyspri 99 diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 85fe24afd..80da41151 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -48,6 +48,7 @@ uint64_t physmem; vnode_t *rootdir = (vnode_t *)0xabcd1234; char hw_serial[HW_HOSTID_LEN]; struct utsname hw_utsname; +vmem_t *zio_arena = NULL; /* this only exists to have its address taken */ struct proc p0; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index df7845b2f..d734a170f 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -164,17 +164,6 @@ static kmutex_t arc_user_evicts_lock; static kcondvar_t arc_user_evicts_cv; static boolean_t arc_user_evicts_thread_exit; -/* number of objects to prune from caches when arc_meta_limit is reached */ -int zfs_arc_meta_prune = 10000; - -/* The preferred strategy to employ when arc_meta_limit is reached */ -int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED; - -typedef enum arc_reclaim_strategy { - ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ - ARC_RECLAIM_CONS /* Conservative reclaim strategy */ -} arc_reclaim_strategy_t; - /* * The number of headers to evict in arc_evict_state_impl() before * dropping the sublist lock and evicting from another sublist. A lower @@ -192,40 +181,31 @@ int zfs_arc_evict_batch_limit = 10; int zfs_arc_num_sublists_per_state = 0; /* number of seconds before growing cache again */ -int zfs_arc_grow_retry = 5; +static int arc_grow_retry = 5; /* shift of arc_c for calculating overflow limit in arc_get_data_buf */ -int zfs_arc_overflow_shift = 8; - -/* disable anon data aggressively growing arc_p */ -int zfs_arc_p_aggressive_disable = 1; - -/* disable arc_p adapt dampener in arc_adapt */ -int zfs_arc_p_dampener_disable = 1; +int zfs_arc_overflow_shift = 8; /* log2(fraction of arc to reclaim) */ -int zfs_arc_shrink_shift = 5; +static int arc_shrink_shift = 7; /* - * minimum lifespan of a prefetch block in clock ticks - * (initialized in arc_init()) + * log2(fraction of ARC which must be free to allow growing). + * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, + * when reading a new block into the ARC, we will evict an equal-sized block + * from the ARC. + * + * This must be less than arc_shrink_shift, so that when we shrink the ARC, + * we will still not allow it to grow. */ -int zfs_arc_min_prefetch_lifespan = HZ; +int arc_no_grow_shift = 5; -/* disable arc proactive arc throttle due to low memory */ -int zfs_arc_memory_throttle_disable = 1; - -/* disable duplicate buffer eviction */ -int zfs_disable_dup_eviction = 0; - -/* average block used to size buf_hash_table */ -int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ -static int arc_min_prefetch_lifespan; +static int arc_min_prefetch_lifespan; /* * If this percent of memory is free, don't throttle. @@ -234,9 +214,6 @@ int arc_lotsfree_percent = 10; static int arc_dead; -/* expiration time for arc_no_grow */ -static clock_t arc_grow_time = 0; - /* * The arc has filled available memory and has now warmed up. */ @@ -249,11 +226,21 @@ unsigned long zfs_arc_max = 0; unsigned long zfs_arc_min = 0; unsigned long zfs_arc_meta_limit = 0; unsigned long zfs_arc_meta_min = 0; +int zfs_arc_grow_retry = 0; +int zfs_arc_shrink_shift = 0; +int zfs_disable_dup_eviction = 0; +int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ /* - * Limit the number of restarts in arc_adjust_meta() + * These tunables are Linux specific */ -unsigned long zfs_arc_meta_adjust_restarts = 4096; +int zfs_arc_memory_throttle_disable = 1; +int zfs_arc_min_prefetch_lifespan = 0; +int zfs_arc_p_aggressive_disable = 1; +int zfs_arc_p_dampener_disable = 1; +int zfs_arc_meta_prune = 10000; +int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED; +int zfs_arc_meta_adjust_restarts = 4096; /* The 6 states: */ static arc_state_t ARC_anon; @@ -689,6 +676,7 @@ static void arc_get_data_buf(arc_buf_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); +static void arc_tuning_update(void); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); @@ -2535,7 +2523,7 @@ arc_prune_task(void *ptr) * is analogous to dnlc_reduce_cache() but more generic. * * This operation is performed asyncronously so it may be safely called - * in the context of the arc_adapt_thread(). A reference is taken here + * in the context of the arc_reclaim_thread(). A reference is taken here * for each registered arc_prune_t and the arc_prune_task() is responsible * for releasing it once the registered arc_prune_func_t has completed. */ @@ -2611,7 +2599,7 @@ arc_adjust_meta_balanced(void) int64_t adjustmnt, delta, prune = 0; uint64_t total_evicted = 0; arc_buf_contents_t type = ARC_BUFC_DATA; - unsigned long restarts = zfs_arc_meta_adjust_restarts; + int restarts = MAX(zfs_arc_meta_adjust_restarts, 0); restart: /* @@ -3004,25 +2992,16 @@ arc_flush(spa_t *spa, boolean_t retry) } void -arc_shrink(uint64_t bytes) +arc_shrink(int64_t to_free) { if (arc_c > arc_c_min) { - uint64_t to_free; - - to_free = bytes ? bytes : arc_c >> zfs_arc_shrink_shift; if (arc_c > arc_c_min + to_free) atomic_add_64(&arc_c, -to_free); else arc_c = arc_c_min; - to_free = bytes ? bytes : arc_p >> zfs_arc_shrink_shift; - - if (arc_p > to_free) - atomic_add_64(&arc_p, -to_free); - else - arc_p = 0; - + atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); if (arc_c > arc_size) arc_c = MAX(arc_size, arc_c_min); if (arc_p > arc_c) @@ -3035,8 +3014,181 @@ arc_shrink(uint64_t bytes) (void) arc_adjust(); } +typedef enum free_memory_reason_t { + FMR_UNKNOWN, + FMR_NEEDFREE, + FMR_LOTSFREE, + FMR_SWAPFS_MINFREE, + FMR_PAGES_PP_MAXIMUM, + FMR_HEAP_ARENA, + FMR_ZIO_ARENA, +} free_memory_reason_t; + +int64_t last_free_memory; +free_memory_reason_t last_free_reason; + +#ifdef _KERNEL +#ifdef __linux__ +/* + * expiration time for arc_no_grow set by direct memory reclaim. + */ +static clock_t arc_grow_time = 0; +#else +/* + * Additional reserve of pages for pp_reserve. + */ +int64_t arc_pages_pp_reserve = 64; + +/* + * Additional reserve of pages for swapfs. + */ +int64_t arc_swapfs_reserve = 64; +#endif +#endif /* _KERNEL */ + +/* + * Return the amount of memory that can be consumed before reclaim will be + * needed. Positive if there is sufficient free memory, negative indicates + * the amount of memory that needs to be freed up. + */ +static int64_t +arc_available_memory(void) +{ + int64_t lowest = INT64_MAX; + free_memory_reason_t r = FMR_UNKNOWN; + +#ifdef _KERNEL +#ifdef __linux__ + /* + * Under Linux we are not allowed to directly interrogate the global + * memory state. Instead rely on observing that direct reclaim has + * recently occurred therefore the system must be low on memory. The + * exact values returned are not critical but should be small. + */ + if (ddi_time_after_eq(ddi_get_lbolt(), arc_grow_time)) + lowest = PAGE_SIZE; + else + lowest = -PAGE_SIZE; +#else + int64_t n; + + /* + * Platforms like illumos have greater visibility in to the memory + * subsystem and can return a more detailed analysis of memory. + */ + if (needfree > 0) { + n = PAGESIZE * (-needfree); + if (n < lowest) { + lowest = n; + r = FMR_NEEDFREE; + } + } + + /* + * check that we're out of range of the pageout scanner. It starts to + * schedule paging if freemem is less than lotsfree and needfree. + * lotsfree is the high-water mark for pageout, and needfree is the + * number of needed free pages. We add extra pages here to make sure + * the scanner doesn't start up while we're freeing memory. + */ + n = PAGESIZE * (freemem - lotsfree - needfree - desfree); + if (n < lowest) { + lowest = n; + r = FMR_LOTSFREE; + } + + /* + * check to make sure that swapfs has enough space so that anon + * reservations can still succeed. anon_resvmem() checks that the + * availrmem is greater than swapfs_minfree, and the number of reserved + * swap pages. We also add a bit of extra here just to prevent + * circumstances from getting really dire. + */ + n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - + desfree - arc_swapfs_reserve); + if (n < lowest) { + lowest = n; + r = FMR_SWAPFS_MINFREE; + } + + + /* + * Check that we have enough availrmem that memory locking (e.g., via + * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum + * stores the number of pages that cannot be locked; when availrmem + * drops below pages_pp_maximum, page locking mechanisms such as + * page_pp_lock() will fail.) + */ + n = PAGESIZE * (availrmem - pages_pp_maximum - + arc_pages_pp_reserve); + if (n < lowest) { + lowest = n; + r = FMR_PAGES_PP_MAXIMUM; + } + +#if defined(__i386) + /* + * If we're on an i386 platform, it's possible that we'll exhaust the + * kernel heap space before we ever run out of available physical + * memory. Most checks of the size of the heap_area compare against + * tune.t_minarmem, which is the minimum available real memory that we + * can have in the system. However, this is generally fixed at 25 pages + * which is so low that it's useless. In this comparison, we seek to + * calculate the total heap-size, and reclaim if more than 3/4ths of the + * heap is allocated. (Or, in the calculation, if less than 1/4th is + * free) + */ + n = vmem_size(heap_arena, VMEM_FREE) - + (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); + if (n < lowest) { + lowest = n; + r = FMR_HEAP_ARENA; + } +#endif + + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this arena remains + * above about 1/16th free. + * + * Note: The 1/16th arena free requirement was put in place + * to aggressively evict memory from the arc in order to avoid + * memory fragmentation issues. + */ + if (zio_arena != NULL) { + n = vmem_size(zio_arena, VMEM_FREE) - + (vmem_size(zio_arena, VMEM_ALLOC) >> 4); + if (n < lowest) { + lowest = n; + r = FMR_ZIO_ARENA; + } + } +#endif /* __linux__ */ +#else + /* Every 100 calls, free a small amount */ + if (spa_get_random(100) == 0) + lowest = -1024; +#endif + + last_free_memory = lowest; + last_free_reason = r; + + return (lowest); +} + +/* + * Determine if the system is under memory pressure and is asking + * to reclaim memory. A return value of TRUE indicates that the system + * is under memory pressure and that the arc should adjust accordingly. + */ +static boolean_t +arc_reclaim_needed(void) +{ + return (arc_available_memory() < 0); +} + static void -arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) +arc_kmem_reap_now(void) { size_t i; kmem_cache_t *prev_cache = NULL; @@ -3053,13 +3205,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) arc_prune(zfs_arc_meta_prune); } - /* - * An aggressive reclamation will shrink the cache size as well as - * reap free buffers from the arc kmem caches. - */ - if (strat == ARC_RECLAIM_AGGR) - arc_shrink(bytes); - for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; @@ -3070,11 +3215,18 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) kmem_cache_reap_now(zio_data_buf_cache[i]); } } - kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); kmem_cache_reap_now(range_seg_cache); + + if (zio_arena != NULL) { + /* + * Ask the vmem arena to reclaim unused memory from its + * quantum caches. + */ + vmem_qcache_reap(zio_arena); + } } /* @@ -3094,88 +3246,90 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) * using mutex_tryenter() from arc_reclaim_thread(). */ static void -arc_adapt_thread(void) +arc_reclaim_thread(void) { + fstrans_cookie_t cookie = spl_fstrans_mark(); + clock_t growtime = 0; callb_cpr_t cpr; - fstrans_cookie_t cookie; - uint64_t arc_evicted; CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); - cookie = spl_fstrans_mark(); mutex_enter(&arc_reclaim_lock); - while (arc_reclaim_thread_exit == 0) { -#ifndef _KERNEL - arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; + while (!arc_reclaim_thread_exit) { + int64_t to_free; + int64_t free_memory = arc_available_memory(); + uint64_t evicted = 0; - mutex_exit(&arc_reclaim_lock); - if (spa_get_random(100) == 0) { + arc_tuning_update(); - if (arc_no_grow) { - if (last_reclaim == ARC_RECLAIM_CONS) { - last_reclaim = ARC_RECLAIM_AGGR; - } else { - last_reclaim = ARC_RECLAIM_CONS; - } - } else { - arc_no_grow = TRUE; - last_reclaim = ARC_RECLAIM_AGGR; - membar_producer(); - } + mutex_exit(&arc_reclaim_lock); - /* reset the growth delay for every reclaim */ - arc_grow_time = ddi_get_lbolt() + - (zfs_arc_grow_retry * hz); + if (free_memory < 0) { - arc_kmem_reap_now(last_reclaim, 0); + arc_no_grow = B_TRUE; arc_warm = B_TRUE; - } -#else /* _KERNEL */ - mutex_exit(&arc_reclaim_lock); -#endif /* !_KERNEL */ - - /* No recent memory pressure allow the ARC to grow. */ - if (arc_no_grow && - ddi_time_after_eq(ddi_get_lbolt(), arc_grow_time)) - arc_no_grow = FALSE; - arc_evicted = arc_adjust(); + /* + * Wait at least zfs_grow_retry (default 5) seconds + * before considering growing. + */ + growtime = ddi_get_lbolt() + (arc_grow_retry * hz); - /* - * We're either no longer overflowing, or we - * can't evict anything more, so we should wake - * up any threads before we go to sleep. - */ - if (arc_size <= arc_c || arc_evicted == 0) - cv_broadcast(&arc_reclaim_waiters_cv); + arc_kmem_reap_now(); - mutex_enter(&arc_reclaim_lock); + /* + * If we are still low on memory, shrink the ARC + * so that we have arc_shrink_min free space. + */ + free_memory = arc_available_memory(); - /* block until needed, or one second, whichever is shorter */ - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_sig(&arc_reclaim_thread_cv, - &arc_reclaim_lock, (ddi_get_lbolt() + hz)); - CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); + to_free = (arc_c >> arc_shrink_shift) - free_memory; + if (to_free > 0) { +#ifdef _KERNEL + to_free = MAX(to_free, ptob(needfree)); +#endif + arc_shrink(to_free); + } + } else if (free_memory < arc_c >> arc_no_grow_shift) { + arc_no_grow = B_TRUE; + } else if (ddi_get_lbolt() >= growtime) { + arc_no_grow = B_FALSE; + } + evicted = arc_adjust(); - /* Allow the module options to be changed */ - if (zfs_arc_max > 64 << 20 && - zfs_arc_max < physmem * PAGESIZE && - zfs_arc_max != arc_c_max) - arc_c_max = zfs_arc_max; + mutex_enter(&arc_reclaim_lock); - if (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT && - zfs_arc_min <= arc_c_max && - zfs_arc_min != arc_c_min) - arc_c_min = zfs_arc_min; + /* + * If evicted is zero, we couldn't evict anything via + * arc_adjust(). This could be due to hash lock + * collisions, but more likely due to the majority of + * arc buffers being unevictable. Therefore, even if + * arc_size is above arc_c, another pass is unlikely to + * be helpful and could potentially cause us to enter an + * infinite loop. + */ + if (arc_size <= arc_c || evicted == 0) { + /* + * We're either no longer overflowing, or we + * can't evict anything more, so we should wake + * up any threads before we go to sleep. + */ + cv_broadcast(&arc_reclaim_waiters_cv); - if (zfs_arc_meta_limit > 0 && - zfs_arc_meta_limit <= arc_c_max && - zfs_arc_meta_limit != arc_meta_limit) - arc_meta_limit = zfs_arc_meta_limit; + /* + * Block until signaled, or after one second (we + * might need to perform arc_kmem_reap_now() + * even if we aren't being signalled) + */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_sig(&arc_reclaim_thread_cv, + &arc_reclaim_lock, ddi_get_lbolt() + hz); + CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); + } } - arc_reclaim_thread_exit = 0; + arc_reclaim_thread_exit = FALSE; cv_broadcast(&arc_reclaim_thread_cv); CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ spl_fstrans_unmark(cookie); @@ -3185,12 +3339,11 @@ arc_adapt_thread(void) static void arc_user_evicts_thread(void) { + fstrans_cookie_t cookie = spl_fstrans_mark(); callb_cpr_t cpr; - fstrans_cookie_t cookie; CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); - cookie = spl_fstrans_mark(); mutex_enter(&arc_user_evicts_lock); while (!arc_user_evicts_thread_exit) { mutex_exit(&arc_user_evicts_lock); @@ -3338,15 +3491,15 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) * reap whatever we can from the various arc slabs. */ if (pages > 0) { - arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan)); - + arc_shrink(ptob(sc->nr_to_scan)); + arc_kmem_reap_now(); #ifdef HAVE_SPLIT_SHRINKER_CALLBACK pages = MAX(pages - btop(arc_evictable_memory()), 0); #else pages = btop(arc_evictable_memory()); #endif } else { - arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan)); + arc_kmem_reap_now(); pages = SHRINK_STOP; } @@ -3421,6 +3574,11 @@ arc_adapt(int bytes, arc_state_t *state) } ASSERT((int64_t)arc_p >= 0); + if (arc_reclaim_needed()) { + cv_signal(&arc_reclaim_thread_cv); + return; + } + if (arc_no_grow) return; @@ -4721,7 +4879,11 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg) if (zfs_arc_memory_throttle_disable) return (0); - if (freemem <= physmem * arc_lotsfree_percent / 100) { + if (freemem > physmem * arc_lotsfree_percent / 100) + return (0); + + if (arc_reclaim_needed()) { + /* memory is low, delay before restarting */ ARCSTAT_INCR(arcstat_memory_throttle_count, 1); DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); return (SET_ERROR(EAGAIN)); @@ -4871,9 +5033,73 @@ arc_state_multilist_index_func(multilist_t *ml, void *obj) multilist_get_num_sublists(ml)); } +/* + * Called during module initialization and periodically thereafter to + * apply reasonable changes to the exposed performance tunings. Non-zero + * zfs_* values which differ from the currently set values will be applied. + */ +static void +arc_tuning_update(void) +{ + /* Valid range: 64M - */ + if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) && + (zfs_arc_max > 64 << 20) && (zfs_arc_max < ptob(physmem)) && + (zfs_arc_max > arc_c_min)) { + arc_c_max = zfs_arc_max; + arc_c = arc_c_max; + arc_p = (arc_c >> 1); + arc_meta_limit = MIN(arc_meta_limit, arc_c_max); + } + + /* Valid range: 32M - */ + if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) && + (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) && + (zfs_arc_min <= arc_c_max)) { + arc_c_min = zfs_arc_min; + arc_c = MAX(arc_c, arc_c_min); + } + + /* Valid range: 16M - */ + if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) && + (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) && + (zfs_arc_meta_min <= arc_c_max)) { + arc_meta_min = zfs_arc_meta_min; + arc_meta_limit = MAX(arc_meta_limit, arc_meta_min); + } + + /* Valid range: - */ + if ((zfs_arc_meta_limit) && (zfs_arc_meta_limit != arc_meta_limit) && + (zfs_arc_meta_limit >= zfs_arc_meta_min) && + (zfs_arc_meta_limit <= arc_c_max)) + arc_meta_limit = zfs_arc_meta_limit; + + /* Valid range: 1 - N */ + if (zfs_arc_grow_retry) + arc_grow_retry = zfs_arc_grow_retry; + + /* Valid range: 1 - N */ + if (zfs_arc_shrink_shift) { + arc_shrink_shift = zfs_arc_shrink_shift; + arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1); + } + + /* Valid range: 1 - N ticks */ + if (zfs_arc_min_prefetch_lifespan) + arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan; +} + void arc_init(void) { + /* + * allmem is "all memory that we could possibly use". + */ +#ifdef _KERNEL + uint64_t allmem = ptob(physmem); +#else + uint64_t allmem = (physmem * PAGESIZE) / 2; +#endif + mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); @@ -4882,10 +5108,10 @@ arc_init(void) cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); /* Convert seconds to clock ticks */ - zfs_arc_min_prefetch_lifespan = 1 * hz; + arc_min_prefetch_lifespan = 1 * hz; /* Start out with 1/8 of all memory */ - arc_c = physmem * PAGESIZE / 8; + arc_c = allmem / 8; #ifdef _KERNEL /* @@ -4894,6 +5120,7 @@ arc_init(void) * need to limit the cache to 1/8 of VM size. */ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); + /* * Register a shrinker to support synchronous (direct) memory * reclaim from the arc. This is done to prevent kswapd from @@ -4902,40 +5129,26 @@ arc_init(void) spl_register_shrinker(&arc_shrinker); #endif - /* set min cache to allow safe operation of arc_adapt() */ + /* Set min cache to allow safe operation of arc_adapt() */ arc_c_min = 2ULL << SPA_MAXBLOCKSHIFT; - /* set max to 1/2 of all memory */ - arc_c_max = arc_c * 4; - - /* - * Allow the tunables to override our calculations if they are - * reasonable (ie. over 64MB) - */ - if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) - arc_c_max = zfs_arc_max; - if (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT && - zfs_arc_min <= arc_c_max) - arc_c_min = zfs_arc_min; + /* Set max to 1/2 of all memory */ + arc_c_max = allmem / 2; arc_c = arc_c_max; arc_p = (arc_c >> 1); - /* limit meta-data to 3/4 of the arc capacity */ - arc_meta_limit = (3 * arc_c_max) / 4; + /* Set min to 1/2 of arc_c_min */ + arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT; + /* Initialize maximum observed usage to zero */ arc_meta_max = 0; + /* Set limit to 3/4 of arc_c_max with a floor of arc_meta_min */ + arc_meta_limit = MAX((3 * arc_c_max) / 4, arc_meta_min); - /* Allow the tunable to override if it is reasonable */ - if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) - arc_meta_limit = zfs_arc_meta_limit; - - if (zfs_arc_meta_min > 0) { - arc_meta_min = zfs_arc_meta_min; - } else { - arc_meta_min = arc_c_min / 2; - } + /* Apply user specified tunings */ + arc_tuning_update(); if (zfs_arc_num_sublists_per_state < 1) - zfs_arc_num_sublists_per_state = num_online_cpus(); + zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1); /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) @@ -5021,7 +5234,7 @@ arc_init(void) kstat_install(arc_ksp); } - (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0, + (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, TS_RUN, minclsyspri); (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, @@ -6345,7 +6558,7 @@ l2arc_feed_thread(void) /* * Avoid contributing to memory pressure. */ - if (arc_no_grow) { + if (arc_reclaim_needed()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; @@ -6568,7 +6781,7 @@ MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata"); module_param(zfs_arc_meta_prune, int, 0644); MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune"); -module_param(zfs_arc_meta_adjust_restarts, ulong, 0644); +module_param(zfs_arc_meta_adjust_restarts, int, 0644); MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts, "Limit number of restarts in arc_adjust_meta");