Make arc_prune() asynchronous

author Brian Behlendorf <behlendorf1@llnl.gov>

Sat, 30 May 2015 14:57:53 +0000 (09:57 -0500)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Thu, 11 Jun 2015 17:27:25 +0000 (10:27 -0700)
author Brian Behlendorf <behlendorf1@llnl.gov>
Sat, 30 May 2015 14:57:53 +0000 (09:57 -0500)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Thu, 11 Jun 2015 17:27:25 +0000 (10:27 -0700)
diff --git a/include/sys/arc.h b/include/sys/arc.h

index 38f9f27fea610363fdaaa956302857fce21dbd40..0961d4b4d2cb14db1c5bcffebf78484acb9489c0 100644 (file)
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -59,10 +59,16 @@ arc_done_func_t arc_getbuf_func;
  struct arc_prune {
         arc_prune_func_t        *p_pfunc;
         void                    *p_private;
+       uint64_t                p_adjust;
         list_node_t             p_node;
         refcount_t              p_refcnt;
  };
  
+typedef enum arc_strategy {
+       ARC_STRATEGY_META_ONLY          = 0, /* Evict only meta data buffers */
+       ARC_STRATEGY_META_BALANCED      = 1, /* Evict data buffers if needed */
+} arc_strategy_t;
+
  typedef enum arc_flags
  {
         /*
diff --git a/module/zfs/arc.c b/module/zfs/arc.c

index 67ef87daf137248ad11f2c1c93f5df6e1ddd8696..561c2312455ae9994409025c6b63e2544b3b68e8 100644 (file)
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -167,6 +167,9 @@ static boolean_t    arc_user_evicts_thread_exit;
  /* number of objects to prune from caches when arc_meta_limit is reached */
  int zfs_arc_meta_prune = 10000;
  
+/* The preferred strategy to employ when arc_meta_limit is reached */
+int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
+
  typedef enum arc_reclaim_strategy {
         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
@@ -531,6 +534,7 @@ static arc_state_t  *arc_l2c_only;
  
  static list_t arc_prune_list;
  static kmutex_t arc_prune_mtx;
+static taskq_t *arc_prune_taskq;
  static arc_buf_t *arc_eviction_list;
  static arc_buf_hdr_t arc_eviction_hdr;
  
@@ -2430,47 +2434,64 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
  }
  
  /*
- * Request that arc user drop references so that N bytes can be released
- * from the cache.  This provides a mechanism to ensure the arc can honor
- * the arc_meta_limit and reclaim buffers which are pinned in the cache
- * by higher layers.  (i.e. the zpl)
+ * Helper function for arc_prune() it is responsible for safely handling
+ * the execution of a registered arc_prune_func_t.
   */
  static void
-arc_do_user_prune(int64_t adjustment)
+arc_prune_task(void *ptr)
  {
-       arc_prune_func_t *func;
-       void *private;
-       arc_prune_t *cp, *np;
+       arc_prune_t *ap = (arc_prune_t *)ptr;
+       arc_prune_func_t *func = ap->p_pfunc;
  
-       mutex_enter(&arc_prune_mtx);
+       if (func != NULL)
+               func(ap->p_adjust, ap->p_private);
  
-       cp = list_head(&arc_prune_list);
-       while (cp != NULL) {
-               func = cp->p_pfunc;
-               private = cp->p_private;
-               np = list_next(&arc_prune_list, cp);
-               refcount_add(&cp->p_refcnt, func);
-               mutex_exit(&arc_prune_mtx);
+       /* Callback unregistered concurrently with execution */
+       if (refcount_remove(&ap->p_refcnt, func) == 0) {
+               ASSERT(!list_link_active(&ap->p_node));
+               refcount_destroy(&ap->p_refcnt);
+               kmem_free(ap, sizeof (*ap));
+       }
+}
  
-               if (func != NULL)
-                       func(adjustment, private);
+/*
+ * Notify registered consumers they must drop holds on a portion of the ARC
+ * buffered they reference.  This provides a mechanism to ensure the ARC can
+ * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers.  This
+ * is analogous to dnlc_reduce_cache() but more generic.
+ *
+ * This operation is performed asyncronously so it may be safely called
+ * in the context of the arc_adapt_thread().  A reference is taken here
+ * for each registered arc_prune_t and the arc_prune_task() is responsible
+ * for releasing it once the registered arc_prune_func_t has completed.
+ */
+static void
+arc_prune_async(int64_t adjust)
+{
+       arc_prune_t *ap;
  
-               mutex_enter(&arc_prune_mtx);
+       mutex_enter(&arc_prune_mtx);
+       for (ap = list_head(&arc_prune_list); ap != NULL;
+           ap = list_next(&arc_prune_list, ap)) {
  
-               /* User removed prune callback concurrently with execution */
-               if (refcount_remove(&cp->p_refcnt, func) == 0) {
-                       ASSERT(!list_link_active(&cp->p_node));
-                       refcount_destroy(&cp->p_refcnt);
-                       kmem_free(cp, sizeof (*cp));
-               }
+               if (refcount_count(&ap->p_refcnt) >= 2)
+                       continue;
  
-               cp = np;
+               refcount_add(&ap->p_refcnt, ap->p_pfunc);
+               ap->p_adjust = adjust;
+               taskq_dispatch(arc_prune_taskq, arc_prune_task, ap, TQ_SLEEP);
+               ARCSTAT_BUMP(arcstat_prune);
         }
-
-       ARCSTAT_BUMP(arcstat_prune);
         mutex_exit(&arc_prune_mtx);
  }
  
+static void
+arc_prune(int64_t adjust)
+{
+       arc_prune_async(adjust);
+       taskq_wait_outstanding(arc_prune_taskq, 0);
+}
+
  /*
   * Evict the specified number of bytes from the state specified,
   * restricting eviction to the spa and type given. This function
@@ -2511,7 +2532,7 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
   * available for reclaim.
   */
  static uint64_t
-arc_adjust_meta(void)
+arc_adjust_meta_balanced(void)
  {
         int64_t adjustmnt, delta, prune = 0;
         uint64_t total_evicted = 0;
@@ -2580,7 +2601,7 @@ restart:
  
                         if (zfs_arc_meta_prune) {
                                 prune += zfs_arc_meta_prune;
-                               arc_do_user_prune(prune);
+                               arc_prune_async(prune);
                         }
                 }
  
@@ -2592,6 +2613,50 @@ restart:
         return (total_evicted);
  }
  
+/*
+ * Evict metadata buffers from the cache, such that arc_meta_used is
+ * capped by the arc_meta_limit tunable.
+ */
+static uint64_t
+arc_adjust_meta_only(void)
+{
+       uint64_t total_evicted = 0;
+       int64_t target;
+
+       /*
+        * If we're over the meta limit, we want to evict enough
+        * metadata to get back under the meta limit. We don't want to
+        * evict so much that we drop the MRU below arc_p, though. If
+        * we're over the meta limit more than we're over arc_p, we
+        * evict some from the MRU here, and some from the MFU below.
+        */
+       target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+           (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p));
+
+       total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+
+       /*
+        * Similar to the above, we want to evict enough bytes to get us
+        * below the meta limit, but not so much as to drop us below the
+        * space alloted to the MFU (which is defined as arc_c - arc_p).
+        */
+       target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+           (int64_t)(arc_mfu->arcs_size - (arc_c - arc_p)));
+
+       total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+
+       return (total_evicted);
+}
+
+static uint64_t
+arc_adjust_meta(void)
+{
+       if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
+               return (arc_adjust_meta_only());
+       else
+               return (arc_adjust_meta_balanced());
+}
+
  /*
   * Return the type of the oldest buffer in the given arc state
   *
@@ -2905,6 +2970,14 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
         extern kmem_cache_t     *zio_buf_cache[];
         extern kmem_cache_t     *zio_data_buf_cache[];
  
+       if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) {
+               /*
+                * We are exceeding our meta-data cache limit.
+                * Prune some entries to release holds on meta-data.
+                */
+               arc_prune(zfs_arc_meta_prune);
+       }
+
         /*
          * An aggressive reclamation will shrink the cache size as well as
          * reap free buffers from the arc kmem caches.
@@ -2929,15 +3002,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
  }
  
  /*
- * Unlike other ZFS implementations this thread is only responsible for
- * adapting the target ARC size on Linux.  The responsibility for memory
- * reclamation has been entirely delegated to the arc_shrinker_func()
- * which is registered with the VM.  To reflect this change in behavior
- * the arc_reclaim thread has been renamed to arc_adapt.
- *
- * The following comment from arc_reclaim_thread() in illumos is still
- * applicable:
- *
   * Threads can block in arc_get_data_buf() waiting for this thread to evict
   * enough data and signal them to proceed. When this happens, the threads in
   * arc_get_data_buf() are sleeping while holding the hash lock for their
@@ -4862,6 +4926,9 @@ arc_init(void)
         mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
  
+       arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri,
+           max_ncpus, INT_MAX, TASKQ_PREPOPULATE);
+
         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
  
@@ -4943,6 +5010,9 @@ arc_fini(void)
                 arc_ksp = NULL;
         }
  
+       taskq_wait(arc_prune_taskq);
+       taskq_destroy(arc_prune_taskq);
+
         mutex_enter(&arc_prune_mtx);
         while ((p = list_head(&arc_prune_list)) != NULL) {
                 list_remove(&arc_prune_list, p);
@@ -6374,6 +6444,9 @@ module_param(zfs_arc_meta_adjust_restarts, ulong, 0644);
  MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
         "Limit number of restarts in arc_adjust_meta");
  
+module_param(zfs_arc_meta_strategy, int, 0644);
+MODULE_PARM_DESC(zfs_arc_meta_strategy, "Meta reclaim strategy");
+
  module_param(zfs_arc_grow_retry, int, 0644);
  MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
author	Brian Behlendorf <behlendorf1@llnl.gov>
	Sat, 30 May 2015 14:57:53 +0000 (09:57 -0500)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Thu, 11 Jun 2015 17:27:25 +0000 (10:27 -0700)
include/sys/arc.h		patch \| blob \| history
module/zfs/arc.c		patch \| blob \| history