OpenZFS 9465 - ARC check for 'anon_size > arc_c/2' can stall the system

author Don Brady <don.brady@delphix.com>

Wed, 27 Sep 2017 01:45:19 +0000 (19:45 -0600)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Mon, 30 Jul 2018 18:30:41 +0000 (11:30 -0700)
author Don Brady <don.brady@delphix.com>
Wed, 27 Sep 2017 01:45:19 +0000 (19:45 -0600)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Mon, 30 Jul 2018 18:30:41 +0000 (11:30 -0700)
diff --git a/include/sys/arc.h b/include/sys/arc.h

index 9d6bab505a2f1c5d5ca57dcb37f54f81729f802a..a5bdefb56f4b2788a424cb878efa4381993835ed 100644 (file)
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -289,7 +289,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp);
  
  void arc_flush(spa_t *spa, boolean_t retry);
  void arc_tempreserve_clear(uint64_t reserve);
-int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
+int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
  
  uint64_t arc_target_bytes(void);
  void arc_init(void);
diff --git a/include/sys/spa.h b/include/sys/spa.h

index 4a3fc71f74057f3f2a3bd95bd16a034368b6aa2c..82fe2c18ca8dbc01ef59f77c1e55c669dc23160f 100644 (file)
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -990,6 +990,7 @@ extern uint64_t spa_delegation(spa_t *spa);
  extern objset_t *spa_meta_objset(spa_t *spa);
  extern uint64_t spa_deadman_synctime(spa_t *spa);
  extern uint64_t spa_deadman_ziotime(spa_t *spa);
+extern uint64_t spa_dirty_data(spa_t *spa);
  
  /* Miscellaneous support routines */
  extern void spa_load_failed(spa_t *spa, const char *fmt, ...);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h

index 8d2a31961d66581d11e67511fe4f688e6346fcc0..8d2a20dbb93cef5b4e269b0ce6d2f7d60544aa8b 100644 (file)
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -363,6 +363,11 @@ struct spa {
         uint64_t        spa_errata;             /* errata issues detected */
         spa_stats_t     spa_stats;              /* assorted spa statistics */
         spa_keystore_t  spa_keystore;           /* loaded crypto keys */
+
+       /* arc_memory_throttle() parameters during low memory condition */
+       uint64_t        spa_lowmem_page_load;   /* memory load during txg */
+       uint64_t        spa_lowmem_last_txg;    /* txg window start */
+
         hrtime_t        spa_ccw_fail_time;      /* Conf cache write fail time */
         taskq_t         *spa_zvol_taskq;        /* Taskq for minor management */
         taskq_t         *spa_prefetch_taskq;    /* Taskq for prefetch threads */
diff --git a/module/zfs/arc.c b/module/zfs/arc.c

index e49f8a547e5527d0b98164b590f0a9145b46c6b1..7f2929c17e1a026e42849c1b114c97163a95fc75 100644 (file)
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -392,6 +392,16 @@ int zfs_arc_shrink_shift = 0;
  int zfs_arc_p_min_shift = 0;
  int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
  
+/*
+ * ARC dirty data constraints for arc_tempreserve_space() throttle.
+ */
+unsigned long zfs_arc_dirty_limit_percent = 50;        /* total dirty data limit */
+unsigned long zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */
+unsigned long zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */
+
+/*
+ * Enable or disable compressed arc buffers.
+ */
  int zfs_compressed_arc_enabled = B_TRUE;
  
  /*
@@ -7182,12 +7192,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
  }
  
  static int
-arc_memory_throttle(uint64_t reserve, uint64_t txg)
+arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
  {
  #ifdef _KERNEL
         uint64_t available_memory = arc_free_memory();
-       static uint64_t page_load = 0;
-       static uint64_t last_txg = 0;
  
  #if defined(_ILP32)
         available_memory =
@@ -7197,9 +7205,9 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg)
         if (available_memory > arc_all_memory() * arc_lotsfree_percent / 100)
                 return (0);
  
-       if (txg > last_txg) {
-               last_txg = txg;
-               page_load = 0;
+       if (txg > spa->spa_lowmem_last_txg) {
+               spa->spa_lowmem_last_txg = txg;
+               spa->spa_lowmem_page_load = 0;
         }
         /*
          * If we are in pageout, we know that memory is already tight,
@@ -7207,21 +7215,22 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg)
          * continue to let page writes occur as quickly as possible.
          */
         if (current_is_kswapd()) {
-               if (page_load > MAX(arc_sys_free / 4, available_memory) / 4) {
+               if (spa->spa_lowmem_page_load >
+                   MAX(arc_sys_free / 4, available_memory) / 4) {
                         DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
                         return (SET_ERROR(ERESTART));
                 }
                 /* Note: reserve is inflated, so we deflate */
-               page_load += reserve / 8;
+               atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
                 return (0);
-       } else if (page_load > 0 && arc_reclaim_needed()) {
+       } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
                 /* memory is low, delay before restarting */
                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
                 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
                 return (SET_ERROR(EAGAIN));
         }
-       page_load = 0;
-#endif
+       spa->spa_lowmem_page_load = 0;
+#endif /* _KERNEL */
         return (0);
  }
  
@@ -7233,7 +7242,7 @@ arc_tempreserve_clear(uint64_t reserve)
  }
  
  int
-arc_tempreserve_space(uint64_t reserve, uint64_t txg)
+arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
  {
         int error;
         uint64_t anon_size;
@@ -7269,7 +7278,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
          * in order to compress/encrypt/etc the data.  We therefore need to
          * make sure that there is sufficient available memory for this.
          */
-       error = arc_memory_throttle(reserve, txg);
+       error = arc_memory_throttle(spa, reserve, txg);
         if (error != 0)
                 return (error);
  
@@ -7277,12 +7286,24 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
          * Throttle writes when the amount of dirty data in the cache
          * gets too large.  We try to keep the cache less than half full
          * of dirty blocks so that our sync times don't grow too large.
+        *
+        * In the case of one pool being built on another pool, we want
+        * to make sure we don't end up throttling the lower (backing)
+        * pool when the upper pool is the majority contributor to dirty
+        * data. To insure we make forward progress during throttling, we
+        * also check the current pool's net dirty data and only throttle
+        * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
+        * data in the cache.
+        *
          * Note: if two requests come in concurrently, we might let them
          * both succeed, when one of them should fail.  Not a huge deal.
          */
+       uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
+       uint64_t spa_dirty_anon = spa_dirty_data(spa);
  
-       if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
-           anon_size > arc_c / 4) {
+       if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 &&
+           anon_size > arc_c * zfs_arc_anon_limit_percent / 100 &&
+           spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
  #ifdef ZFS_DEBUG
                 uint64_t meta_esize =
                     refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c

index 75c40c68c9c47714f8dc38ee13ed77a38d25f719..9a43691e5f1e7499ffa2fdc8c89cf69c1e29ecf0 100644 (file)
--- a/module/zfs/dsl_dir.c
+++ b/module/zfs/dsl_dir.c
@@ -1416,7 +1416,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
             offsetof(struct tempreserve, tr_node));
         ASSERT3S(asize, >, 0);
  
-       err = arc_tempreserve_space(lsize, tx->tx_txg);
+       err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg);
         if (err == 0) {
                 struct tempreserve *tr;
  
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c

index 9410fab0763cdb32b43c5428c0a844561b0f5efc..f43a38ef1c2d48e8667bb50b50231666d192b6b4 100644 (file)
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1917,6 +1917,12 @@ bp_get_dsize(spa_t *spa, const blkptr_t *bp)
         return (dsize);
  }
  
+uint64_t
+spa_dirty_data(spa_t *spa)
+{
+       return (spa->spa_dsl_pool->dp_dirty_total);
+}
+
  /*
   * ==========================================================================
   * Initialization and Termination
author	Don Brady <don.brady@delphix.com>
	Wed, 27 Sep 2017 01:45:19 +0000 (19:45 -0600)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Mon, 30 Jul 2018 18:30:41 +0000 (11:30 -0700)
include/sys/arc.h		patch \| blob \| history
include/sys/spa.h		patch \| blob \| history
include/sys/spa_impl.h		patch \| blob \| history
module/zfs/arc.c		patch \| blob \| history
module/zfs/dsl_dir.c		patch \| blob \| history
module/zfs/spa_misc.c		patch \| blob \| history