OpenZFS 8558, 8602 - lwp_create() returns EAGAIN

author Brian Behlendorf <behlendorf1@llnl.gov>

Thu, 26 Oct 2017 19:57:53 +0000 (12:57 -0700)

committer GitHub <noreply@github.com>

Thu, 26 Oct 2017 19:57:53 +0000 (12:57 -0700)
author Brian Behlendorf <behlendorf1@llnl.gov>
Thu, 26 Oct 2017 19:57:53 +0000 (12:57 -0700)
committer GitHub <noreply@github.com>
Thu, 26 Oct 2017 19:57:53 +0000 (12:57 -0700)
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h

index 8eed90a8fdd5702b79d605b3d00ba0fd0defcc35..044ef95441c94b9f4a3c9f88630f4af90c8fdeb5 100644 (file)
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@@ -127,6 +127,7 @@ typedef struct dsl_pool {
         txg_list_t dp_dirty_dirs;
         txg_list_t dp_sync_tasks;
         taskq_t *dp_sync_taskq;
+       taskq_t *dp_zil_clean_taskq;
  
         /*
          * Protects administrative changes (properties, namespace)
diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h

index 13ecca3c8b0436818de603fab0dd2c780f3c2eed..dd5304b79a91be698af69413c1b2296e9133256e 100644 (file)
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@@ -124,7 +124,6 @@ struct zilog {
         list_t          zl_lwb_list;    /* in-flight log write list */
         kmutex_t        zl_vdev_lock;   /* protects zl_vdev_tree */
         avl_tree_t      zl_vdev_tree;   /* vdevs to flush in zil_commit() */
-       taskq_t         *zl_clean_taskq; /* runs lwb and itx clean tasks */
         avl_tree_t      zl_bp_tree;     /* track bps during log parse */
         clock_t         zl_replay_time; /* lbolt of when replay started */
         uint64_t        zl_replay_blks; /* number of log blocks replayed */
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5

index 19ecde4f462e2ec0a9b6760f7c0c11b7a0897325..d94d6b395df8a16cd318eeeaff743f266fe5c04d 100644 (file)
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -1745,6 +1745,18 @@ Rewrite new block pointers starting in this pass
  Default value: \fB2\fR.
  .RE
  
+.sp
+.ne 2
+.na
+\fBzfs_sync_taskq_batch_pct\fR (int)
+.ad
+.RS 12n
+This controls the number of threads used by the dp_sync_taskq.  The default
+value of 75% will create a maximum of one thread per cpu.
+.sp
+Default value: \fB75\fR.
+.RE
+
  .sp
  .ne 2
  .na
@@ -1999,6 +2011,42 @@ in the queue can be viewed with the \fBzpool events\fR command.
  Default value: \fB0\fR.
  .RE
  
+.sp
+.ne 2
+.na
+\fBzfs_zil_clean_taskq_maxalloc\fR (int)
+.ad
+.RS 12n
+The maximum number of taskq entries that are allowed to be cached.  When this
+limit is exceeded itx's will be cleaned synchronously.
+.sp
+Default value: \fB1048576\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_zil_clean_taskq_minalloc\fR (int)
+.ad
+.RS 12n
+The number of taskq entries that are pre-populated when the taskq is first
+created and are immediately available for use.
+.sp
+Default value: \fB1024\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_zil_clean_taskq_nthr_pct\fR (int)
+.ad
+.RS 12n
+This controls the number of threads used by the dp_zil_clean_taskq.  The default
+value of 100% will create a maximum of one thread per cpu.
+.sp
+Default value: \fB100\fR.
+.RE
+
  .sp
  .ne 2
  .na
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c

index a28be34fb842a99bca8fc2b4e3e92d3d9efb351a..6f435dcc2c4445a3e21e7b72d33f5862be88fa9f 100644 (file)
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -135,6 +135,36 @@ unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
   */
  int zfs_sync_taskq_batch_pct = 75;
  
+/*
+ * These tunables determine the behavior of how zil_itxg_clean() is
+ * called via zil_clean() in the context of spa_sync(). When an itxg
+ * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching.
+ * If the dispatch fails, the call to zil_itxg_clean() will occur
+ * synchronously in the context of spa_sync(), which can negatively
+ * impact the performance of spa_sync() (e.g. in the case of the itxg
+ * list having a large number of itxs that needs to be cleaned).
+ *
+ * Thus, these tunables can be used to manipulate the behavior of the
+ * taskq used by zil_clean(); they determine the number of taskq entries
+ * that are pre-populated when the taskq is first created (via the
+ * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of
+ * taskq entries that are cached after an on-demand allocation (via the
+ * "zfs_zil_clean_taskq_maxalloc").
+ *
+ * The idea being, we want to try reasonably hard to ensure there will
+ * already be a taskq entry pre-allocated by the time that it is needed
+ * by zil_clean(). This way, we can avoid the possibility of an
+ * on-demand allocation of a new taskq entry from failing, which would
+ * result in zil_itxg_clean() being called synchronously from zil_clean()
+ * (which can adversely affect performance of spa_sync()).
+ *
+ * Additionally, the number of threads used by the taskq can be
+ * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable.
+ */
+int zfs_zil_clean_taskq_nthr_pct = 100;
+int zfs_zil_clean_taskq_minalloc = 1024;
+int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
+
  int
  dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
  {
@@ -176,6 +206,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
             zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
             TASKQ_THREADS_CPU_PCT);
  
+       dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
+           zfs_zil_clean_taskq_nthr_pct, minclsyspri,
+           zfs_zil_clean_taskq_minalloc,
+           zfs_zil_clean_taskq_maxalloc,
+           TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
+
         mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
         cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
  
@@ -334,6 +370,7 @@ dsl_pool_close(dsl_pool_t *dp)
         txg_list_destroy(&dp->dp_sync_tasks);
         txg_list_destroy(&dp->dp_dirty_dirs);
  
+       taskq_destroy(dp->dp_zil_clean_taskq);
         taskq_destroy(dp->dp_sync_taskq);
  
         /*
@@ -1155,5 +1192,18 @@ MODULE_PARM_DESC(zfs_delay_scale, "how quickly delay approaches infinity");
  module_param(zfs_sync_taskq_batch_pct, int, 0644);
  MODULE_PARM_DESC(zfs_sync_taskq_batch_pct,
         "max percent of CPUs that are used to sync dirty data");
+
+module_param(zfs_zil_clean_taskq_nthr_pct, int, 0644);
+MODULE_PARM_DESC(zfs_zil_clean_taskq_nthr_pct,
+       "max percent of CPUs that are used per dp_sync_taskq");
+
+module_param(zfs_zil_clean_taskq_minalloc, int, 0644);
+MODULE_PARM_DESC(zfs_zil_clean_taskq_minalloc,
+       "number of taskq entries that are pre-populated");
+
+module_param(zfs_zil_clean_taskq_maxalloc, int, 0644);
+MODULE_PARM_DESC(zfs_zil_clean_taskq_maxalloc,
+       "max number of taskq entries that are cached");
+
  /* END CSTYLED */
  #endif
diff --git a/module/zfs/zil.c b/module/zfs/zil.c

index e90e583ae06eef8735d44b7a726aa344600f2ead..c34c1d2534b72a62a180319d6824985bbf1a8def 100644 (file)
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1473,8 +1473,7 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
                 return;
         }
         ASSERT3U(itxg->itxg_txg, <=, synced_txg);
-       ASSERT(itxg->itxg_txg != 0);
-       ASSERT(zilog->zl_clean_taskq != NULL);
+       ASSERT3U(itxg->itxg_txg, !=, 0);
         clean_me = itxg->itxg_itxs;
         itxg->itxg_itxs = NULL;
         itxg->itxg_txg = 0;
@@ -1485,8 +1484,11 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
          * free it in-line. This should be rare. Note, using TQ_SLEEP
          * created a bad performance problem.
          */
-       if (taskq_dispatch(zilog->zl_clean_taskq,
-           (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0)
+       ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
+       ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
+       taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
+           (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP);
+       if (id == TASKQID_INVALID)
                 zil_itxg_clean(clean_me);
  }
  
@@ -1959,13 +1961,10 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
  {
         zilog_t *zilog = dmu_objset_zil(os);
  
-       ASSERT(zilog->zl_clean_taskq == NULL);
         ASSERT(zilog->zl_get_data == NULL);
         ASSERT(list_is_empty(&zilog->zl_lwb_list));
  
         zilog->zl_get_data = get_data;
-       zilog->zl_clean_taskq = taskq_create("zil_clean", 1, defclsyspri,
-           2, 2, TASKQ_PREPOPULATE);
  
         return (zilog);
  }
@@ -2000,8 +1999,6 @@ zil_close(zilog_t *zilog)
         if (txg < spa_freeze_txg(zilog->zl_spa))
                 VERIFY(!zilog_is_dirty(zilog));
  
-       taskq_destroy(zilog->zl_clean_taskq);
-       zilog->zl_clean_taskq = NULL;
         zilog->zl_get_data = NULL;
  
         /*
author	Brian Behlendorf <behlendorf1@llnl.gov>
	Thu, 26 Oct 2017 19:57:53 +0000 (12:57 -0700)
committer	GitHub <noreply@github.com>
	Thu, 26 Oct 2017 19:57:53 +0000 (12:57 -0700)
include/sys/dsl_pool.h		patch \| blob \| history
include/sys/zil_impl.h		patch \| blob \| history
man/man5/zfs-module-parameters.5		patch \| blob \| history
module/zfs/dsl_pool.c		patch \| blob \| history
module/zfs/zil.c		patch \| blob \| history