]> granicus.if.org Git - zfs/commitdiff
Use smaller default slack/delta value for schedule_hrtimeout_range()
authorTony Nguyen <tony.nguyen@delphix.com>
Wed, 28 Aug 2019 21:56:54 +0000 (15:56 -0600)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Wed, 28 Aug 2019 21:56:54 +0000 (14:56 -0700)
For interrupt coalescing, cv_timedwait_hires() uses a 100us slack/delta
for calls to schedule_hrtimeout_range(). This 100us slack can be costly
for small writes.

This change improves small write performance by passing resolution `res`
parameter to schedule_hrtimeout_range() to be used as delta/slack. A new
tunable `spl_schedule_hrtimeout_slack_us` is added to preserve old
behavior when desired.

Performance observations on 8K recordsize filesystem:
- 8K random writes at 1-64 threads, up to 60% improvement for one thread
  and smaller gains as thread count increases. At >64 threads, 2-5%
  decrease in performance was observed.
- 8K sequential writes, similar 60% improvement for one thread and
  leveling out around 64 threads. At >64 threads, 5-10% decrease in
  performance was observed.
- 128K sequential write sees 1-5 for the 128K. No observed regression at
  high thread count.

Testing done on Ubuntu 18.04 with 4.15 kernel, 8vCPUs and SSD storage on
VMware ESX.

Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Signed-off-by: Tony Nguyen <tony.nguyen@delphix.com>
Closes #9217

module/spl/spl-condvar.c
module/zfs/mmp.c

index 19c575f770b87313e26eff3d91354ee8a1847c2e..664fae1e7199a72f40ddae3709032bae0edd78e3 100644 (file)
 
 #include <sys/condvar.h>
 #include <sys/time.h>
+#include <sys/sysmacros.h>
 #include <linux/hrtimer.h>
 #include <linux/compiler_compat.h>
+#include <linux/mod_compat.h>
 
 #include <linux/sched.h>
 
 #include <linux/sched/signal.h>
 #endif
 
+#define        MAX_HRTIMEOUT_SLACK_US  1000
+unsigned int spl_schedule_hrtimeout_slack_us = 0;
+
+static int
+param_set_hrtimeout_slack(const char *buf, zfs_kernel_param_t *kp)
+{
+       unsigned long val;
+       int error;
+
+       error = kstrtoul(buf, 0, &val);
+       if (error)
+               return (error);
+
+       if (val > MAX_HRTIMEOUT_SLACK_US)
+               return (-EINVAL);
+
+       error = param_set_uint(buf, kp);
+       if (error < 0)
+               return (error);
+
+       return (0);
+}
+
+module_param_call(spl_schedule_hrtimeout_slack_us, param_set_hrtimeout_slack,
+       param_get_uint, &spl_schedule_hrtimeout_slack_us, 0644);
+MODULE_PARM_DESC(spl_schedule_hrtimeout_slack_us,
+       "schedule_hrtimeout_range() delta/slack value in us, default(0)");
+
 void
 __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg)
 {
@@ -304,12 +334,13 @@ EXPORT_SYMBOL(__cv_timedwait_sig);
  */
 static clock_t
 __cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time,
-    int state)
+    hrtime_t res, int state)
 {
        DEFINE_WAIT(wait);
        kmutex_t *m;
        hrtime_t time_left;
        ktime_t ktime_left;
+       u64 slack = 0;
 
        ASSERT(cvp);
        ASSERT(mp);
@@ -336,13 +367,11 @@ __cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time,
         * race where 'cvp->cv_waiters > 0' but the list is empty.
         */
        mutex_exit(mp);
-       /*
-        * Allow a 100 us range to give kernel an opportunity to coalesce
-        * interrupts
-        */
+
        ktime_left = ktime_set(0, time_left);
-       schedule_hrtimeout_range(&ktime_left, 100 * NSEC_PER_USEC,
-           HRTIMER_MODE_REL);
+       slack = MIN(MAX(res, spl_schedule_hrtimeout_slack_us * NSEC_PER_USEC),
+           MAX_HRTIMEOUT_SLACK_US * NSEC_PER_USEC);
+       schedule_hrtimeout_range(&ktime_left, slack, HRTIMER_MODE_REL);
 
        /* No more waiters a different mutex could be used */
        if (atomic_dec_and_test(&cvp->cv_waiters)) {
@@ -369,19 +398,10 @@ static clock_t
 cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
     hrtime_t res, int flag, int state)
 {
-       if (res > 1) {
-               /*
-                * Align expiration to the specified resolution.
-                */
-               if (flag & CALLOUT_FLAG_ROUNDUP)
-                       tim += res - 1;
-               tim = (tim / res) * res;
-       }
-
        if (!(flag & CALLOUT_FLAG_ABSOLUTE))
                tim += gethrtime();
 
-       return (__cv_timedwait_hires(cvp, mp, tim, state));
+       return (__cv_timedwait_hires(cvp, mp, tim, res, state));
 }
 
 clock_t
index cd5603a1a5cd2c39682b6d5412c12848a72fb61c..1ffd862da1264743264fc74a95c0955441c408df 100644 (file)
@@ -672,7 +672,7 @@ mmp_thread(void *arg)
 
                CALLB_CPR_SAFE_BEGIN(&cpr);
                (void) cv_timedwait_sig_hires(&mmp->mmp_thread_cv,
-                   &mmp->mmp_thread_lock, next_time, USEC2NSEC(1),
+                   &mmp->mmp_thread_lock, next_time, USEC2NSEC(100),
                    CALLOUT_FLAG_ABSOLUTE);
                CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
        }