From: Tony Nguyen Date: Wed, 28 Aug 2019 21:56:54 +0000 (-0600) Subject: Use smaller default slack/delta value for schedule_hrtimeout_range() X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8d042842815f33d2e4ab919a695139b11b7ed0c2;p=zfs Use smaller default slack/delta value for schedule_hrtimeout_range() For interrupt coalescing, cv_timedwait_hires() uses a 100us slack/delta for calls to schedule_hrtimeout_range(). This 100us slack can be costly for small writes. This change improves small write performance by passing resolution `res` parameter to schedule_hrtimeout_range() to be used as delta/slack. A new tunable `spl_schedule_hrtimeout_slack_us` is added to preserve old behavior when desired. Performance observations on 8K recordsize filesystem: - 8K random writes at 1-64 threads, up to 60% improvement for one thread and smaller gains as thread count increases. At >64 threads, 2-5% decrease in performance was observed. - 8K sequential writes, similar 60% improvement for one thread and leveling out around 64 threads. At >64 threads, 5-10% decrease in performance was observed. - 128K sequential write sees 1-5 for the 128K. No observed regression at high thread count. Testing done on Ubuntu 18.04 with 4.15 kernel, 8vCPUs and SSD storage on VMware ESX. Reviewed-by: Richard Elling Reviewed-by: Brian Behlendorf Reviewed-by: Matt Ahrens Signed-off-by: Tony Nguyen Closes #9217 --- diff --git a/module/spl/spl-condvar.c b/module/spl/spl-condvar.c index 19c575f77..664fae1e7 100644 --- a/module/spl/spl-condvar.c +++ b/module/spl/spl-condvar.c @@ -26,8 +26,10 @@ #include #include +#include #include #include +#include #include @@ -35,6 +37,34 @@ #include #endif +#define MAX_HRTIMEOUT_SLACK_US 1000 +unsigned int spl_schedule_hrtimeout_slack_us = 0; + +static int +param_set_hrtimeout_slack(const char *buf, zfs_kernel_param_t *kp) +{ + unsigned long val; + int error; + + error = kstrtoul(buf, 0, &val); + if (error) + return (error); + + if (val > MAX_HRTIMEOUT_SLACK_US) + return (-EINVAL); + + error = param_set_uint(buf, kp); + if (error < 0) + return (error); + + return (0); +} + +module_param_call(spl_schedule_hrtimeout_slack_us, param_set_hrtimeout_slack, + param_get_uint, &spl_schedule_hrtimeout_slack_us, 0644); +MODULE_PARM_DESC(spl_schedule_hrtimeout_slack_us, + "schedule_hrtimeout_range() delta/slack value in us, default(0)"); + void __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) { @@ -304,12 +334,13 @@ EXPORT_SYMBOL(__cv_timedwait_sig); */ static clock_t __cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time, - int state) + hrtime_t res, int state) { DEFINE_WAIT(wait); kmutex_t *m; hrtime_t time_left; ktime_t ktime_left; + u64 slack = 0; ASSERT(cvp); ASSERT(mp); @@ -336,13 +367,11 @@ __cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time, * race where 'cvp->cv_waiters > 0' but the list is empty. */ mutex_exit(mp); - /* - * Allow a 100 us range to give kernel an opportunity to coalesce - * interrupts - */ + ktime_left = ktime_set(0, time_left); - schedule_hrtimeout_range(&ktime_left, 100 * NSEC_PER_USEC, - HRTIMER_MODE_REL); + slack = MIN(MAX(res, spl_schedule_hrtimeout_slack_us * NSEC_PER_USEC), + MAX_HRTIMEOUT_SLACK_US * NSEC_PER_USEC); + schedule_hrtimeout_range(&ktime_left, slack, HRTIMER_MODE_REL); /* No more waiters a different mutex could be used */ if (atomic_dec_and_test(&cvp->cv_waiters)) { @@ -369,19 +398,10 @@ static clock_t cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag, int state) { - if (res > 1) { - /* - * Align expiration to the specified resolution. - */ - if (flag & CALLOUT_FLAG_ROUNDUP) - tim += res - 1; - tim = (tim / res) * res; - } - if (!(flag & CALLOUT_FLAG_ABSOLUTE)) tim += gethrtime(); - return (__cv_timedwait_hires(cvp, mp, tim, state)); + return (__cv_timedwait_hires(cvp, mp, tim, res, state)); } clock_t diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index cd5603a1a..1ffd862da 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -672,7 +672,7 @@ mmp_thread(void *arg) CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_sig_hires(&mmp->mmp_thread_cv, - &mmp->mmp_thread_lock, next_time, USEC2NSEC(1), + &mmp->mmp_thread_lock, next_time, USEC2NSEC(100), CALLOUT_FLAG_ABSOLUTE); CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock); }