Illumos #3582, #3584

author Adam Leventhal <ahl@delphix.com>

Wed, 28 Aug 2013 23:05:48 +0000 (16:05 -0700)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Mon, 4 Nov 2013 18:55:25 +0000 (10:55 -0800)
author Adam Leventhal <ahl@delphix.com>
Wed, 28 Aug 2013 23:05:48 +0000 (16:05 -0700)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Mon, 4 Nov 2013 18:55:25 +0000 (10:55 -0800)
diff --git a/include/sys/txg.h b/include/sys/txg.h

index b9bbba8be2919125089547888fb457e5e91cc7eb..9e547819b2480c07dad1ce415c470c92ff69d9c3 100644 (file)
--- a/include/sys/txg.h
+++ b/include/sys/txg.h
@@ -74,13 +74,8 @@ extern void txg_rele_to_quiesce(txg_handle_t *txghp);
  extern void txg_rele_to_sync(txg_handle_t *txghp);
  extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
  
-/*
- * Delay the caller by the specified number of ticks or until
- * the txg closes (whichever comes first).  This is intended
- * to be used to throttle writers when the system nears its
- * capacity.
- */
-extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks);
+extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta,
+    hrtime_t resolution);
  
  /*
   * Wait until the given transaction group has finished syncing.
diff --git a/include/sys/txg_impl.h b/include/sys/txg_impl.h

index 5a6d0e19f071adb64ee213b4d591ab51fb20545e..8a0977f1f4dc02525c6787229c8d0679ff1cf22e 100644 (file)
--- a/include/sys/txg_impl.h
+++ b/include/sys/txg_impl.h
@@ -70,7 +70,7 @@ struct tx_cpu {
         kmutex_t        tc_open_lock;   /* protects tx_open_txg */
         kmutex_t        tc_lock;        /* protects the rest of this struct */
         kcondvar_t      tc_cv[TXG_SIZE];
-       uint64_t        tc_count[TXG_SIZE];
+       uint64_t        tc_count[TXG_SIZE];     /* tx hold count on each txg */
         list_t          tc_callbacks[TXG_SIZE]; /* commit cb list */
         char            tc_pad[8];              /* pad to fill 3 cache lines */
  };
@@ -87,8 +87,8 @@ struct tx_cpu {
   * every cpu (see txg_quiesce()).
   */
  typedef struct tx_state {
-       tx_cpu_t        *tx_cpu;        /* protects right to enter txg  */
-       kmutex_t        tx_sync_lock;   /* protects tx_state_t */
+       tx_cpu_t        *tx_cpu;        /* protects access to tx_open_txg */
+       kmutex_t        tx_sync_lock;   /* protects the rest of this struct */
         uint64_t        tx_open_txg;    /* currently open txg id */
         uint64_t        tx_quiesced_txg; /* quiesced txg waiting for sync */
         uint64_t        tx_syncing_txg; /* currently syncing txg id */
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h

index dac74386d890f585c71f5fc5f074d5b0fbc66757..adb152f58b78fac45a275a58c922323352fa2450 100644 (file)
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -338,6 +338,8 @@ extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg);
  extern void cv_destroy(kcondvar_t *cv);
  extern void cv_wait(kcondvar_t *cv, kmutex_t *mp);
  extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime);
+extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+    hrtime_t res, int flag);
  extern void cv_signal(kcondvar_t *cv);
  extern void cv_broadcast(kcondvar_t *cv);
  #define        cv_timedwait_interruptible(cv, mp, at)  cv_timedwait(cv, mp, at)
diff --git a/lib/libspl/include/sys/time.h b/lib/libspl/include/sys/time.h

index 0cbbd928dbce71ac4611a106faccbeb182ec42c0..852b2eff941123e811d2b2e73cb8c8b6d88098f4 100644 (file)
--- a/lib/libspl/include/sys/time.h
+++ b/lib/libspl/include/sys/time.h
@@ -50,6 +50,14 @@
  #define NSEC_PER_USEC  1000L
  #endif
  
+#ifndef MSEC2NSEC
+#define MSEC2NSEC(m)    ((hrtime_t)(m) * (NANOSEC / MILLISEC))
+#endif
+
+#ifndef NSEC2MSEC
+#define NSEC2MSEC(n)    ((n) / (NANOSEC / MILLISEC))
+#endif
+
  extern hrtime_t gethrtime(void);
  extern void gethrestime(timestruc_t *);
  
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c

index 2e5eef69b0ad7a8f03a15f4668b72b0464d46406..f994f8ee3e7929982fd9204d96d639926abfff91 100644 (file)
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -528,6 +528,41 @@ top:
         return (1);
  }
  
+/*ARGSUSED*/
+clock_t
+cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
+    int flag)
+{
+       int error;
+       timestruc_t ts;
+       hrtime_t delta;
+
+       ASSERT(flag == 0);
+
+top:
+       delta = tim - gethrtime();
+       if (delta <= 0)
+               return (-1);
+
+       ts.tv_sec = delta / NANOSEC;
+       ts.tv_nsec = delta % NANOSEC;
+
+       ASSERT(mutex_owner(mp) == curthread);
+       mp->m_owner = NULL;
+       error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts);
+       mp->m_owner = curthread;
+
+       if (error == ETIME)
+               return (-1);
+
+       if (error == EINTR)
+               goto top;
+
+       ASSERT(error == 0);
+
+       return (1);
+}
+
  void
  cv_signal(kcondvar_t *cv)
  {
diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c

index 572086b931bd96e07ac156b466753c83941d3c17..51a78f9024d8097afd76da3e9368add37f0aa8d4 100644 (file)
--- a/module/zfs/dsl_dir.c
+++ b/module/zfs/dsl_dir.c
@@ -743,7 +743,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
                 err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
         } else {
                 if (err == EAGAIN) {
-                       txg_delay(dd->dd_pool, tx->tx_txg, 1);
+                       txg_delay(dd->dd_pool, tx->tx_txg,
+                           MSEC2NSEC(10), MSEC2NSEC(10));
                         err = SET_ERROR(ERESTART);
                 }
                 dsl_pool_memory_pressure(dd->dd_pool);
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c

index c28ed3e03888094996eb581a07fd009711f90966..950738e98562234602a523cfcb533f39ba9a701f 100644 (file)
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -58,6 +58,9 @@ kmutex_t zfs_write_limit_lock;
  
  static pgcnt_t old_physmem = 0;
  
+hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
+hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
+
  int
  dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
  {
@@ -512,12 +515,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
          * Weight the throughput calculation towards the current value:
          *      thru = 3/4 old_thru + 1/4 new_thru
          *
-        * Note: write_time is in nanosecs, so write_time/MICROSEC
-        * yields millisecs
+        * Note: write_time is in nanosecs while dp_throughput is expressed in
+        * bytes per millisecond.
          */
         ASSERT(zfs_write_limit_min > 0);
-       if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
-               uint64_t throughput = data_written / (write_time / MICROSEC);
+       if (data_written > zfs_write_limit_min / 8 &&
+           write_time > MSEC2NSEC(1)) {
+               uint64_t throughput = data_written / NSEC2MSEC(write_time);
  
                 if (dp->dp_throughput)
                         dp->dp_throughput = throughput / 4 +
@@ -617,8 +621,10 @@ dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
          * the caller 1 clock tick.  This will slow down the "fill"
          * rate until the sync process can catch up with us.
          */
-       if (reserved && reserved > (write_limit - (write_limit >> 3)))
-               txg_delay(dp, tx->tx_txg, 1);
+       if (reserved && reserved > (write_limit - (write_limit >> 3))) {
+               txg_delay(dp, tx->tx_txg, zfs_throttle_delay,
+                   zfs_throttle_resolution);
+       }
  
         return (0);
  }
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c

index de82ab1cee3518bb411df02429ff55fb48bdf386..6cc08ab47d51f292fc0283fd193170d12a3452f3 100644 (file)
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -409,7 +409,7 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
             zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
         elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
         if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
-           (elapsed_nanosecs / MICROSEC > mintime &&
+           (NSEC2MSEC(elapsed_nanosecs) > mintime &&
             txg_sync_waiting(scn->scn_dp)) ||
             spa_shutting_down(scn->scn_dp->dp_spa)) {
                 if (zb) {
@@ -1335,7 +1335,7 @@ dsl_scan_free_should_pause(dsl_scan_t *scn)
  
         elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
         return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
-           (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
+           (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
             txg_sync_waiting(scn->scn_dp)) ||
             spa_shutting_down(scn->scn_dp->dp_spa));
  }
@@ -1459,7 +1459,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                             "free_bpobj/bptree txg %llu",
                             (longlong_t)scn->scn_visited_this_txg,
                             (longlong_t)
-                           (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
+                           NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
                             (longlong_t)tx->tx_txg);
                         scn->scn_visited_this_txg = 0;
                         /*
@@ -1507,7 +1507,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
  
         zfs_dbgmsg("visited %llu blocks in %llums",
             (longlong_t)scn->scn_visited_this_txg,
-           (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
+           (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
  
         if (!scn->scn_pausing) {
                 /* finished with scan. */
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c

index 5b278412593402fafab3f9290af790f41869da33..ffba5028f79e72a8605fa6a5ac34a7ec2d724b9a 100644 (file)
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -490,8 +490,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
         spa->spa_proc = &p0;
         spa->spa_proc_state = SPA_PROC_NONE;
  
-       spa->spa_deadman_synctime = zfs_deadman_synctime *
-           zfs_txg_synctime_ms * MICROSEC;
+       spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime *
+           zfs_txg_synctime_ms);
  
         refcount_create(&spa->spa_refcount);
         spa_config_lock_init(spa);
diff --git a/module/zfs/txg.c b/module/zfs/txg.c

index c0c0b295a6be71aa12b13c165b8871b3562411a7..c8a29e14fe4efe4926f9d643cdaea6907895c899 100644 (file)
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -236,7 +236,7 @@ txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
  }
  
  static void
-txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time)
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
  {
         CALLB_CPR_SAFE_BEGIN(cpr);
  
@@ -373,6 +373,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
         spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, gethrtime());
         spa_txg_history_add(dp->dp_spa, tx->tx_open_txg);
  
+       DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
+       DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
+
         /*
          * Now that we've incremented tx_open_txg, we can let threads
          * enter the next transaction group.
@@ -531,6 +534,7 @@ txg_sync_thread(dsl_pool_t *dp)
                 txg = tx->tx_quiesced_txg;
                 tx->tx_quiesced_txg = 0;
                 tx->tx_syncing_txg = txg;
+               DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
                 cv_broadcast(&tx->tx_quiesce_more_cv);
  
                 dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
@@ -544,6 +548,7 @@ txg_sync_thread(dsl_pool_t *dp)
                 mutex_enter(&tx->tx_sync_lock);
                 tx->tx_synced_txg = txg;
                 tx->tx_syncing_txg = 0;
+               DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
                 cv_broadcast(&tx->tx_sync_done_cv);
  
                 /*
@@ -602,21 +607,22 @@ txg_quiesce_thread(dsl_pool_t *dp)
                  */
                 dprintf("quiesce done, handing off txg %llu\n", txg);
                 tx->tx_quiesced_txg = txg;
+               DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
                 cv_broadcast(&tx->tx_sync_more_cv);
                 cv_broadcast(&tx->tx_quiesce_done_cv);
         }
  }
  
  /*
- * Delay this thread by 'ticks' if we are still in the open transaction
- * group and there is already a waiting txg quiesing or quiesced.  Abort
- * the delay if this txg stalls or enters the quiesing state.
+ * Delay this thread by delay nanoseconds if we are still in the open
+ * transaction group and there is already a waiting txg quiesing or quiesced.
+ * Abort the delay if this txg stalls or enters the quiesing state.
   */
  void
-txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
+txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
  {
         tx_state_t *tx = &dp->dp_tx;
-       clock_t timeout = ddi_get_lbolt() + ticks;
+       hrtime_t start = gethrtime();
  
         /* don't delay if this txg could transition to quiesing immediately */
         if (tx->tx_open_txg > txg ||
@@ -629,10 +635,11 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
                 return;
         }
  
-       while (ddi_get_lbolt() < timeout &&
-           tx->tx_syncing_txg < txg-1 && !txg_stalled(dp))
-               (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock,
-                   timeout);
+       while (gethrtime() - start < delay &&
+           tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
+               (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
+                   &tx->tx_sync_lock, delay, resolution, 0);
+       }
  
         DMU_TX_STAT_BUMP(dmu_tx_delay);
author	Adam Leventhal <ahl@delphix.com>
	Wed, 28 Aug 2013 23:05:48 +0000 (16:05 -0700)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Mon, 4 Nov 2013 18:55:25 +0000 (10:55 -0800)
include/sys/txg.h		patch \| blob \| history
include/sys/txg_impl.h		patch \| blob \| history
include/sys/zfs_context.h		patch \| blob \| history
lib/libspl/include/sys/time.h		patch \| blob \| history
lib/libzpool/kernel.c		patch \| blob \| history
module/zfs/dsl_dir.c		patch \| blob \| history
module/zfs/dsl_pool.c		patch \| blob \| history
module/zfs/dsl_scan.c		patch \| blob \| history
module/zfs/spa_misc.c		patch \| blob \| history
module/zfs/txg.c		patch \| blob \| history