Illumos #3642, #3643

author George Wilson <george.wilson@delphix.com>

Tue, 23 Apr 2013 17:31:42 +0000 (09:31 -0800)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Fri, 1 Nov 2013 15:55:12 +0000 (08:55 -0700)
author George Wilson <george.wilson@delphix.com>
Tue, 23 Apr 2013 17:31:42 +0000 (09:31 -0800)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Fri, 1 Nov 2013 15:55:12 +0000 (08:55 -0700)
diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h

index 5691f4d14d9369aedd8a4ee842c12ed63b33b4c4..aae8c312af57292dd4c7c8675899e2e800904fe6 100644 (file)
--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@@ -20,7 +20,7 @@
   */
  /*
   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
   */
  
  #ifndef        _SYS_DSL_SCAN_H
@@ -82,6 +82,7 @@ typedef struct dsl_scan {
  
         /* for freeing blocks */
         boolean_t scn_is_bptree;
+       boolean_t scn_async_destroying;
  
         /* for debugging / information */
         uint64_t scn_visited_this_txg;
diff --git a/include/sys/txg_impl.h b/include/sys/txg_impl.h

index 7b356eac1293bdbcb0d2f967a14b1ef8b88c5678..5a6d0e19f071adb64ee213b4d591ab51fb20545e 100644 (file)
--- a/include/sys/txg_impl.h
+++ b/include/sys/txg_impl.h
@@ -23,6 +23,10 @@
   * Use is subject to license terms.
   */
  
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
  #ifndef _SYS_TXG_IMPL_H
  #define        _SYS_TXG_IMPL_H
  
@@ -33,14 +37,55 @@
  extern "C" {
  #endif
  
+/*
+ * The tx_cpu structure is a per-cpu structure that is used to track
+ * the number of active transaction holds (tc_count). As transactions
+ * are assigned into a transaction group the appropriate tc_count is
+ * incremented to indicate that there are pending changes that have yet
+ * to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement
+ * the tc_count. A transaction group is not considered quiesced until all
+ * tx_cpu structures have reached a tc_count of zero.
+ *
+ * This structure is a per-cpu structure by design. Updates to this structure
+ * are frequent and concurrent. Having a single structure would result in
+ * heavy lock contention so a per-cpu design was implemented. With the fanned
+ * out mutex design, consumers only need to lock the mutex associated with
+ * thread's cpu.
+ *
+ * The tx_cpu contains two locks, the tc_lock and tc_open_lock.
+ * The tc_lock is used to protect all members of the tx_cpu structure with
+ * the exception of the tc_open_lock. This lock should only be held for a
+ * short period of time, typically when updating the value of tc_count.
+ *
+ * The tc_open_lock protects the tx_open_txg member of the tx_state structure.
+ * This lock is used to ensure that transactions are only assigned into
+ * the current open transaction group. In order to move the current open
+ * transaction group to the quiesce phase, the txg_quiesce thread must
+ * grab all tc_open_locks, increment the tx_open_txg, and drop the locks.
+ * The tc_open_lock is held until the transaction is assigned into the
+ * transaction group. Typically, this is a short operation but if throttling
+ * is occuring it may be held for longer periods of time.
+ */
  struct tx_cpu {
-       kmutex_t        tc_lock;
+       kmutex_t        tc_open_lock;   /* protects tx_open_txg */
+       kmutex_t        tc_lock;        /* protects the rest of this struct */
         kcondvar_t      tc_cv[TXG_SIZE];
         uint64_t        tc_count[TXG_SIZE];
         list_t          tc_callbacks[TXG_SIZE]; /* commit cb list */
-       char            tc_pad[16];
+       char            tc_pad[8];              /* pad to fill 3 cache lines */
  };
  
+/*
+ * The tx_state structure maintains the state information about the different
+ * stages of the pool's transcation groups. A per pool tx_state structure
+ * is used to track this information. The tx_state structure also points to
+ * an array of tx_cpu structures (described above). Although the tx_sync_lock
+ * is used to protect the members of this structure, it is not used to
+ * protect the tx_open_txg. Instead a special lock in the tx_cpu structure
+ * is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock.
+ * Any thread wishing to update tx_open_txg must grab the tc_open_lock on
+ * every cpu (see txg_quiesce()).
+ */
  typedef struct tx_state {
         tx_cpu_t        *tx_cpu;        /* protects right to enter txg  */
         kmutex_t        tx_sync_lock;   /* protects tx_state_t */
diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c

index 35e9244c87804c8b92db34b82e6d9048e0f195e0..eee0df106032be1deb3ac90e6fae446402abca32 100644 (file)
--- a/module/zfs/dsl_destroy.c
+++ b/module/zfs/dsl_destroy.c
@@ -761,12 +761,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
                 zil_destroy_sync(dmu_objset_zil(os), tx);
  
                 if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
+                       dsl_scan_t *scn = dp->dp_scan;
+
                         spa_feature_incr(dp->dp_spa, async_destroy, tx);
                         dp->dp_bptree_obj = bptree_alloc(mos, tx);
                         VERIFY0(zap_add(mos,
                             DMU_POOL_DIRECTORY_OBJECT,
                             DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
                             &dp->dp_bptree_obj, tx));
+                       ASSERT(!scn->scn_async_destroying);
+                       scn->scn_async_destroying = B_TRUE;
                 }
  
                 used = ds->ds_dir->dd_phys->dd_used_bytes;
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c

index aa861a59277a26bf5a5e702fc670b4a0d3ace8f0..de82ab1cee3518bb411df02429ff55fb48bdf386 100644 (file)
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -91,6 +91,15 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
         scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
         scn->scn_dp = dp;
  
+       /*
+        * It's possible that we're resuming a scan after a reboot so
+        * make sure that the scan_async_destroying flag is initialized
+        * appropriately.
+        */
+       ASSERT(!scn->scn_async_destroying);
+       scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
+           &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]);
+
         err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
             "scrub_func", sizeof (uint64_t), 1, &f);
         if (err == 0) {
@@ -1362,13 +1371,10 @@ dsl_scan_active(dsl_scan_t *scn)
         if (spa_shutting_down(spa))
                 return (B_FALSE);
  
-       if (scn->scn_phys.scn_state == DSS_SCANNING)
+       if (scn->scn_phys.scn_state == DSS_SCANNING ||
+           scn->scn_async_destroying)
                 return (B_TRUE);
  
-       if (spa_feature_is_active(spa,
-           &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
-               return (B_TRUE);
-       }
         if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
                 (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
                     &used, &comp, &uncomp);
@@ -1424,6 +1430,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
  
                 if (err == 0 && spa_feature_is_active(spa,
                     &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+                       ASSERT(scn->scn_async_destroying);
                         scn->scn_is_bptree = B_TRUE;
                         scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
                             NULL, ZIO_FLAG_MUSTSUCCEED);
@@ -1444,6 +1451,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                                 VERIFY0(bptree_free(dp->dp_meta_objset,
                                     dp->dp_bptree_obj, tx));
                                 dp->dp_bptree_obj = 0;
+                               scn->scn_async_destroying = B_FALSE;
                         }
                 }
                 if (scn->scn_visited_this_txg) {
diff --git a/module/zfs/txg.c b/module/zfs/txg.c

index 7a3da8647d17e352e5cfbbd273f4ae296d8b9b0d..c0c0b295a6be71aa12b13c165b8871b3562411a7 100644 (file)
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -127,6 +127,8 @@ txg_init(dsl_pool_t *dp, uint64_t txg)
                 int i;
  
                 mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
+               mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
+                   NULL);
                 for (i = 0; i < TXG_SIZE; i++) {
                         cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
                             NULL);
@@ -169,6 +171,7 @@ txg_fini(dsl_pool_t *dp)
         for (c = 0; c < max_ncpus; c++) {
                 int i;
  
+               mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
                 mutex_destroy(&tx->tx_cpu[c].tc_lock);
                 for (i = 0; i < TXG_SIZE; i++) {
                         cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
@@ -303,10 +306,12 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
         tc = &tx->tx_cpu[CPU_SEQID];
         kpreempt_enable();
  
-       mutex_enter(&tc->tc_lock);
-
+       mutex_enter(&tc->tc_open_lock);
         txg = tx->tx_open_txg;
+
+       mutex_enter(&tc->tc_lock);
         tc->tc_count[txg & TXG_MASK]++;
+       mutex_exit(&tc->tc_lock);
  
         th->th_cpu = tc;
         th->th_txg = txg;
@@ -319,7 +324,8 @@ txg_rele_to_quiesce(txg_handle_t *th)
  {
         tx_cpu_t *tc = th->th_cpu;
  
-       mutex_exit(&tc->tc_lock);
+       ASSERT(!MUTEX_HELD(&tc->tc_lock));
+       mutex_exit(&tc->tc_open_lock);
  }
  
  void
@@ -356,10 +362,10 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
         int c;
  
         /*
-        * Grab all tx_cpu locks so nobody else can get into this txg.
+        * Grab all tc_open_locks so nobody else can get into this txg.
          */
         for (c = 0; c < max_ncpus; c++)
-               mutex_enter(&tx->tx_cpu[c].tc_lock);
+               mutex_enter(&tx->tx_cpu[c].tc_open_lock);
  
         ASSERT(txg == tx->tx_open_txg);
         tx->tx_open_txg++;
@@ -372,7 +378,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
          * enter the next transaction group.
          */
         for (c = 0; c < max_ncpus; c++)
-               mutex_exit(&tx->tx_cpu[c].tc_lock);
+               mutex_exit(&tx->tx_cpu[c].tc_open_lock);
  
         /*
          * Quiesce the transaction group by waiting for everyone to txg_exit().
author	George Wilson <george.wilson@delphix.com>
	Tue, 23 Apr 2013 17:31:42 +0000 (09:31 -0800)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Fri, 1 Nov 2013 15:55:12 +0000 (08:55 -0700)
include/sys/dsl_scan.h		patch \| blob \| history
include/sys/txg_impl.h		patch \| blob \| history
module/zfs/dsl_destroy.c		patch \| blob \| history
module/zfs/dsl_scan.c		patch \| blob \| history
module/zfs/txg.c		patch \| blob \| history