dmu_tx_wait() hang likely due to cv_signal() in dsl_pool_dirty_delta()

author Serapheim Dimitropoulos <serapheim@delphix.com>

Thu, 15 Aug 2019 23:53:53 +0000 (16:53 -0700)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Thu, 15 Aug 2019 23:53:53 +0000 (17:53 -0600)
author Serapheim Dimitropoulos <serapheim@delphix.com>
Thu, 15 Aug 2019 23:53:53 +0000 (16:53 -0700)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Thu, 15 Aug 2019 23:53:53 +0000 (17:53 -0600)
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c

index f8f96c142e9f9c9d9319a32f43586708ba99e670..ace862637de1de73c82e057bf29d65f696e872ac 100644 (file)
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -1890,9 +1890,11 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
         db->db.db_size = size;
  
         if (db->db_level == 0) {
-               ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
                 db->db_last_dirty->dt.dl.dr_data = buf;
         }
+       ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+       ASSERT3U(db->db_last_dirty->dr_accounted, ==, osize);
+       db->db_last_dirty->dr_accounted = size;
         mutex_exit(&db->db_mtx);
  
         dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
@@ -2105,7 +2107,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                     sizeof (dbuf_dirty_record_t),
                     offsetof(dbuf_dirty_record_t, dr_dirty_node));
         }
-       if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
+       if (db->db_blkid != DMU_BONUS_BLKID)
                 dr->dr_accounted = db->db.db_size;
         dr->dr_dbuf = db;
         dr->dr_txg = tx->tx_txg;
@@ -4356,8 +4358,7 @@ dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
         /*
          * The callback will be called io_phys_children times.  Retire one
          * portion of our dirty space each time we are called.  Any rounding
-        * error will be cleaned up by dsl_pool_sync()'s call to
-        * dsl_pool_undirty_space().
+        * error will be cleaned up by dbuf_write_done().
          */
         delta = dr->dr_accounted / zio->io_phys_children;
         dsl_pool_undirty_space(dp, delta, zio->io_txg);
@@ -4440,13 +4441,36 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
                 mutex_destroy(&dr->dt.di.dr_mtx);
                 list_destroy(&dr->dt.di.dr_children);
         }
-       kmem_free(dr, sizeof (dbuf_dirty_record_t));
  
         cv_broadcast(&db->db_changed);
         ASSERT(db->db_dirtycnt > 0);
         db->db_dirtycnt -= 1;
         db->db_data_pending = NULL;
         dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
+
+       /*
+        * If we didn't do a physical write in this ZIO and we
+        * still ended up here, it means that the space of the
+        * dbuf that we just released (and undirtied) above hasn't
+        * been marked as undirtied in the pool's accounting.
+        *
+        * Thus, we undirty that space in the pool's view of the
+        * world here. For physical writes this type of update
+        * happens in dbuf_write_physdone().
+        *
+        * If we did a physical write, cleanup any rounding errors
+        * that came up due to writing multiple copies of a block
+        * on disk [see dbuf_write_physdone()].
+        */
+       if (zio->io_phys_children == 0) {
+               dsl_pool_undirty_space(dmu_objset_pool(os),
+                   dr->dr_accounted, zio->io_txg);
+       } else {
+               dsl_pool_undirty_space(dmu_objset_pool(os),
+                   dr->dr_accounted % zio->io_phys_children, zio->io_txg);
+       }
+
+       kmem_free(dr, sizeof (dbuf_dirty_record_t));
  }
  
  static void
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c

index 955588fb7b6afee0295df217e833416aaad9abac..aa3ef6458d47fa03062d4f8a58f95cff922258b9 100644 (file)
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1090,6 +1090,9 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
         dmu_buf_rele_array(dbp, numbufs, FTAG);
  }
  
+/*
+ * Note: Lustre is an external consumer of this interface.
+ */
  void
  dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
      const void *buf, dmu_tx_t *tx)
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c

index 7a540bdfaf10f0e7a40b1ff7fc93bfad58995b58..3afafd1827acc97318a1c186e96cc93de7d5ae41 100644 (file)
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -2908,9 +2908,17 @@ dmu_fsname(const char *snapname, char *buf)
  }
  
  /*
- * Call when we think we're going to write/free space in open context to track
- * the amount of dirty data in the open txg, which is also the amount
- * of memory that can not be evicted until this txg syncs.
+ * Call when we think we're going to write/free space in open context
+ * to track the amount of dirty data in the open txg, which is also the
+ * amount of memory that can not be evicted until this txg syncs.
+ *
+ * Note that there are two conditions where this can be called from
+ * syncing context:
+ *
+ * [1] When we just created the dataset, in which case we go on with
+ *     updating any accounting of dirty data as usual.
+ * [2] When we are dirtying MOS data, in which case we only update the
+ *     pool's accounting of dirty data.
   */
  void
  dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
@@ -2920,8 +2928,9 @@ dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
  
         if (ds != NULL) {
                 dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
-               dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
         }
+
+       dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
  }
  
  #if defined(_KERNEL)
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c

index 9fb3a061d946b8479a62cc6e606987dbc050b0be..1f1fd6462720243e59bbdd6ebda97c6a460ae5d3 100644 (file)
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -658,15 +658,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
         }
         VERIFY0(zio_wait(zio));
  
-       /*
-        * We have written all of the accounted dirty data, so our
-        * dp_space_towrite should now be zero.  However, some seldom-used
-        * code paths do not adhere to this (e.g. dbuf_undirty(), also
-        * rounding error in dbuf_write_physdone).
-        * Shore up the accounting of any dirtied space now.
-        */
-       dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
-
         /*
          * Update the long range free counter after
          * we're done syncing user data
@@ -762,6 +753,21 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
                 dsl_pool_sync_mos(dp, tx);
         }
  
+       /*
+        * We have written all of the accounted dirty data, so our
+        * dp_space_towrite should now be zero. However, some seldom-used
+        * code paths do not adhere to this (e.g. dbuf_undirty()). Shore up
+        * the accounting of any dirtied space now.
+        *
+        * Note that, besides any dirty data from datasets, the amount of
+        * dirty data in the MOS is also accounted by the pool. Therefore,
+        * we want to do this cleanup after dsl_pool_sync_mos() so we don't
+        * attempt to update the accounting for the same dirty data twice.
+        * (i.e. at this point we only update the accounting for the space
+        * that we know that we "leaked").
+        */
+       dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
+
         /*
          * If we modify a dataset in the same txg that we want to destroy it,
          * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
author	Serapheim Dimitropoulos <serapheim@delphix.com>
	Thu, 15 Aug 2019 23:53:53 +0000 (16:53 -0700)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Thu, 15 Aug 2019 23:53:53 +0000 (17:53 -0600)
module/zfs/dbuf.c		patch \| blob \| history
module/zfs/dmu.c		patch \| blob \| history
module/zfs/dmu_objset.c		patch \| blob \| history
module/zfs/dsl_pool.c		patch \| blob \| history