OpenZFS 9962 - zil_commit should omit cache thrash

author Prakash Surya <prakash.surya@delphix.com>

Tue, 23 Oct 2018 21:14:27 +0000 (14:14 -0700)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Fri, 7 Dec 2018 19:09:42 +0000 (11:09 -0800)
author Prakash Surya <prakash.surya@delphix.com>
Tue, 23 Oct 2018 21:14:27 +0000 (14:14 -0700)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Fri, 7 Dec 2018 19:09:42 +0000 (11:09 -0800)
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c

index 0d9495a28782e800a690eaf59c01a254beed8e86..c4bcd74fc0d2e993169bdaae8f99df9678724b20 100644 (file)
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -2157,6 +2157,7 @@ zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
   * ZIL get_data callbacks
   */
  
+/* ARGSUSED */
  static void
  ztest_get_done(zgd_t *zgd, int error)
  {
@@ -2169,9 +2170,6 @@ ztest_get_done(zgd_t *zgd, int error)
         ztest_range_unlock((rl_t *)zgd->zgd_lr);
         ztest_object_unlock(zd, object);
  
-       if (error == 0 && zgd->zgd_bp)
-               zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
-
         umem_free(zgd, sizeof (*zgd));
  }
  
diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h

index 6901d51f083eb01d19aa2ef9855a8b3249a5b1f6..174fef3341282392e96137bad9705983bc0fe2ce 100644 (file)
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@@ -47,10 +47,11 @@ extern "C" {
   * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must
   * be held when making this transition.
   *
- * After the lwb's zio completes, and the vdev's are flushed, the lwb
- * will transition into the "done" state via zil_lwb_write_done(). When
- * transitioning from "issued" to "done", the zilog's "zl_lock" must be
- * held, *not* the "zl_issuer_lock".
+ * After the lwb's write zio completes, it transitions into the "write
+ * done" state via zil_lwb_write_done(); and then into the "flush done"
+ * state via zil_lwb_flush_vdevs_done(). When transitioning from
+ * "issued" to "write done", and then from "write done" to "flush done",
+ * the zilog's "zl_lock" must be held, *not* the "zl_issuer_lock".
   *
   * The zilog's "zl_issuer_lock" can become heavily contended in certain
   * workloads, so we specifically avoid acquiring that lock when
@@ -67,13 +68,14 @@ extern "C" {
   * "zl_issuer_lock" will prevent a concurrent thread from transitioning
   * that lwb to the "issued" state. Likewise, if an lwb is already in the
   * "issued" state, holding the "zl_lock" will prevent a concurrent
- * thread from transitioning that lwb to the "done" state.
+ * thread from transitioning that lwb to the "write done" state.
   */
  typedef enum {
      LWB_STATE_CLOSED,
      LWB_STATE_OPENED,
      LWB_STATE_ISSUED,
-    LWB_STATE_DONE,
+    LWB_STATE_WRITE_DONE,
+    LWB_STATE_FLUSH_DONE,
      LWB_NUM_STATES
  } lwb_state_t;
  
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c

index 180c1f12fc9c2d7239e51c28a656eebde112d10d..3eed512e58a4bc0fe421d24a72e1103800e898c4 100644 (file)
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1757,6 +1757,15 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
         dmu_sync_arg_t *dsa = varg;
         dbuf_dirty_record_t *dr = dsa->dsa_dr;
         dmu_buf_impl_t *db = dr->dr_dbuf;
+       zgd_t *zgd = dsa->dsa_zgd;
+
+       /*
+        * Record the vdev(s) backing this blkptr so they can be flushed after
+        * the writes for the lwb have completed.
+        */
+       if (zio->io_error == 0) {
+               zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
+       }
  
         mutex_enter(&db->db_mtx);
         ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
@@ -1806,14 +1815,23 @@ dmu_sync_late_arrival_done(zio_t *zio)
  {
         blkptr_t *bp = zio->io_bp;
         dmu_sync_arg_t *dsa = zio->io_private;
-       ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
-
-       if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
-               ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
-               ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
-               ASSERT(zio->io_bp->blk_birth == zio->io_txg);
-               ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
-               zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+       zgd_t *zgd = dsa->dsa_zgd;
+
+       if (zio->io_error == 0) {
+               /*
+                * Record the vdev(s) backing this blkptr so they can be
+                * flushed after the writes for the lwb have completed.
+                */
+               zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
+
+               if (!BP_IS_HOLE(bp)) {
+                       ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
+                       ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
+                       ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
+                       ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+                       ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+                       zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+               }
         }
  
         dmu_tx_commit(dsa->dsa_tx);
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c

index 863adc5950346ffb2e7d85cba72c6ed82748197c..06d93bad04e9eb92e88f9a8560cab3d5c28ffa6a 100644 (file)
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -976,6 +976,7 @@ zfs_iput_async(struct inode *ip)
                 iput(ip);
  }
  
+/* ARGSUSED */
  void
  zfs_get_done(zgd_t *zgd, int error)
  {
@@ -992,9 +993,6 @@ zfs_get_done(zgd_t *zgd, int error)
          */
         zfs_iput_async(ZTOI(zp));
  
-       if (error == 0 && zgd->zgd_bp)
-               zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
-
         kmem_free(zgd, sizeof (zgd_t));
  }
  
@@ -1118,11 +1116,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
                                  * TX_WRITE2 relies on the data previously
                                  * written by the TX_WRITE that caused
                                  * EALREADY.  We zero out the BP because
-                                * it is the old, currently-on-disk BP,
-                                * so there's no need to zio_flush() its
-                                * vdevs (flushing would needlesly hurt
-                                * performance, and doesn't work on
-                                * indirect vdevs).
+                                * it is the old, currently-on-disk BP.
                                  */
                                 zgd->zgd_bp = NULL;
                                 BP_ZERO(bp);
diff --git a/module/zfs/zil.c b/module/zfs/zil.c

index a453c26c37eec7f523f2f625a24ee0fb736d42ee..e5de8b5e4c26f621a42ea1c8f1b2be5013ec846c 100644 (file)
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -588,7 +588,7 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
         ASSERT3P(lwb->lwb_root_zio, ==, NULL);
         ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
         ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
-           lwb->lwb_state == LWB_STATE_DONE);
+           lwb->lwb_state == LWB_STATE_FLUSH_DONE);
  
         /*
          * Clear the zilog's field to indicate this lwb is no longer
@@ -1011,7 +1011,8 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
         ASSERT3P(zcw->zcw_lwb, ==, NULL);
         ASSERT3P(lwb, !=, NULL);
         ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
-           lwb->lwb_state == LWB_STATE_ISSUED);
+           lwb->lwb_state == LWB_STATE_ISSUED ||
+           lwb->lwb_state == LWB_STATE_WRITE_DONE);
  
         list_insert_tail(&lwb->lwb_waiters, zcw);
         zcw->zcw_lwb = lwb;
@@ -1057,6 +1058,42 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
         mutex_exit(&lwb->lwb_vdev_lock);
  }
  
+static void
+zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
+{
+       avl_tree_t *src = &lwb->lwb_vdev_tree;
+       avl_tree_t *dst = &nlwb->lwb_vdev_tree;
+       void *cookie = NULL;
+       zil_vdev_node_t *zv;
+
+       ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
+       ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+       ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+
+       /*
+        * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
+        * not need the protection of lwb_vdev_lock (it will only be modified
+        * while holding zilog->zl_lock) as its writes and those of its
+        * children have all completed.  The younger 'nlwb' may be waiting on
+        * future writes to additional vdevs.
+        */
+       mutex_enter(&nlwb->lwb_vdev_lock);
+       /*
+        * Tear down the 'lwb' vdev tree, ensuring that entries which do not
+        * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
+        */
+       while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
+               avl_index_t where;
+
+               if (avl_find(dst, zv, &where) == NULL) {
+                       avl_insert(dst, zv, where);
+               } else {
+                       kmem_free(zv, sizeof (*zv));
+               }
+       }
+       mutex_exit(&nlwb->lwb_vdev_lock);
+}
+
  void
  zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
  {
@@ -1064,9 +1101,13 @@ zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
  }
  
  /*
- * This function is a called after all VDEVs associated with a given lwb
+ * This function is a called after all vdevs associated with a given lwb
   * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
- * as the lwb write completes, if "zil_nocacheflush" is set.
+ * as the lwb write completes, if "zil_nocacheflush" is set. Further,
+ * all "previous" lwb's will have completed before this function is
+ * called; i.e. this function is called for all previous lwbs before
+ * it's called for "this" lwb (enforced via zio the dependencies
+ * configured in zil_lwb_set_zio_dependency()).
   *
   * The intention is for this function to be called as soon as the
   * contents of an lwb are considered "stable" on disk, and will survive
@@ -1104,7 +1145,9 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
         zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;
  
         lwb->lwb_root_zio = NULL;
-       lwb->lwb_state = LWB_STATE_DONE;
+
+       ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
+       lwb->lwb_state = LWB_STATE_FLUSH_DONE;
  
         if (zilog->zl_last_lwb_opened == lwb) {
                 /*
@@ -1150,14 +1193,17 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
  }
  
  /*
- * This is called when an lwb write completes. This means, this specific
- * lwb was written to disk, and all dependent lwb have also been
- * written to disk.
- *
- * At this point, a DKIOCFLUSHWRITECACHE command hasn't been issued to
- * the VDEVs involved in writing out this specific lwb. The lwb will be
- * "done" once zil_lwb_flush_vdevs_done() is called, which occurs in the
- * zio completion callback for the lwb's root zio.
+ * This is called when an lwb's write zio completes. The callback's
+ * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
+ * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
+ * in writing out this specific lwb's data, and in the case that cache
+ * flushes have been deferred, vdevs involved in writing the data for
+ * previous lwbs. The writes corresponding to all the vdevs in the
+ * lwb_vdev_tree will have completed by the time this is called, due to
+ * the zio dependencies configured in zil_lwb_set_zio_dependency(),
+ * which takes deferred flushes into account. The lwb will be "done"
+ * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
+ * completion callback for the lwb's root zio.
   */
  static void
  zil_lwb_write_done(zio_t *zio)
@@ -1168,6 +1214,7 @@ zil_lwb_write_done(zio_t *zio)
         avl_tree_t *t = &lwb->lwb_vdev_tree;
         void *cookie = NULL;
         zil_vdev_node_t *zv;
+       lwb_t *nlwb;
  
         ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
  
@@ -1181,11 +1228,12 @@ zil_lwb_write_done(zio_t *zio)
  
         abd_put(zio->io_abd);
  
-       ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
-
         mutex_enter(&zilog->zl_lock);
+       ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
+       lwb->lwb_state = LWB_STATE_WRITE_DONE;
         lwb->lwb_write_zio = NULL;
         lwb->lwb_fastwrite = FALSE;
+       nlwb = list_next(&zilog->zl_lwb_list, lwb);
         mutex_exit(&zilog->zl_lock);
  
         if (avl_numnodes(t) == 0)
@@ -1204,6 +1252,27 @@ zil_lwb_write_done(zio_t *zio)
                 return;
         }
  
+       /*
+        * If this lwb does not have any threads waiting for it to
+        * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
+        * command to the vdevs written to by "this" lwb, and instead
+        * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
+        * command for those vdevs. Thus, we merge the vdev tree of
+        * "this" lwb with the vdev tree of the "next" lwb in the list,
+        * and assume the "next" lwb will handle flushing the vdevs (or
+        * deferring the flush(s) again).
+        *
+        * This is a useful performance optimization, especially for
+        * workloads with lots of async write activity and few sync
+        * write and/or fsync activity, as it has the potential to
+        * coalesce multiple flush commands to a vdev into one.
+        */
+       if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) {
+               zil_lwb_flush_defer(lwb, nlwb);
+               ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+               return;
+       }
+
         while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
                 vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
                 if (vd != NULL)
@@ -1212,6 +1281,73 @@ zil_lwb_write_done(zio_t *zio)
         }
  }
  
+static void
+zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
+{
+       lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
+
+       ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+       ASSERT(MUTEX_HELD(&zilog->zl_lock));
+
+       /*
+        * The zilog's "zl_last_lwb_opened" field is used to build the
+        * lwb/zio dependency chain, which is used to preserve the
+        * ordering of lwb completions that is required by the semantics
+        * of the ZIL. Each new lwb zio becomes a parent of the
+        * "previous" lwb zio, such that the new lwb's zio cannot
+        * complete until the "previous" lwb's zio completes.
+        *
+        * This is required by the semantics of zil_commit(); the commit
+        * waiters attached to the lwbs will be woken in the lwb zio's
+        * completion callback, so this zio dependency graph ensures the
+        * waiters are woken in the correct order (the same order the
+        * lwbs were created).
+        */
+       if (last_lwb_opened != NULL &&
+           last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) {
+               ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
+                   last_lwb_opened->lwb_state == LWB_STATE_ISSUED ||
+                   last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE);
+
+               ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
+               zio_add_child(lwb->lwb_root_zio,
+                   last_lwb_opened->lwb_root_zio);
+
+               /*
+                * If the previous lwb's write hasn't already completed,
+                * we also want to order the completion of the lwb write
+                * zios (above, we only order the completion of the lwb
+                * root zios). This is required because of how we can
+                * defer the DKIOCFLUSHWRITECACHE commands for each lwb.
+                *
+                * When the DKIOCFLUSHWRITECACHE commands are defered,
+                * the previous lwb will rely on this lwb to flush the
+                * vdevs written to by that previous lwb. Thus, we need
+                * to ensure this lwb doesn't issue the flush until
+                * after the previous lwb's write completes. We ensure
+                * this ordering by setting the zio parent/child
+                * relationship here.
+                *
+                * Without this relationship on the lwb's write zio,
+                * it's possible for this lwb's write to complete prior
+                * to the previous lwb's write completing; and thus, the
+                * vdevs for the previous lwb would be flushed prior to
+                * that lwb's data being written to those vdevs (the
+                * vdevs are flushed in the lwb write zio's completion
+                * handler, zil_lwb_write_done()).
+                */
+               if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) {
+                       ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
+                           last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
+
+                       ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL);
+                       zio_add_child(lwb->lwb_write_zio,
+                           last_lwb_opened->lwb_write_zio);
+               }
+       }
+}
+
+
  /*
   * This function's purpose is to "open" an lwb such that it is ready to
   * accept new itxs being committed to it. To do this, the lwb's zio
@@ -1263,30 +1399,7 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
  
                 lwb->lwb_state = LWB_STATE_OPENED;
  
-               /*
-                * The zilog's "zl_last_lwb_opened" field is used to
-                * build the lwb/zio dependency chain, which is used to
-                * preserve the ordering of lwb completions that is
-                * required by the semantics of the ZIL. Each new lwb
-                * zio becomes a parent of the "previous" lwb zio, such
-                * that the new lwb's zio cannot complete until the
-                * "previous" lwb's zio completes.
-                *
-                * This is required by the semantics of zil_commit();
-                * the commit waiters attached to the lwbs will be woken
-                * in the lwb zio's completion callback, so this zio
-                * dependency graph ensures the waiters are woken in the
-                * correct order (the same order the lwbs were created).
-                */
-               lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
-               if (last_lwb_opened != NULL &&
-                   last_lwb_opened->lwb_state != LWB_STATE_DONE) {
-                       ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
-                           last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
-                       ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
-                       zio_add_child(lwb->lwb_root_zio,
-                           last_lwb_opened->lwb_root_zio);
-               }
+               zil_lwb_set_zio_dependency(zilog, lwb);
                 zilog->zl_last_lwb_opened = lwb;
         }
         mutex_exit(&zilog->zl_lock);
@@ -2012,7 +2125,8 @@ zil_prune_commit_list(zilog_t *zilog)
                 mutex_enter(&zilog->zl_lock);
  
                 lwb_t *last_lwb = zilog->zl_last_lwb_opened;
-               if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_DONE) {
+               if (last_lwb == NULL ||
+                   last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
                         /*
                          * All of the itxs this waiter was waiting on
                          * must have already completed (or there were
@@ -2095,7 +2209,8 @@ zil_process_commit_list(zilog_t *zilog)
                 lwb = zil_create(zilog);
         } else {
                 ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-               ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE);
+               ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+               ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
         }
  
         while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
@@ -2217,7 +2332,8 @@ zil_process_commit_list(zilog_t *zilog)
                 ASSERT(list_is_empty(&nolwb_waiters));
                 ASSERT3P(lwb, !=, NULL);
                 ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-               ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE);
+               ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+               ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
  
                 /*
                  * At this point, the ZIL block pointed at by the "lwb"
@@ -2340,7 +2456,8 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
          * acquiring it when it's not necessary to do so.
          */
         if (lwb->lwb_state == LWB_STATE_ISSUED ||
-           lwb->lwb_state == LWB_STATE_DONE)
+           lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+           lwb->lwb_state == LWB_STATE_FLUSH_DONE)
                 return;
  
         /*
@@ -2388,7 +2505,8 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
          * more details on the lwb states, and locking requirements.
          */
         if (lwb->lwb_state == LWB_STATE_ISSUED ||
-           lwb->lwb_state == LWB_STATE_DONE)
+           lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+           lwb->lwb_state == LWB_STATE_FLUSH_DONE)
                 goto out;
  
         ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
@@ -2561,7 +2679,8 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
  
                         IMPLY(lwb != NULL,
                             lwb->lwb_state == LWB_STATE_ISSUED ||
-                           lwb->lwb_state == LWB_STATE_DONE);
+                           lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+                           lwb->lwb_state == LWB_STATE_FLUSH_DONE);
                         cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
                 }
         }
@@ -3256,15 +3375,14 @@ zil_suspend(const char *osname, void **cookiep)
          * to disk before proceeding. If we used zil_commit instead, it
          * would just call txg_wait_synced(), because zl_suspend is set.
          * txg_wait_synced() doesn't wait for these lwb's to be
-        * LWB_STATE_DONE before returning.
+        * LWB_STATE_FLUSH_DONE before returning.
          */
         zil_commit_impl(zilog, 0);
  
         /*
-        * Now that we've ensured all lwb's are LWB_STATE_DONE,
-        * txg_wait_synced() will be called from within zil_destroy(),
-        * which will ensure the data from the zilog has migrated to the
-        * main pool before it returns.
+        * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
+        * use txg_wait_synced() to ensure the data from the zilog has
+        * migrated to the main pool before calling zil_destroy().
          */
         txg_wait_synced(zilog->zl_dmu_pool, 0);
  
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c

index e6f8451b259bbb4764a03a63b62d2ba4cac18e9b..7395dcb8ddcb67a9e63fa6f3554162eecf432c7a 100644 (file)
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -1030,6 +1030,7 @@ out:
  #endif
  }
  
+/* ARGSUSED */
  static void
  zvol_get_done(zgd_t *zgd, int error)
  {
@@ -1038,9 +1039,6 @@ zvol_get_done(zgd_t *zgd, int error)
  
         rangelock_exit(zgd->zgd_lr);
  
-       if (error == 0 && zgd->zgd_bp)
-               zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
-
         kmem_free(zgd, sizeof (zgd_t));
  }
author	Prakash Surya <prakash.surya@delphix.com>
	Tue, 23 Oct 2018 21:14:27 +0000 (14:14 -0700)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Fri, 7 Dec 2018 19:09:42 +0000 (11:09 -0800)
cmd/ztest/ztest.c		patch \| blob \| history
include/sys/zil_impl.h		patch \| blob \| history
module/zfs/dmu.c		patch \| blob \| history
module/zfs/zfs_vnops.c		patch \| blob \| history
module/zfs/zil.c		patch \| blob \| history
module/zfs/zvol.c		patch \| blob \| history