Only commit the ZIL once in zpl_writepages() (msync() case).

author Etienne Dechamps <etienne@edechamps.fr>

Sun, 10 Nov 2013 15:00:11 +0000 (15:00 +0000)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Sat, 23 Nov 2013 23:08:29 +0000 (15:08 -0800)
author Etienne Dechamps <etienne@edechamps.fr>
Sun, 10 Nov 2013 15:00:11 +0000 (15:00 +0000)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Sat, 23 Nov 2013 23:08:29 +0000 (15:08 -0800)
diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h

index b5ab7dbaec6380f7534b8c0b7d1b732a99516b30..620244556ea18dae3b033897f91c504dafa3385a 100644 (file)
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@@ -353,7 +353,8 @@ extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
  extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
      znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
  extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *zp, offset_t off, ssize_t len, int ioflag);
+    znode_t *zp, offset_t off, ssize_t len, int ioflag,
+    zil_callback_t callback, void *callback_data);
  extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
      znode_t *zp, uint64_t off, uint64_t len);
  extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
diff --git a/include/sys/zil.h b/include/sys/zil.h

index d3e4b8ec6cfc92c83ef10ac76bde2684730c3331..b6718b93c3c6471dba6186e6a8fb8aa363767cb2 100644 (file)
--- a/include/sys/zil.h
+++ b/include/sys/zil.h
@@ -361,11 +361,15 @@ typedef enum {
         WR_NUM_STATES   /* number of states */
  } itx_wr_state_t;
  
+typedef void (*zil_callback_t)(void *data);
+
  typedef struct itx {
         list_node_t     itx_node;       /* linkage on zl_itx_list */
         void            *itx_private;   /* type-specific opaque data */
         itx_wr_state_t  itx_wr_state;   /* write state */
         uint8_t         itx_sync;       /* synchronous transaction */
+       zil_callback_t  itx_callback;   /* Called when the itx is persistent */
+       void            *itx_callback_data; /* User data for the callback */
         uint64_t        itx_sod;        /* record size on disk */
         uint64_t        itx_oid;        /* object id */
         lr_t            itx_lr;         /* common part of log record */
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c

index 0bb44234d72db449e3b166f9bf67f518b8c151d7..cfce83138df2422933d86fe307b2d131b8df8efa 100644 (file)
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -445,21 +445,27 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
  }
  
  /*
- * Handles TX_WRITE transactions.
+ * zfs_log_write() handles TX_WRITE transactions. The specified callback is
+ * called as soon as the write is on stable storage (be it via a DMU sync or a
+ * ZIL commit).
   */
  long zfs_immediate_write_sz = 32768;
  
  void
  zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-       znode_t *zp, offset_t off, ssize_t resid, int ioflag)
+       znode_t *zp, offset_t off, ssize_t resid, int ioflag,
+       zil_callback_t callback, void *callback_data)
  {
         itx_wr_state_t write_state;
         boolean_t slogging;
         uintptr_t fsync_cnt;
         ssize_t immediate_write_sz;
  
-       if (zil_replaying(zilog, tx) || zp->z_unlinked)
+       if (zil_replaying(zilog, tx) || zp->z_unlinked) {
+               if (callback != NULL)
+                       callback(callback_data);
                 return;
+       }
  
         immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
             ? 0 : (ssize_t)zfs_immediate_write_sz;
@@ -516,6 +522,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
                     (fsync_cnt == 0))
                         itx->itx_sync = B_FALSE;
  
+               itx->itx_callback = callback;
+               itx->itx_callback_data = callback_data;
                 zil_itx_assign(zilog, itx, tx);
  
                 off += len;
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c

index 84b4fe81f781cbc0c6f92555ef5b4f78ff5450f5..abf3747db29ec70b0bfd10134231872dd3cd1d26 100644 (file)
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -892,7 +892,8 @@ again:
  
                 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
  
-               zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
+               zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
+                   NULL, NULL);
                 dmu_tx_commit(tx);
  
                 if (error != 0)
@@ -3822,19 +3823,11 @@ top:
  EXPORT_SYMBOL(zfs_link);
  
  static void
-zfs_putpage_commit_cb(void *arg, int error)
+zfs_putpage_commit_cb(void *arg)
  {
         struct page *pp = arg;
  
-       if (error) {
-               __set_page_dirty_nobuffers(pp);
-
-               if (error != ECANCELED)
-                       SetPageError(pp);
-       } else {
-               ClearPageError(pp);
-       }
-
+       ClearPageError(pp);
         end_page_writeback(pp);
  }
  
@@ -3868,7 +3861,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
         uint64_t        mtime[2], ctime[2];
         sa_bulk_attr_t  bulk[3];
         int             cnt = 0;
-       int             sync;
  
         ZFS_ENTER(zsb);
         ZFS_VERIFY_ZP(zp);
@@ -3909,11 +3901,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
         rl = zfs_range_lock(zp, pgoff, pglen, RL_WRITER);
         tx = dmu_tx_create(zsb->z_os);
  
-       sync = ((zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) ||
-               (wbc->sync_mode == WB_SYNC_ALL));
-       if (!sync)
-               dmu_tx_callback_register(tx, zfs_putpage_commit_cb, pp);
-
         dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
  
         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
@@ -3923,16 +3910,10 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
                 if (err == ERESTART)
                         dmu_tx_wait(tx);
  
-               /* Will call all registered commit callbacks */
                 dmu_tx_abort(tx);
-
-               /*
-                * For the synchronous case the commit callback must be
-                * explicitly called because there is no registered callback.
-                */
-               if (sync)
-                       zfs_putpage_commit_cb(pp, ECANCELED);
-
+               __set_page_dirty_nobuffers(pp);
+               ClearPageError(pp);
+               end_page_writeback(pp);
                 zfs_range_unlock(rl);
                 ZFS_EXIT(zsb);
                 return (err);
@@ -3955,14 +3936,19 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
  
         err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
  
-       zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0);
+       zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
+           zfs_putpage_commit_cb, pp);
         dmu_tx_commit(tx);
  
         zfs_range_unlock(rl);
  
-       if (sync) {
+       if (wbc->sync_mode != WB_SYNC_NONE) {
+               /*
+                * Note that this is rarely called under writepages(), because
+                * writepages() normally handles the entire commit for
+                * performance reasons.
+                */
                 zil_commit(zsb->z_log, zp->z_id);
-               zfs_putpage_commit_cb(pp, err);
         }
  
         ZFS_EXIT(zsb);
diff --git a/module/zfs/zil.c b/module/zfs/zil.c

index 3688c8a1615af3874062c4fe9dcafde0307e604d..839afa956c17163034efd8aab514e2b1cf3720b4 100644 (file)
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1184,6 +1184,8 @@ zil_itx_create(uint64_t txtype, size_t lrsize)
         itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
         itx->itx_lr.lrc_seq = 0;        /* defensive */
         itx->itx_sync = B_TRUE;         /* default is synchronous */
+       itx->itx_callback = NULL;
+       itx->itx_callback_data = NULL;
  
         return (itx);
  }
@@ -1209,6 +1211,8 @@ zil_itxg_clean(itxs_t *itxs)
  
         list = &itxs->i_sync_list;
         while ((itx = list_head(list)) != NULL) {
+               if (itx->itx_callback != NULL)
+                       itx->itx_callback(itx->itx_callback_data);
                 list_remove(list, itx);
                 kmem_free(itx, offsetof(itx_t, itx_lr) +
                     itx->itx_lr.lrc_reclen);
@@ -1219,6 +1223,8 @@ zil_itxg_clean(itxs_t *itxs)
         while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
                 list = &ian->ia_list;
                 while ((itx = list_head(list)) != NULL) {
+                       if (itx->itx_callback != NULL)
+                               itx->itx_callback(itx->itx_callback_data);
                         list_remove(list, itx);
                         kmem_free(itx, offsetof(itx_t, itx_lr) +
                             itx->itx_lr.lrc_reclen);
@@ -1285,6 +1291,8 @@ zil_remove_async(zilog_t *zilog, uint64_t oid)
                 mutex_exit(&itxg->itxg_lock);
         }
         while ((itx = list_head(&clean_list)) != NULL) {
+               if (itx->itx_callback != NULL)
+                       itx->itx_callback(itx->itx_callback_data);
                 list_remove(&clean_list, itx);
                 kmem_free(itx, offsetof(itx_t, itx_lr) +
                     itx->itx_lr.lrc_reclen);
@@ -1530,15 +1538,13 @@ zil_commit_writer(zilog_t *zilog)
         }
  
         DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
-       while ((itx = list_head(&zilog->zl_itx_commit_list))) {
+       for (itx = list_head(&zilog->zl_itx_commit_list); itx != NULL;
+            itx = list_next(&zilog->zl_itx_commit_list, itx)) {
                 txg = itx->itx_lr.lrc_txg;
                 ASSERT(txg);
  
                 if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa))
                         lwb = zil_lwb_commit(zilog, itx, lwb);
-               list_remove(&zilog->zl_itx_commit_list, itx);
-               kmem_free(itx, offsetof(itx_t, itx_lr)
-                   + itx->itx_lr.lrc_reclen);
         }
         DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
  
@@ -1560,6 +1566,17 @@ zil_commit_writer(zilog_t *zilog)
         if (error || lwb == NULL)
                 txg_wait_synced(zilog->zl_dmu_pool, 0);
  
+       while ((itx = list_head(&zilog->zl_itx_commit_list))) {
+               txg = itx->itx_lr.lrc_txg;
+               ASSERT(txg);
+
+               if (itx->itx_callback != NULL)
+                       itx->itx_callback(itx->itx_callback_data);
+               list_remove(&zilog->zl_itx_commit_list, itx);
+               kmem_free(itx, offsetof(itx_t, itx_lr)
+                   + itx->itx_lr.lrc_reclen);
+       }
+
         mutex_enter(&zilog->zl_lock);
  
         /*
diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c

index 8054645c1e4d2db7f7a457e4ab0cf436f61924d7..0d46eee0268962477ef6b3a2fd559ff7ceaea8e3 100644 (file)
--- a/module/zfs/zpl_file.c
+++ b/module/zfs/zpl_file.c
@@ -23,6 +23,7 @@
   */
  
  
+#include <sys/dmu_objset.h>
  #include <sys/zfs_vfsops.h>
  #include <sys/zfs_vnops.h>
  #include <sys/zfs_znode.h>
@@ -420,7 +421,43 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
  static int
  zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
  {
-       return write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+       znode_t         *zp = ITOZ(mapping->host);
+       zfs_sb_t        *zsb = ITOZSB(mapping->host);
+       enum writeback_sync_modes sync_mode;
+       int result;
+
+       ZFS_ENTER(zsb);
+       if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
+               wbc->sync_mode = WB_SYNC_ALL;
+       ZFS_EXIT(zsb);
+       sync_mode = wbc->sync_mode;
+
+       /*
+        * We don't want to run write_cache_pages() in SYNC mode here, because
+        * that would make putpage() wait for a single page to be committed to
+        * disk every single time, resulting in atrocious performance. Instead
+        * we run it once in non-SYNC mode so that the ZIL gets all the data,
+        * and then we commit it all in one go.
+        */
+       wbc->sync_mode = WB_SYNC_NONE;
+       result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+       if (sync_mode != wbc->sync_mode) {
+               ZFS_ENTER(zsb);
+               ZFS_VERIFY_ZP(zp);
+               zil_commit(zsb->z_log, zp->z_id);
+               ZFS_EXIT(zsb);
+
+               /*
+                * We need to call write_cache_pages() again (we can't just
+                * return after the commit) because the previous call in
+                * non-SYNC mode does not guarantee that we got all the dirty
+                * pages (see the implementation of write_cache_pages() for
+                * details). That being said, this is a no-op in most cases.
+                */
+               wbc->sync_mode = sync_mode;
+               result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+       }
+       return (result);
  }
  
  /*
@@ -432,7 +469,10 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
  static int
  zpl_writepage(struct page *pp, struct writeback_control *wbc)
  {
-       return zpl_putpage(pp, wbc, pp->mapping);
+       if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
+               wbc->sync_mode = WB_SYNC_ALL;
+
+       return (zpl_putpage(pp, wbc, pp->mapping));
  }
  
  /*
author	Etienne Dechamps <etienne@edechamps.fr>
	Sun, 10 Nov 2013 15:00:11 +0000 (15:00 +0000)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Sat, 23 Nov 2013 23:08:29 +0000 (15:08 -0800)
include/sys/zfs_znode.h		patch \| blob \| history
include/sys/zil.h		patch \| blob \| history
module/zfs/zfs_log.c		patch \| blob \| history
module/zfs/zfs_vnops.c		patch \| blob \| history
module/zfs/zil.c		patch \| blob \| history
module/zfs/zpl_file.c		patch \| blob \| history