extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, offset_t off, ssize_t len, int ioflag);
+ znode_t *zp, offset_t off, ssize_t len, int ioflag,
+ zil_callback_t callback, void *callback_data);
extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, uint64_t off, uint64_t len);
extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
WR_NUM_STATES /* number of states */
} itx_wr_state_t;
+typedef void (*zil_callback_t)(void *data);
+
typedef struct itx {
list_node_t itx_node; /* linkage on zl_itx_list */
void *itx_private; /* type-specific opaque data */
itx_wr_state_t itx_wr_state; /* write state */
uint8_t itx_sync; /* synchronous transaction */
+ zil_callback_t itx_callback; /* Called when the itx is persistent */
+ void *itx_callback_data; /* User data for the callback */
uint64_t itx_sod; /* record size on disk */
uint64_t itx_oid; /* object id */
lr_t itx_lr; /* common part of log record */
}
/*
- * Handles TX_WRITE transactions.
+ * zfs_log_write() handles TX_WRITE transactions. The specified callback is
+ * called as soon as the write is on stable storage (be it via a DMU sync or a
+ * ZIL commit).
*/
long zfs_immediate_write_sz = 32768;
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, offset_t off, ssize_t resid, int ioflag)
+ znode_t *zp, offset_t off, ssize_t resid, int ioflag,
+ zil_callback_t callback, void *callback_data)
{
itx_wr_state_t write_state;
boolean_t slogging;
uintptr_t fsync_cnt;
ssize_t immediate_write_sz;
- if (zil_replaying(zilog, tx) || zp->z_unlinked)
+ if (zil_replaying(zilog, tx) || zp->z_unlinked) {
+ if (callback != NULL)
+ callback(callback_data);
return;
+ }
immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
? 0 : (ssize_t)zfs_immediate_write_sz;
(fsync_cnt == 0))
itx->itx_sync = B_FALSE;
+ itx->itx_callback = callback;
+ itx->itx_callback_data = callback_data;
zil_itx_assign(zilog, itx, tx);
off += len;
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
+ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
+ NULL, NULL);
dmu_tx_commit(tx);
if (error != 0)
EXPORT_SYMBOL(zfs_link);
static void
-zfs_putpage_commit_cb(void *arg, int error)
+zfs_putpage_commit_cb(void *arg)
{
struct page *pp = arg;
- if (error) {
- __set_page_dirty_nobuffers(pp);
-
- if (error != ECANCELED)
- SetPageError(pp);
- } else {
- ClearPageError(pp);
- }
-
+ ClearPageError(pp);
end_page_writeback(pp);
}
uint64_t mtime[2], ctime[2];
sa_bulk_attr_t bulk[3];
int cnt = 0;
- int sync;
ZFS_ENTER(zsb);
ZFS_VERIFY_ZP(zp);
rl = zfs_range_lock(zp, pgoff, pglen, RL_WRITER);
tx = dmu_tx_create(zsb->z_os);
- sync = ((zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) ||
- (wbc->sync_mode == WB_SYNC_ALL));
- if (!sync)
- dmu_tx_callback_register(tx, zfs_putpage_commit_cb, pp);
-
dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
if (err == ERESTART)
dmu_tx_wait(tx);
- /* Will call all registered commit callbacks */
dmu_tx_abort(tx);
-
- /*
- * For the synchronous case the commit callback must be
- * explicitly called because there is no registered callback.
- */
- if (sync)
- zfs_putpage_commit_cb(pp, ECANCELED);
-
+ __set_page_dirty_nobuffers(pp);
+ ClearPageError(pp);
+ end_page_writeback(pp);
zfs_range_unlock(rl);
ZFS_EXIT(zsb);
return (err);
err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
- zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0);
+ zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
+ zfs_putpage_commit_cb, pp);
dmu_tx_commit(tx);
zfs_range_unlock(rl);
- if (sync) {
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ /*
+ * Note that this is rarely called under writepages(), because
+ * writepages() normally handles the entire commit for
+ * performance reasons.
+ */
zil_commit(zsb->z_log, zp->z_id);
- zfs_putpage_commit_cb(pp, err);
}
ZFS_EXIT(zsb);
itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
itx->itx_lr.lrc_seq = 0; /* defensive */
itx->itx_sync = B_TRUE; /* default is synchronous */
+ itx->itx_callback = NULL;
+ itx->itx_callback_data = NULL;
return (itx);
}
list = &itxs->i_sync_list;
while ((itx = list_head(list)) != NULL) {
+ if (itx->itx_callback != NULL)
+ itx->itx_callback(itx->itx_callback_data);
list_remove(list, itx);
kmem_free(itx, offsetof(itx_t, itx_lr) +
itx->itx_lr.lrc_reclen);
while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
list = &ian->ia_list;
while ((itx = list_head(list)) != NULL) {
+ if (itx->itx_callback != NULL)
+ itx->itx_callback(itx->itx_callback_data);
list_remove(list, itx);
kmem_free(itx, offsetof(itx_t, itx_lr) +
itx->itx_lr.lrc_reclen);
mutex_exit(&itxg->itxg_lock);
}
while ((itx = list_head(&clean_list)) != NULL) {
+ if (itx->itx_callback != NULL)
+ itx->itx_callback(itx->itx_callback_data);
list_remove(&clean_list, itx);
kmem_free(itx, offsetof(itx_t, itx_lr) +
itx->itx_lr.lrc_reclen);
}
DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
- while ((itx = list_head(&zilog->zl_itx_commit_list))) {
+ for (itx = list_head(&zilog->zl_itx_commit_list); itx != NULL;
+ itx = list_next(&zilog->zl_itx_commit_list, itx)) {
txg = itx->itx_lr.lrc_txg;
ASSERT(txg);
if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa))
lwb = zil_lwb_commit(zilog, itx, lwb);
- list_remove(&zilog->zl_itx_commit_list, itx);
- kmem_free(itx, offsetof(itx_t, itx_lr)
- + itx->itx_lr.lrc_reclen);
}
DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
if (error || lwb == NULL)
txg_wait_synced(zilog->zl_dmu_pool, 0);
+ while ((itx = list_head(&zilog->zl_itx_commit_list))) {
+ txg = itx->itx_lr.lrc_txg;
+ ASSERT(txg);
+
+ if (itx->itx_callback != NULL)
+ itx->itx_callback(itx->itx_callback_data);
+ list_remove(&zilog->zl_itx_commit_list, itx);
+ kmem_free(itx, offsetof(itx_t, itx_lr)
+ + itx->itx_lr.lrc_reclen);
+ }
+
mutex_enter(&zilog->zl_lock);
/*
*/
+#include <sys/dmu_objset.h>
#include <sys/zfs_vfsops.h>
#include <sys/zfs_vnops.h>
#include <sys/zfs_znode.h>
static int
zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
- return write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+ znode_t *zp = ITOZ(mapping->host);
+ zfs_sb_t *zsb = ITOZSB(mapping->host);
+ enum writeback_sync_modes sync_mode;
+ int result;
+
+ ZFS_ENTER(zsb);
+ if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ wbc->sync_mode = WB_SYNC_ALL;
+ ZFS_EXIT(zsb);
+ sync_mode = wbc->sync_mode;
+
+ /*
+ * We don't want to run write_cache_pages() in SYNC mode here, because
+ * that would make putpage() wait for a single page to be committed to
+ * disk every single time, resulting in atrocious performance. Instead
+ * we run it once in non-SYNC mode so that the ZIL gets all the data,
+ * and then we commit it all in one go.
+ */
+ wbc->sync_mode = WB_SYNC_NONE;
+ result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+ if (sync_mode != wbc->sync_mode) {
+ ZFS_ENTER(zsb);
+ ZFS_VERIFY_ZP(zp);
+ zil_commit(zsb->z_log, zp->z_id);
+ ZFS_EXIT(zsb);
+
+ /*
+ * We need to call write_cache_pages() again (we can't just
+ * return after the commit) because the previous call in
+ * non-SYNC mode does not guarantee that we got all the dirty
+ * pages (see the implementation of write_cache_pages() for
+ * details). That being said, this is a no-op in most cases.
+ */
+ wbc->sync_mode = sync_mode;
+ result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+ }
+ return (result);
}
/*
static int
zpl_writepage(struct page *pp, struct writeback_control *wbc)
{
- return zpl_putpage(pp, wbc, pp->mapping);
+ if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ wbc->sync_mode = WB_SYNC_ALL;
+
+ return (zpl_putpage(pp, wbc, pp->mapping));
}
/*