Illumos #4045 write throttle & i/o scheduler performance work

author Matthew Ahrens <mahrens@delphix.com>

Thu, 29 Aug 2013 03:01:20 +0000 (20:01 -0700)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Fri, 6 Dec 2013 17:32:43 +0000 (09:32 -0800)
author Matthew Ahrens <mahrens@delphix.com>
Thu, 29 Aug 2013 03:01:20 +0000 (20:01 -0700)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Fri, 6 Dec 2013 17:32:43 +0000 (09:32 -0800)
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am

index 34c715101054185647b1a1d91c39678c71583f31..9d77566277bde21503aa6c2584a8000af645a608 100644 (file)
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -62,6 +62,7 @@ COMMON_H = \
         $(top_srcdir)/include/sys/zfs_context.h \
         $(top_srcdir)/include/sys/zfs_ctldir.h \
         $(top_srcdir)/include/sys/zfs_debug.h \
+       $(top_srcdir)/include/sys/zfs_delay.h \
         $(top_srcdir)/include/sys/zfs_dir.h \
         $(top_srcdir)/include/sys/zfs_fuid.h \
         $(top_srcdir)/include/sys/zfs_rlock.h \
diff --git a/include/sys/arc.h b/include/sys/arc.h

index 221946da30a6330509c09366cf69137b2ddcd5d5..9d68d3b43a730ef6883b87a9035eeed66c74423c 100644 (file)
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -145,12 +145,13 @@ int arc_referenced(arc_buf_t *buf);
  #endif
  
  int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
-    arc_done_func_t *done, void *private, int priority, int flags,
+    arc_done_func_t *done, void *private, zio_priority_t priority, int flags,
      uint32_t *arc_flags, const zbookmark_t *zb);
  zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
      blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
-    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
-    void *private, int priority, int zio_flags, const zbookmark_t *zb);
+    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
+    arc_done_func_t *done, void *private, zio_priority_t priority,
+    int zio_flags, const zbookmark_t *zb);
  
  arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *private);
  void arc_remove_prune_callback(arc_prune_t *p);
@@ -179,11 +180,6 @@ void l2arc_fini(void);
  void l2arc_start(void);
  void l2arc_stop(void);
  
-/* Global tunings */
-extern int zfs_write_limit_shift;
-extern unsigned long zfs_write_limit_max;
-extern kmutex_t zfs_write_limit_lock;
-
  #ifndef _KERNEL
  extern boolean_t arc_watch;
  #endif
diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h

index 3140665ab62187c06c4f098e6e6f984526bb7579..23b919bf758cf4a1eb75f0365403f68b9256dff8 100644 (file)
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -112,6 +112,9 @@ typedef struct dbuf_dirty_record {
         /* pointer to parent dirty record */
         struct dbuf_dirty_record *dr_parent;
  
+       /* How much space was changed to dsl_pool_dirty_space() for this? */
+       unsigned int dr_accounted;
+
         union dirty_types {
                 struct dirty_indirect {
  
@@ -252,7 +255,7 @@ dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
  int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
      void *tag, dmu_buf_impl_t **dbp);
  
-void dbuf_prefetch(struct dnode *dn, uint64_t blkid);
+void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio);
  
  void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
  uint64_t dbuf_refcount(dmu_buf_impl_t *db);
diff --git a/include/sys/dmu.h b/include/sys/dmu.h

index 5485131dfc63e40754d45ed2b0cc1fe9c5529b52..1314c1eed42a03e2bee27b1853d4ede30cc79c85 100644 (file)
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -218,6 +218,7 @@ typedef enum dmu_object_type {
  typedef enum txg_how {
         TXG_WAIT = 1,
         TXG_NOWAIT,
+       TXG_WAITED,
  } txg_how_t;
  
  void byteswap_uint64_array(void *buf, size_t size);
diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h

index 48a507e347a5172e68507d1502e648f7305673a9..f6a62af4b85b7d9c4e1912f8f557c36ed79d2229 100644 (file)
--- a/include/sys/dmu_tx.h
+++ b/include/sys/dmu_tx.h
@@ -23,7 +23,7 @@
   * Use is subject to license terms.
   */
  /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
   */
  
  #ifndef        _SYS_DMU_TX_H
@@ -60,8 +60,22 @@ struct dmu_tx {
         txg_handle_t tx_txgh;
         void *tx_tempreserve_cookie;
         struct dmu_tx_hold *tx_needassign_txh;
-       list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */
-       uint8_t tx_anyobj;
+
+       /* list of dmu_tx_callback_t on this dmu_tx */
+       list_t tx_callbacks;
+
+       /* placeholder for syncing context, doesn't need specific holds */
+       boolean_t tx_anyobj;
+
+       /* has this transaction already been delayed? */
+       boolean_t tx_waited;
+
+       /* time this transaction was created */
+       hrtime_t tx_start;
+
+       /* need to wait for sufficient dirty space */
+       boolean_t tx_wait_dirty;
+
         int tx_err;
  #ifdef DEBUG_DMU_TX
         uint64_t tx_space_towrite;
@@ -121,7 +135,8 @@ typedef struct dmu_tx_stats {
         kstat_named_t dmu_tx_memory_reclaim;
         kstat_named_t dmu_tx_memory_inflight;
         kstat_named_t dmu_tx_dirty_throttle;
-       kstat_named_t dmu_tx_write_limit;
+       kstat_named_t dmu_tx_dirty_delay;
+       kstat_named_t dmu_tx_dirty_over_max;
         kstat_named_t dmu_tx_quota;
  } dmu_tx_stats_t;
  
diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h

index 2477e89af64ac037a4404f13bab1676be0cc4761..d69d47696f230ae2f396b8b31ea61f881967b86a 100644 (file)
--- a/include/sys/dsl_dir.h
+++ b/include/sys/dsl_dir.h
@@ -20,7 +20,7 @@
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
   */
  
  #ifndef        _SYS_DSL_DIR_H
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h

index 0f9471486ef3952154a9b97019a3d9137fcd1825..d5bad8dc1919d3898ccdeec16a8de202a338a922 100644 (file)
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@@ -20,7 +20,7 @@
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
   */
  
  #ifndef        _SYS_DSL_POOL_H
@@ -51,6 +51,14 @@ struct dsl_pool;
  struct dmu_tx;
  struct dsl_scan;
  
+extern unsigned long zfs_dirty_data_max;
+extern unsigned long zfs_dirty_data_max_max;
+extern unsigned long zfs_dirty_data_sync;
+extern int zfs_dirty_data_max_percent;
+extern int zfs_dirty_data_max_max_percent;
+extern int zfs_delay_min_dirty_percent;
+extern unsigned long zfs_delay_scale;
+
  /* These macros are for indexing into the zfs_all_blkstats_t. */
  #define        DMU_OT_DEFERRED DMU_OT_NONE
  #define        DMU_OT_OTHER    DMU_OT_NUMTYPES /* place holder for DMU_OT() types */
@@ -85,9 +93,6 @@ typedef struct dsl_pool {
  
         /* No lock needed - sync context only */
         blkptr_t dp_meta_rootbp;
-       hrtime_t dp_read_overhead;
-       uint64_t dp_throughput; /* bytes per millisec */
-       uint64_t dp_write_limit;
         uint64_t dp_tmp_userrefs_obj;
         bpobj_t dp_free_bpobj;
         uint64_t dp_bptree_obj;
@@ -97,12 +102,19 @@ typedef struct dsl_pool {
  
         /* Uses dp_lock */
         kmutex_t dp_lock;
-       uint64_t dp_space_towrite[TXG_SIZE];
-       uint64_t dp_tempreserved[TXG_SIZE];
+       kcondvar_t dp_spaceavail_cv;
+       uint64_t dp_dirty_pertxg[TXG_SIZE];
+       uint64_t dp_dirty_total;
         uint64_t dp_mos_used_delta;
         uint64_t dp_mos_compressed_delta;
         uint64_t dp_mos_uncompressed_delta;
  
+       /*
+        * Time of most recently scheduled (furthest in the future)
+        * wakeup for delayed transactions.
+        */
+       hrtime_t dp_last_wakeup;
+
         /* Has its own locking */
         tx_state_t dp_tx;
         txg_list_t dp_dirty_datasets;
@@ -131,10 +143,8 @@ void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
  int dsl_pool_sync_context(dsl_pool_t *dp);
  uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
  uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
-int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx);
-void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
-void dsl_pool_memory_pressure(dsl_pool_t *dp);
-void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
+void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
+void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
  void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
  void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg,
      const blkptr_t *bpp);
@@ -143,6 +153,7 @@ void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
  void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
  void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
      int64_t used, int64_t comp, int64_t uncomp);
+boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
  void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
  void dsl_pool_config_exit(dsl_pool_t *dp, void *tag);
  boolean_t dsl_pool_config_held(dsl_pool_t *dp);
diff --git a/include/sys/sa_impl.h b/include/sys/sa_impl.h

index 582bd76f0180cb65f7f74bcb565066c9b25ad5f2..fcbd8eb34e9171061aef49ae7abbdcfdd1751908 100644 (file)
--- a/include/sys/sa_impl.h
+++ b/include/sys/sa_impl.h
@@ -20,7 +20,7 @@
   */
  /*
   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
   */
  
  #ifndef        _SYS_SA_IMPL_H
@@ -153,7 +153,7 @@ struct sa_os {
   *
   * The header has a fixed portion with a variable number
   * of "lengths" depending on the number of variable sized
- * attribues which are determined by the "layout number"
+ * attributes which are determined by the "layout number"
   */
  
  #define        SA_MAGIC        0x2F505A  /* ZFS SA */
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h

index 2e65ce8454846182044d2e74d99d4c198ae87f7e..55515c1fc36953aa85ccb027c8245cee944df87b 100644 (file)
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -20,7 +20,7 @@
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
   */
  
@@ -234,7 +234,7 @@ struct spa {
         uint64_t        spa_feat_desc_obj;      /* Feature descriptions */
         taskqid_t       spa_deadman_tqid;       /* Task id */
         uint64_t        spa_deadman_calls;      /* number of deadman calls */
-       uint64_t        spa_sync_starttime;     /* starting time fo spa_sync */
+       hrtime_t        spa_sync_starttime;     /* starting time of spa_sync */
         uint64_t        spa_deadman_synctime;   /* deadman expiration timer */
         spa_stats_t     spa_stats;              /* assorted spa statistics */
  
diff --git a/include/sys/txg.h b/include/sys/txg.h

index 9e547819b2480c07dad1ce415c470c92ff69d9c3..1bb6bac917e43e34fcaf0b8160b0b1b04aa29047 100644 (file)
--- a/include/sys/txg.h
+++ b/include/sys/txg.h
@@ -23,7 +23,7 @@
   * Use is subject to license terms.
   */
  /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
   */
  
  #ifndef _SYS_TXG_H
@@ -76,6 +76,7 @@ extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
  
  extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta,
      hrtime_t resolution);
+extern void txg_kick(struct dsl_pool *dp);
  
  /*
   * Wait until the given transaction group has finished syncing.
diff --git a/include/sys/txg_impl.h b/include/sys/txg_impl.h

index 8a0977f1f4dc02525c6787229c8d0679ff1cf22e..e583d61eac2f345010245944d87ae588a051c19a 100644 (file)
--- a/include/sys/txg_impl.h
+++ b/include/sys/txg_impl.h
@@ -18,6 +18,7 @@
   *
   * CDDL HEADER END
   */
+
  /*
   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
   * Use is subject to license terms.
@@ -89,11 +90,14 @@ struct tx_cpu {
  typedef struct tx_state {
         tx_cpu_t        *tx_cpu;        /* protects access to tx_open_txg */
         kmutex_t        tx_sync_lock;   /* protects the rest of this struct */
+
         uint64_t        tx_open_txg;    /* currently open txg id */
         uint64_t        tx_quiesced_txg; /* quiesced txg waiting for sync */
         uint64_t        tx_syncing_txg; /* currently syncing txg id */
         uint64_t        tx_synced_txg;  /* last synced txg id */
  
+       hrtime_t        tx_open_time;   /* start time of tx_open_txg */
+
         uint64_t        tx_sync_txg_waiting; /* txg we're waiting to sync */
         uint64_t        tx_quiesce_txg_waiting; /* txg we're waiting to open */
  
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h

index 45a2b5e48471b4c277a24ca040f53ae8af255af7..4b465d2958c16c63fc3104c91df20ef1963b7e3b 100644 (file)
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -100,12 +100,22 @@ struct vdev_cache {
         kmutex_t        vc_lock;
  };
  
+typedef struct vdev_queue_class {
+       uint32_t        vqc_active;
+
+       /*
+        * Sorted by offset or timestamp, depending on if the queue is
+        * LBA-ordered vs FIFO.
+        */
+       avl_tree_t      vqc_queued_tree;
+} vdev_queue_class_t;
+
  struct vdev_queue {
-       avl_tree_t      vq_deadline_tree;
-       avl_tree_t      vq_read_tree;
-       avl_tree_t      vq_write_tree;
-       avl_tree_t      vq_pending_tree;
-       hrtime_t        vq_io_complete_ts;
+       vdev_t          *vq_vdev;
+       vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
+       avl_tree_t      vq_active_tree;
+       uint64_t        vq_last_offset;
+       hrtime_t        vq_io_complete_ts; /* time last i/o completed */
         hrtime_t        vq_io_delta_ts;
         list_t          vq_io_list;
         kmutex_t        vq_lock;
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h

index 28a1306aa41938a0925b6a01ab603f0dad4af6c0..3fd9e1be01913945cc5e714c25e38de71988d998 100644 (file)
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -25,7 +25,7 @@
  /*
   * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
   * Copyright (c) 2012, Joyent, Inc. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
   */
  
  #ifndef _SYS_ZFS_CONTEXT_H
@@ -61,6 +61,7 @@
  #include <sys/zone.h>
  #include <sys/sdt.h>
  #include <sys/zfs_debug.h>
+#include <sys/zfs_delay.h>
  #include <sys/fm/fs/zfs.h>
  #include <sys/sunddi.h>
  #include <sys/ctype.h>
@@ -224,6 +225,8 @@ typedef void (*thread_func_t)(void *);
  typedef void (*thread_func_arg_t)(void *);
  typedef pthread_t kt_did_t;
  
+#define kpreempt(x)    ((void)0)
+
  typedef struct kthread {
         kt_did_t        t_tid;
         thread_func_t   t_func;
@@ -708,6 +711,15 @@ void ksiddomain_rele(ksiddomain_t *);
  #define        ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) \
         sysevent_post_event(_c, _d, _b, "libzpool", _e, _f)
  
+#define zfs_sleep_until(wakeup)                                                \
+       do {                                                            \
+               hrtime_t delta = wakeup - gethrtime();                  \
+               struct timespec ts;                                     \
+               ts.tv_sec = delta / NANOSEC;                            \
+               ts.tv_nsec = delta % NANOSEC;                           \
+               (void) nanosleep(&ts, NULL);                            \
+       } while (0)
+
  #endif /* _KERNEL */
  
  #endif /* _SYS_ZFS_CONTEXT_H */
diff --git a/include/sys/zfs_delay.h b/include/sys/zfs_delay.h

new file mode 100644 (file)

index 0000000..4c76631
--- /dev/null
+++ b/include/sys/zfs_delay.h
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#ifndef        _SYS_FS_ZFS_DELAY_H
+#define        _SYS_FS_ZFS_DELAY_H
+
+#include <linux/delay_compat.h>
+
+/*
+ * Generic wrapper to sleep until a given time.
+ */
+#define zfs_sleep_until(wakeup)                                                \
+       do {                                                            \
+               hrtime_t delta = wakeup - gethrtime();                  \
+                                                                       \
+               if (delta > 0) {                                        \
+                       unsigned long delta_us;                         \
+                       delta_us = delta / (NANOSEC / MICROSEC);        \
+                       usleep_range(delta_us, delta_us + 100);         \
+               }                                                       \
+       } while (0)
+
+#endif /* _SYS_FS_ZFS_DELAY_H */
diff --git a/include/sys/zio.h b/include/sys/zio.h

index b505ca1e63e862b5e0c14fa582f5dc4bf57b32e0..cfb256f0f7ba85a046b4a75df12da70fe258c0be 100644 (file)
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -22,7 +22,7 @@
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
   */
  
@@ -130,19 +130,16 @@ enum zio_compress {
  #define        ZIO_FAILURE_MODE_CONTINUE       1
  #define        ZIO_FAILURE_MODE_PANIC          2
  
-#define        ZIO_PRIORITY_NOW                (zio_priority_table[0])
-#define        ZIO_PRIORITY_SYNC_READ          (zio_priority_table[1])
-#define        ZIO_PRIORITY_SYNC_WRITE         (zio_priority_table[2])
-#define        ZIO_PRIORITY_LOG_WRITE          (zio_priority_table[3])
-#define        ZIO_PRIORITY_CACHE_FILL         (zio_priority_table[4])
-#define        ZIO_PRIORITY_AGG                (zio_priority_table[5])
-#define        ZIO_PRIORITY_FREE               (zio_priority_table[6])
-#define        ZIO_PRIORITY_ASYNC_WRITE        (zio_priority_table[7])
-#define        ZIO_PRIORITY_ASYNC_READ         (zio_priority_table[8])
-#define        ZIO_PRIORITY_RESILVER           (zio_priority_table[9])
-#define        ZIO_PRIORITY_SCRUB              (zio_priority_table[10])
-#define        ZIO_PRIORITY_DDT_PREFETCH       (zio_priority_table[11])
-#define        ZIO_PRIORITY_TABLE_SIZE         12
+typedef enum zio_priority {
+       ZIO_PRIORITY_SYNC_READ,
+       ZIO_PRIORITY_SYNC_WRITE,        /* ZIL */
+       ZIO_PRIORITY_ASYNC_READ,        /* prefetch */
+       ZIO_PRIORITY_ASYNC_WRITE,       /* spa_sync() */
+       ZIO_PRIORITY_SCRUB,             /* asynchronous scrub/resilver reads */
+       ZIO_PRIORITY_NUM_QUEUEABLE,
+
+       ZIO_PRIORITY_NOW                /* non-queued i/os (e.g. free) */
+} zio_priority_t;
  
  #define        ZIO_PIPELINE_CONTINUE           0x100
  #define        ZIO_PIPELINE_STOP               0x101
@@ -198,7 +195,8 @@ enum zio_flag {
         ZIO_FLAG_GODFATHER      = 1 << 24,
         ZIO_FLAG_NOPWRITE       = 1 << 25,
         ZIO_FLAG_REEXECUTED     = 1 << 26,
-       ZIO_FLAG_FASTWRITE      = 1 << 27
+       ZIO_FLAG_DELEGATED      = 1 << 27,
+       ZIO_FLAG_FASTWRITE      = 1 << 28
  };
  
  #define        ZIO_FLAG_MUSTSUCCEED            0
@@ -238,8 +236,7 @@ enum zio_wait_type {
  
  typedef void zio_done_func_t(zio_t *zio);
  
-extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
-extern char *zio_type_name[ZIO_TYPES];
+extern const char *zio_type_name[ZIO_TYPES];
  
  /*
   * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
@@ -381,7 +378,7 @@ struct zio {
         zio_type_t      io_type;
         enum zio_child  io_child_type;
         int             io_cmd;
-       uint8_t         io_priority;
+       zio_priority_t  io_priority;
         uint8_t         io_reexecute;
         uint8_t         io_state[ZIO_WAIT_TYPES];
         uint64_t        io_txg;
@@ -396,7 +393,8 @@ struct zio {
         zio_transform_t *io_transform_stack;
  
         /* Callback info */
-       zio_done_func_t *io_ready;
+       zio_done_func_t *io_ready;
+       zio_done_func_t *io_physdone;
         zio_done_func_t *io_done;
         void            *io_private;
         int64_t         io_prev_space_delta;    /* DMU private */
@@ -414,13 +412,10 @@ struct zio {
         const zio_vsd_ops_t *io_vsd_ops;
  
         uint64_t        io_offset;
-       uint64_t        io_deadline;    /* expires at timestamp + deadline */
         hrtime_t        io_timestamp;   /* submitted at */
         hrtime_t        io_delta;       /* vdev queue service delta */
         uint64_t        io_delay;       /* vdev disk service delta (ticks) */
-       avl_node_t      io_offset_node;
-       avl_node_t      io_deadline_node;
-       avl_tree_t      *io_vdev_tree;
+       avl_node_t      io_queue_node;
  
         /* Internal pipeline state */
         enum zio_flag   io_flags;
@@ -433,6 +428,7 @@ struct zio {
         int             io_child_error[ZIO_CHILD_TYPES];
         uint64_t        io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
         uint64_t        io_child_count;
+       uint64_t        io_phys_children;
         uint64_t        io_parent_count;
         uint64_t        *io_stall;
         zio_t           *io_gang_leader;
@@ -458,16 +454,17 @@ extern zio_t *zio_root(spa_t *spa,
  
  extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
      uint64_t size, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, const zbookmark_t *zb);
+    zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb);
  
  extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
      void *data, uint64_t size, const zio_prop_t *zp,
-    zio_done_func_t *ready, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, const zbookmark_t *zb);
+    zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
+    void *private,
+    zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb);
  
  extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
      void *data, uint64_t size, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, zbookmark_t *zb);
+    zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb);
  
  extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
      boolean_t nopwrite);
@@ -479,17 +476,17 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
      zio_done_func_t *done, void *private, enum zio_flag flags);
  
  extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, int priority, enum zio_flag flags);
+    zio_done_func_t *done, void *private, enum zio_flag flags);
  
  extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
      uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
-    boolean_t labels);
+    zio_done_func_t *done, void *private, zio_priority_t priority,
+    enum zio_flag flags, boolean_t labels);
  
  extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
      uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
-    boolean_t labels);
+    zio_done_func_t *done, void *private, zio_priority_t priority,
+    enum zio_flag flags, boolean_t labels);
  
  extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
      const blkptr_t *bp, enum zio_flag flags);
@@ -520,11 +517,12 @@ extern void zio_vdev_free(void *buf);
  extern void zio_resubmit_stage_async(void *);
  
  extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
-    uint64_t offset, void *data, uint64_t size, int type, int priority,
-    enum zio_flag flags, zio_done_func_t *done, void *private);
+    uint64_t offset, void *data, uint64_t size, int type,
+    zio_priority_t priority, enum zio_flag flags,
+    zio_done_func_t *done, void *private);
  
  extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
-    void *data, uint64_t size, int type, int priority,
+    void *data, uint64_t size, int type, zio_priority_t priority,
      enum zio_flag flags, zio_done_func_t *done, void *private);
  
  extern void zio_vdev_io_bypass(zio_t *zio);
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5

index a54e31f8bc22a5f379200d9e5a042f1881db9acc..00d12b0871757cedd5d9c89ed9b2243c7d928c10 100644 (file)
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -156,6 +156,22 @@ SPA config file
  Default value: \fB/etc/zfs/zpool.cache\fR.
  .RE
  
+.sp
+.ne 2
+.na
+\fBspa_asize_inflation\fR (int)
+.ad
+.RS 12n
+Multiplication factor used to estimate actual disk consumption from the
+size of data being written. The default value is a worst case estimate,
+but lower values may be valid for a given pool depending on its
+configuration.  Pool administrators who understand the factors involved
+may wish to specify a more realistic inflation factor, particularly if
+they operate close to quota or capacity limits.
+.sp
+Default value: 24
+.RE
+
  .sp
  .ne 2
  .na
@@ -335,12 +351,17 @@ Use \fB1\fR for yes (default) and \fB0\fR to disable.
  .sp
  .ne 2
  .na
-\fBzfs_deadman_synctime\fR (ulong)
+\fBzfs_deadman_synctime_ms\fR (ulong)
  .ad
  .RS 12n
-Expire in units of zfs_txg_synctime_ms
+Expiration time in milliseconds. This value has two meanings. First it is
+used to determine when the spa_deadman() logic should fire. By default the
+spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
+Secondly, the value determines if an I/O is considered "hung". Any I/O that
+has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
+in a zevent being logged.
  .sp
-Default value: \fB1,000\fR.
+Default value: \fB1,000,000\fR.
  .RE
  
  .sp
@@ -354,6 +375,272 @@ Enable prefetching dedup-ed blks
  Use \fB1\fR for yes (default) and \fB0\fR to disable.
  .RE
  
+.sp
+.ne 2
+.na
+\fBzfs_delay_min_dirty_percent\fR (int)
+.ad
+.RS 12n
+Start to delay each transaction once there is this amount of dirty data,
+expressed as a percentage of \fBzfs_dirty_data_max\fR.
+This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
+See the section "ZFS TRANSACTION DELAY".
+.sp
+Default value: \fB60\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_delay_scale\fR (int)
+.ad
+.RS 12n
+This controls how quickly the transaction delay approaches infinity.
+Larger values cause longer delays for a given amount of dirty data.
+.sp
+For the smoothest delay, this value should be about 1 billion divided
+by the maximum number of operations per second.  This will smoothly
+handle between 10x and 1/10th this number.
+.sp
+See the section "ZFS TRANSACTION DELAY".
+.sp
+Note: \fBzfs_delay_scale\fR * \fBzfs_dirty_data_max\fR must be < 2^64.
+.sp
+Default value: \fB500,000\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_dirty_data_max\fR (int)
+.ad
+.RS 12n
+Determines the dirty space limit in bytes.  Once this limit is exceeded, new
+writes are halted until space frees up. This parameter takes precedence
+over \fBzfs_dirty_data_max_percent\fR.
+See the section "ZFS TRANSACTION DELAY".
+.sp
+Default value: 10 percent of all memory, capped at \fBzfs_dirty_data_max_max\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_dirty_data_max_max\fR (int)
+.ad
+.RS 12n
+Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed in bytes.
+This limit is only enforced at module load time, and will be ignored if
+\fBzfs_dirty_data_max\fR is later changed.  This parameter takes
+precedence over \fBzfs_dirty_data_max_max_percent\fR. See the section
+"ZFS TRANSACTION DELAY".
+.sp
+Default value: 25% of physical RAM.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_dirty_data_max_max_percent\fR (int)
+.ad
+.RS 12n
+Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed as a
+percentage of physical RAM.  This limit is only enforced at module load
+time, and will be ignored if \fBzfs_dirty_data_max\fR is later changed.
+The parameter \fBzfs_dirty_data_max_max\fR takes precedence over this
+one. See the section "ZFS TRANSACTION DELAY".
+.sp
+Default value: 25
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_dirty_data_max_percent\fR (int)
+.ad
+.RS 12n
+Determines the dirty space limit, expressed as a percentage of all
+memory.  Once this limit is exceeded, new writes are halted until space frees
+up.  The parameter \fBzfs_dirty_data_max\fR takes precedence over this
+one.  See the section "ZFS TRANSACTION DELAY".
+.sp
+Default value: 10%, subject to \fBzfs_dirty_data_max_max\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_dirty_data_sync\fR (int)
+.ad
+.RS 12n
+Start syncing out a transaction group if there is at least this much dirty data.
+.sp
+Default value: \fB67,108,864\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_async_read_max_active\fR (int)
+.ad
+.RS 12n
+Maxium asynchronous read I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB3\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_async_read_min_active\fR (int)
+.ad
+.RS 12n
+Minimum asynchronous read I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB1\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_async_write_active_max_dirty_percent\fR (int)
+.ad
+.RS 12n
+When the pool has more than
+\fBzfs_vdev_async_write_active_max_dirty_percent\fR dirty data, use
+\fBzfs_vdev_async_write_max_active\fR to limit active async writes.  If
+the dirty data is between min and max, the active I/O limit is linearly
+interpolated. See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB60\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_async_write_active_min_dirty_percent\fR (int)
+.ad
+.RS 12n
+When the pool has less than
+\fBzfs_vdev_async_write_active_min_dirty_percent\fR dirty data, use
+\fBzfs_vdev_async_write_min_active\fR to limit active async writes.  If
+the dirty data is between min and max, the active I/O limit is linearly
+interpolated. See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB30\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_async_write_max_active\fR (int)
+.ad
+.RS 12n
+Maxium asynchronous write I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB10\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_async_write_min_active\fR (int)
+.ad
+.RS 12n
+Minimum asynchronous write I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB1\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_max_active\fR (int)
+.ad
+.RS 12n
+The maximum number of I/Os active to each device.  Ideally, this will be >=
+the sum of each queue's max_active.  It must be at least the sum of each
+queue's min_active.  See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB1,000\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_scrub_max_active\fR (int)
+.ad
+.RS 12n
+Maxium scrub I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB2\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_scrub_min_active\fR (int)
+.ad
+.RS 12n
+Minimum scrub I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB1\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_sync_read_max_active\fR (int)
+.ad
+.RS 12n
+Maxium synchronous read I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB10\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_sync_read_min_active\fR (int)
+.ad
+.RS 12n
+Minimum synchronous read I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB10\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_sync_write_max_active\fR (int)
+.ad
+.RS 12n
+Maxium synchronous write I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB10\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_sync_write_min_active\fR (int)
+.ad
+.RS 12n
+Minimum synchronous write I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB10\fR.
+.RE
+
  .sp
  .ne 2
  .na
@@ -442,17 +729,6 @@ Set for no scrub prefetching
  Use \fB1\fR for yes and \fB0\fR for no (default).
  .RE
  
-.sp
-.ne 2
-.na
-\fBzfs_no_write_throttle\fR (int)
-.ad
-.RS 12n
-Disable write throttling
-.sp
-Use \fB1\fR for yes and \fB0\fR for no (default).
-.RE
-
  .sp
  .ne 2
  .na
@@ -652,17 +928,6 @@ Historic statistics for the last N txgs
  Default value: \fB0\fR.
  .RE
  
-.sp
-.ne 2
-.na
-\fBzfs_txg_synctime_ms\fR (int)
-.ad
-.RS 12n
-Target milliseconds between txg sync
-.sp
-Default value: \fB1,000\fR.
-.RE
-
  .sp
  .ne 2
  .na
@@ -716,28 +981,6 @@ Total size of the per-disk cache
  Default value: \fB0\fR.
  .RE
  
-.sp
-.ne 2
-.na
-\fBzfs_vdev_max_pending\fR (int)
-.ad
-.RS 12n
-Max pending per-vdev I/Os
-.sp
-Default value: \fB10\fR.
-.RE
-
-.sp
-.ne 2
-.na
-\fBzfs_vdev_min_pending\fR (int)
-.ad
-.RS 12n
-Min pending per-vdev I/Os
-.sp
-Default value: \fB4\fR.
-.RE
-
  .sp
  .ne 2
  .na
@@ -749,17 +992,6 @@ Switch mirrors every N usecs
  Default value: \fB10,000\fR.
  .RE
  
-.sp
-.ne 2
-.na
-\fBzfs_vdev_ramp_rate\fR (int)
-.ad
-.RS 12n
-Exponential I/O issue ramp-up rate
-.sp
-Default value: \fB2\fR.
-.RE
-
  .sp
  .ne 2
  .na
@@ -782,17 +1014,6 @@ I/O scheduler
  Default value: \fBnoop\fR.
  .RE
  
-.sp
-.ne 2
-.na
-\fBzfs_vdev_time_shift\fR (int)
-.ad
-.RS 12n
-Deadline time shift for vdev I/O
-.sp
-Default value: \fB29\fR (each bucket is 0.537 seconds).
-.RE
-
  .sp
  .ne 2
  .na
@@ -804,61 +1025,6 @@ Aggregate write I/O over gap
  Default value: \fB4,096\fR.
  .RE
  
-.sp
-.ne 2
-.na
-\fBzfs_write_limit_inflated\fR (ulong)
-.ad
-.RS 12n
-Inflated txg write limit
-.sp
-Default value: \fB0\fR.
-.RE
-
-.sp
-.ne 2
-.na
-\fBzfs_write_limit_max\fR (ulong)
-.ad
-.RS 12n
-Max txg write limit
-.sp
-Default value: \fB0\fR.
-.RE
-
-.sp
-.ne 2
-.na
-\fBzfs_write_limit_min\fR (ulong)
-.ad
-.RS 12n
-Min txg write limit
-.sp
-Default value: \fB33,554,432\fR.
-.RE
-
-.sp
-.ne 2
-.na
-\fBzfs_write_limit_override\fR (ulong)
-.ad
-.RS 12n
-Override txg write limit
-.sp
-Default value: \fB0\fR.
-.RE
-
-.sp
-.ne 2
-.na
-\fBzfs_write_limit_shift\fR (int)
-.ad
-.RS 12n
-log2(fraction of memory) per txg
-.sp
-Default value: \fB3\fR.
-.RE
-
  .sp
  .ne 2
  .na
@@ -1002,3 +1168,186 @@ Number of threads for zvol device
  Default value: \fB32\fR.
  .RE
  
+.SH ZFS I/O SCHEDULER
+ZFS issues I/O operations to leaf vdevs to satisfy and complete I/Os.
+The I/O scheduler determines when and in what order those operations are
+issued.  The I/O scheduler divides operations into five I/O classes
+prioritized in the following order: sync read, sync write, async read,
+async write, and scrub/resilver.  Each queue defines the minimum and
+maximum number of concurrent operations that may be issued to the
+device.  In addition, the device has an aggregate maximum,
+\fBzfs_vdev_max_active\fR. Note that the sum of the per-queue minimums
+must not exceed the aggregate maximum.  If the sum of the per-queue
+maximums exceeds the aggregate maximum, then the number of active I/Os
+may reach \fBzfs_vdev_max_active\fR, in which case no further I/Os will
+be issued regardless of whether all per-queue minimums have been met.
+.sp
+For many physical devices, throughput increases with the number of
+concurrent operations, but latency typically suffers. Further, physical
+devices typically have a limit at which more concurrent operations have no
+effect on throughput or can actually cause it to decrease.
+.sp
+The scheduler selects the next operation to issue by first looking for an
+I/O class whose minimum has not been satisfied. Once all are satisfied and
+the aggregate maximum has not been hit, the scheduler looks for classes
+whose maximum has not been satisfied. Iteration through the I/O classes is
+done in the order specified above. No further operations are issued if the
+aggregate maximum number of concurrent operations has been hit or if there
+are no operations queued for an I/O class that has not hit its maximum.
+Every time an I/O is queued or an operation completes, the I/O scheduler
+looks for new operations to issue.
+.sp
+In general, smaller max_active's will lead to lower latency of synchronous
+operations.  Larger max_active's may lead to higher overall throughput,
+depending on underlying storage.
+.sp
+The ratio of the queues' max_actives determines the balance of performance
+between reads, writes, and scrubs.  E.g., increasing
+\fBzfs_vdev_scrub_max_active\fR will cause the scrub or resilver to complete
+more quickly, but reads and writes to have higher latency and lower throughput.
+.sp
+All I/O classes have a fixed maximum number of outstanding operations
+except for the async write class. Asynchronous writes represent the data
+that is committed to stable storage during the syncing stage for
+transaction groups. Transaction groups enter the syncing state
+periodically so the number of queued async writes will quickly burst up
+and then bleed down to zero. Rather than servicing them as quickly as
+possible, the I/O scheduler changes the maximum number of active async
+write I/Os according to the amount of dirty data in the pool.  Since
+both throughput and latency typically increase with the number of
+concurrent operations issued to physical devices, reducing the
+burstiness in the number of concurrent operations also stabilizes the
+response time of operations from other -- and in particular synchronous
+-- queues. In broad strokes, the I/O scheduler will issue more
+concurrent operations from the async write queue as there's more dirty
+data in the pool.
+.sp
+Async Writes
+.sp
+The number of concurrent operations issued for the async write I/O class
+follows a piece-wise linear function defined by a few adjustable points.
+.nf
+
+       |              o---------| <-- zfs_vdev_async_write_max_active
+  ^    |             /^         |
+  |    |            / |         |
+active |           /  |         |
+ I/O   |          /   |         |
+count  |         /    |         |
+       |        /     |         |
+       |-------o      |         | <-- zfs_vdev_async_write_min_active
+      0|_______^______|_________|
+       0%      |      |       100% of zfs_dirty_data_max
+               |      |
+               |      `-- zfs_vdev_async_write_active_max_dirty_percent
+               `--------- zfs_vdev_async_write_active_min_dirty_percent
+
+.fi
+Until the amount of dirty data exceeds a minimum percentage of the dirty
+data allowed in the pool, the I/O scheduler will limit the number of
+concurrent operations to the minimum. As that threshold is crossed, the
+number of concurrent operations issued increases linearly to the maximum at
+the specified maximum percentage of the dirty data allowed in the pool.
+.sp
+Ideally, the amount of dirty data on a busy pool will stay in the sloped
+part of the function between \fBzfs_vdev_async_write_active_min_dirty_percent\fR
+and \fBzfs_vdev_async_write_active_max_dirty_percent\fR. If it exceeds the
+maximum percentage, this indicates that the rate of incoming data is
+greater than the rate that the backend storage can handle. In this case, we
+must further throttle incoming writes, as described in the next section.
+
+.SH ZFS TRANSACTION DELAY
+We delay transactions when we've determined that the backend storage
+isn't able to accommodate the rate of incoming writes.
+.sp
+If there is already a transaction waiting, we delay relative to when
+that transaction will finish waiting.  This way the calculated delay time
+is independent of the number of threads concurrently executing
+transactions.
+.sp
+If we are the only waiter, wait relative to when the transaction
+started, rather than the current time.  This credits the transaction for
+"time already served", e.g. reading indirect blocks.
+.sp
+The minimum time for a transaction to take is calculated as:
+.nf
+    min_time = zfs_delay_scale * (dirty - min) / (max - dirty)
+    min_time is then capped at 100 milliseconds.
+.fi
+.sp
+The delay has two degrees of freedom that can be adjusted via tunables.  The
+percentage of dirty data at which we start to delay is defined by
+\fBzfs_delay_min_dirty_percent\fR. This should typically be at or above
+\fBzfs_vdev_async_write_active_max_dirty_percent\fR so that we only start to
+delay after writing at full speed has failed to keep up with the incoming write
+rate. The scale of the curve is defined by \fBzfs_delay_scale\fR. Roughly speaking,
+this variable determines the amount of delay at the midpoint of the curve.
+.sp
+.nf
+delay
+ 10ms +-------------------------------------------------------------*+
+      |                                                             *|
+  9ms +                                                             *+
+      |                                                             *|
+  8ms +                                                             *+
+      |                                                            * |
+  7ms +                                                            * +
+      |                                                            * |
+  6ms +                                                            * +
+      |                                                            * |
+  5ms +                                                           *  +
+      |                                                           *  |
+  4ms +                                                           *  +
+      |                                                           *  |
+  3ms +                                                          *   +
+      |                                                          *   |
+  2ms +                                              (midpoint) *    +
+      |                                                  |    **     |
+  1ms +                                                  v ***       +
+      |             zfs_delay_scale ---------->     ********         |
+    0 +-------------------------------------*********----------------+
+      0%                    <- zfs_dirty_data_max ->               100%
+.fi
+.sp
+Note that since the delay is added to the outstanding time remaining on the
+most recent transaction, the delay is effectively the inverse of IOPS.
+Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
+was chosen such that small changes in the amount of accumulated dirty data
+in the first 3/4 of the curve yield relatively small differences in the
+amount of delay.
+.sp
+The effects can be easier to understand when the amount of delay is
+represented on a log scale:
+.sp
+.nf
+delay
+100ms +-------------------------------------------------------------++
+      +                                                              +
+      |                                                              |
+      +                                                             *+
+ 10ms +                                                             *+
+      +                                                           ** +
+      |                                              (midpoint)  **  |
+      +                                                  |     **    +
+  1ms +                                                  v ****      +
+      +             zfs_delay_scale ---------->        *****         +
+      |                                             ****             |
+      +                                          ****                +
+100us +                                        **                    +
+      +                                       *                      +
+      |                                      *                       |
+      +                                     *                        +
+ 10us +                                     *                        +
+      +                                                              +
+      |                                                              |
+      +                                                              +
+      +--------------------------------------------------------------+
+      0%                    <- zfs_dirty_data_max ->               100%
+.fi
+.sp
+Note here that only as the amount of dirty data approaches its limit does
+the delay start to increase rapidly. The goal of a properly tuned system
+should be to keep the amount of dirty data out of that range by first
+ensuring that the appropriate limits are set for the I/O scheduler to reach
+optimal throughput on the backend storage, and then by changing the value
+of \fBzfs_delay_scale\fR to increase the steepness of the curve.
diff --git a/module/zfs/arc.c b/module/zfs/arc.c

index 6aa7d37c7176335e64157894acbe2c08d0575625..9cdb52011a6d688f1198a195455cfda627e9ff78 100644 (file)
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -134,6 +134,7 @@
  #include <sys/arc.h>
  #include <sys/vdev.h>
  #include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
  #ifdef _KERNEL
  #include <sys/vmsystm.h>
  #include <vm/anon.h>
@@ -162,6 +163,12 @@ typedef enum arc_reclaim_strategy {
         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
  } arc_reclaim_strategy_t;
  
+/*
+ * The number of iterations through arc_evict_*() before we
+ * drop & reacquire the lock.
+ */
+int arc_evict_iterations = 100;
+
  /* number of seconds before growing cache again */
  int zfs_arc_grow_retry = 5;
  
@@ -183,6 +190,11 @@ int zfs_arc_memory_throttle_disable = 1;
  /* disable duplicate buffer eviction */
  int zfs_disable_dup_eviction = 0;
  
+/*
+ * If this percent of memory is free, don't throttle.
+ */
+int arc_lotsfree_percent = 10;
+
  static int arc_dead;
  
  /* expiration time for arc_no_grow */
@@ -519,6 +531,7 @@ typedef struct arc_write_callback arc_write_callback_t;
  struct arc_write_callback {
         void            *awcb_private;
         arc_done_func_t *awcb_ready;
+       arc_done_func_t *awcb_physdone;
         arc_done_func_t *awcb_done;
         arc_buf_t       *awcb_buf;
  };
@@ -1253,7 +1266,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
         uint64_t from_delta, to_delta;
  
         ASSERT(MUTEX_HELD(hash_lock));
-       ASSERT(new_state != old_state);
+       ASSERT3P(new_state, !=, old_state);
         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
@@ -1859,6 +1872,8 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
         kmutex_t *hash_lock;
         boolean_t have_lock;
         void *stolen = NULL;
+       arc_buf_hdr_t marker = {{{ 0 }}};
+       int count = 0;
  
         ASSERT(state == arc_mru || state == arc_mfu);
  
@@ -1882,6 +1897,33 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                 if (recycle && ab->b_size != bytes &&
                     ab_prev && ab_prev->b_size == bytes)
                         continue;
+
+               /* ignore markers */
+               if (ab->b_spa == 0)
+                       continue;
+
+               /*
+                * It may take a long time to evict all the bufs requested.
+                * To avoid blocking all arc activity, periodically drop
+                * the arcs_mtx and give other threads a chance to run
+                * before reacquiring the lock.
+                *
+                * If we are looking for a buffer to recycle, we are in
+                * the hot code path, so don't sleep.
+                */
+               if (!recycle && count++ > arc_evict_iterations) {
+                       list_insert_after(list, ab, &marker);
+                       mutex_exit(&evicted_state->arcs_mtx);
+                       mutex_exit(&state->arcs_mtx);
+                       kpreempt(KPREEMPT_SYNC);
+                       mutex_enter(&state->arcs_mtx);
+                       mutex_enter(&evicted_state->arcs_mtx);
+                       ab_prev = list_prev(list, &marker);
+                       list_remove(list, &marker);
+                       count = 0;
+                       continue;
+               }
+
                 hash_lock = HDR_LOCK(ab);
                 have_lock = MUTEX_HELD(hash_lock);
                 if (have_lock || mutex_tryenter(hash_lock)) {
@@ -1963,27 +2005,11 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
  
         /*
-        * We have just evicted some data into the ghost state, make
-        * sure we also adjust the ghost state size if necessary.
+        * Note: we have just evicted some data into the ghost state,
+        * potentially putting the ghost size over the desired size.  Rather
+        * that evicting from the ghost list in this hot code path, leave
+        * this chore to the arc_reclaim_thread().
          */
-       if (arc_no_grow &&
-           arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
-               int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
-                   arc_mru_ghost->arcs_size - arc_c;
-
-               if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
-                       int64_t todelete =
-                           MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
-                       arc_evict_ghost(arc_mru_ghost, 0, todelete,
-                           ARC_BUFC_DATA);
-               } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
-                       int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
-                           arc_mru_ghost->arcs_size +
-                           arc_mfu_ghost->arcs_size - arc_c);
-                       arc_evict_ghost(arc_mfu_ghost, 0, todelete,
-                           ARC_BUFC_DATA);
-               }
-       }
  
         return (stolen);
  }
@@ -2002,6 +2028,7 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
         kmutex_t *hash_lock;
         uint64_t bytes_deleted = 0;
         uint64_t bufs_skipped = 0;
+       int count = 0;
  
         ASSERT(GHOST_STATE(state));
         bzero(&marker, sizeof(marker));
@@ -2009,6 +2036,8 @@ top:
         mutex_enter(&state->arcs_mtx);
         for (ab = list_tail(list); ab; ab = ab_prev) {
                 ab_prev = list_prev(list, ab);
+               if (ab->b_type > ARC_BUFC_NUMTYPES)
+                       panic("invalid ab=%p", (void *)ab);
                 if (spa && ab->b_spa != spa)
                         continue;
  
@@ -2020,6 +2049,23 @@ top:
                 /* caller may be trying to modify this buffer, skip it */
                 if (MUTEX_HELD(hash_lock))
                         continue;
+
+               /*
+                * It may take a long time to evict all the bufs requested.
+                * To avoid blocking all arc activity, periodically drop
+                * the arcs_mtx and give other threads a chance to run
+                * before reacquiring the lock.
+                */
+               if (count++ > arc_evict_iterations) {
+                       list_insert_after(list, ab, &marker);
+                       mutex_exit(&state->arcs_mtx);
+                       kpreempt(KPREEMPT_SYNC);
+                       mutex_enter(&state->arcs_mtx);
+                       ab_prev = list_prev(list, &marker);
+                       list_remove(list, &marker);
+                       count = 0;
+                       continue;
+               }
                 if (mutex_tryenter(hash_lock)) {
                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
                         ASSERT(ab->b_buf == NULL);
@@ -2055,8 +2101,9 @@ top:
                         mutex_enter(&state->arcs_mtx);
                         ab_prev = list_prev(list, &marker);
                         list_remove(list, &marker);
-               } else
+               } else {
                         bufs_skipped += 1;
+               }
         }
         mutex_exit(&state->arcs_mtx);
  
@@ -3050,7 +3097,7 @@ arc_read_done(zio_t *zio)
   */
  int
  arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
-    void *private, int priority, int zio_flags, uint32_t *arc_flags,
+    void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
      const zbookmark_t *zb)
  {
         arc_buf_hdr_t *hdr;
@@ -3702,6 +3749,18 @@ arc_write_ready(zio_t *zio)
         hdr->b_flags |= ARC_IO_IN_PROGRESS;
  }
  
+/*
+ * The SPA calls this callback for each physical write that happens on behalf
+ * of a logical write.  See the comment in dbuf_write_physdone() for details.
+ */
+static void
+arc_write_physdone(zio_t *zio)
+{
+       arc_write_callback_t *cb = zio->io_private;
+       if (cb->awcb_physdone != NULL)
+               cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
+}
+
  static void
  arc_write_done(zio_t *zio)
  {
@@ -3782,8 +3841,9 @@ arc_write_done(zio_t *zio)
  zio_t *
  arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
      blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
-    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
-    void *private, int priority, int zio_flags, const zbookmark_t *zb)
+    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
+    arc_done_func_t *done, void *private, zio_priority_t priority,
+    int zio_flags, const zbookmark_t *zb)
  {
         arc_buf_hdr_t *hdr = buf->b_hdr;
         arc_write_callback_t *callback;
@@ -3800,39 +3860,30 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
                 hdr->b_flags |= ARC_L2COMPRESS;
         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_PUSHPAGE);
         callback->awcb_ready = ready;
+       callback->awcb_physdone = physdone;
         callback->awcb_done = done;
         callback->awcb_private = private;
         callback->awcb_buf = buf;
  
         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
-           arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
+           arc_write_ready, arc_write_physdone, arc_write_done, callback,
+           priority, zio_flags, zb);
  
         return (zio);
  }
  
  static int
-arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
+arc_memory_throttle(uint64_t reserve, uint64_t txg)
  {
  #ifdef _KERNEL
-       uint64_t available_memory;
-
         if (zfs_arc_memory_throttle_disable)
                 return (0);
  
-       /* Easily reclaimable memory (free + inactive + arc-evictable) */
-       available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory();
-
-       if (available_memory <= zfs_write_limit_max) {
+       if (freemem <= physmem * arc_lotsfree_percent / 100) {
                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
                 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
                 return (SET_ERROR(EAGAIN));
         }
-
-       if (inflight_data > available_memory / 4) {
-               ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
-               DMU_TX_STAT_BUMP(dmu_tx_memory_inflight);
-               return (ERESTART);
-       }
  #endif
         return (0);
  }
@@ -3850,15 +3901,6 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
         int error;
         uint64_t anon_size;
  
-#ifdef ZFS_DEBUG
-       /*
-        * Once in a while, fail for no reason.  Everything should cope.
-        */
-       if (spa_get_random(10000) == 0) {
-               dprintf("forcing random failure\n");
-               return (ERESTART);
-       }
-#endif
         if (reserve > arc_c/4 && !arc_no_grow)
                 arc_c = MIN(arc_c_max, reserve * 4);
         if (reserve > arc_c) {
@@ -3878,7 +3920,8 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
          * in order to compress/encrypt/etc the data.  We therefore need to
          * make sure that there is sufficient available memory for this.
          */
-       if ((error = arc_memory_throttle(reserve, anon_size, txg)))
+       error = arc_memory_throttle(reserve, txg);
+       if (error != 0)
                 return (error);
  
         /*
@@ -4075,11 +4118,24 @@ arc_init(void)
         arc_dead = FALSE;
         arc_warm = B_FALSE;
  
-       if (zfs_write_limit_max == 0)
-               zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
-       else
-               zfs_write_limit_shift = 0;
-       mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
+       /*
+        * Calculate maximum amount of dirty data per pool.
+        *
+        * If it has been set by a module parameter, take that.
+        * Otherwise, use a percentage of physical memory defined by
+        * zfs_dirty_data_max_percent (default 10%) with a cap at
+        * zfs_dirty_data_max_max (default 25% of physical memory).
+        */
+       if (zfs_dirty_data_max_max == 0)
+               zfs_dirty_data_max_max = physmem * PAGESIZE *
+                   zfs_dirty_data_max_max_percent / 100;
+
+       if (zfs_dirty_data_max == 0) {
+               zfs_dirty_data_max = physmem * PAGESIZE *
+                   zfs_dirty_data_max_percent / 100;
+               zfs_dirty_data_max = MIN(zfs_dirty_data_max,
+                   zfs_dirty_data_max_max);
+       }
  }
  
  void
@@ -4137,8 +4193,6 @@ arc_fini(void)
         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
         mutex_destroy(&arc_l2c_only->arcs_mtx);
  
-       mutex_destroy(&zfs_write_limit_lock);
-
         buf_fini();
  
         ASSERT(arc_loaned_bytes == 0);
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c

index 9ad3d1d30ab1350e54c04c54716dcb84202e3007..01352a91cb4f0b9f606404519a156e9c3f265b04 100644 (file)
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -891,7 +891,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
                 atomic_inc_64(&zfs_free_range_recv_miss);
         }
  
-       for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+       for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
                 db_next = list_next(&dn->dn_dbufs, db);
                 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
  
@@ -1238,6 +1238,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                     sizeof (dbuf_dirty_record_t),
                     offsetof(dbuf_dirty_record_t, dr_dirty_node));
         }
+       if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
+               dr->dr_accounted = db->db.db_size;
         dr->dr_dbuf = db;
         dr->dr_txg = tx->tx_txg;
         dr->dr_next = *drp;
@@ -1321,7 +1323,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                         dbuf_rele(parent, FTAG);
  
                 mutex_enter(&db->db_mtx);
-               /*  possible race with dbuf_undirty() */
+               /*
+                * Since we've dropped the mutex, it's possible that
+                * dbuf_undirty() might have changed this out from under us.
+                */
                 if (db->db_last_dirty == dr ||
                     dn->dn_object == DMU_META_DNODE_OBJECT) {
                         mutex_enter(&di->dt.di.dr_mtx);
@@ -1391,7 +1396,11 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
  
         ASSERT(db->db.db_size != 0);
  
-       /* XXX would be nice to fix up dn_towrite_space[] */
+       /*
+        * Any space we accounted for in dp_dirty_* will be cleaned up by
+        * dsl_pool_sync().  This is relatively rare so the discrepancy
+        * is not a big deal.
+        */
  
         *drp = dr->dr_next;
  
@@ -1571,7 +1580,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
  
  /*
   * "Clear" the contents of this dbuf.  This will mark the dbuf
- * EVICTING and clear *most* of its references.  Unfortunetely,
+ * EVICTING and clear *most* of its references.  Unfortunately,
   * when we are not holding the dn_dbufs_mtx, we can't clear the
   * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
   * in this case.  For callers from the DMU we will usually see:
@@ -1768,7 +1777,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
                 db->db.db_offset = 0;
         } else {
                 int blocksize =
-                   db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
+                   db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
                 db->db.db_size = blocksize;
                 db->db.db_offset = db->db_blkid * blocksize;
         }
@@ -1877,7 +1886,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
  }
  
  void
-dbuf_prefetch(dnode_t *dn, uint64_t blkid)
+dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
  {
         dmu_buf_impl_t *db = NULL;
         blkptr_t *bp = NULL;
@@ -1901,8 +1910,6 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
  
         if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) {
                 if (bp && !BP_IS_HOLE(bp)) {
-                       int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
-                           ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
                         dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
                         uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
                         zbookmark_t zb;
@@ -1911,7 +1918,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
                             dn->dn_object, 0, blkid);
  
                         (void) arc_read(NULL, dn->dn_objset->os_spa,
-                           bp, NULL, NULL, priority,
+                           bp, NULL, NULL, prio,
                             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
                             &aflags, &zb);
                 }
@@ -2647,6 +2654,38 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
         mutex_exit(&db->db_mtx);
  }
  
+/*
+ * The SPA will call this callback several times for each zio - once
+ * for every physical child i/o (zio->io_phys_children times).  This
+ * allows the DMU to monitor the progress of each logical i/o.  For example,
+ * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
+ * block.  There may be a long delay before all copies/fragments are completed,
+ * so this callback allows us to retire dirty space gradually, as the physical
+ * i/os complete.
+ */
+/* ARGSUSED */
+static void
+dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+       dmu_buf_impl_t *db = arg;
+       objset_t *os = db->db_objset;
+       dsl_pool_t *dp = dmu_objset_pool(os);
+       dbuf_dirty_record_t *dr;
+       int delta = 0;
+
+       dr = db->db_data_pending;
+       ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+       /*
+        * The callback will be called io_phys_children times.  Retire one
+        * portion of our dirty space each time we are called.  Any rounding
+        * error will be cleaned up by dsl_pool_sync()'s call to
+        * dsl_pool_undirty_space().
+        */
+       delta = dr->dr_accounted / zio->io_phys_children;
+       dsl_pool_undirty_space(dp, delta, zio->io_txg);
+}
+
  /* ARGSUSED */
  static void
  dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
@@ -2741,6 +2780,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
         ASSERT(db->db_dirtycnt > 0);
         db->db_dirtycnt -= 1;
         db->db_data_pending = NULL;
+
         dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
  }
  
@@ -2859,8 +2899,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
                 ASSERT(db->db_state != DB_NOFILL);
                 dr->dr_zio = zio_write(zio, os->os_spa, txg,
                     db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
-                   dbuf_write_override_ready, dbuf_write_override_done, dr,
-                   ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+                   dbuf_write_override_ready, NULL, dbuf_write_override_done,
+                   dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
                 mutex_enter(&db->db_mtx);
                 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
                 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
@@ -2870,7 +2910,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
                 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
                 dr->dr_zio = zio_write(zio, os->os_spa, txg,
                     db->db_blkptr, NULL, db->db.db_size, &zp,
-                   dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
+                   dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
                     ZIO_PRIORITY_ASYNC_WRITE,
                     ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
         } else {
@@ -2878,8 +2918,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
                 dr->dr_zio = arc_write(zio, os->os_spa, txg,
                     db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
                     DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
-                   dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
-                   ZIO_FLAG_MUSTSUCCEED, &zb);
+                   dbuf_write_physdone, dbuf_write_done, db,
+                   ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
         }
  }
  
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c

index 72fce35c5634f05c2f240a3e999d6fd2a28a77b9..ade13b9f08d5f30b6c3d4e792fd21910ecf91e2d 100644 (file)
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -370,13 +370,11 @@ static int
  dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
      int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
  {
-       dsl_pool_t *dp = NULL;
         dmu_buf_t **dbp;
         uint64_t blkid, nblks, i;
         uint32_t dbuf_flags;
         int err;
         zio_t *zio;
-       hrtime_t start = 0;
  
         ASSERT(length <= DMU_MAX_ACCESS);
  
@@ -404,9 +402,6 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
         }
         dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_PUSHPAGE | KM_NODEBUG);
  
-       if (dn->dn_objset->os_dsl_dataset)
-               dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
-       start = gethrtime();
         zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
         blkid = dbuf_whichblock(dn, offset);
         for (i = 0; i < nblks; i++) {
@@ -427,9 +422,6 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
  
         /* wait for async i/o */
         err = zio_wait(zio);
-       /* track read overhead when we are in sync context */
-       if (dp && dsl_pool_sync_context(dp))
-               dp->dp_read_overhead += gethrtime() - start;
         if (err) {
                 dmu_buf_rele_array(dbp, nblks, tag);
                 return (err);
@@ -511,12 +503,22 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
         kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
  }
  
+/*
+ * Issue prefetch i/os for the given blocks.
+ *
+ * Note: The assumption is that we *know* these blocks will be needed
+ * almost immediately.  Therefore, the prefetch i/os will be issued at
+ * ZIO_PRIORITY_SYNC_READ
+ *
+ * Note: indirect blocks and other metadata will be read synchronously,
+ * causing this function to block if they are not already cached.
+ */
  void
  dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
  {
         dnode_t *dn;
         uint64_t blkid;
-       int nblks, i, err;
+       int nblks, err;
  
         if (zfs_prefetch_disable)
                 return;
@@ -529,7 +531,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
  
                 rw_enter(&dn->dn_struct_rwlock, RW_READER);
                 blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
-               dbuf_prefetch(dn, blkid);
+               dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ);
                 rw_exit(&dn->dn_struct_rwlock);
                 return;
         }
@@ -546,16 +548,18 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
         rw_enter(&dn->dn_struct_rwlock, RW_READER);
         if (dn->dn_datablkshift) {
                 int blkshift = dn->dn_datablkshift;
-               nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
-                   P2ALIGN(offset, 1<<blkshift)) >> blkshift;
+               nblks = (P2ROUNDUP(offset + len, 1 << blkshift) -
+                   P2ALIGN(offset, 1 << blkshift)) >> blkshift;
         } else {
                 nblks = (offset < dn->dn_datablksz);
         }
  
         if (nblks != 0) {
+               int i;
+
                 blkid = dbuf_whichblock(dn, offset);
                 for (i = 0; i < nblks; i++)
-                       dbuf_prefetch(dn, blkid+i);
+                       dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ);
         }
  
         rw_exit(&dn->dn_struct_rwlock);
@@ -1559,7 +1563,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
  
         zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
             zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
-           dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
+           dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa,
             ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, zb));
  
         return (0);
@@ -1699,8 +1703,9 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
  
         zio_nowait(arc_write(pio, os->os_spa, txg,
             bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
-           DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, dmu_sync_done,
-           dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb));
+           DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready,
+           NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
+           ZIO_FLAG_CANFAIL, &zb));
  
         return (0);
  }
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c

index 6a3b5f05e6bf150579e93b1a7a7522d288b15846..07e00c307f4a989e6578ef6f8bde09f3c1602cd6 100644 (file)
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1032,7 +1032,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
         zio = arc_write(pio, os->os_spa, tx->tx_txg,
             os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
             DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready,
-           dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
+           NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
             ZIO_FLAG_MUSTSUCCEED, &zb);
  
         /*
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c

index ece6b14b38004be11d79f1994e97a7ca8d9e14f7..47cb86b0894be92d247ac3b1db3efac7ddad28b0 100644 (file)
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -53,7 +53,8 @@ dmu_tx_stats_t dmu_tx_stats = {
         { "dmu_tx_memory_reclaim",      KSTAT_DATA_UINT64 },
         { "dmu_tx_memory_inflight",     KSTAT_DATA_UINT64 },
         { "dmu_tx_dirty_throttle",      KSTAT_DATA_UINT64 },
-       { "dmu_tx_write_limit",         KSTAT_DATA_UINT64 },
+       { "dmu_tx_dirty_delay",         KSTAT_DATA_UINT64 },
+       { "dmu_tx_dirty_over_max",      KSTAT_DATA_UINT64 },
         { "dmu_tx_quota",               KSTAT_DATA_UINT64 },
  };
  
@@ -70,6 +71,7 @@ dmu_tx_create_dd(dsl_dir_t *dd)
             offsetof(dmu_tx_hold_t, txh_node));
         list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
             offsetof(dmu_tx_callback_t, dcb_node));
+       tx->tx_start = gethrtime();
  #ifdef DEBUG_DMU_TX
         refcount_create(&tx->tx_space_written);
         refcount_create(&tx->tx_space_freed);
@@ -614,6 +616,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
         if (txh == NULL)
                 return;
         dn = txh->txh_dnode;
+       dmu_tx_count_dnode(txh);
  
         if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
                 return;
@@ -931,6 +934,142 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
  }
  #endif
  
+/*
+ * If we can't do 10 iops, something is wrong.  Let us go ahead
+ * and hit zfs_dirty_data_max.
+ */
+hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */
+int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
+
+/*
+ * We delay transactions when we've determined that the backend storage
+ * isn't able to accommodate the rate of incoming writes.
+ *
+ * If there is already a transaction waiting, we delay relative to when
+ * that transaction finishes waiting.  This way the calculated min_time
+ * is independent of the number of threads concurrently executing
+ * transactions.
+ *
+ * If we are the only waiter, wait relative to when the transaction
+ * started, rather than the current time.  This credits the transaction for
+ * "time already served", e.g. reading indirect blocks.
+ *
+ * The minimum time for a transaction to take is calculated as:
+ *     min_time = scale * (dirty - min) / (max - dirty)
+ *     min_time is then capped at zfs_delay_max_ns.
+ *
+ * The delay has two degrees of freedom that can be adjusted via tunables.
+ * The percentage of dirty data at which we start to delay is defined by
+ * zfs_delay_min_dirty_percent. This should typically be at or above
+ * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
+ * delay after writing at full speed has failed to keep up with the incoming
+ * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
+ * speaking, this variable determines the amount of delay at the midpoint of
+ * the curve.
+ *
+ * delay
+ *  10ms +-------------------------------------------------------------*+
+ *       |                                                             *|
+ *   9ms +                                                             *+
+ *       |                                                             *|
+ *   8ms +                                                             *+
+ *       |                                                            * |
+ *   7ms +                                                            * +
+ *       |                                                            * |
+ *   6ms +                                                            * +
+ *       |                                                            * |
+ *   5ms +                                                           *  +
+ *       |                                                           *  |
+ *   4ms +                                                           *  +
+ *       |                                                           *  |
+ *   3ms +                                                          *   +
+ *       |                                                          *   |
+ *   2ms +                                              (midpoint) *    +
+ *       |                                                  |    **     |
+ *   1ms +                                                  v ***       +
+ *       |             zfs_delay_scale ---------->     ********         |
+ *     0 +-------------------------------------*********----------------+
+ *       0%                    <- zfs_dirty_data_max ->               100%
+ *
+ * Note that since the delay is added to the outstanding time remaining on the
+ * most recent transaction, the delay is effectively the inverse of IOPS.
+ * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
+ * was chosen such that small changes in the amount of accumulated dirty data
+ * in the first 3/4 of the curve yield relatively small differences in the
+ * amount of delay.
+ *
+ * The effects can be easier to understand when the amount of delay is
+ * represented on a log scale:
+ *
+ * delay
+ * 100ms +-------------------------------------------------------------++
+ *       +                                                              +
+ *       |                                                              |
+ *       +                                                             *+
+ *  10ms +                                                             *+
+ *       +                                                           ** +
+ *       |                                              (midpoint)  **  |
+ *       +                                                  |     **    +
+ *   1ms +                                                  v ****      +
+ *       +             zfs_delay_scale ---------->        *****         +
+ *       |                                             ****             |
+ *       +                                          ****                +
+ * 100us +                                        **                    +
+ *       +                                       *                      +
+ *       |                                      *                       |
+ *       +                                     *                        +
+ *  10us +                                     *                        +
+ *       +                                                              +
+ *       |                                                              |
+ *       +                                                              +
+ *       +--------------------------------------------------------------+
+ *       0%                    <- zfs_dirty_data_max ->               100%
+ *
+ * Note here that only as the amount of dirty data approaches its limit does
+ * the delay start to increase rapidly. The goal of a properly tuned system
+ * should be to keep the amount of dirty data out of that range by first
+ * ensuring that the appropriate limits are set for the I/O scheduler to reach
+ * optimal throughput on the backend storage, and then by changing the value
+ * of zfs_delay_scale to increase the steepness of the curve.
+ */
+static void
+dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
+{
+       dsl_pool_t *dp = tx->tx_pool;
+       uint64_t delay_min_bytes =
+           zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+       hrtime_t wakeup, min_tx_time, now;
+
+       if (dirty <= delay_min_bytes)
+               return;
+
+       /*
+        * The caller has already waited until we are under the max.
+        * We make them pass us the amount of dirty data so we don't
+        * have to handle the case of it being >= the max, which could
+        * cause a divide-by-zero if it's == the max.
+        */
+       ASSERT3U(dirty, <, zfs_dirty_data_max);
+
+       now = gethrtime();
+       min_tx_time = zfs_delay_scale *
+           (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
+       min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
+       if (now > tx->tx_start + min_tx_time)
+               return;
+
+       DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
+           uint64_t, min_tx_time);
+
+       mutex_enter(&dp->dp_lock);
+       wakeup = MAX(tx->tx_start + min_tx_time,
+           dp->dp_last_wakeup + min_tx_time);
+       dp->dp_last_wakeup = wakeup;
+       mutex_exit(&dp->dp_lock);
+
+       zfs_sleep_until(wakeup);
+}
+
  static int
  dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
  {
@@ -965,6 +1104,13 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
                 return (SET_ERROR(ERESTART));
         }
  
+       if (!tx->tx_waited &&
+           dsl_pool_need_dirty_delay(tx->tx_pool)) {
+               tx->tx_wait_dirty = B_TRUE;
+               DMU_TX_STAT_BUMP(dmu_tx_dirty_delay);
+               return (ERESTART);
+       }
+
         tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
         tx->tx_needassign_txh = NULL;
  
@@ -1092,6 +1238,10 @@ dmu_tx_unassign(dmu_tx_t *tx)
   *     blocking, returns immediately with ERESTART.  This should be used
   *     whenever you're holding locks.  On an ERESTART error, the caller
   *     should drop locks, do a dmu_tx_wait(tx), and try again.
+ *
+ * (3) TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
+ *     has already been called on behalf of this operation (though
+ *     most likely on a different tx).
   */
  int
  dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
@@ -1100,11 +1250,15 @@ dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
         int err;
  
         ASSERT(tx->tx_txg == 0);
-       ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT);
+       ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
+           txg_how == TXG_WAITED);
         ASSERT(!dsl_pool_sync_context(tx->tx_pool));
  
         before = gethrtime();
  
+       if (txg_how == TXG_WAITED)
+               tx->tx_waited = B_TRUE;
+
         /* If we might wait, we must not hold the config lock. */
         ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
  
@@ -1128,17 +1282,47 @@ void
  dmu_tx_wait(dmu_tx_t *tx)
  {
         spa_t *spa = tx->tx_pool->dp_spa;
+       dsl_pool_t *dp = tx->tx_pool;
  
         ASSERT(tx->tx_txg == 0);
         ASSERT(!dsl_pool_config_held(tx->tx_pool));
  
-       /*
-        * It's possible that the pool has become active after this thread
-        * has tried to obtain a tx. If that's the case then his
-        * tx_lasttried_txg would not have been assigned.
-        */
-       if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
-               txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
+       if (tx->tx_wait_dirty) {
+               uint64_t dirty;
+
+               /*
+                * dmu_tx_try_assign() has determined that we need to wait
+                * because we've consumed much or all of the dirty buffer
+                * space.
+                */
+               mutex_enter(&dp->dp_lock);
+               if (dp->dp_dirty_total >= zfs_dirty_data_max)
+                       DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max);
+               while (dp->dp_dirty_total >= zfs_dirty_data_max)
+                       cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
+               dirty = dp->dp_dirty_total;
+               mutex_exit(&dp->dp_lock);
+
+               dmu_tx_delay(tx, dirty);
+
+               tx->tx_wait_dirty = B_FALSE;
+
+               /*
+                * Note: setting tx_waited only has effect if the caller
+                * used TX_WAIT.  Otherwise they are going to destroy
+                * this tx and try again.  The common case, zfs_write(),
+                * uses TX_WAIT.
+                */
+               tx->tx_waited = B_TRUE;
+       } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
+               /*
+                * If the pool is suspended we need to wait until it
+                * is resumed.  Note that it's possible that the pool
+                * has become active after this thread has tried to
+                * obtain a tx.  If that's the case then tx_lasttried_txg
+                * would not have been set.
+                */
+               txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
         } else if (tx->tx_needassign_txh) {
                 dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
  
@@ -1148,6 +1332,10 @@ dmu_tx_wait(dmu_tx_t *tx)
                 mutex_exit(&dn->dn_mtx);
                 tx->tx_needassign_txh = NULL;
         } else {
+               /*
+                * A dnode is assigned to the quiescing txg.  Wait for its
+                * transaction to complete.
+                */
                 txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
         }
  }
@@ -1268,7 +1456,6 @@ dmu_tx_pool(dmu_tx_t *tx)
         return (tx->tx_pool);
  }
  
-
  void
  dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
  {
diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c

index 705478c82ef5c6be346f2cfdae9cd338101f587d..feb76394774f65e9ebad436a5da24bd7658209ff 100644 (file)
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@@ -23,6 +23,10 @@
   * Use is subject to license terms.
   */
  
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
  #include <sys/zfs_context.h>
  #include <sys/dnode.h>
  #include <sys/dmu_objset.h>
@@ -287,7 +291,7 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
         fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
  
         for (i = 0; i < fetchsz; i++) {
-               dbuf_prefetch(dn, blkid + i);
+               dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ);
         }
  
         return (fetchsz);
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c

index 7ac37957861a50282767d88ecdfb0424e29c1837..25c77753b2c5fce5e48d9b6f456734eb729558be 100644 (file)
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1789,23 +1789,22 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
  }
  
  /*
- * Call when we think we're going to write/free space in open context.
- * Be conservative (ie. OK to write less than this or free more than
- * this, but don't write more or free less).
+ * Call when we think we're going to write/free space in open context to track
+ * the amount of memory in use by the currently open txg.
   */
  void
  dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
  {
         objset_t *os = dn->dn_objset;
         dsl_dataset_t *ds = os->os_dsl_dataset;
+       int64_t aspace = spa_get_asize(os->os_spa, space);
  
-       if (space > 0)
-               space = spa_get_asize(os->os_spa, space);
-
-       if (ds)
-               dsl_dir_willuse_space(ds->ds_dir, space, tx);
+       if (ds != NULL) {
+               dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
+               dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
+       }
  
-       dmu_tx_willuse_space(tx, space);
+       dmu_tx_willuse_space(tx, aspace);
  }
  
  /*
diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c

index 44795f344de43ade59aa68690f93b9b27278a1c6..54a7dffb1d6779997c947d68ac2d5f05aeeb1618 100644 (file)
--- a/module/zfs/dsl_dir.c
+++ b/module/zfs/dsl_dir.c
@@ -589,7 +589,6 @@ dsl_dir_space_available(dsl_dir_t *dd,
  
  struct tempreserve {
         list_node_t tr_node;
-       dsl_pool_t *tr_dp;
         dsl_dir_t *tr_ds;
         uint64_t tr_size;
  };
@@ -740,25 +739,24 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
                 tr = kmem_zalloc(sizeof (struct tempreserve), KM_PUSHPAGE);
                 tr->tr_size = lsize;
                 list_insert_tail(tr_list, tr);
-
-               err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
         } else {
                 if (err == EAGAIN) {
+                       /*
+                        * If arc_memory_throttle() detected that pageout
+                        * is running and we are low on memory, we delay new
+                        * non-pageout transactions to give pageout an
+                        * advantage.
+                        *
+                        * It is unfortunate to be delaying while the caller's
+                        * locks are held.
+                        */
                         txg_delay(dd->dd_pool, tx->tx_txg,
                             MSEC2NSEC(10), MSEC2NSEC(10));
                         err = SET_ERROR(ERESTART);
                 }
-               dsl_pool_memory_pressure(dd->dd_pool);
         }
  
         if (err == 0) {
-               struct tempreserve *tr;
-
-               tr = kmem_zalloc(sizeof (struct tempreserve), KM_PUSHPAGE);
-               tr->tr_dp = dd->dd_pool;
-               tr->tr_size = asize;
-               list_insert_tail(tr_list, tr);
-
                 err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
                     FALSE, asize > usize, tr_list, tx, TRUE);
         }
@@ -787,10 +785,8 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
         if (tr_cookie == NULL)
                 return;
  
-       while ((tr = list_head(tr_list))) {
-               if (tr->tr_dp) {
-                       dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
-               } else if (tr->tr_ds) {
+       while ((tr = list_head(tr_list)) != NULL) {
+               if (tr->tr_ds) {
                         mutex_enter(&tr->tr_ds->dd_lock);
                         ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
                             tr->tr_size);
@@ -806,8 +802,14 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
         kmem_free(tr_list, sizeof (list_t));
  }
  
-static void
-dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+/*
+ * This should be called from open context when we think we're going to write
+ * or free space, for example when dirtying data. Be conservative; it's okay
+ * to write less space or free more, but we don't want to write more or free
+ * less than the amount specified.
+ */
+void
+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
  {
         int64_t parent_space;
         uint64_t est_used;
@@ -825,19 +827,7 @@ dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
  
         /* XXX this is potentially expensive and unnecessary... */
         if (parent_space && dd->dd_parent)
-               dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
-}
-
-/*
- * Call in open context when we think we're going to write/free space,
- * eg. when dirtying data.  Be conservative (ie. OK to write less than
- * this or free more than this, but don't write more or free less).
- */
-void
-dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
-{
-       dsl_pool_willuse_space(dd->dd_pool, space, tx);
-       dsl_dir_willuse_space_impl(dd, space, tx);
+               dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
  }
  
  /* call from syncing context when we actually write/free space for this dd */
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c

index e7127c535cdcb7badbc347ddf8b0c6787bab6091..eed4bd49742821cc790228826d3617a8c7f5ab6f 100644 (file)
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -46,18 +46,85 @@
  #include <sys/zil_impl.h>
  #include <sys/dsl_userhold.h>
  
-int zfs_no_write_throttle = 0;
-int zfs_write_limit_shift = 3;                 /* 1/8th of physical memory */
-int zfs_txg_synctime_ms = 1000;                /* target millisecs to sync a txg */
+/*
+ * ZFS Write Throttle
+ * ------------------
+ *
+ * ZFS must limit the rate of incoming writes to the rate at which it is able
+ * to sync data modifications to the backend storage. Throttling by too much
+ * creates an artificial limit; throttling by too little can only be sustained
+ * for short periods and would lead to highly lumpy performance. On a per-pool
+ * basis, ZFS tracks the amount of modified (dirty) data. As operations change
+ * data, the amount of dirty data increases; as ZFS syncs out data, the amount
+ * of dirty data decreases. When the amount of dirty data exceeds a
+ * predetermined threshold further modifications are blocked until the amount
+ * of dirty data decreases (as data is synced out).
+ *
+ * The limit on dirty data is tunable, and should be adjusted according to
+ * both the IO capacity and available memory of the system. The larger the
+ * window, the more ZFS is able to aggregate and amortize metadata (and data)
+ * changes. However, memory is a limited resource, and allowing for more dirty
+ * data comes at the cost of keeping other useful data in memory (for example
+ * ZFS data cached by the ARC).
+ *
+ * Implementation
+ *
+ * As buffers are modified dsl_pool_willuse_space() increments both the per-
+ * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
+ * dirty space used; dsl_pool_dirty_space() decrements those values as data
+ * is synced out from dsl_pool_sync(). While only the poolwide value is
+ * relevant, the per-txg value is useful for debugging. The tunable
+ * zfs_dirty_data_max determines the dirty space limit. Once that value is
+ * exceeded, new writes are halted until space frees up.
+ *
+ * The zfs_dirty_data_sync tunable dictates the threshold at which we
+ * ensure that there is a txg syncing (see the comment in txg.c for a full
+ * description of transaction group stages).
+ *
+ * The IO scheduler uses both the dirty space limit and current amount of
+ * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
+ * issues. See the comment in vdev_queue.c for details of the IO scheduler.
+ *
+ * The delay is also calculated based on the amount of dirty data.  See the
+ * comment above dmu_tx_delay() for details.
+ */
+
+/*
+ * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
+ * capped at zfs_dirty_data_max_max.  It can also be overridden with a module
+ * parameter.
+ */
+unsigned long zfs_dirty_data_max = 0;
+unsigned long zfs_dirty_data_max_max = 0;
+int zfs_dirty_data_max_percent = 10;
+int zfs_dirty_data_max_max_percent = 25;
  
-unsigned long zfs_write_limit_min = 32 << 20;  /* min write limit is 32MB */
-unsigned long zfs_write_limit_max = 0;         /* max data payload per txg */
-unsigned long zfs_write_limit_inflated = 0;
-unsigned long zfs_write_limit_override = 0;
+/*
+ * If there is at least this much dirty data, push out a txg.
+ */
+unsigned long zfs_dirty_data_sync = 64 * 1024 * 1024;
  
-kmutex_t zfs_write_limit_lock;
+/*
+ * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
+ * and delay each transaction.
+ * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
+ */
+int zfs_delay_min_dirty_percent = 60;
  
-static pgcnt_t old_physmem = 0;
+/*
+ * This controls how quickly the delay approaches infinity.
+ * Larger values cause it to delay more for a given amount of dirty data.
+ * Therefore larger values will cause there to be less dirty data for a
+ * given throughput.
+ *
+ * For the smoothest delay, this value should be about 1 billion divided
+ * by the maximum number of operations per second.  This will smoothly
+ * handle between 10x and 1/10th this number.
+ *
+ * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
+ * multiply in dmu_tx_delay().
+ */
+unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
  
  hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
  hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
@@ -87,7 +154,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
         dp->dp_spa = spa;
         dp->dp_meta_rootbp = *bp;
         rrw_init(&dp->dp_config_rwlock, B_TRUE);
-       dp->dp_write_limit = zfs_write_limit_min;
         txg_init(dp, txg);
  
         txg_list_create(&dp->dp_dirty_datasets,
@@ -100,6 +166,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
             offsetof(dsl_sync_task_t, dst_node));
  
         mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
+       cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
  
         dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri,
             1, 4, 0);
@@ -214,9 +281,9 @@ out:
  void
  dsl_pool_close(dsl_pool_t *dp)
  {
-       /* drop our references from dsl_pool_open() */
-
         /*
+        * Drop our references from dsl_pool_open().
+        *
          * Since we held the origin_snap from "syncing" context (which
          * includes pool-opening context), it actually only got a "ref"
          * and not a hold, so just drop that here.
@@ -346,6 +413,34 @@ deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
         return (0);
  }
  
+static void
+dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+       zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+       dmu_objset_sync(dp->dp_meta_objset, zio, tx);
+       VERIFY0(zio_wait(zio));
+       dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
+       spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+}
+
+static void
+dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
+{
+       ASSERT(MUTEX_HELD(&dp->dp_lock));
+
+       if (delta < 0)
+               ASSERT3U(-delta, <=, dp->dp_dirty_total);
+
+       dp->dp_dirty_total += delta;
+
+       /*
+        * Note: we signal even when increasing dp_dirty_total.
+        * This ensures forward progress -- each thread wakes the next waiter.
+        */
+       if (dp->dp_dirty_total <= zfs_dirty_data_max)
+               cv_signal(&dp->dp_spaceavail_cv);
+}
+
  void
  dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
  {
@@ -354,29 +449,18 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
         dsl_dir_t *dd;
         dsl_dataset_t *ds;
         objset_t *mos = dp->dp_meta_objset;
-       hrtime_t start, write_time;
-       uint64_t data_written;
-       int err;
         list_t synced_datasets;
  
         list_create(&synced_datasets, sizeof (dsl_dataset_t),
             offsetof(dsl_dataset_t, ds_synced_link));
  
-       /*
-        * We need to copy dp_space_towrite() before doing
-        * dsl_sync_task_sync(), because
-        * dsl_dataset_snapshot_reserve_space() will increase
-        * dp_space_towrite but not actually write anything.
-        */
-       data_written = dp->dp_space_towrite[txg & TXG_MASK];
-
         tx = dmu_tx_create_assigned(dp, txg);
  
-       dp->dp_read_overhead = 0;
-       start = gethrtime();
-
+       /*
+        * Write out all dirty blocks of dirty datasets.
+        */
         zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-       while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) {
+       while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
                 /*
                  * We must not sync any non-MOS datasets twice, because
                  * we may have taken a snapshot of them.  However, we
@@ -386,20 +470,25 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
                 list_insert_tail(&synced_datasets, ds);
                 dsl_dataset_sync(ds, zio, tx);
         }
-       DTRACE_PROBE(pool_sync__1setup);
-       err = zio_wait(zio);
+       VERIFY0(zio_wait(zio));
  
-       write_time = gethrtime() - start;
-       ASSERT(err == 0);
-       DTRACE_PROBE(pool_sync__2rootzio);
+       /*
+        * We have written all of the accounted dirty data, so our
+        * dp_space_towrite should now be zero.  However, some seldom-used
+        * code paths do not adhere to this (e.g. dbuf_undirty(), also
+        * rounding error in dbuf_write_physdone).
+        * Shore up the accounting of any dirtied space now.
+        */
+       dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
  
         /*
          * After the data blocks have been written (ensured by the zio_wait()
          * above), update the user/group space accounting.
          */
-       for (ds = list_head(&synced_datasets); ds;
-           ds = list_next(&synced_datasets, ds))
+       for (ds = list_head(&synced_datasets); ds != NULL;
+           ds = list_next(&synced_datasets, ds)) {
                 dmu_objset_do_userquota_updates(ds->ds_objset, tx);
+       }
  
         /*
          * Sync the datasets again to push out the changes due to
@@ -409,12 +498,12 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
          * about which blocks are part of the snapshot).
          */
         zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-       while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) {
+       while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
                 ASSERT(list_link_active(&ds->ds_synced_link));
                 dmu_buf_rele(ds->ds_dbuf, ds);
                 dsl_dataset_sync(ds, zio, tx);
         }
-       err = zio_wait(zio);
+       VERIFY0(zio_wait(zio));
  
         /*
          * Now that the datasets have been completely synced, we can
@@ -423,7 +512,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
          *  - move dead blocks from the pending deadlist to the on-disk deadlist
          *  - release hold from dsl_dataset_dirty()
          */
-       while ((ds = list_remove_head(&synced_datasets))) {
+       while ((ds = list_remove_head(&synced_datasets)) != NULL) {
                 ASSERTV(objset_t *os = ds->ds_objset);
                 bplist_iterate(&ds->ds_pending_deadlist,
                     deadlist_enqueue_cb, &ds->ds_deadlist, tx);
@@ -431,10 +520,9 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
                 dmu_buf_rele(ds->ds_dbuf, ds);
         }
  
-       start = gethrtime();
-       while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)))
+       while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
                 dsl_dir_sync(dd, tx);
-       write_time += gethrtime() - start;
+       }
  
         /*
          * The MOS's space is accounted for in the pool/$MOS
@@ -452,20 +540,10 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
                 dp->dp_mos_uncompressed_delta = 0;
         }
  
-       start = gethrtime();
         if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
             list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
-               zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-               dmu_objset_sync(mos, zio, tx);
-               err = zio_wait(zio);
-               ASSERT(err == 0);
-               dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
-               spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+               dsl_pool_sync_mos(dp, tx);
         }
-       write_time += gethrtime() - start;
-       DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time,
-           hrtime_t, dp->dp_read_overhead);
-       write_time -= dp->dp_read_overhead;
  
         /*
          * If we modify a dataset in the same txg that we want to destroy it,
@@ -476,72 +554,29 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
          * The MOS data dirtied by the sync_tasks will be synced on the next
          * pass.
          */
-       DTRACE_PROBE(pool_sync__3task);
         if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
                 dsl_sync_task_t *dst;
                 /*
                  * No more sync tasks should have been added while we
                  * were syncing.
                  */
-               ASSERT(spa_sync_pass(dp->dp_spa) == 1);
-               while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)))
+               ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+               while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
                         dsl_sync_task_sync(dst, tx);
         }
  
         dmu_tx_commit(tx);
  
-       dp->dp_space_towrite[txg & TXG_MASK] = 0;
-       ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
-
-       /*
-        * If the write limit max has not been explicitly set, set it
-        * to a fraction of available physical memory (default 1/8th).
-        * Note that we must inflate the limit because the spa
-        * inflates write sizes to account for data replication.
-        * Check this each sync phase to catch changing memory size.
-        */
-       if (physmem != old_physmem && zfs_write_limit_shift) {
-               mutex_enter(&zfs_write_limit_lock);
-               old_physmem = physmem;
-               zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
-               zfs_write_limit_inflated = MAX(zfs_write_limit_min,
-                   spa_get_asize(dp->dp_spa, zfs_write_limit_max));
-               mutex_exit(&zfs_write_limit_lock);
-       }
-
-       /*
-        * Attempt to keep the sync time consistent by adjusting the
-        * amount of write traffic allowed into each transaction group.
-        * Weight the throughput calculation towards the current value:
-        *      thru = 3/4 old_thru + 1/4 new_thru
-        *
-        * Note: write_time is in nanosecs while dp_throughput is expressed in
-        * bytes per millisecond.
-        */
-       ASSERT(zfs_write_limit_min > 0);
-       if (data_written > zfs_write_limit_min / 8 &&
-           write_time > MSEC2NSEC(1)) {
-               uint64_t throughput = data_written / NSEC2MSEC(write_time);
-
-               if (dp->dp_throughput)
-                       dp->dp_throughput = throughput / 4 +
-                           3 * dp->dp_throughput / 4;
-               else
-                       dp->dp_throughput = throughput;
-               dp->dp_write_limit = MIN(zfs_write_limit_inflated,
-                   MAX(zfs_write_limit_min,
-                   dp->dp_throughput * zfs_txg_synctime_ms));
-       }
+       DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
  }
  
  void
  dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
  {
         zilog_t *zilog;
-       dsl_dataset_t *ds;
  
         while ((zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg))) {
-               ds = dmu_objset_ds(zilog->zl_os);
+               dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
                 zil_clean(zilog, txg);
                 ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
                 dmu_buf_rele(ds->ds_dbuf, zilog);
@@ -583,84 +618,49 @@ dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
         return (space - resv);
  }
  
-int
-dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
+boolean_t
+dsl_pool_need_dirty_delay(dsl_pool_t *dp)
  {
-       uint64_t reserved = 0;
-       uint64_t write_limit = (zfs_write_limit_override ?
-           zfs_write_limit_override : dp->dp_write_limit);
-
-       if (zfs_no_write_throttle) {
-               atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK],
-                   space);
-               return (0);
-       }
-
-       /*
-        * Check to see if we have exceeded the maximum allowed IO for
-        * this transaction group.  We can do this without locks since
-        * a little slop here is ok.  Note that we do the reserved check
-        * with only half the requested reserve: this is because the
-        * reserve requests are worst-case, and we really don't want to
-        * throttle based off of worst-case estimates.
-        */
-       if (write_limit > 0) {
-               reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK]
-                   + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2;
+       uint64_t delay_min_bytes =
+           zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+       boolean_t rv;
  
-               if (reserved && reserved > write_limit) {
-                       DMU_TX_STAT_BUMP(dmu_tx_write_limit);
-                       return (SET_ERROR(ERESTART));
-               }
-       }
-
-       atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space);
-
-       /*
-        * If this transaction group is over 7/8ths capacity, delay
-        * the caller 1 clock tick.  This will slow down the "fill"
-        * rate until the sync process can catch up with us.
-        */
-       if (reserved && reserved > (write_limit - (write_limit >> 3))) {
-               txg_delay(dp, tx->tx_txg, zfs_throttle_delay,
-                   zfs_throttle_resolution);
-       }
-
-       return (0);
+       mutex_enter(&dp->dp_lock);
+       if (dp->dp_dirty_total > zfs_dirty_data_sync)
+               txg_kick(dp);
+       rv = (dp->dp_dirty_total > delay_min_bytes);
+       mutex_exit(&dp->dp_lock);
+       return (rv);
  }
  
  void
-dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
+dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
  {
-       ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space);
-       atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space);
+       if (space > 0) {
+               mutex_enter(&dp->dp_lock);
+               dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
+               dsl_pool_dirty_delta(dp, space);
+               mutex_exit(&dp->dp_lock);
+       }
  }
  
  void
-dsl_pool_memory_pressure(dsl_pool_t *dp)
+dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
  {
-       uint64_t space_inuse = 0;
-       int i;
-
-       if (dp->dp_write_limit == zfs_write_limit_min)
+       ASSERT3S(space, >=, 0);
+       if (space == 0)
                 return;
  
-       for (i = 0; i < TXG_SIZE; i++) {
-               space_inuse += dp->dp_space_towrite[i];
-               space_inuse += dp->dp_tempreserved[i];
-       }
-       dp->dp_write_limit = MAX(zfs_write_limit_min,
-           MIN(dp->dp_write_limit, space_inuse / 4));
-}
-
-void
-dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
-{
-       if (space > 0) {
-               mutex_enter(&dp->dp_lock);
-               dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space;
-               mutex_exit(&dp->dp_lock);
+       mutex_enter(&dp->dp_lock);
+       if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
+               /* XXX writing something we didn't dirty? */
+               space = dp->dp_dirty_pertxg[txg & TXG_MASK];
         }
+       ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
+       dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
+       ASSERT3U(dp->dp_dirty_total, >=, space);
+       dsl_pool_dirty_delta(dp, -space);
+       mutex_exit(&dp->dp_lock);
  }
  
  /* ARGSUSED */
@@ -1049,24 +1049,30 @@ dsl_pool_config_held(dsl_pool_t *dp)
  EXPORT_SYMBOL(dsl_pool_config_enter);
  EXPORT_SYMBOL(dsl_pool_config_exit);
  
-module_param(zfs_no_write_throttle, int, 0644);
-MODULE_PARM_DESC(zfs_no_write_throttle, "Disable write throttling");
+/* zfs_dirty_data_max_percent only applied at module load time in arc_init(). */
+module_param(zfs_dirty_data_max_percent, int, 0444);
+MODULE_PARM_DESC(zfs_dirty_data_max_percent, "percent of ram can be dirty");
  
-module_param(zfs_write_limit_shift, int, 0444);
-MODULE_PARM_DESC(zfs_write_limit_shift, "log2(fraction of memory) per txg");
+/* zfs_dirty_data_max_max_percent only applied at module load time in
+ * arc_init(). */
+module_param(zfs_dirty_data_max_max_percent, int, 0444);
+MODULE_PARM_DESC(zfs_dirty_data_max_max_percent,
+    "zfs_dirty_data_max upper bound as % of RAM");
  
-module_param(zfs_txg_synctime_ms, int, 0644);
-MODULE_PARM_DESC(zfs_txg_synctime_ms, "Target milliseconds between txg sync");
+module_param(zfs_delay_min_dirty_percent, int, 0644);
+MODULE_PARM_DESC(zfs_delay_min_dirty_percent, "transaction delay threshold");
  
-module_param(zfs_write_limit_min, ulong, 0444);
-MODULE_PARM_DESC(zfs_write_limit_min, "Min txg write limit");
+module_param(zfs_dirty_data_max, ulong, 0644);
+MODULE_PARM_DESC(zfs_dirty_data_max, "determines the dirty space limit");
  
-module_param(zfs_write_limit_max, ulong, 0444);
-MODULE_PARM_DESC(zfs_write_limit_max, "Max txg write limit");
+/* zfs_dirty_data_max_max only applied at module load time in arc_init(). */
+module_param(zfs_dirty_data_max_max, ulong, 0444);
+MODULE_PARM_DESC(zfs_dirty_data_max_max,
+    "zfs_dirty_data_max upper bound in bytes");
  
-module_param(zfs_write_limit_inflated, ulong, 0444);
-MODULE_PARM_DESC(zfs_write_limit_inflated, "Inflated txg write limit");
+module_param(zfs_dirty_data_sync, ulong, 0644);
+MODULE_PARM_DESC(zfs_dirty_data_sync, "sync txg when this much dirty data");
  
-module_param(zfs_write_limit_override, ulong, 0444);
-MODULE_PARM_DESC(zfs_write_limit_override, "Override txg write limit");
+module_param(zfs_delay_scale, ulong, 0644);
+MODULE_PARM_DESC(zfs_delay_scale, "how quickly delay approaches infinity");
  #endif
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c

index 354d5b1df89c80af2b08a62ff0c131b19f03de9c..3780aee798c71db9f7b3dc5d7fe5312a013aaa3e 100644 (file)
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -1650,7 +1650,6 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
         uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
         boolean_t needs_io = B_FALSE;
         int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
-       int zio_priority = 0;
         int scan_delay = 0;
         int d;
  
@@ -1663,13 +1662,11 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
         ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
         if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
                 zio_flags |= ZIO_FLAG_SCRUB;
-               zio_priority = ZIO_PRIORITY_SCRUB;
                 needs_io = B_TRUE;
                 scan_delay = zfs_scrub_delay;
         } else {
                 ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
                 zio_flags |= ZIO_FLAG_RESILVER;
-               zio_priority = ZIO_PRIORITY_RESILVER;
                 needs_io = B_FALSE;
                 scan_delay = zfs_resilver_delay;
         }
@@ -1727,7 +1724,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
                         delay(scan_delay);
  
                 zio_nowait(zio_read(NULL, spa, bp, data, size,
-                   dsl_scan_scrub_done, NULL, zio_priority,
+                   dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB,
                     zio_flags, zb));
         }
  
diff --git a/module/zfs/spa.c b/module/zfs/spa.c

index c7ea5953630d215fa5cf6e0ca5eb5e0a63d912a1..3daf5805d08c3da8528418049d60d578661ecd42 100644 (file)
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -83,7 +83,6 @@
  
  typedef enum zti_modes {
         ZTI_MODE_FIXED,                 /* value is # of threads (min 1) */
-       ZTI_MODE_ONLINE_PERCENT,        /* value is % of online CPUs */
         ZTI_MODE_BATCH,                 /* cpu-intensive; value is ignored */
         ZTI_MODE_NULL,                  /* don't create a taskq */
         ZTI_NMODES
@@ -142,7 +141,7 @@ static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
      char **ereport);
  static void spa_vdev_resilver_done(spa_t *spa);
  
-uint_t         zio_taskq_batch_pct = 100;      /* 1 thread per cpu in pset */
+uint_t         zio_taskq_batch_pct = 75;       /* 1 thread per cpu in pset */
  id_t           zio_taskq_psrset_bind = PS_NONE;
  boolean_t      zio_taskq_sysdc = B_TRUE;       /* use SDC scheduling class */
  uint_t         zio_taskq_basedc = 80;          /* base duty cycle */
@@ -837,31 +836,27 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
         tqs->stqs_count = count;
         tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
  
-       for (i = 0; i < count; i++) {
-               taskq_t *tq;
-
-               switch (mode) {
-               case ZTI_MODE_FIXED:
-                       ASSERT3U(value, >=, 1);
-                       value = MAX(value, 1);
-                       break;
+       switch (mode) {
+       case ZTI_MODE_FIXED:
+               ASSERT3U(value, >=, 1);
+               value = MAX(value, 1);
+               break;
  
-               case ZTI_MODE_BATCH:
-                       batch = B_TRUE;
-                       flags |= TASKQ_THREADS_CPU_PCT;
-                       value = zio_taskq_batch_pct;
-                       break;
+       case ZTI_MODE_BATCH:
+               batch = B_TRUE;
+               flags |= TASKQ_THREADS_CPU_PCT;
+               value = zio_taskq_batch_pct;
+               break;
  
-               case ZTI_MODE_ONLINE_PERCENT:
-                       flags |= TASKQ_THREADS_CPU_PCT;
-                       break;
+       default:
+               panic("unrecognized mode for %s_%s taskq (%u:%u) in "
+                   "spa_activate()",
+                   zio_type_name[t], zio_taskq_types[q], mode, value);
+               break;
+       }
  
-               default:
-                       panic("unrecognized mode for %s_%s taskq (%u:%u) in "
-                           "spa_activate()",
-                           zio_type_name[t], zio_taskq_types[q], mode, value);
-                       break;
-               }
+       for (i = 0; i < count; i++) {
+               taskq_t *tq;
  
                 if (count > 1) {
                         (void) snprintf(name, sizeof (name), "%s_%s_%u",
@@ -878,7 +873,16 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
                         tq = taskq_create_sysdc(name, value, 50, INT_MAX,
                             spa->spa_proc, zio_taskq_basedc, flags);
                 } else {
-                       tq = taskq_create_proc(name, value, maxclsyspri, 50,
+                       pri_t pri = maxclsyspri;
+                       /*
+                        * The write issue taskq can be extremely CPU
+                        * intensive.  Run it at slightly lower priority
+                        * than the other taskqs.
+                        */
+                       if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
+                               pri--;
+
+                       tq = taskq_create_proc(name, value, pri, 50,
                             INT_MAX, spa->spa_proc, flags);
                 }
  
@@ -5775,6 +5779,31 @@ spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
         return (0);
  }
  
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing frees.
+ */
+static void
+spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
+{
+       zio_t *zio = zio_root(spa, NULL, NULL, 0);
+       bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
+       VERIFY(zio_wait(zio) == 0);
+}
+
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing deferred frees.
+ */
+static void
+spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
+{
+       zio_t *zio = zio_root(spa, NULL, NULL, 0);
+       VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
+           spa_free_sync_cb, zio, tx), ==, 0);
+       VERIFY0(zio_wait(zio));
+}
+
  static void
  spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
  {
@@ -6102,7 +6131,6 @@ spa_sync(spa_t *spa, uint64_t txg)
  {
         dsl_pool_t *dp = spa->spa_dsl_pool;
         objset_t *mos = spa->spa_meta_objset;
-       bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
         bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
         vdev_t *rvd = spa->spa_root_vdev;
         vdev_t *vd;
@@ -6185,10 +6213,7 @@ spa_sync(spa_t *spa, uint64_t txg)
             !txg_list_empty(&dp->dp_sync_tasks, txg) ||
             ((dsl_scan_active(dp->dp_scan) ||
             txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
-               zio_t *zio = zio_root(spa, NULL, NULL, 0);
-               VERIFY3U(bpobj_iterate(defer_bpo,
-                   spa_free_sync_cb, zio, tx), ==, 0);
-               VERIFY0(zio_wait(zio));
+               spa_sync_deferred_frees(spa, tx);
         }
  
         /*
@@ -6206,13 +6231,10 @@ spa_sync(spa_t *spa, uint64_t txg)
                 dsl_pool_sync(dp, txg);
  
                 if (pass < zfs_sync_pass_deferred_free) {
-                       zio_t *zio = zio_root(spa, NULL, NULL, 0);
-                       bplist_iterate(free_bpl, spa_free_sync_cb,
-                           zio, tx);
-                       VERIFY(zio_wait(zio) == 0);
+                       spa_sync_frees(spa, free_bpl, tx);
                 } else {
                         bplist_iterate(free_bpl, bpobj_enqueue_cb,
-                           defer_bpo, tx);
+                           &spa->spa_deferred_bpobj, tx);
                 }
  
                 ddt_sync(spa, txg);
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c

index 91e7fdf3580b7a57a4a5d8336fddeed4b6183abe..d12e233b180c669426a0dc77a0adba4910ba49a8 100644 (file)
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -238,21 +238,31 @@ kmem_cache_t *spa_buffer_pool;
  int spa_mode_global;
  
  /*
- * Expiration time in units of zfs_txg_synctime_ms. This value has two
- * meanings. First it is used to determine when the spa_deadman logic
- * should fire. By default the spa_deadman will fire if spa_sync has
- * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds).
- * Secondly, the value determines if an I/O is considered "hung".
- * Any I/O that has not completed in zfs_deadman_synctime is considered
- * "hung" resulting in a zevent being posted.
+ * Expiration time in milliseconds. This value has two meanings. First it is
+ * used to determine when the spa_deadman() logic should fire. By default the
+ * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
+ * Secondly, the value determines if an I/O is considered "hung". Any I/O that
+ * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
+ * in a system panic.
   */
-unsigned long zfs_deadman_synctime = 1000ULL;
+unsigned long zfs_deadman_synctime_ms = 1000000ULL;
  
  /*
   * By default the deadman is enabled.
   */
  int zfs_deadman_enabled = 1;
  
+/*
+ * The worst case is single-sector max-parity RAID-Z blocks, in which
+ * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
+ * times the size; so just assume that.  Add to this the fact that
+ * we can have up to 3 DVAs per bp, and one more factor of 2 because
+ * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
+ * the worst case is:
+ *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
+ */
+int spa_asize_inflation = 24;
+
  /*
   * ==========================================================================
   * SPA config locking
@@ -489,8 +499,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
         spa->spa_proc = &p0;
         spa->spa_proc_state = SPA_PROC_NONE;
  
-       spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime *
-           zfs_txg_synctime_ms);
+       spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
  
         refcount_create(&spa->spa_refcount);
         spa_config_lock_init(spa);
@@ -1452,14 +1461,7 @@ spa_freeze_txg(spa_t *spa)
  uint64_t
  spa_get_asize(spa_t *spa, uint64_t lsize)
  {
-       /*
-        * The worst case is single-sector max-parity RAID-Z blocks, in which
-        * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
-        * times the size; so just assume that.  Add to this the fact that
-        * we can have up to 3 DVAs per bp, and one more factor of 2 because
-        * the block may be dittoed with up to 3 DVAs by ddt_sync().
-        */
-       return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2);
+       return (lsize * spa_asize_inflation);
  }
  
  uint64_t
@@ -1880,9 +1882,13 @@ EXPORT_SYMBOL(spa_mode);
  
  EXPORT_SYMBOL(spa_namespace_lock);
  
-module_param(zfs_deadman_synctime, ulong, 0644);
-MODULE_PARM_DESC(zfs_deadman_synctime,"Expire in units of zfs_txg_synctime_ms");
+module_param(zfs_deadman_synctime_ms, ulong, 0644);
+MODULE_PARM_DESC(zfs_deadman_synctime_ms,"Expiration time in milliseconds");
  
  module_param(zfs_deadman_enabled, int, 0644);
  MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer");
+
+module_param(spa_asize_inflation, int, 0644);
+MODULE_PARM_DESC(spa_asize_inflation,
+    "SPA size estimate multiplication factor");
  #endif
diff --git a/module/zfs/txg.c b/module/zfs/txg.c

index 8d410f7a5565ad2a4f2a7916e356994e4490fb63..9a594b95475d6a0d1b3897740ba9b6968727b13f 100644 (file)
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -46,7 +46,7 @@
   * either be processing, or blocked waiting to enter the next state. There may
   * be up to three active txgs, and there is always a txg in the open state
   * (though it may be blocked waiting to enter the quiescing state). In broad
- * strokes, transactions — operations that change in-memory structures — are
+ * strokes, transactions -- operations that change in-memory structures -- are
   * accepted into the txg in the open state, and are completed while the txg is
   * in the open or quiescing states. The accumulated changes are written to
   * disk in the syncing state.
@@ -54,7 +54,7 @@
   * Open
   *
   * When a new txg becomes active, it first enters the open state. New
- * transactions — updates to in-memory structures — are assigned to the
+ * transactions -- updates to in-memory structures -- are assigned to the
   * currently open txg. There is always a txg in the open state so that ZFS can
   * accept new changes (though the txg may refuse new changes if it has hit
   * some limit). ZFS advances the open txg to the next state for a variety of
@@ -375,6 +375,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
  
         ASSERT(txg == tx->tx_open_txg);
         tx->tx_open_txg++;
+       tx->tx_open_time = gethrtime();
  
         spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, gethrtime());
         spa_txg_history_add(dp->dp_spa, tx->tx_open_txg);
@@ -511,7 +512,8 @@ txg_sync_thread(dsl_pool_t *dp)
                 while (!dsl_scan_active(dp->dp_scan) &&
                     !tx->tx_exiting && timer > 0 &&
                     tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
-                   tx->tx_quiesced_txg == 0) {
+                   tx->tx_quiesced_txg == 0 &&
+                   dp->dp_dirty_total < zfs_dirty_data_sync) {
                         dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
                             tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
                         txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
@@ -574,8 +576,7 @@ txg_sync_thread(dsl_pool_t *dp)
                     vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE],
                     vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ],
                     vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE],
-                   dp->dp_space_towrite[txg & TXG_MASK] +
-                   dp->dp_tempreserved[txg & TXG_MASK] / 2);
+                   dp->dp_dirty_pertxg[txg & TXG_MASK]);
                 spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime());
         }
  }
@@ -705,6 +706,28 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg)
         mutex_exit(&tx->tx_sync_lock);
  }
  
+/*
+ * If there isn't a txg syncing or in the pipeline, push another txg through
+ * the pipeline by queiscing the open txg.
+ */
+void
+txg_kick(dsl_pool_t *dp)
+{
+       tx_state_t *tx = &dp->dp_tx;
+
+       ASSERT(!dsl_pool_config_held(dp));
+
+       mutex_enter(&tx->tx_sync_lock);
+       if (tx->tx_syncing_txg == 0 &&
+           tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
+           tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
+           tx->tx_quiesced_txg <= tx->tx_synced_txg) {
+               tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
+               cv_broadcast(&tx->tx_quiesce_more_cv);
+       }
+       mutex_exit(&tx->tx_sync_lock);
+}
+
  boolean_t
  txg_stalled(dsl_pool_t *dp)
  {
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c

index ff239d72c172f7b48155f10a0e69c025637c9ee2..7751683d1396003713eca509a992290275657864 100644 (file)
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -3296,7 +3296,7 @@ vdev_deadman(vdev_t *vd)
                 vdev_queue_t *vq = &vd->vdev_queue;
  
                 mutex_enter(&vq->vq_lock);
-               if (avl_numnodes(&vq->vq_pending_tree) > 0) {
+               if (avl_numnodes(&vq->vq_active_tree) > 0) {
                         spa_t *spa = vd->vdev_spa;
                         zio_t *fio;
                         uint64_t delta;
@@ -3306,7 +3306,7 @@ vdev_deadman(vdev_t *vd)
                          * if any I/O has been outstanding for longer than
                          * the spa_deadman_synctime we log a zevent.
                          */
-                       fio = avl_first(&vq->vq_pending_tree);
+                       fio = avl_first(&vq->vq_active_tree);
                         delta = gethrtime() - fio->io_timestamp;
                         if (delta > spa_deadman_synctime(spa)) {
                                 zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c

index 6d0cb967d072c06de6e8c6d7ab35dd9ca1a98067..ec215ffa513ea542894a95d996ccf149fbe9cd28 100644 (file)
--- a/module/zfs/vdev_cache.c
+++ b/module/zfs/vdev_cache.c
@@ -312,7 +312,7 @@ vdev_cache_read(zio_t *zio)
         }
  
         fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
-           ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
+           ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
             ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
  
         ve->ve_fill_io = fio;
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c

index 42f7c97134d6da747c2f6ba1494163c14c3969ee..99b35f08521e60cbafc6007129fcd9581aeebe82 100644 (file)
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -89,7 +89,7 @@ static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
  static int
  vdev_mirror_pending(vdev_t *vd)
  {
-       return (avl_numnodes(&vd->vdev_queue.vq_pending_tree));
+       return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
  }
  
  /*
@@ -499,7 +499,7 @@ vdev_mirror_io_done(zio_t *zio)
                         zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
                             mc->mc_vd, mc->mc_offset,
                             zio->io_data, zio->io_size,
-                           ZIO_TYPE_WRITE, zio->io_priority,
+                           ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
                 }
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c

index 06a641087bf5ffa620434920daec3cb4bac2d7ff..2e1f098a179a9b7c71a61929eae526748f943197 100644 (file)
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -24,7 +24,7 @@
   */
  
  /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
   */
  
  #include <sys/zfs_context.h>
@@ -32,29 +32,134 @@
  #include <sys/spa_impl.h>
  #include <sys/zio.h>
  #include <sys/avl.h>
+#include <sys/dsl_pool.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
  #include <sys/kstat.h>
  
  /*
- * These tunables are for performance analysis.
+ * ZFS I/O Scheduler
+ * ---------------
+ *
+ * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios.  The
+ * I/O scheduler determines when and in what order those operations are
+ * issued.  The I/O scheduler divides operations into five I/O classes
+ * prioritized in the following order: sync read, sync write, async read,
+ * async write, and scrub/resilver.  Each queue defines the minimum and
+ * maximum number of concurrent operations that may be issued to the device.
+ * In addition, the device has an aggregate maximum. Note that the sum of the
+ * per-queue minimums must not exceed the aggregate maximum. If the
+ * sum of the per-queue maximums exceeds the aggregate maximum, then the
+ * number of active i/os may reach zfs_vdev_max_active, in which case no
+ * further i/os will be issued regardless of whether all per-queue
+ * minimums have been met.
+ *
+ * For many physical devices, throughput increases with the number of
+ * concurrent operations, but latency typically suffers. Further, physical
+ * devices typically have a limit at which more concurrent operations have no
+ * effect on throughput or can actually cause it to decrease.
+ *
+ * The scheduler selects the next operation to issue by first looking for an
+ * I/O class whose minimum has not been satisfied. Once all are satisfied and
+ * the aggregate maximum has not been hit, the scheduler looks for classes
+ * whose maximum has not been satisfied. Iteration through the I/O classes is
+ * done in the order specified above. No further operations are issued if the
+ * aggregate maximum number of concurrent operations has been hit or if there
+ * are no operations queued for an I/O class that has not hit its maximum.
+ * Every time an i/o is queued or an operation completes, the I/O scheduler
+ * looks for new operations to issue.
+ *
+ * All I/O classes have a fixed maximum number of outstanding operations
+ * except for the async write class. Asynchronous writes represent the data
+ * that is committed to stable storage during the syncing stage for
+ * transaction groups (see txg.c). Transaction groups enter the syncing state
+ * periodically so the number of queued async writes will quickly burst up and
+ * then bleed down to zero. Rather than servicing them as quickly as possible,
+ * the I/O scheduler changes the maximum number of active async write i/os
+ * according to the amount of dirty data in the pool (see dsl_pool.c). Since
+ * both throughput and latency typically increase with the number of
+ * concurrent operations issued to physical devices, reducing the burstiness
+ * in the number of concurrent operations also stabilizes the response time of
+ * operations from other -- and in particular synchronous -- queues. In broad
+ * strokes, the I/O scheduler will issue more concurrent operations from the
+ * async write queue as there's more dirty data in the pool.
+ *
+ * Async Writes
+ *
+ * The number of concurrent operations issued for the async write I/O class
+ * follows a piece-wise linear function defined by a few adjustable points.
+ *
+ *        |                   o---------| <-- zfs_vdev_async_write_max_active
+ *   ^    |                  /^         |
+ *   |    |                 / |         |
+ * active |                /  |         |
+ *  I/O   |               /   |         |
+ * count  |              /    |         |
+ *        |             /     |         |
+ *        |------------o      |         | <-- zfs_vdev_async_write_min_active
+ *       0|____________^______|_________|
+ *        0%           |      |       100% of zfs_dirty_data_max
+ *                     |      |
+ *                     |      `-- zfs_vdev_async_write_active_max_dirty_percent
+ *                     `--------- zfs_vdev_async_write_active_min_dirty_percent
+ *
+ * Until the amount of dirty data exceeds a minimum percentage of the dirty
+ * data allowed in the pool, the I/O scheduler will limit the number of
+ * concurrent operations to the minimum. As that threshold is crossed, the
+ * number of concurrent operations issued increases linearly to the maximum at
+ * the specified maximum percentage of the dirty data allowed in the pool.
+ *
+ * Ideally, the amount of dirty data on a busy pool will stay in the sloped
+ * part of the function between zfs_vdev_async_write_active_min_dirty_percent
+ * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
+ * maximum percentage, this indicates that the rate of incoming data is
+ * greater than the rate that the backend storage can handle. In this case, we
+ * must further throttle incoming writes (see dmu_tx_delay() for details).
   */
  
-/* The maximum number of I/Os concurrently pending to each device. */
-int zfs_vdev_max_pending = 10;
-
  /*
- * The initial number of I/Os pending to each device, before it starts ramping
- * up to zfs_vdev_max_pending.
+ * The maximum number of i/os active to each device.  Ideally, this will be >=
+ * the sum of each queue's max_active.  It must be at least the sum of each
+ * queue's min_active.
   */
-int zfs_vdev_min_pending = 4;
+uint32_t zfs_vdev_max_active = 1000;
  
  /*
- * The deadlines are grouped into buckets based on zfs_vdev_time_shift:
- * deadline = pri + gethrtime() >> time_shift)
+ * Per-queue limits on the number of i/os active to each device.  If the
+ * number of active i/os is < zfs_vdev_max_active, then the min_active comes
+ * into play. We will send min_active from each queue, and then select from
+ * queues in the order defined by zio_priority_t.
+ *
+ * In general, smaller max_active's will lead to lower latency of synchronous
+ * operations.  Larger max_active's may lead to higher overall throughput,
+ * depending on underlying storage.
+ *
+ * The ratio of the queues' max_actives determines the balance of performance
+ * between reads, writes, and scrubs.  E.g., increasing
+ * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
+ * more quickly, but reads and writes to have higher latency and lower
+ * throughput.
   */
-int zfs_vdev_time_shift = 29; /* each bucket is 0.537 seconds */
+uint32_t zfs_vdev_sync_read_min_active = 10;
+uint32_t zfs_vdev_sync_read_max_active = 10;
+uint32_t zfs_vdev_sync_write_min_active = 10;
+uint32_t zfs_vdev_sync_write_max_active = 10;
+uint32_t zfs_vdev_async_read_min_active = 1;
+uint32_t zfs_vdev_async_read_max_active = 3;
+uint32_t zfs_vdev_async_write_min_active = 1;
+uint32_t zfs_vdev_async_write_max_active = 10;
+uint32_t zfs_vdev_scrub_min_active = 1;
+uint32_t zfs_vdev_scrub_max_active = 2;
  
-/* exponential I/O issue ramp-up rate */
-int zfs_vdev_ramp_rate = 2;
+/*
+ * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
+ * dirty data, use zfs_vdev_async_write_min_active.  When it has more than
+ * zfs_vdev_async_write_active_max_dirty_percent, use
+ * zfs_vdev_async_write_max_active. The value is linearly interpolated
+ * between min and max.
+ */
+int zfs_vdev_async_write_active_min_dirty_percent = 30;
+int zfs_vdev_async_write_active_max_dirty_percent = 60;
  
  /*
   * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
@@ -66,20 +171,12 @@ int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
  int zfs_vdev_read_gap_limit = 32 << 10;
  int zfs_vdev_write_gap_limit = 4 << 10;
  
-/*
- * Virtual device vector for disk I/O scheduling.
- */
  int
-vdev_queue_deadline_compare(const void *x1, const void *x2)
+vdev_queue_offset_compare(const void *x1, const void *x2)
  {
         const zio_t *z1 = x1;
         const zio_t *z2 = x2;
  
-       if (z1->io_deadline < z2->io_deadline)
-               return (-1);
-       if (z1->io_deadline > z2->io_deadline)
-               return (1);
-
         if (z1->io_offset < z2->io_offset)
                 return (-1);
         if (z1->io_offset > z2->io_offset)
@@ -94,14 +191,14 @@ vdev_queue_deadline_compare(const void *x1, const void *x2)
  }
  
  int
-vdev_queue_offset_compare(const void *x1, const void *x2)
+vdev_queue_timestamp_compare(const void *x1, const void *x2)
  {
         const zio_t *z1 = x1;
         const zio_t *z2 = x2;
  
-       if (z1->io_offset < z2->io_offset)
+       if (z1->io_timestamp < z2->io_timestamp)
                 return (-1);
-       if (z1->io_offset > z2->io_offset)
+       if (z1->io_timestamp > z2->io_timestamp)
                 return (1);
  
         if (z1 < z2)
@@ -112,25 +209,141 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
         return (0);
  }
  
+static int
+vdev_queue_class_min_active(zio_priority_t p)
+{
+       switch (p) {
+       case ZIO_PRIORITY_SYNC_READ:
+               return (zfs_vdev_sync_read_min_active);
+       case ZIO_PRIORITY_SYNC_WRITE:
+               return (zfs_vdev_sync_write_min_active);
+       case ZIO_PRIORITY_ASYNC_READ:
+               return (zfs_vdev_async_read_min_active);
+       case ZIO_PRIORITY_ASYNC_WRITE:
+               return (zfs_vdev_async_write_min_active);
+       case ZIO_PRIORITY_SCRUB:
+               return (zfs_vdev_scrub_min_active);
+       default:
+               panic("invalid priority %u", p);
+               return (0);
+       }
+}
+
+static int
+vdev_queue_max_async_writes(uint64_t dirty)
+{
+       int writes;
+       uint64_t min_bytes = zfs_dirty_data_max *
+           zfs_vdev_async_write_active_min_dirty_percent / 100;
+       uint64_t max_bytes = zfs_dirty_data_max *
+           zfs_vdev_async_write_active_max_dirty_percent / 100;
+
+       if (dirty < min_bytes)
+               return (zfs_vdev_async_write_min_active);
+       if (dirty > max_bytes)
+               return (zfs_vdev_async_write_max_active);
+
+       /*
+        * linear interpolation:
+        * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
+        * move right by min_bytes
+        * move up by min_writes
+        */
+       writes = (dirty - min_bytes) *
+           (zfs_vdev_async_write_max_active -
+           zfs_vdev_async_write_min_active) /
+           (max_bytes - min_bytes) +
+           zfs_vdev_async_write_min_active;
+       ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
+       ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
+       return (writes);
+}
+
+static int
+vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
+{
+       switch (p) {
+       case ZIO_PRIORITY_SYNC_READ:
+               return (zfs_vdev_sync_read_max_active);
+       case ZIO_PRIORITY_SYNC_WRITE:
+               return (zfs_vdev_sync_write_max_active);
+       case ZIO_PRIORITY_ASYNC_READ:
+               return (zfs_vdev_async_read_max_active);
+       case ZIO_PRIORITY_ASYNC_WRITE:
+               return (vdev_queue_max_async_writes(
+                   spa->spa_dsl_pool->dp_dirty_total));
+       case ZIO_PRIORITY_SCRUB:
+               return (zfs_vdev_scrub_max_active);
+       default:
+               panic("invalid priority %u", p);
+               return (0);
+       }
+}
+
+/*
+ * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
+ * there is no eligible class.
+ */
+static zio_priority_t
+vdev_queue_class_to_issue(vdev_queue_t *vq)
+{
+       spa_t *spa = vq->vq_vdev->vdev_spa;
+       zio_priority_t p;
+
+       if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
+               return (ZIO_PRIORITY_NUM_QUEUEABLE);
+
+       /* find a queue that has not reached its minimum # outstanding i/os */
+       for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+               if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
+                   vq->vq_class[p].vqc_active <
+                   vdev_queue_class_min_active(p))
+                       return (p);
+       }
+
+       /*
+        * If we haven't found a queue, look for one that hasn't reached its
+        * maximum # outstanding i/os.
+        */
+       for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+               if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
+                   vq->vq_class[p].vqc_active <
+                   vdev_queue_class_max_active(spa, p))
+                       return (p);
+       }
+
+       /* No eligible queued i/os */
+       return (ZIO_PRIORITY_NUM_QUEUEABLE);
+}
+
  void
  vdev_queue_init(vdev_t *vd)
  {
         vdev_queue_t *vq = &vd->vdev_queue;
+       int max_active_sum;
+       zio_priority_t p;
         int i;
  
         mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
+       vq->vq_vdev = vd;
  
-       avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
-           sizeof (zio_t), offsetof(struct zio, io_deadline_node));
-
-       avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
-           sizeof (zio_t), offsetof(struct zio, io_offset_node));
-
-       avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
-           sizeof (zio_t), offsetof(struct zio, io_offset_node));
+       avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
+           sizeof (zio_t), offsetof(struct zio, io_queue_node));
  
-       avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
-           sizeof (zio_t), offsetof(struct zio, io_offset_node));
+       for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+               /*
+                * The synchronous i/o queues are FIFO rather than LBA ordered.
+                * This provides more consistent latency for these i/os, and
+                * they tend to not be tightly clustered anyway so there is
+                * little to no throughput loss.
+                */
+               boolean_t fifo = (p == ZIO_PRIORITY_SYNC_READ ||
+                   p == ZIO_PRIORITY_SYNC_WRITE);
+               avl_create(&vq->vq_class[p].vqc_queued_tree,
+                   fifo ? vdev_queue_timestamp_compare :
+                   vdev_queue_offset_compare,
+                   sizeof (zio_t), offsetof(struct zio, io_queue_node));
+       }
  
         /*
          * A list of buffers which can be used for aggregate I/O, this
@@ -139,7 +352,10 @@ vdev_queue_init(vdev_t *vd)
         list_create(&vq->vq_io_list, sizeof (vdev_io_t),
             offsetof(vdev_io_t, vi_node));
  
-       for (i = 0; i < zfs_vdev_max_pending; i++)
+       max_active_sum = zfs_vdev_sync_read_max_active +
+           zfs_vdev_sync_write_max_active + zfs_vdev_async_read_max_active +
+           zfs_vdev_async_write_max_active + zfs_vdev_scrub_max_active;
+       for (i = 0; i < max_active_sum; i++)
                 list_insert_tail(&vq->vq_io_list, zio_vdev_alloc());
  }
  
@@ -148,11 +364,11 @@ vdev_queue_fini(vdev_t *vd)
  {
         vdev_queue_t *vq = &vd->vdev_queue;
         vdev_io_t *vi;
+       zio_priority_t p;
  
-       avl_destroy(&vq->vq_deadline_tree);
-       avl_destroy(&vq->vq_read_tree);
-       avl_destroy(&vq->vq_write_tree);
-       avl_destroy(&vq->vq_pending_tree);
+       for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
+               avl_destroy(&vq->vq_class[p].vqc_queued_tree);
+       avl_destroy(&vq->vq_active_tree);
  
         while ((vi = list_head(&vq->vq_io_list)) != NULL) {
                 list_remove(&vq->vq_io_list, vi);
@@ -170,8 +386,8 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
         spa_t *spa = zio->io_spa;
         spa_stats_history_t *ssh = &spa->spa_stats.io_history;
  
-       avl_add(&vq->vq_deadline_tree, zio);
-       avl_add(zio->io_vdev_tree, zio);
+       ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+       avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
  
         if (ssh->kstat != NULL) {
                 mutex_enter(&ssh->lock);
@@ -186,8 +402,8 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
         spa_t *spa = zio->io_spa;
         spa_stats_history_t *ssh = &spa->spa_stats.io_history;
  
-       avl_remove(&vq->vq_deadline_tree, zio);
-       avl_remove(zio->io_vdev_tree, zio);
+       ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+       avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
  
         if (ssh->kstat != NULL) {
                 mutex_enter(&ssh->lock);
@@ -202,7 +418,10 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
         spa_t *spa = zio->io_spa;
         spa_stats_history_t *ssh = &spa->spa_stats.io_history;
  
-       avl_add(&vq->vq_pending_tree, zio);
+       ASSERT(MUTEX_HELD(&vq->vq_lock));
+       ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+       vq->vq_class[zio->io_priority].vqc_active++;
+       avl_add(&vq->vq_active_tree, zio);
  
         if (ssh->kstat != NULL) {
                 mutex_enter(&ssh->lock);
@@ -217,7 +436,10 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
         spa_t *spa = zio->io_spa;
         spa_stats_history_t *ssh = &spa->spa_stats.io_history;
  
-       avl_remove(&vq->vq_pending_tree, zio);
+       ASSERT(MUTEX_HELD(&vq->vq_lock));
+       ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+       vq->vq_class[zio->io_priority].vqc_active--;
+       avl_remove(&vq->vq_active_tree, zio);
  
         if (ssh->kstat != NULL) {
                 kstat_io_t *ksio = ssh->kstat->ks_data;
@@ -240,12 +462,14 @@ vdev_queue_agg_io_done(zio_t *aio)
  {
         vdev_queue_t *vq = &aio->io_vd->vdev_queue;
         vdev_io_t *vi = aio->io_data;
-       zio_t *pio;
  
-       while ((pio = zio_walk_parents(aio)) != NULL)
-               if (aio->io_type == ZIO_TYPE_READ)
+       if (aio->io_type == ZIO_TYPE_READ) {
+               zio_t *pio;
+               while ((pio = zio_walk_parents(aio)) != NULL) {
                         bcopy((char *)aio->io_data + (pio->io_offset -
                             aio->io_offset), pio->io_data, pio->io_size);
+               }
+       }
  
         mutex_enter(&vq->vq_lock);
         list_insert_tail(&vq->vq_io_list, vi);
@@ -262,28 +486,38 @@ vdev_queue_agg_io_done(zio_t *aio)
  #define        IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
  
  static zio_t *
-vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
+vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
  {
-       zio_t *fio, *lio, *aio, *dio, *nio, *mio;
-       avl_tree_t *t;
         vdev_io_t *vi;
-       int flags;
-       uint64_t maxspan = MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE);
-       uint64_t maxgap;
-       int stretch;
+       zio_t *first, *last, *aio, *dio, *mandatory, *nio;
+       uint64_t maxgap = 0;
+       uint64_t size;
+       boolean_t stretch = B_FALSE;
+       vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority];
+       avl_tree_t *t = &vqc->vqc_queued_tree;
+       enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
+
+       if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
+               return (NULL);
  
-again:
-       ASSERT(MUTEX_HELD(&vq->vq_lock));
+       /* Prevent users from setting the zfs_vdev_aggregation_limit
+        * tuning larger than SPA_MAXBLOCKSIZE. */
+       zfs_vdev_aggregation_limit =
+           MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE);
  
-       if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
-           avl_numnodes(&vq->vq_deadline_tree) == 0)
+       /*
+        * The synchronous i/o queues are not sorted by LBA, so we can't
+        * find adjacent i/os.  These i/os tend to not be tightly clustered,
+        * or too large to aggregate, so this has little impact on performance.
+        */
+       if (zio->io_priority == ZIO_PRIORITY_SYNC_READ ||
+           zio->io_priority == ZIO_PRIORITY_SYNC_WRITE)
                 return (NULL);
  
-       fio = lio = avl_first(&vq->vq_deadline_tree);
+       first = last = zio;
  
-       t = fio->io_vdev_tree;
-       flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
-       maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;
+       if (zio->io_type == ZIO_TYPE_READ)
+               maxgap = zfs_vdev_read_gap_limit;
  
         vi = list_head(&vq->vq_io_list);
         if (vi == NULL) {
@@ -291,134 +525,172 @@ again:
                 list_insert_head(&vq->vq_io_list, vi);
         }
  
-       if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
-               /*
-                * We can aggregate I/Os that are sufficiently adjacent and of
-                * the same flavor, as expressed by the AGG_INHERIT flags.
-                * The latter requirement is necessary so that certain
-                * attributes of the I/O, such as whether it's a normal I/O
-                * or a scrub/resilver, can be preserved in the aggregate.
-                * We can include optional I/Os, but don't allow them
-                * to begin a range as they add no benefit in that situation.
-                */
+       /*
+        * We can aggregate I/Os that are sufficiently adjacent and of
+        * the same flavor, as expressed by the AGG_INHERIT flags.
+        * The latter requirement is necessary so that certain
+        * attributes of the I/O, such as whether it's a normal I/O
+        * or a scrub/resilver, can be preserved in the aggregate.
+        * We can include optional I/Os, but don't allow them
+        * to begin a range as they add no benefit in that situation.
+        */
  
-               /*
-                * We keep track of the last non-optional I/O.
-                */
-               mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
+       /*
+        * We keep track of the last non-optional I/O.
+        */
+       mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
  
-               /*
-                * Walk backwards through sufficiently contiguous I/Os
-                * recording the last non-option I/O.
-                */
-               while ((dio = AVL_PREV(t, fio)) != NULL &&
-                   (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
-                   IO_SPAN(dio, lio) <= maxspan &&
-                   IO_GAP(dio, fio) <= maxgap) {
-                       fio = dio;
-                       if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
-                               mio = fio;
-               }
+       /*
+        * Walk backwards through sufficiently contiguous I/Os
+        * recording the last non-option I/O.
+        */
+       while ((dio = AVL_PREV(t, first)) != NULL &&
+           (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+           IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
+           IO_GAP(dio, first) <= maxgap) {
+               first = dio;
+               if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
+                       mandatory = first;
+       }
  
-               /*
-                * Skip any initial optional I/Os.
-                */
-               while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
-                       fio = AVL_NEXT(t, fio);
-                       ASSERT(fio != NULL);
-               }
+       /*
+        * Skip any initial optional I/Os.
+        */
+       while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
+               first = AVL_NEXT(t, first);
+               ASSERT(first != NULL);
+       }
  
-               /*
-                * Walk forward through sufficiently contiguous I/Os.
-                */
-               while ((dio = AVL_NEXT(t, lio)) != NULL &&
-                   (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
-                   IO_SPAN(fio, dio) <= maxspan &&
-                   IO_GAP(lio, dio) <= maxgap) {
-                       lio = dio;
-                       if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
-                               mio = lio;
-               }
  
-               /*
-                * Now that we've established the range of the I/O aggregation
-                * we must decide what to do with trailing optional I/Os.
-                * For reads, there's nothing to do. While we are unable to
-                * aggregate further, it's possible that a trailing optional
-                * I/O would allow the underlying device to aggregate with
-                * subsequent I/Os. We must therefore determine if the next
-                * non-optional I/O is close enough to make aggregation
-                * worthwhile.
-                */
-               stretch = B_FALSE;
-               if (t != &vq->vq_read_tree && mio != NULL) {
-                       nio = lio;
-                       while ((dio = AVL_NEXT(t, nio)) != NULL &&
-                           IO_GAP(nio, dio) == 0 &&
-                           IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
-                               nio = dio;
-                               if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
-                                       stretch = B_TRUE;
-                                       break;
-                               }
+       /*
+        * Walk forward through sufficiently contiguous I/Os.
+        */
+       while ((dio = AVL_NEXT(t, last)) != NULL &&
+           (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+           IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit &&
+           IO_GAP(last, dio) <= maxgap) {
+               last = dio;
+               if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
+                       mandatory = last;
+       }
+
+       /*
+        * Now that we've established the range of the I/O aggregation
+        * we must decide what to do with trailing optional I/Os.
+        * For reads, there's nothing to do. While we are unable to
+        * aggregate further, it's possible that a trailing optional
+        * I/O would allow the underlying device to aggregate with
+        * subsequent I/Os. We must therefore determine if the next
+        * non-optional I/O is close enough to make aggregation
+        * worthwhile.
+        */
+       if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
+               zio_t *nio = last;
+               while ((dio = AVL_NEXT(t, nio)) != NULL &&
+                   IO_GAP(nio, dio) == 0 &&
+                   IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
+                       nio = dio;
+                       if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
+                               stretch = B_TRUE;
+                               break;
                         }
                 }
+       }
  
-               if (stretch) {
-                       /* This may be a no-op. */
-                       VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
-                       dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
-               } else {
-                       while (lio != mio && lio != fio) {
-                               ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
-                               lio = AVL_PREV(t, lio);
-                               ASSERT(lio != NULL);
-                       }
+       if (stretch) {
+               /* This may be a no-op. */
+               dio = AVL_NEXT(t, last);
+               dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
+       } else {
+               while (last != mandatory && last != first) {
+                       ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
+                       last = AVL_PREV(t, last);
+                       ASSERT(last != NULL);
                 }
         }
  
-       if (fio != lio) {
-               uint64_t size = IO_SPAN(fio, lio);
-               ASSERT(size <= maxspan);
-               ASSERT(vi != NULL);
-
-               aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
-                   vi, size, fio->io_type, ZIO_PRIORITY_AGG,
-                   flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
-                   vdev_queue_agg_io_done, NULL);
-               aio->io_timestamp = fio->io_timestamp;
-
-               nio = fio;
-               do {
-                       dio = nio;
-                       nio = AVL_NEXT(t, dio);
-                       ASSERT(dio->io_type == aio->io_type);
-                       ASSERT(dio->io_vdev_tree == t);
-
-                       if (dio->io_flags & ZIO_FLAG_NODATA) {
-                               ASSERT(dio->io_type == ZIO_TYPE_WRITE);
-                               bzero((char *)aio->io_data + (dio->io_offset -
-                                   aio->io_offset), dio->io_size);
-                       } else if (dio->io_type == ZIO_TYPE_WRITE) {
-                               bcopy(dio->io_data, (char *)aio->io_data +
-                                   (dio->io_offset - aio->io_offset),
-                                   dio->io_size);
-                       }
+       if (first == last)
+               return (NULL);
+
+       ASSERT(vi != NULL);
+
+       size = IO_SPAN(first, last);
+       ASSERT3U(size, <=, zfs_vdev_aggregation_limit);
+
+       aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
+           vi, size, first->io_type, zio->io_priority,
+           flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
+           vdev_queue_agg_io_done, NULL);
+       aio->io_timestamp = first->io_timestamp;
+
+       nio = first;
+       do {
+               dio = nio;
+               nio = AVL_NEXT(t, dio);
+               ASSERT3U(dio->io_type, ==, aio->io_type);
+
+               if (dio->io_flags & ZIO_FLAG_NODATA) {
+                       ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
+                       bzero((char *)aio->io_data + (dio->io_offset -
+                           aio->io_offset), dio->io_size);
+               } else if (dio->io_type == ZIO_TYPE_WRITE) {
+                       bcopy(dio->io_data, (char *)aio->io_data +
+                           (dio->io_offset - aio->io_offset),
+                           dio->io_size);
+               }
  
-                       zio_add_child(dio, aio);
-                       vdev_queue_io_remove(vq, dio);
-                       zio_vdev_io_bypass(dio);
-                       zio_execute(dio);
-               } while (dio != lio);
+               zio_add_child(dio, aio);
+               vdev_queue_io_remove(vq, dio);
+               zio_vdev_io_bypass(dio);
+               zio_execute(dio);
+       } while (dio != last);
  
-               vdev_queue_pending_add(vq, aio);
-               list_remove(&vq->vq_io_list, vi);
+       list_remove(&vq->vq_io_list, vi);
+
+       return (aio);
+}
+
+static zio_t *
+vdev_queue_io_to_issue(vdev_queue_t *vq)
+{
+       zio_t *zio, *aio;
+       zio_priority_t p;
+       avl_index_t idx;
+       vdev_queue_class_t *vqc;
+       zio_t *search;
+
+again:
+       ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+       p = vdev_queue_class_to_issue(vq);
  
-               return (aio);
+       if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
+               /* No eligible queued i/os */
+               return (NULL);
         }
  
-       ASSERT(fio->io_vdev_tree == t);
-       vdev_queue_io_remove(vq, fio);
+       /*
+        * For LBA-ordered queues (async / scrub), issue the i/o which follows
+        * the most recently issued i/o in LBA (offset) order.
+        *
+        * For FIFO queues (sync), issue the i/o with the lowest timestamp.
+        */
+       vqc = &vq->vq_class[p];
+       search = zio_buf_alloc(sizeof(*search));
+       search->io_timestamp = 0;
+       search->io_offset = vq->vq_last_offset + 1;
+       VERIFY3P(avl_find(&vqc->vqc_queued_tree, search, &idx), ==, NULL);
+       zio_buf_free(search, sizeof(*search));
+       zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER);
+       if (zio == NULL)
+               zio = avl_first(&vqc->vqc_queued_tree);
+       ASSERT3U(zio->io_priority, ==, p);
+
+       aio = vdev_queue_aggregate(vq, zio);
+       if (aio != NULL)
+               zio = aio;
+       else
+               vdev_queue_io_remove(vq, zio);
  
         /*
          * If the I/O is or was optional and therefore has no data, we need to
@@ -426,17 +698,18 @@ again:
          * deadlock that we could encounter since this I/O will complete
          * immediately.
          */
-       if (fio->io_flags & ZIO_FLAG_NODATA) {
+       if (zio->io_flags & ZIO_FLAG_NODATA) {
                 mutex_exit(&vq->vq_lock);
-               zio_vdev_io_bypass(fio);
-               zio_execute(fio);
+               zio_vdev_io_bypass(zio);
+               zio_execute(zio);
                 mutex_enter(&vq->vq_lock);
                 goto again;
         }
  
-       vdev_queue_pending_add(vq, fio);
+       vdev_queue_pending_add(vq, zio);
+       vq->vq_last_offset = zio->io_offset;
  
-       return (fio);
+       return (zio);
  }
  
  zio_t *
@@ -445,28 +718,31 @@ vdev_queue_io(zio_t *zio)
         vdev_queue_t *vq = &zio->io_vd->vdev_queue;
         zio_t *nio;
  
-       ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
-
         if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
                 return (zio);
  
-       zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+       /*
+        * Children i/os inherent their parent's priority, which might
+        * not match the child's i/o type.  Fix it up here.
+        */
+       if (zio->io_type == ZIO_TYPE_READ) {
+               if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
+                   zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
+                   zio->io_priority != ZIO_PRIORITY_SCRUB)
+                       zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
+       } else {
+               ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+               if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
+                   zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE)
+                       zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
+       }
  
-       if (zio->io_type == ZIO_TYPE_READ)
-               zio->io_vdev_tree = &vq->vq_read_tree;
-       else
-               zio->io_vdev_tree = &vq->vq_write_tree;
+       zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
  
         mutex_enter(&vq->vq_lock);
-
         zio->io_timestamp = gethrtime();
-       zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
-           zio->io_priority;
-
         vdev_queue_io_add(vq, zio);
-
-       nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
-
+       nio = vdev_queue_io_to_issue(vq);
         mutex_exit(&vq->vq_lock);
  
         if (nio == NULL)
@@ -484,7 +760,7 @@ void
  vdev_queue_io_done(zio_t *zio)
  {
         vdev_queue_t *vq = &zio->io_vd->vdev_queue;
-       int i;
+       zio_t *nio;
  
         if (zio_injection_enabled)
                 delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
@@ -497,10 +773,7 @@ vdev_queue_io_done(zio_t *zio)
         vq->vq_io_complete_ts = gethrtime();
         vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
  
-       for (i = 0; i < zfs_vdev_ramp_rate; i++) {
-               zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
-               if (nio == NULL)
-                       break;
+       while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
                 mutex_exit(&vq->vq_lock);
                 if (nio->io_done == vdev_queue_agg_io_done) {
                         zio_nowait(nio);
@@ -515,24 +788,61 @@ vdev_queue_io_done(zio_t *zio)
  }
  
  #if defined(_KERNEL) && defined(HAVE_SPL)
-module_param(zfs_vdev_max_pending, int, 0644);
-MODULE_PARM_DESC(zfs_vdev_max_pending, "Max pending per-vdev I/Os");
-
-module_param(zfs_vdev_min_pending, int, 0644);
-MODULE_PARM_DESC(zfs_vdev_min_pending, "Min pending per-vdev I/Os");
-
  module_param(zfs_vdev_aggregation_limit, int, 0644);
  MODULE_PARM_DESC(zfs_vdev_aggregation_limit, "Max vdev I/O aggregation size");
  
-module_param(zfs_vdev_time_shift, int, 0644);
-MODULE_PARM_DESC(zfs_vdev_time_shift, "Deadline time shift for vdev I/O");
-
-module_param(zfs_vdev_ramp_rate, int, 0644);
-MODULE_PARM_DESC(zfs_vdev_ramp_rate, "Exponential I/O issue ramp-up rate");
-
  module_param(zfs_vdev_read_gap_limit, int, 0644);
  MODULE_PARM_DESC(zfs_vdev_read_gap_limit, "Aggregate read I/O over gap");
  
  module_param(zfs_vdev_write_gap_limit, int, 0644);
  MODULE_PARM_DESC(zfs_vdev_write_gap_limit, "Aggregate write I/O over gap");
+
+module_param(zfs_vdev_max_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_max_active, "Maximum number of active I/Os per vdev");
+
+module_param(zfs_vdev_async_write_active_max_dirty_percent, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_async_write_active_max_dirty_percent,
+    "Async write concurrency max threshold");
+
+module_param(zfs_vdev_async_write_active_min_dirty_percent, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_async_write_active_min_dirty_percent,
+    "Async write concurrency min threshold");
+
+module_param(zfs_vdev_async_read_max_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_async_read_max_active,
+    "Max active async read I/Os per vdev");
+
+module_param(zfs_vdev_async_read_min_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_async_read_min_active,
+    "Min active async read I/Os per vdev");
+
+module_param(zfs_vdev_async_write_max_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_async_write_max_active,
+    "Max active async write I/Os per vdev");
+
+module_param(zfs_vdev_async_write_min_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_async_write_min_active,
+    "Min active async write I/Os per vdev");
+
+module_param(zfs_vdev_scrub_max_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_scrub_max_active, "Max active scrub I/Os per vdev");
+
+module_param(zfs_vdev_scrub_min_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_scrub_min_active, "Min active scrub I/Os per vdev");
+
+module_param(zfs_vdev_sync_read_max_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_sync_read_max_active,
+    "Max active sync read I/Os per vdev");
+
+module_param(zfs_vdev_sync_read_min_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_sync_read_min_active,
+    "Min active sync read I/Os per vdev");
+
+module_param(zfs_vdev_sync_write_max_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_sync_write_max_active,
+    "Max active sync write I/Os per vdev");
+
+module_param(zfs_vdev_sync_write_min_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_sync_write_min_active,
+    "Min active sync write I/Osper vdev");
  #endif
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c

index 9632e9af75d04c39a60e3152b501691d87f5ca97..4cd21df8938d8ff14c43c76f173ffa59b71b8242 100644 (file)
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -2188,7 +2188,7 @@ done:
  
                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
                             rc->rc_offset, rc->rc_data, rc->rc_size,
-                           ZIO_TYPE_WRITE, zio->io_priority,
+                           ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
                 }
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c

index af2030ae7f3670a57c8289c924ed21a974880867..df47d99cfafa20be862c15f8d2de0547f31ef16d 100644 (file)
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -316,8 +316,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
                     DATA_TYPE_UINT64, zio->io_delay, NULL);
                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP,
                     DATA_TYPE_UINT64, zio->io_timestamp, NULL);
-               fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DEADLINE,
-                   DATA_TYPE_UINT64, zio->io_deadline, NULL);
                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
                     DATA_TYPE_UINT64, zio->io_delta, NULL);
  
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c

index 1552b61e0e3e3e7380438dfa7dd601b42ee10502..6f25a6fff1b80f1c9d2928ddaf17631aca552baf 100644 (file)
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -125,7 +125,11 @@
   *     forever, because the previous txg can't quiesce until B's tx commits.
   *
   *     If dmu_tx_assign() returns ERESTART and zsb->z_assign is TXG_NOWAIT,
- *     then drop all locks, call dmu_tx_wait(), and try again.
+ *     then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
+ *     calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
+ *     to indicate that this operation has already called dmu_tx_wait().
+ *     This will ensure that we don't retry forever, waiting a short bit
+ *     each time.
   *
   *  (5)        If the operation succeeded, generate the intent log entry for it
   *     before dropping locks.  This ensures that the ordering of events
@@ -147,12 +151,13 @@
   *     rw_enter(...);                  // grab any other locks you need
   *     tx = dmu_tx_create(...);        // get DMU tx
   *     dmu_tx_hold_*();                // hold each object you might modify
- *     error = dmu_tx_assign(tx, TXG_NOWAIT);  // try to assign
+ *     error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
   *     if (error) {
   *             rw_exit(...);           // drop locks
   *             zfs_dirent_unlock(dl);  // unlock directory entry
   *             iput(...);              // release held vnodes
   *             if (error == ERESTART) {
+ *                     waited = B_TRUE;
   *                     dmu_tx_wait(tx);
   *                     dmu_tx_abort(tx);
   *                     goto top;
@@ -1279,6 +1284,7 @@ zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl,
         zfs_acl_ids_t   acl_ids;
         boolean_t       fuid_dirtied;
         boolean_t       have_acl = B_FALSE;
+       boolean_t       waited = B_FALSE;
  
         /*
          * If we have an ephemeral id, ACL, or XVATTR then
@@ -1391,10 +1397,11 @@ top:
                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
                             0, acl_ids.z_aclp->z_acl_bytes);
                 }
-               error = dmu_tx_assign(tx, TXG_NOWAIT);
+               error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
                 if (error) {
                         zfs_dirent_unlock(dl);
                         if (error == ERESTART) {
+                               waited = B_TRUE;
                                 dmu_tx_wait(tx);
                                 dmu_tx_abort(tx);
                                 goto top;
@@ -1524,6 +1531,7 @@ zfs_remove(struct inode *dip, char *name, cred_t *cr)
  #endif /* HAVE_PN_UTILS */
         int             error;
         int             zflg = ZEXISTS;
+       boolean_t       waited = B_FALSE;
  
         ZFS_ENTER(zsb);
         ZFS_VERIFY_ZP(dzp);
@@ -1599,13 +1607,14 @@ top:
         /* charge as an update -- would be nice not to charge at all */
         dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL);
  
-       error = dmu_tx_assign(tx, TXG_NOWAIT);
+       error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
         if (error) {
                 zfs_dirent_unlock(dl);
                 iput(ip);
                 if (xzp)
                         iput(ZTOI(xzp));
                 if (error == ERESTART) {
+                       waited = B_TRUE;
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
@@ -1710,6 +1719,7 @@ zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp,
         gid_t           gid = crgetgid(cr);
         zfs_acl_ids_t   acl_ids;
         boolean_t       fuid_dirtied;
+       boolean_t       waited = B_FALSE;
  
         ASSERT(S_ISDIR(vap->va_mode));
  
@@ -1801,10 +1811,11 @@ top:
         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
             ZFS_SA_BASE_ATTR_SIZE);
  
-       error = dmu_tx_assign(tx, TXG_NOWAIT);
+       error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
         if (error) {
                 zfs_dirent_unlock(dl);
                 if (error == ERESTART) {
+                       waited = B_TRUE;
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
@@ -1882,6 +1893,7 @@ zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr,
         dmu_tx_t        *tx;
         int             error;
         int             zflg = ZEXISTS;
+       boolean_t       waited = B_FALSE;
  
         ZFS_ENTER(zsb);
         ZFS_VERIFY_ZP(dzp);
@@ -1935,13 +1947,14 @@ top:
         dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL);
         zfs_sa_upgrade_txholds(tx, zp);
         zfs_sa_upgrade_txholds(tx, dzp);
-       error = dmu_tx_assign(tx, TXG_NOWAIT);
+       error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
         if (error) {
                 rw_exit(&zp->z_parent_lock);
                 rw_exit(&zp->z_name_lock);
                 zfs_dirent_unlock(dl);
                 iput(ip);
                 if (error == ERESTART) {
+                       waited = B_TRUE;
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
@@ -3169,6 +3182,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
         int             cmp, serr, terr;
         int             error = 0;
         int             zflg = 0;
+       boolean_t       waited = B_FALSE;
  
         ZFS_ENTER(zsb);
         ZFS_VERIFY_ZP(sdzp);
@@ -3383,7 +3397,7 @@ top:
  
         zfs_sa_upgrade_txholds(tx, szp);
         dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL);
-       error = dmu_tx_assign(tx, TXG_NOWAIT);
+       error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
         if (error) {
                 if (zl != NULL)
                         zfs_rename_unlock(&zl);
@@ -3397,6 +3411,7 @@ top:
                 if (tzp)
                         iput(ZTOI(tzp));
                 if (error == ERESTART) {
+                       waited = B_TRUE;
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
@@ -3504,6 +3519,7 @@ zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link,
         zfs_acl_ids_t   acl_ids;
         boolean_t       fuid_dirtied;
         uint64_t        txtype = TX_SYMLINK;
+       boolean_t       waited = B_FALSE;
  
         ASSERT(S_ISLNK(vap->va_mode));
  
@@ -3568,10 +3584,11 @@ top:
         }
         if (fuid_dirtied)
                 zfs_fuid_txhold(zsb, tx);
-       error = dmu_tx_assign(tx, TXG_NOWAIT);
+       error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
         if (error) {
                 zfs_dirent_unlock(dl);
                 if (error == ERESTART) {
+                       waited = B_TRUE;
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
@@ -3699,6 +3716,7 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr)
         int             zf = ZNEW;
         uint64_t        parent;
         uid_t           owner;
+       boolean_t       waited = B_FALSE;
  
         ASSERT(S_ISDIR(tdip->i_mode));
  
@@ -3782,10 +3800,11 @@ top:
         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
         zfs_sa_upgrade_txholds(tx, szp);
         zfs_sa_upgrade_txholds(tx, dzp);
-       error = dmu_tx_assign(tx, TXG_NOWAIT);
+       error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
         if (error) {
                 zfs_dirent_unlock(dl);
                 if (error == ERESTART) {
+                       waited = B_TRUE;
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
diff --git a/module/zfs/zil.c b/module/zfs/zil.c

index 839afa956c17163034efd8aab514e2b1cf3720b4..30035faa0dbfc56650b3825803a364b4adec8bcf 100644 (file)
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -913,7 +913,7 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
                 }
                 lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
                     0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
-                   zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
+                   zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE,
                     ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
                     ZIO_FLAG_FASTWRITE, &zb);
         }
diff --git a/module/zfs/zio.c b/module/zfs/zio.c

index bc9d5b92061bd09009e8c51bcce868f3aff600b1..7cc3d4c9a4c28621ef0f11704a3e8ae1b48e3e3e 100644 (file)
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -37,32 +37,12 @@
  #include <sys/arc.h>
  #include <sys/ddt.h>
  
-/*
- * ==========================================================================
- * I/O priority table
- * ==========================================================================
- */
-uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
-       0,      /* ZIO_PRIORITY_NOW             */
-       0,      /* ZIO_PRIORITY_SYNC_READ       */
-       0,      /* ZIO_PRIORITY_SYNC_WRITE      */
-       0,      /* ZIO_PRIORITY_LOG_WRITE       */
-       1,      /* ZIO_PRIORITY_CACHE_FILL      */
-       1,      /* ZIO_PRIORITY_AGG             */
-       4,      /* ZIO_PRIORITY_FREE            */
-       4,      /* ZIO_PRIORITY_ASYNC_WRITE     */
-       6,      /* ZIO_PRIORITY_ASYNC_READ      */
-       10,     /* ZIO_PRIORITY_RESILVER        */
-       20,     /* ZIO_PRIORITY_SCRUB           */
-       2,      /* ZIO_PRIORITY_DDT_PREFETCH    */
-};
-
  /*
   * ==========================================================================
   * I/O type descriptions
   * ==========================================================================
   */
-char *zio_type_name[ZIO_TYPES] = {
+const char *zio_type_name[ZIO_TYPES] = {
         "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
  };
  
@@ -549,7 +529,10 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
                 *errorp = zio_worst_error(*errorp, zio->io_error);
         pio->io_reexecute |= zio->io_reexecute;
         ASSERT3U(*countp, >, 0);
-       if (--*countp == 0 && pio->io_stall == countp) {
+
+       (*countp)--;
+
+       if (*countp == 0 && pio->io_stall == countp) {
                 pio->io_stall = NULL;
                 mutex_exit(&pio->io_lock);
                 __zio_execute(pio);
@@ -573,7 +556,7 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c)
  static zio_t *
  zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
      void *data, uint64_t size, zio_done_func_t *done, void *private,
-    zio_type_t type, int priority, enum zio_flag flags,
+    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
      vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
      enum zio_stage stage, enum zio_stage pipeline)
  {
@@ -620,6 +603,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
         zio->io_spa = spa;
         zio->io_txg = txg;
         zio->io_ready = NULL;
+       zio->io_physdone = NULL;
         zio->io_done = done;
         zio->io_private = private;
         zio->io_prev_space_delta = 0;
@@ -629,7 +613,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
         zio->io_vsd = NULL;
         zio->io_vsd_ops = NULL;
         zio->io_offset = offset;
-       zio->io_deadline = 0;
         zio->io_timestamp = 0;
         zio->io_delta = 0;
         zio->io_delay = 0;
@@ -646,6 +629,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
         zio->io_transform_stack = NULL;
         zio->io_error = 0;
         zio->io_child_count = 0;
+       zio->io_phys_children = 0;
         zio->io_parent_count = 0;
         zio->io_stall = NULL;
         zio->io_gang_leader = NULL;
@@ -706,7 +690,7 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
  zio_t *
  zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
      void *data, uint64_t size, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, const zbookmark_t *zb)
+    zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb)
  {
         zio_t *zio;
  
@@ -722,8 +706,9 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
  zio_t *
  zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
      void *data, uint64_t size, const zio_prop_t *zp,
-    zio_done_func_t *ready, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, const zbookmark_t *zb)
+    zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
+    void *private,
+    zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb)
  {
         zio_t *zio;
  
@@ -742,6 +727,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
             ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
  
         zio->io_ready = ready;
+       zio->io_physdone = physdone;
         zio->io_prop = *zp;
  
         return (zio);
@@ -749,8 +735,8 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
  
  zio_t *
  zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
-    uint64_t size, zio_done_func_t *done, void *private, int priority,
-    enum zio_flag flags, zbookmark_t *zb)
+    uint64_t size, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb)
  {
         zio_t *zio;
  
@@ -829,7 +815,6 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
             NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
             NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
  
-
         return (zio);
  }
  
@@ -864,14 +849,14 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
  
  zio_t *
  zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
+    zio_done_func_t *done, void *private, enum zio_flag flags)
  {
         zio_t *zio;
         int c;
  
         if (vd->vdev_children == 0) {
                 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
-                   ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
+                   ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
                     ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
  
                 zio->io_cmd = cmd;
@@ -880,7 +865,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
  
                 for (c = 0; c < vd->vdev_children; c++)
                         zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
-                           done, private, priority, flags));
+                           done, private, flags));
         }
  
         return (zio);
@@ -889,7 +874,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
  zio_t *
  zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
      void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, boolean_t labels)
+    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
  {
         zio_t *zio;
  
@@ -910,7 +895,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
  zio_t *
  zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
      void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, boolean_t labels)
+    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
  {
         zio_t *zio;
  
@@ -945,8 +930,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
   */
  zio_t *
  zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
-       void *data, uint64_t size, int type, int priority, enum zio_flag flags,
-       zio_done_func_t *done, void *private)
+       void *data, uint64_t size, int type, zio_priority_t priority,
+       enum zio_flag flags, zio_done_func_t *done, void *private)
  {
         enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
         zio_t *zio;
@@ -981,12 +966,16 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
             done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
             ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
  
+       zio->io_physdone = pio->io_physdone;
+       if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
+               zio->io_logical->io_phys_children++;
+
         return (zio);
  }
  
  zio_t *
  zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
-       int type, int priority, enum zio_flag flags,
+       int type, zio_priority_t priority, enum zio_flag flags,
         zio_done_func_t *done, void *private)
  {
         zio_t *zio;
@@ -995,7 +984,7 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
  
         zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
             data, size, done, private, type, priority,
-           flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
+           flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
             vd, offset, NULL,
             ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
  
@@ -1006,7 +995,7 @@ void
  zio_flush(zio_t *zio, vdev_t *vd)
  {
         zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
-           NULL, NULL, ZIO_PRIORITY_NOW,
+           NULL, NULL,
             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
  }
  
@@ -1951,7 +1940,7 @@ zio_write_gang_block(zio_t *pio)
  
                 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
                     (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
-                   zio_write_gang_member_ready, NULL, &gn->gn_child[g],
+                   zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
                     pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
                     &pio->io_bookmark));
         }
@@ -2335,7 +2324,7 @@ zio_ddt_write(zio_t *zio)
                 }
  
                 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
-                   zio->io_orig_size, &czp, NULL,
+                   zio->io_orig_size, &czp, NULL, NULL,
                     zio_ddt_ditto_write_done, dde, zio->io_priority,
                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
  
@@ -2357,7 +2346,7 @@ zio_ddt_write(zio_t *zio)
                 ddt_phys_addref(ddp);
         } else {
                 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
-                   zio->io_orig_size, zp, zio_ddt_child_write_ready,
+                   zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
                     zio_ddt_child_write_done, dde, zio->io_priority,
                     ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
  
@@ -2780,6 +2769,13 @@ zio_vdev_io_assess(zio_t *zio)
         if (zio->io_error)
                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
  
+       if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+           zio->io_physdone != NULL) {
+               ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
+               ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
+               zio->io_physdone(zio->io_logical);
+       }
+
         return (ZIO_PIPELINE_CONTINUE);
  }
  
@@ -3346,7 +3342,6 @@ EXPORT_SYMBOL(zio_clear_fault);
  EXPORT_SYMBOL(zio_handle_fault_injection);
  EXPORT_SYMBOL(zio_handle_device_injection);
  EXPORT_SYMBOL(zio_handle_label_injection);
-EXPORT_SYMBOL(zio_priority_table);
  EXPORT_SYMBOL(zio_type_name);
  
  module_param(zio_bulk_flags, int, 0644);
author	Matthew Ahrens <mahrens@delphix.com>
	Thu, 29 Aug 2013 03:01:20 +0000 (20:01 -0700)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Fri, 6 Dec 2013 17:32:43 +0000 (09:32 -0800)
include/sys/Makefile.am		patch \| blob \| history
include/sys/arc.h		patch \| blob \| history
include/sys/dbuf.h		patch \| blob \| history
include/sys/dmu.h		patch \| blob \| history
include/sys/dmu_tx.h		patch \| blob \| history
include/sys/dsl_dir.h		patch \| blob \| history
include/sys/dsl_pool.h		patch \| blob \| history
include/sys/sa_impl.h		patch \| blob \| history
include/sys/spa_impl.h		patch \| blob \| history
include/sys/txg.h		patch \| blob \| history
include/sys/txg_impl.h		patch \| blob \| history
include/sys/vdev_impl.h		patch \| blob \| history
include/sys/zfs_context.h		patch \| blob \| history
include/sys/zfs_delay.h	[new file with mode: 0644]	patch \| blob
include/sys/zio.h		patch \| blob \| history
man/man5/zfs-module-parameters.5		patch \| blob \| history
module/zfs/arc.c		patch \| blob \| history
module/zfs/dbuf.c		patch \| blob \| history
module/zfs/dmu.c		patch \| blob \| history
module/zfs/dmu_objset.c		patch \| blob \| history
module/zfs/dmu_tx.c		patch \| blob \| history
module/zfs/dmu_zfetch.c		patch \| blob \| history
module/zfs/dnode.c		patch \| blob \| history
module/zfs/dsl_dir.c		patch \| blob \| history
module/zfs/dsl_pool.c		patch \| blob \| history
module/zfs/dsl_scan.c		patch \| blob \| history
module/zfs/spa.c		patch \| blob \| history
module/zfs/spa_misc.c		patch \| blob \| history
module/zfs/txg.c		patch \| blob \| history
module/zfs/vdev.c		patch \| blob \| history
module/zfs/vdev_cache.c		patch \| blob \| history
module/zfs/vdev_mirror.c		patch \| blob \| history
module/zfs/vdev_queue.c		patch \| blob \| history
module/zfs/vdev_raidz.c		patch \| blob \| history
module/zfs/zfs_fm.c		patch \| blob \| history
module/zfs/zfs_vnops.c		patch \| blob \| history
module/zfs/zil.c		patch \| blob \| history
module/zfs/zio.c		patch \| blob \| history