]> granicus.if.org Git - zfs/commitdiff
5313 Allow I/Os to be aggregated across ZIO priority classes
authorJustin T. Gibbs <justing@spectralogic.com>
Sat, 11 Apr 2015 18:51:06 +0000 (14:51 -0400)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Fri, 24 Apr 2015 22:16:56 +0000 (15:16 -0700)
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Will Andrews <willa@SpectraLogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>

References:
  https://www.illumos.org/issues/5313
  https://github.com/illumos/illumos-gate/commit/fe319232

Ported-by: DHE <git@dehacked.net>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3280

include/sys/vdev_impl.h
include/sys/zio.h
module/zfs/vdev_queue.c

index a8dc9510e3e93955c724ad63c750015b048b826e..1048dec5ed89be3e11fba1a4f2793cdc08f5c489 100644 (file)
@@ -113,6 +113,8 @@ struct vdev_queue {
        vdev_t          *vq_vdev;
        vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
        avl_tree_t      vq_active_tree;
+       avl_tree_t      vq_read_offset_tree;
+       avl_tree_t      vq_write_offset_tree;
        uint64_t        vq_last_offset;
        hrtime_t        vq_io_complete_ts; /* time last i/o completed */
        hrtime_t        vq_io_delta_ts;
index 18e7a40a308085f60ebb70d2c62c38bce1949a2e..0368d9c5925cd864d149e949d9399db883fcc057 100644 (file)
@@ -427,6 +427,7 @@ struct zio {
        hrtime_t        io_delta;       /* vdev queue service delta */
        uint64_t        io_delay;       /* vdev disk service delta (ticks) */
        avl_node_t      io_queue_node;
+       avl_node_t      io_offset_node;
 
        /* Internal pipeline state */
        enum zio_flag   io_flags;
index 3fa4219f260e7099369773d690c2a64a01be276b..cf03016490642a9032b5578033a86a30114835fe 100644 (file)
@@ -190,6 +190,22 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
        return (0);
 }
 
+static inline avl_tree_t *
+vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
+{
+       return (&vq->vq_class[p].vqc_queued_tree);
+}
+
+static inline avl_tree_t *
+vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
+{
+       ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE);
+       if (t == ZIO_TYPE_READ)
+               return (&vq->vq_read_offset_tree);
+       else
+               return (&vq->vq_write_offset_tree);
+}
+
 int
 vdev_queue_timestamp_compare(const void *x1, const void *x2)
 {
@@ -303,7 +319,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
 
        /* find a queue that has not reached its minimum # outstanding i/os */
        for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
-               if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
+               if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
                    vq->vq_class[p].vqc_active <
                    vdev_queue_class_min_active(p))
                        return (p);
@@ -314,7 +330,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
         * maximum # outstanding i/os.
         */
        for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
-               if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
+               if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
                    vq->vq_class[p].vqc_active <
                    vdev_queue_class_max_active(spa, p))
                        return (p);
@@ -335,20 +351,27 @@ vdev_queue_init(vdev_t *vd)
 
        avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
            sizeof (zio_t), offsetof(struct zio, io_queue_node));
+       avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
+               vdev_queue_offset_compare, sizeof (zio_t),
+               offsetof(struct zio, io_offset_node));
+       avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
+               vdev_queue_offset_compare, sizeof (zio_t),
+               offsetof(struct zio, io_offset_node));
 
        for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+               int (*compfn) (const void *, const void *);
+
                /*
-                * The synchronous i/o queues are FIFO rather than LBA ordered.
-                * This provides more consistent latency for these i/os, and
-                * they tend to not be tightly clustered anyway so there is
-                * little to no throughput loss.
+                * The synchronous i/o queues are dispatched in FIFO rather
+                * than LBA order. This provides more consistent latency for
+                * these i/os.
                 */
-               boolean_t fifo = (p == ZIO_PRIORITY_SYNC_READ ||
-                   p == ZIO_PRIORITY_SYNC_WRITE);
-               avl_create(&vq->vq_class[p].vqc_queued_tree,
-                   fifo ? vdev_queue_timestamp_compare :
-                   vdev_queue_offset_compare,
-                   sizeof (zio_t), offsetof(struct zio, io_queue_node));
+               if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
+                       compfn = vdev_queue_timestamp_compare;
+               else
+                       compfn = vdev_queue_offset_compare;
+               avl_create(vdev_queue_class_tree(vq, p), compfn,
+                       sizeof (zio_t), offsetof(struct zio, io_queue_node));
        }
 }
 
@@ -359,8 +382,10 @@ vdev_queue_fini(vdev_t *vd)
        zio_priority_t p;
 
        for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
-               avl_destroy(&vq->vq_class[p].vqc_queued_tree);
+               avl_destroy(vdev_queue_class_tree(vq, p));
        avl_destroy(&vq->vq_active_tree);
+       avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
+       avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
 
        mutex_destroy(&vq->vq_lock);
 }
@@ -372,7 +397,8 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
        spa_stats_history_t *ssh = &spa->spa_stats.io_history;
 
        ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-       avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
+       avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+       avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
 
        if (ssh->kstat != NULL) {
                mutex_enter(&ssh->lock);
@@ -388,7 +414,8 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
        spa_stats_history_t *ssh = &spa->spa_stats.io_history;
 
        ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-       avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
+       avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+       avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
 
        if (ssh->kstat != NULL) {
                mutex_enter(&ssh->lock);
@@ -472,8 +499,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
        uint64_t maxgap = 0;
        uint64_t size;
        boolean_t stretch = B_FALSE;
-       vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority];
-       avl_tree_t *t = &vqc->vqc_queued_tree;
+       avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
        enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
 
        if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
@@ -486,15 +512,6 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
        zfs_vdev_aggregation_limit =
            MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE);
 
-       /*
-        * The synchronous i/o queues are not sorted by LBA, so we can't
-        * find adjacent i/os.  These i/os tend to not be tightly clustered,
-        * or too large to aggregate, so this has little impact on performance.
-        */
-       if (zio->io_priority == ZIO_PRIORITY_SYNC_READ ||
-           zio->io_priority == ZIO_PRIORITY_SYNC_WRITE)
-               return (NULL);
-
        first = last = zio;
 
        if (zio->io_type == ZIO_TYPE_READ)
@@ -627,7 +644,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
        zio_t *zio, *aio;
        zio_priority_t p;
        avl_index_t idx;
-       vdev_queue_class_t *vqc;
+       avl_tree_t *tree;
 
 again:
        ASSERT(MUTEX_HELD(&vq->vq_lock));
@@ -645,14 +662,14 @@ again:
         *
         * For FIFO queues (sync), issue the i/o with the lowest timestamp.
         */
-       vqc = &vq->vq_class[p];
+       tree = vdev_queue_class_tree(vq, p);
        vq->vq_io_search.io_timestamp = 0;
        vq->vq_io_search.io_offset = vq->vq_last_offset + 1;
-       VERIFY3P(avl_find(&vqc->vqc_queued_tree, &vq->vq_io_search,
+       VERIFY3P(avl_find(tree, &vq->vq_io_search,
            &idx), ==, NULL);
-       zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER);
+       zio = avl_nearest(tree, idx, AVL_AFTER);
        if (zio == NULL)
-               zio = avl_first(&vqc->vqc_queued_tree);
+               zio = avl_first(tree);
        ASSERT3U(zio->io_priority, ==, p);
 
        aio = vdev_queue_aggregate(vq, zio);