]> granicus.if.org Git - zfs/commitdiff
Support re-prioritizing asynchronous prefetches
authorTom Caputi <tcaputi@datto.com>
Thu, 21 Dec 2017 17:13:06 +0000 (12:13 -0500)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Thu, 21 Dec 2017 17:13:06 +0000 (09:13 -0800)
When sequential scrubs were merged, all calls to arc_read()
(including prefetch IOs) were given ZIO_PRIORITY_ASYNC_READ.
Unfortunately, this behaves badly with an existing issue where
prefetch IOs cannot be re-prioritized after the issue. The
result is that synchronous reads end up in the same vdev_queue
as the scrub IOs and can have (in some workloads) multiple
seconds of latency.

This patch incorporates 2 changes. The first ensures that all
scrub IOs are given ZIO_PRIORITY_SCRUB to allow the vdev_queue
code to differentiate between these I/Os and user prefetches.
Second, this patch introduces zio_change_priority() to provide
the missing capability to upgrade a zio's priority.

Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #6921
Closes #6926

include/sys/arc_impl.h
include/sys/trace_arc.h
include/sys/vdev.h
include/sys/zio.h
io_spa- [new file with mode: 0644]
module/zfs/arc.c
module/zfs/dsl_scan.c
module/zfs/vdev_queue.c
module/zfs/zio.c
spa_stats.io_history [new file with mode: 0644]
tests/zfs-tests/tests/perf/scripts/prefetch_io.sh

index e39cf6a8ff490b065c57b2230372ae79cb1bef77..a923449d96fd5e62f96494f3dadb1fcbe58c786d 100644 (file)
@@ -98,6 +98,7 @@ struct arc_callback {
        boolean_t               acb_noauth;
        uint64_t                acb_dsobj;
        zio_t                   *acb_zio_dummy;
+       zio_t                   *acb_zio_head;
        arc_callback_t          *acb_next;
 };
 
index 74a76520dffdaf48eb3cd056fb091ca001729c66..c40b58e32d9112d75ba67e5417838f499668abf5 100644 (file)
@@ -108,7 +108,7 @@ DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict);
 DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete);
 DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru);
 DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu);
-DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__sync__wait__for__async);
+DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync);
 DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch);
 DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit);
 DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss);
index 473d2691c947fcb7bb20444f735727597b4a726d..bc2f4f0ead887a822d01d9c40a4a95dccab9fdc5 100644 (file)
@@ -123,6 +123,7 @@ extern void vdev_queue_init(vdev_t *vd);
 extern void vdev_queue_fini(vdev_t *vd);
 extern zio_t *vdev_queue_io(zio_t *zio);
 extern void vdev_queue_io_done(zio_t *zio);
+extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
 
 extern int vdev_queue_length(vdev_t *vd);
 extern uint64_t vdev_queue_last_offset(vdev_t *vd);
index 1f9f48268463913595b3f88be205afa7e9b3a9f6..8beea7adf1e674466a60a75c3ab8877324f4d924 100644 (file)
@@ -586,6 +586,8 @@ extern void zio_vdev_io_bypass(zio_t *zio);
 extern void zio_vdev_io_reissue(zio_t *zio);
 extern void zio_vdev_io_redone(zio_t *zio);
 
+extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
+
 extern void zio_checksum_verified(zio_t *zio);
 extern int zio_worst_error(int e1, int e2);
 
diff --git a/io_spa- b/io_spa-
new file mode 100644 (file)
index 0000000..e69de29
index 10b1c60d538660b78c530451556acfcbfdf03cb4..476351eb42844130c860e6c6120d260ebcfa613f 100644 (file)
@@ -663,7 +663,7 @@ typedef struct arc_stats {
        kstat_named_t arcstat_dnode_limit;
        kstat_named_t arcstat_meta_max;
        kstat_named_t arcstat_meta_min;
-       kstat_named_t arcstat_sync_wait_for_async;
+       kstat_named_t arcstat_async_upgrade_sync;
        kstat_named_t arcstat_demand_hit_predictive_prefetch;
        kstat_named_t arcstat_demand_hit_prescient_prefetch;
        kstat_named_t arcstat_need_free;
@@ -763,7 +763,7 @@ static arc_stats_t arc_stats = {
        { "arc_dnode_limit",            KSTAT_DATA_UINT64 },
        { "arc_meta_max",               KSTAT_DATA_UINT64 },
        { "arc_meta_min",               KSTAT_DATA_UINT64 },
-       { "sync_wait_for_async",        KSTAT_DATA_UINT64 },
+       { "async_upgrade_sync",         KSTAT_DATA_UINT64 },
        { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
        { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
        { "arc_need_free",              KSTAT_DATA_UINT64 },
@@ -5911,32 +5911,20 @@ top:
                *arc_flags |= ARC_FLAG_CACHED;
 
                if (HDR_IO_IN_PROGRESS(hdr)) {
+                       zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
 
+                       ASSERT3P(head_zio, !=, NULL);
                        if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
                            priority == ZIO_PRIORITY_SYNC_READ) {
                                /*
-                                * This sync read must wait for an
-                                * in-progress async read (e.g. a predictive
-                                * prefetch).  Async reads are queued
-                                * separately at the vdev_queue layer, so
-                                * this is a form of priority inversion.
-                                * Ideally, we would "inherit" the demand
-                                * i/o's priority by moving the i/o from
-                                * the async queue to the synchronous queue,
-                                * but there is currently no mechanism to do
-                                * so.  Track this so that we can evaluate
-                                * the magnitude of this potential performance
-                                * problem.
-                                *
-                                * Note that if the prefetch i/o is already
-                                * active (has been issued to the device),
-                                * the prefetch improved performance, because
-                                * we issued it sooner than we would have
-                                * without the prefetch.
+                                * This is a sync read that needs to wait for
+                                * an in-flight async read. Request that the
+                                * zio have its priority upgraded.
                                 */
-                               DTRACE_PROBE1(arc__sync__wait__for__async,
+                               zio_change_priority(head_zio, priority);
+                               DTRACE_PROBE1(arc__async__upgrade__sync,
                                    arc_buf_hdr_t *, hdr);
-                               ARCSTAT_BUMP(arcstat_sync_wait_for_async);
+                               ARCSTAT_BUMP(arcstat_async_upgrade_sync);
                        }
                        if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
                                arc_hdr_clear_flags(hdr,
@@ -5966,6 +5954,7 @@ top:
                                            spa, NULL, NULL, NULL, zio_flags);
 
                                ASSERT3P(acb->acb_done, !=, NULL);
+                               acb->acb_zio_head = head_zio;
                                acb->acb_next = hdr->b_l1hdr.b_acb;
                                hdr->b_l1hdr.b_acb = acb;
                                mutex_exit(hash_lock);
@@ -6182,14 +6171,17 @@ top:
                                vd = NULL;
                }
 
-               if (priority == ZIO_PRIORITY_ASYNC_READ)
+               /*
+                * We count both async reads and scrub IOs as asynchronous so
+                * that both can be upgraded in the event of a cache hit while
+                * the read IO is still in-flight.
+                */
+               if (priority == ZIO_PRIORITY_ASYNC_READ ||
+                   priority == ZIO_PRIORITY_SCRUB)
                        arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
                else
                        arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
 
-               if (hash_lock != NULL)
-                       mutex_exit(hash_lock);
-
                /*
                 * At this point, we have a level 1 cache miss.  Try again in
                 * L2ARC if possible.
@@ -6260,6 +6252,10 @@ top:
                                    ZIO_FLAG_CANFAIL |
                                    ZIO_FLAG_DONT_PROPAGATE |
                                    ZIO_FLAG_DONT_RETRY, B_FALSE);
+                               acb->acb_zio_head = rzio;
+
+                               if (hash_lock != NULL)
+                                       mutex_exit(hash_lock);
 
                                DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
                                    zio_t *, rzio);
@@ -6276,6 +6272,8 @@ top:
                                        goto out;
 
                                /* l2arc read error; goto zio_read() */
+                               if (hash_lock != NULL)
+                                       mutex_enter(hash_lock);
                        } else {
                                DTRACE_PROBE1(l2arc__miss,
                                    arc_buf_hdr_t *, hdr);
@@ -6296,6 +6294,10 @@ top:
 
                rzio = zio_read(pio, spa, bp, hdr_abd, size,
                    arc_read_done, hdr, priority, zio_flags, zb);
+               acb->acb_zio_head = rzio;
+
+               if (hash_lock != NULL)
+                       mutex_exit(hash_lock);
 
                if (*arc_flags & ARC_FLAG_WAIT) {
                        rc = zio_wait(rzio);
index 52c700f11891d1f00c6c7f4c45c34d20345c1000..de579d1c7ec7d7fb9f1aa1da6f3be22ff013c3db 100644 (file)
@@ -1529,7 +1529,7 @@ dsl_scan_prefetch_thread(void *arg)
                /* issue the prefetch asynchronously */
                (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
                    &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
-                   ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &spic->spic_zb);
+                   ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
 
                kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
        }
@@ -1611,7 +1611,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
                arc_buf_t *buf;
 
                err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
-                   ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+                   ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
                if (err) {
                        scn->scn_phys.scn_errors++;
                        return (err);
@@ -1639,7 +1639,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
                }
 
                err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
-                   ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+                   ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
                if (err) {
                        scn->scn_phys.scn_errors++;
                        return (err);
@@ -1658,7 +1658,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
                arc_buf_t *buf;
 
                err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
-                   ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+                   ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
                if (err) {
                        scn->scn_phys.scn_errors++;
                        return (err);
index 792642952e22a81f562558a524a5090c8a389afb..f121eddc7a304314243b301c50541cbfd8359cb8 100644 (file)
@@ -803,6 +803,48 @@ vdev_queue_io_done(zio_t *zio)
        mutex_exit(&vq->vq_lock);
 }
 
+void
+vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
+{
+       vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+       avl_tree_t *tree;
+
+       ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+       ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+
+       if (zio->io_type == ZIO_TYPE_READ) {
+               if (priority != ZIO_PRIORITY_SYNC_READ &&
+                   priority != ZIO_PRIORITY_ASYNC_READ &&
+                   priority != ZIO_PRIORITY_SCRUB)
+                       priority = ZIO_PRIORITY_ASYNC_READ;
+       } else {
+               ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+               if (priority != ZIO_PRIORITY_SYNC_WRITE &&
+                   priority != ZIO_PRIORITY_ASYNC_WRITE)
+                       priority = ZIO_PRIORITY_ASYNC_WRITE;
+       }
+
+       mutex_enter(&vq->vq_lock);
+
+       /*
+        * If the zio is in none of the queues we can simply change
+        * the priority. If the zio is waiting to be submitted we must
+        * remove it from the queue and re-insert it with the new priority.
+        * Otherwise, the zio is currently active and we cannot change its
+        * priority.
+        */
+       tree = vdev_queue_class_tree(vq, zio->io_priority);
+       if (avl_find(tree, zio, NULL) == zio) {
+               avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+               zio->io_priority = priority;
+               avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+       } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
+               zio->io_priority = priority;
+       }
+
+       mutex_exit(&vq->vq_lock);
+}
+
 /*
  * As these two methods are only used for load calculations we're not
  * concerned if we get an incorrect value on 32bit platforms due to lack of
index 92e5a8dd8303ddbc9b170c925f7d513aa748dd84..c3b571e9a7e883d7ad8d5ff6375a47973814c9c5 100644 (file)
@@ -539,6 +539,8 @@ zio_walk_children(zio_t *pio, zio_link_t **zl)
 {
        list_t *cl = &pio->io_child_list;
 
+       ASSERT(MUTEX_HELD(&pio->io_lock));
+
        *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
        if (*zl == NULL)
                return (NULL);
@@ -573,8 +575,8 @@ zio_add_child(zio_t *pio, zio_t *cio)
        zl->zl_parent = pio;
        zl->zl_child = cio;
 
-       mutex_enter(&cio->io_lock);
        mutex_enter(&pio->io_lock);
+       mutex_enter(&cio->io_lock);
 
        ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
@@ -587,8 +589,8 @@ zio_add_child(zio_t *pio, zio_t *cio)
        pio->io_child_count++;
        cio->io_parent_count++;
 
-       mutex_exit(&pio->io_lock);
        mutex_exit(&cio->io_lock);
+       mutex_exit(&pio->io_lock);
 }
 
 static void
@@ -597,8 +599,8 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
        ASSERT(zl->zl_parent == pio);
        ASSERT(zl->zl_child == cio);
 
-       mutex_enter(&cio->io_lock);
        mutex_enter(&pio->io_lock);
+       mutex_enter(&cio->io_lock);
 
        list_remove(&pio->io_child_list, zl);
        list_remove(&cio->io_parent_list, zl);
@@ -606,8 +608,8 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
        pio->io_child_count--;
        cio->io_parent_count--;
 
-       mutex_exit(&pio->io_lock);
        mutex_exit(&cio->io_lock);
+       mutex_exit(&pio->io_lock);
        kmem_cache_free(zio_link_cache, zl);
 }
 
@@ -1963,14 +1965,16 @@ zio_reexecute(zio_t *pio)
         * cannot be affected by any side effects of reexecuting 'cio'.
         */
        zio_link_t *zl = NULL;
+       mutex_enter(&pio->io_lock);
        for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
                cio_next = zio_walk_children(pio, &zl);
-               mutex_enter(&pio->io_lock);
                for (int w = 0; w < ZIO_WAIT_TYPES; w++)
                        pio->io_children[cio->io_child_type][w]++;
                mutex_exit(&pio->io_lock);
                zio_reexecute(cio);
+               mutex_enter(&pio->io_lock);
        }
+       mutex_exit(&pio->io_lock);
 
        /*
         * Now that all children have been reexecuted, execute the parent.
@@ -3474,6 +3478,35 @@ zio_vdev_io_done(zio_t *zio)
        return (ZIO_PIPELINE_CONTINUE);
 }
 
+/*
+ * This function is used to change the priority of an existing zio that is
+ * currently in-flight. This is used by the arc to upgrade priority in the
+ * event that a demand read is made for a block that is currently queued
+ * as a scrub or async read IO. Otherwise, the high priority read request
+ * would end up having to wait for the lower priority IO.
+ */
+void
+zio_change_priority(zio_t *pio, zio_priority_t priority)
+{
+       zio_t *cio, *cio_next;
+       zio_link_t *zl = NULL;
+
+       ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+
+       if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
+               vdev_queue_change_io_priority(pio, priority);
+       } else {
+               pio->io_priority = priority;
+       }
+
+       mutex_enter(&pio->io_lock);
+       for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+               cio_next = zio_walk_children(pio, &zl);
+               zio_change_priority(cio, priority);
+       }
+       mutex_exit(&pio->io_lock);
+}
+
 /*
  * For non-raidz ZIOs, we can just copy aside the bad data read from the
  * disk, and use that to finish the checksum ereport later.
diff --git a/spa_stats.io_history b/spa_stats.io_history
new file mode 100644 (file)
index 0000000..e69de29
index 85e7d940a98b82b6d573ca114b9926ed32b180a0..75bf08f4b6799d3fcc828839ee7b3a95155dc2fe 100755 (executable)
@@ -41,9 +41,9 @@ function get_prefetched_demand_reads
        echo $demand_reads
 }
 
-function get_sync_wait_for_async
+function get_async_upgrade_sync
 {
-       typeset -l sync_wait=`awk '$1 == "sync_wait_for_async" \
+       typeset -l sync_wait=`awk '$1 == "async_upgrade_sync" \
            { print $3 }' $zfs_kstats/arcstats`
 
        echo $sync_wait
@@ -59,7 +59,7 @@ poolname=$1
 interval=$2
 prefetch_ios=$(get_prefetch_ios)
 prefetched_demand_reads=$(get_prefetched_demand_reads)
-sync_wait_for_async=$(get_sync_wait_for_async)
+async_upgrade_sync=$(get_async_upgrade_sync)
 
 while true
 do
@@ -73,10 +73,10 @@ do
            $(( $new_prefetched_demand_reads - $prefetched_demand_reads ))
        prefetched_demand_reads=$new_prefetched_demand_reads
 
-       new_sync_wait_for_async=$(get_sync_wait_for_async)
-       printf "%-24s\t%u\n" "sync_wait_for_async" \
-           $(( $new_sync_wait_for_async - $sync_wait_for_async ))
-       sync_wait_for_async=$new_sync_wait_for_async
+       new_async_upgrade_sync=$(get_async_upgrade_sync)
+       printf "%-24s\t%u\n" "async_upgrade_sync" \
+           $(( $new_async_upgrade_sync - $async_upgrade_sync ))
+       async_upgrade_sync=$new_async_upgrade_sync
 
        sleep $interval
 done