boolean_t acb_noauth;
uint64_t acb_dsobj;
zio_t *acb_zio_dummy;
+ zio_t *acb_zio_head;
arc_callback_t *acb_next;
};
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete);
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru);
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu);
-DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__sync__wait__for__async);
+DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch);
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit);
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss);
extern void vdev_queue_fini(vdev_t *vd);
extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
+extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
extern int vdev_queue_length(vdev_t *vd);
extern uint64_t vdev_queue_last_offset(vdev_t *vd);
extern void zio_vdev_io_reissue(zio_t *zio);
extern void zio_vdev_io_redone(zio_t *zio);
+extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
+
extern void zio_checksum_verified(zio_t *zio);
extern int zio_worst_error(int e1, int e2);
kstat_named_t arcstat_dnode_limit;
kstat_named_t arcstat_meta_max;
kstat_named_t arcstat_meta_min;
- kstat_named_t arcstat_sync_wait_for_async;
+ kstat_named_t arcstat_async_upgrade_sync;
kstat_named_t arcstat_demand_hit_predictive_prefetch;
kstat_named_t arcstat_demand_hit_prescient_prefetch;
kstat_named_t arcstat_need_free;
{ "arc_dnode_limit", KSTAT_DATA_UINT64 },
{ "arc_meta_max", KSTAT_DATA_UINT64 },
{ "arc_meta_min", KSTAT_DATA_UINT64 },
- { "sync_wait_for_async", KSTAT_DATA_UINT64 },
+ { "async_upgrade_sync", KSTAT_DATA_UINT64 },
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
{ "arc_need_free", KSTAT_DATA_UINT64 },
*arc_flags |= ARC_FLAG_CACHED;
if (HDR_IO_IN_PROGRESS(hdr)) {
+ zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
+ ASSERT3P(head_zio, !=, NULL);
if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
priority == ZIO_PRIORITY_SYNC_READ) {
/*
- * This sync read must wait for an
- * in-progress async read (e.g. a predictive
- * prefetch). Async reads are queued
- * separately at the vdev_queue layer, so
- * this is a form of priority inversion.
- * Ideally, we would "inherit" the demand
- * i/o's priority by moving the i/o from
- * the async queue to the synchronous queue,
- * but there is currently no mechanism to do
- * so. Track this so that we can evaluate
- * the magnitude of this potential performance
- * problem.
- *
- * Note that if the prefetch i/o is already
- * active (has been issued to the device),
- * the prefetch improved performance, because
- * we issued it sooner than we would have
- * without the prefetch.
+ * This is a sync read that needs to wait for
+ * an in-flight async read. Request that the
+ * zio have its priority upgraded.
*/
- DTRACE_PROBE1(arc__sync__wait__for__async,
+ zio_change_priority(head_zio, priority);
+ DTRACE_PROBE1(arc__async__upgrade__sync,
arc_buf_hdr_t *, hdr);
- ARCSTAT_BUMP(arcstat_sync_wait_for_async);
+ ARCSTAT_BUMP(arcstat_async_upgrade_sync);
}
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
arc_hdr_clear_flags(hdr,
spa, NULL, NULL, NULL, zio_flags);
ASSERT3P(acb->acb_done, !=, NULL);
+ acb->acb_zio_head = head_zio;
acb->acb_next = hdr->b_l1hdr.b_acb;
hdr->b_l1hdr.b_acb = acb;
mutex_exit(hash_lock);
vd = NULL;
}
- if (priority == ZIO_PRIORITY_ASYNC_READ)
+ /*
+ * We count both async reads and scrub IOs as asynchronous so
+ * that both can be upgraded in the event of a cache hit while
+ * the read IO is still in-flight.
+ */
+ if (priority == ZIO_PRIORITY_ASYNC_READ ||
+ priority == ZIO_PRIORITY_SCRUB)
arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
else
arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
- if (hash_lock != NULL)
- mutex_exit(hash_lock);
-
/*
* At this point, we have a level 1 cache miss. Try again in
* L2ARC if possible.
ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY, B_FALSE);
+ acb->acb_zio_head = rzio;
+
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
zio_t *, rzio);
goto out;
/* l2arc read error; goto zio_read() */
+ if (hash_lock != NULL)
+ mutex_enter(hash_lock);
} else {
DTRACE_PROBE1(l2arc__miss,
arc_buf_hdr_t *, hdr);
rzio = zio_read(pio, spa, bp, hdr_abd, size,
arc_read_done, hdr, priority, zio_flags, zb);
+ acb->acb_zio_head = rzio;
+
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
if (*arc_flags & ARC_FLAG_WAIT) {
rc = zio_wait(rzio);
/* issue the prefetch asynchronously */
(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
&spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
- ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &spic->spic_zb);
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
}
arc_buf_t *buf;
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
}
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
arc_buf_t *buf;
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
mutex_exit(&vq->vq_lock);
}
+void
+vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ avl_tree_t *tree;
+
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ if (priority != ZIO_PRIORITY_SYNC_READ &&
+ priority != ZIO_PRIORITY_ASYNC_READ &&
+ priority != ZIO_PRIORITY_SCRUB)
+ priority = ZIO_PRIORITY_ASYNC_READ;
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ if (priority != ZIO_PRIORITY_SYNC_WRITE &&
+ priority != ZIO_PRIORITY_ASYNC_WRITE)
+ priority = ZIO_PRIORITY_ASYNC_WRITE;
+ }
+
+ mutex_enter(&vq->vq_lock);
+
+ /*
+ * If the zio is in none of the queues we can simply change
+ * the priority. If the zio is waiting to be submitted we must
+ * remove it from the queue and re-insert it with the new priority.
+ * Otherwise, the zio is currently active and we cannot change its
+ * priority.
+ */
+ tree = vdev_queue_class_tree(vq, zio->io_priority);
+ if (avl_find(tree, zio, NULL) == zio) {
+ avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ zio->io_priority = priority;
+ avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
+ zio->io_priority = priority;
+ }
+
+ mutex_exit(&vq->vq_lock);
+}
+
/*
* As these two methods are only used for load calculations we're not
* concerned if we get an incorrect value on 32bit platforms due to lack of
{
list_t *cl = &pio->io_child_list;
+ ASSERT(MUTEX_HELD(&pio->io_lock));
+
*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
if (*zl == NULL)
return (NULL);
zl->zl_parent = pio;
zl->zl_child = cio;
- mutex_enter(&cio->io_lock);
mutex_enter(&pio->io_lock);
+ mutex_enter(&cio->io_lock);
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
pio->io_child_count++;
cio->io_parent_count++;
- mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
+ mutex_exit(&pio->io_lock);
}
static void
ASSERT(zl->zl_parent == pio);
ASSERT(zl->zl_child == cio);
- mutex_enter(&cio->io_lock);
mutex_enter(&pio->io_lock);
+ mutex_enter(&cio->io_lock);
list_remove(&pio->io_child_list, zl);
list_remove(&cio->io_parent_list, zl);
pio->io_child_count--;
cio->io_parent_count--;
- mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
+ mutex_exit(&pio->io_lock);
kmem_cache_free(zio_link_cache, zl);
}
* cannot be affected by any side effects of reexecuting 'cio'.
*/
zio_link_t *zl = NULL;
+ mutex_enter(&pio->io_lock);
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
cio_next = zio_walk_children(pio, &zl);
- mutex_enter(&pio->io_lock);
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_children[cio->io_child_type][w]++;
mutex_exit(&pio->io_lock);
zio_reexecute(cio);
+ mutex_enter(&pio->io_lock);
}
+ mutex_exit(&pio->io_lock);
/*
* Now that all children have been reexecuted, execute the parent.
return (ZIO_PIPELINE_CONTINUE);
}
+/*
+ * This function is used to change the priority of an existing zio that is
+ * currently in-flight. This is used by the arc to upgrade priority in the
+ * event that a demand read is made for a block that is currently queued
+ * as a scrub or async read IO. Otherwise, the high priority read request
+ * would end up having to wait for the lower priority IO.
+ */
+void
+zio_change_priority(zio_t *pio, zio_priority_t priority)
+{
+ zio_t *cio, *cio_next;
+ zio_link_t *zl = NULL;
+
+ ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+
+ if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
+ vdev_queue_change_io_priority(pio, priority);
+ } else {
+ pio->io_priority = priority;
+ }
+
+ mutex_enter(&pio->io_lock);
+ for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+ cio_next = zio_walk_children(pio, &zl);
+ zio_change_priority(cio, priority);
+ }
+ mutex_exit(&pio->io_lock);
+}
+
/*
* For non-raidz ZIOs, we can just copy aside the bad data read from the
* disk, and use that to finish the checksum ereport later.
echo $demand_reads
}
-function get_sync_wait_for_async
+function get_async_upgrade_sync
{
- typeset -l sync_wait=`awk '$1 == "sync_wait_for_async" \
+ typeset -l sync_wait=`awk '$1 == "async_upgrade_sync" \
{ print $3 }' $zfs_kstats/arcstats`
echo $sync_wait
interval=$2
prefetch_ios=$(get_prefetch_ios)
prefetched_demand_reads=$(get_prefetched_demand_reads)
-sync_wait_for_async=$(get_sync_wait_for_async)
+async_upgrade_sync=$(get_async_upgrade_sync)
while true
do
$(( $new_prefetched_demand_reads - $prefetched_demand_reads ))
prefetched_demand_reads=$new_prefetched_demand_reads
- new_sync_wait_for_async=$(get_sync_wait_for_async)
- printf "%-24s\t%u\n" "sync_wait_for_async" \
- $(( $new_sync_wait_for_async - $sync_wait_for_async ))
- sync_wait_for_async=$new_sync_wait_for_async
+ new_async_upgrade_sync=$(get_async_upgrade_sync)
+ printf "%-24s\t%u\n" "async_upgrade_sync" \
+ $(( $new_async_upgrade_sync - $async_upgrade_sync ))
+ async_upgrade_sync=$new_async_upgrade_sync
sleep $interval
done