SPA_LOAD_IMPORT, /* import in progress */
SPA_LOAD_TRYIMPORT, /* tryimport in progress */
SPA_LOAD_RECOVER, /* recovery requested */
- SPA_LOAD_ERROR /* load failed */
+ SPA_LOAD_ERROR, /* load failed */
+ SPA_LOAD_CREATE /* creation in progress */
} spa_load_state_t;
/*
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_H
void metaslab_sync_reassess(metaslab_group_t *);
uint64_t metaslab_block_maxsize(metaslab_t *);
-#define METASLAB_HINTBP_FAVOR 0x0
-#define METASLAB_HINTBP_AVOID 0x1
-#define METASLAB_GANG_HEADER 0x2
-#define METASLAB_GANG_CHILD 0x4
-#define METASLAB_GANG_AVOID 0x8
-#define METASLAB_FASTWRITE 0x10
+#define METASLAB_HINTBP_FAVOR 0x0
+#define METASLAB_HINTBP_AVOID 0x1
+#define METASLAB_GANG_HEADER 0x2
+#define METASLAB_GANG_CHILD 0x4
+#define METASLAB_ASYNC_ALLOC 0x8
+#define METASLAB_DONT_THROTTLE 0x10
+#define METASLAB_FASTWRITE 0x20
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
- blkptr_t *, int, uint64_t, blkptr_t *, int);
+ blkptr_t *, int, uint64_t, blkptr_t *, int, zio_t *);
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
void metaslab_check_free(spa_t *, const blkptr_t *);
void metaslab_class_histogram_verify(metaslab_class_t *);
uint64_t metaslab_class_fragmentation(metaslab_class_t *);
uint64_t metaslab_class_expandable_space(metaslab_class_t *);
+boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
+ zio_t *, int);
+void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);
void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
int64_t, int64_t);
void metaslab_group_destroy(metaslab_group_t *);
void metaslab_group_activate(metaslab_group_t *);
void metaslab_group_passivate(metaslab_group_t *);
+boolean_t metaslab_group_initialized(metaslab_group_t *);
uint64_t metaslab_group_get_space(metaslab_group_t *);
void metaslab_group_histogram_verify(metaslab_group_t *);
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
+void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int);
+void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *);
#ifdef __cplusplus
}
*/
/*
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_IMPL_H
* to use a block allocator that best suits that class.
*/
struct metaslab_class {
+ kmutex_t mc_lock;
spa_t *mc_spa;
metaslab_group_t *mc_rotor;
metaslab_ops_t *mc_ops;
uint64_t mc_aliquot;
+
+ /*
+ * Track the number of metaslab groups that have been initialized
+ * and can accept allocations. An initialized metaslab group is
+ * one has been completely added to the config (i.e. we have
+ * updated the MOS config and the space has been added to the pool).
+ */
+ uint64_t mc_groups;
+
+ /*
+ * Toggle to enable/disable the allocation throttle.
+ */
+ boolean_t mc_alloc_throttle_enabled;
+
+ /*
+ * The allocation throttle works on a reservation system. Whenever
+ * an asynchronous zio wants to perform an allocation it must
+ * first reserve the number of blocks that it wants to allocate.
+ * If there aren't sufficient slots available for the pending zio
+ * then that I/O is throttled until more slots free up. The current
+ * number of reserved allocations is maintained by the mc_alloc_slots
+ * refcount. The mc_alloc_max_slots value determines the maximum
+ * number of allocations that the system allows. Gang blocks are
+ * allowed to reserve slots even if we've reached the maximum
+ * number of allocations allowed.
+ */
+ uint64_t mc_alloc_max_slots;
+ refcount_t mc_alloc_slots;
+
uint64_t mc_alloc_groups; /* # of allocatable groups */
+
uint64_t mc_alloc; /* total allocated space */
uint64_t mc_deferred; /* total deferred frees */
uint64_t mc_space; /* total space (alloc + free) */
avl_tree_t mg_metaslab_tree;
uint64_t mg_aliquot;
boolean_t mg_allocatable; /* can we allocate? */
+
+ /*
+ * A metaslab group is considered to be initialized only after
+ * we have updated the MOS config and added the space to the pool.
+ * We only allow allocation attempts to a metaslab group if it
+ * has been initialized.
+ */
+ boolean_t mg_initialized;
+
uint64_t mg_free_capacity; /* percentage free */
int64_t mg_bias;
int64_t mg_activation_count;
taskq_t *mg_taskq;
metaslab_group_t *mg_prev;
metaslab_group_t *mg_next;
+
+ /*
+ * Each metaslab group can handle mg_max_alloc_queue_depth allocations
+ * which are tracked by mg_alloc_queue_depth. It's possible for a
+ * metaslab group to handle more allocations than its max. This
+ * can occur when gang blocks are required or when other groups
+ * are unable to handle their share of allocations.
+ */
+ uint64_t mg_max_alloc_queue_depth;
+ refcount_t mg_alloc_queue_depth;
+
+ /*
+ * A metalab group that can no longer allocate the minimum block
+ * size will set mg_no_free_space. Once a metaslab group is out
+ * of space then its share of work must be distributed to other
+ * groups.
+ */
+ boolean_t mg_no_free_space;
+
+ uint64_t mg_allocations;
+ uint64_t mg_failed_allocations;
uint64_t mg_fragmentation;
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
};
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_REFCOUNT_H
void refcount_create(refcount_t *rc);
void refcount_create_untracked(refcount_t *rc);
+void refcount_create_tracked(refcount_t *rc);
void refcount_destroy(refcount_t *rc);
void refcount_destroy_many(refcount_t *rc, uint64_t number);
int refcount_is_zero(refcount_t *rc);
int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
void refcount_transfer(refcount_t *dst, refcount_t *src);
void refcount_transfer_ownership(refcount_t *, void *, void *);
+boolean_t refcount_held(refcount_t *, void *);
+boolean_t refcount_not_held(refcount_t *, void *);
void refcount_init(void);
void refcount_fini(void);
#define refcount_create(rc) ((rc)->rc_count = 0)
#define refcount_create_untracked(rc) ((rc)->rc_count = 0)
+#define refcount_create_tracked(rc) ((rc)->rc_count = 0)
#define refcount_destroy(rc) ((rc)->rc_count = 0)
#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
#define refcount_is_zero(rc) ((rc)->rc_count == 0)
atomic_add_64(&(dst)->rc_count, __tmp); \
}
#define refcount_transfer_ownership(rc, current_holder, new_holder) (void)0
+#define refcount_held(rc, holder) ((rc)->rc_count > 0)
+#define refcount_not_held(rc, holder) (B_TRUE)
#define refcount_init()
#define refcount_fini()
uint64_t spa_last_synced_guid; /* last synced guid */
list_t spa_config_dirty_list; /* vdevs with dirty config */
list_t spa_state_dirty_list; /* vdevs with dirty state */
+ kmutex_t spa_alloc_lock;
+ avl_tree_t spa_alloc_tree;
spa_aux_vdev_t spa_spares; /* hot spares */
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
nvlist_t *spa_label_features; /* Features for reading MOS */
typedef struct vdev_cache vdev_cache_t;
typedef struct vdev_cache_entry vdev_cache_entry_t;
+extern int zfs_vdev_queue_depth_pct;
+extern uint32_t zfs_vdev_async_write_max_active;
+
/*
* Virtual device operations
*/
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
uint64_t vdev_islog; /* is an intent log device */
uint64_t vdev_removing; /* device is being removed? */
- boolean_t vdev_ishole; /* is a hole in the namespace */
+ boolean_t vdev_ishole; /* is a hole in the namespace */
+ kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */
uint64_t vdev_top_zap;
+ /*
+ * The queue depth parameters determine how many async writes are
+ * still pending (i.e. allocated by net yet issued to disk) per
+ * top-level (vdev_async_write_queue_depth) and the maximum allowed
+ * (vdev_max_async_write_queue_depth). These values only apply to
+ * top-level vdevs.
+ */
+ uint64_t vdev_async_write_queue_depth;
+ uint64_t vdev_max_async_write_queue_depth;
+
/*
* Leaf vdev state.
*/
ZIO_FLAG_DONT_CACHE = 1 << 11,
ZIO_FLAG_NODATA = 1 << 12,
ZIO_FLAG_INDUCE_DAMAGE = 1 << 13,
+ ZIO_FLAG_IO_ALLOCATING = 1 << 14,
#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
/*
* Flags inherited by vdev children.
*/
- ZIO_FLAG_IO_RETRY = 1 << 14, /* must be first for INHERIT */
- ZIO_FLAG_PROBE = 1 << 15,
- ZIO_FLAG_TRYHARD = 1 << 16,
- ZIO_FLAG_OPTIONAL = 1 << 17,
+ ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */
+ ZIO_FLAG_PROBE = 1 << 16,
+ ZIO_FLAG_TRYHARD = 1 << 17,
+ ZIO_FLAG_OPTIONAL = 1 << 18,
#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
/*
* Flags not inherited by any children.
*/
- ZIO_FLAG_DONT_QUEUE = 1 << 18, /* must be first for INHERIT */
- ZIO_FLAG_DONT_PROPAGATE = 1 << 19,
- ZIO_FLAG_IO_BYPASS = 1 << 20,
- ZIO_FLAG_IO_REWRITE = 1 << 21,
- ZIO_FLAG_RAW = 1 << 22,
- ZIO_FLAG_GANG_CHILD = 1 << 23,
- ZIO_FLAG_DDT_CHILD = 1 << 24,
- ZIO_FLAG_GODFATHER = 1 << 25,
- ZIO_FLAG_NOPWRITE = 1 << 26,
- ZIO_FLAG_REEXECUTED = 1 << 27,
- ZIO_FLAG_DELEGATED = 1 << 28,
- ZIO_FLAG_FASTWRITE = 1 << 29,
+ ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */
+ ZIO_FLAG_DONT_PROPAGATE = 1 << 20,
+ ZIO_FLAG_IO_BYPASS = 1 << 21,
+ ZIO_FLAG_IO_REWRITE = 1 << 22,
+ ZIO_FLAG_RAW = 1 << 23,
+ ZIO_FLAG_GANG_CHILD = 1 << 24,
+ ZIO_FLAG_DDT_CHILD = 1 << 25,
+ ZIO_FLAG_GODFATHER = 1 << 26,
+ ZIO_FLAG_NOPWRITE = 1 << 27,
+ ZIO_FLAG_REEXECUTED = 1 << 28,
+ ZIO_FLAG_DELEGATED = 1 << 29,
+ ZIO_FLAG_FASTWRITE = 1 << 30
};
#define ZIO_FLAG_MUSTSUCCEED 0
typedef void zio_done_func_t(zio_t *zio);
+extern int zio_dva_throttle_enabled;
extern const char *zio_type_name[ZIO_TYPES];
/*
blkptr_t io_bp_copy;
list_t io_parent_list;
list_t io_child_list;
- zio_link_t *io_walk_link;
zio_t *io_logical;
zio_transform_t *io_transform_stack;
uint64_t io_offset;
hrtime_t io_timestamp; /* submitted at */
+ hrtime_t io_queued_timestamp;
hrtime_t io_target_timestamp;
hrtime_t io_delta; /* vdev queue service delta */
hrtime_t io_delay; /* Device access time (disk or */
/* file). */
avl_node_t io_queue_node;
avl_node_t io_offset_node;
+ avl_node_t io_alloc_node;
/* Internal pipeline state */
enum zio_flag io_flags;
enum zio_flag io_orig_flags;
enum zio_stage io_orig_stage;
enum zio_stage io_orig_pipeline;
+ enum zio_stage io_pipeline_trace;
int io_error;
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
taskq_ent_t io_tqent;
};
+extern int zio_timestamp_compare(const void *, const void *);
+
extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
zio_done_func_t *done, void *private, enum zio_flag flags);
extern void zio_delay_init(zio_t *zio);
extern void zio_delay_interrupt(zio_t *zio);
-extern zio_t *zio_walk_parents(zio_t *cio);
-extern zio_t *zio_walk_children(zio_t *pio);
+extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
+extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
extern zio_t *zio_unique_parent(zio_t *cio);
extern void zio_add_child(zio_t *pio, zio_t *cio);
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _ZIO_IMPL_H
ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */
ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */
- ZIO_STAGE_FREE_BP_INIT = 1 << 2, /* --F-- */
- ZIO_STAGE_ISSUE_ASYNC = 1 << 3, /* RWF-- */
- ZIO_STAGE_WRITE_BP_INIT = 1 << 4, /* -W--- */
+ ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */
+ ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */
+ ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */
+ ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */
- ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */
+ ZIO_STAGE_CHECKSUM_GENERATE = 1 << 6, /* -W--- */
- ZIO_STAGE_NOP_WRITE = 1 << 6, /* -W--- */
+ ZIO_STAGE_NOP_WRITE = 1 << 7, /* -W--- */
- ZIO_STAGE_DDT_READ_START = 1 << 7, /* R---- */
- ZIO_STAGE_DDT_READ_DONE = 1 << 8, /* R---- */
- ZIO_STAGE_DDT_WRITE = 1 << 9, /* -W--- */
- ZIO_STAGE_DDT_FREE = 1 << 10, /* --F-- */
+ ZIO_STAGE_DDT_READ_START = 1 << 8, /* R---- */
+ ZIO_STAGE_DDT_READ_DONE = 1 << 9, /* R---- */
+ ZIO_STAGE_DDT_WRITE = 1 << 10, /* -W--- */
+ ZIO_STAGE_DDT_FREE = 1 << 11, /* --F-- */
- ZIO_STAGE_GANG_ASSEMBLE = 1 << 11, /* RWFC- */
- ZIO_STAGE_GANG_ISSUE = 1 << 12, /* RWFC- */
+ ZIO_STAGE_GANG_ASSEMBLE = 1 << 12, /* RWFC- */
+ ZIO_STAGE_GANG_ISSUE = 1 << 13, /* RWFC- */
- ZIO_STAGE_DVA_ALLOCATE = 1 << 13, /* -W--- */
- ZIO_STAGE_DVA_FREE = 1 << 14, /* --F-- */
- ZIO_STAGE_DVA_CLAIM = 1 << 15, /* ---C- */
+ ZIO_STAGE_DVA_THROTTLE = 1 << 14, /* -W--- */
+ ZIO_STAGE_DVA_ALLOCATE = 1 << 15, /* -W--- */
+ ZIO_STAGE_DVA_FREE = 1 << 16, /* --F-- */
+ ZIO_STAGE_DVA_CLAIM = 1 << 17, /* ---C- */
- ZIO_STAGE_READY = 1 << 16, /* RWFCI */
+ ZIO_STAGE_READY = 1 << 18, /* RWFCI */
- ZIO_STAGE_VDEV_IO_START = 1 << 17, /* RW--I */
- ZIO_STAGE_VDEV_IO_DONE = 1 << 18, /* RW--I */
- ZIO_STAGE_VDEV_IO_ASSESS = 1 << 19, /* RW--I */
+ ZIO_STAGE_VDEV_IO_START = 1 << 19, /* RW--I */
+ ZIO_STAGE_VDEV_IO_DONE = 1 << 20, /* RW--I */
+ ZIO_STAGE_VDEV_IO_ASSESS = 1 << 21, /* RW--I */
- ZIO_STAGE_CHECKSUM_VERIFY = 1 << 20, /* R---- */
+ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 22, /* R---- */
- ZIO_STAGE_DONE = 1 << 21 /* RWFCI */
+ ZIO_STAGE_DONE = 1 << 23 /* RWFCI */
};
#define ZIO_INTERLOCK_STAGES \
#define ZIO_REWRITE_PIPELINE \
(ZIO_WRITE_COMMON_STAGES | \
+ ZIO_STAGE_WRITE_COMPRESS | \
ZIO_STAGE_WRITE_BP_INIT)
#define ZIO_WRITE_PIPELINE \
(ZIO_WRITE_COMMON_STAGES | \
ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_WRITE_COMPRESS | \
+ ZIO_STAGE_DVA_THROTTLE | \
ZIO_STAGE_DVA_ALLOCATE)
#define ZIO_DDT_CHILD_WRITE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_DVA_THROTTLE | \
ZIO_STAGE_DVA_ALLOCATE)
#define ZIO_DDT_WRITE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
- ZIO_STAGE_ISSUE_ASYNC | \
ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_WRITE_COMPRESS | \
ZIO_STAGE_CHECKSUM_GENERATE | \
ZIO_STAGE_DDT_WRITE)
Default value: \fB10\fR.
.RE
+.sp
+.ne 2
+.na
+\fBzfs_vdev_queue_depth_pct\fR (int)
+.ad
+.RS 12n
+The queue depth percentage for each top-level virtual device.
+Used in conjunction with zfs_vdev_async_max_active.
+.sp
+Default value: \fB1000\fR.
+.RE
+
.sp
.ne 2
.na
Default value: \fB30,000\fR.
.RE
+.sp
+.ne 2
+.na
+\fBzio_dva_throttle_enabled\fR (int)
+.ad
+.RS 12n
+Throttle block allocations in the ZIO pipeline. This allows for
+dynamic allocation distribution when devices are imbalanced.
+.sp
+Default value: \fB1\fR.
+.RE
+
.sp
.ne 2
.na
#define WITH_DF_BLOCK_ALLOCATOR
-/*
- * Allow allocations to switch to gang blocks quickly. We do this to
- * avoid having to load lots of space_maps in a given txg. There are,
- * however, some cases where we want to avoid "fast" ganging and instead
- * we want to do an exhaustive search of all metaslabs on this device.
- * Currently we don't allow any gang, slog, or dump device related allocations
- * to "fast" gang.
- */
-#define CAN_FASTGANG(flags) \
- (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
- METASLAB_GANG_AVOID)))
+#define GANG_ALLOCATION(flags) \
+ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
mc->mc_spa = spa;
mc->mc_rotor = NULL;
mc->mc_ops = ops;
+ mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
+ refcount_create_tracked(&mc->mc_alloc_slots);
return (mc);
}
ASSERT(mc->mc_space == 0);
ASSERT(mc->mc_dspace == 0);
+ refcount_destroy(&mc->mc_alloc_slots);
+ mutex_destroy(&mc->mc_lock);
kmem_free(mc, sizeof (metaslab_class_t));
}
/*
* Update the allocatable flag and the metaslab group's capacity.
* The allocatable flag is set to true if the capacity is below
- * the zfs_mg_noalloc_threshold. If a metaslab group transitions
- * from allocatable to non-allocatable or vice versa then the metaslab
- * group's class is updated to reflect the transition.
+ * the zfs_mg_noalloc_threshold or has a fragmentation value that is
+ * greater than zfs_mg_fragmentation_threshold. If a metaslab group
+ * transitions from allocatable to non-allocatable or vice versa then the
+ * metaslab group's class is updated to reflect the transition.
*/
static void
metaslab_group_alloc_update(metaslab_group_t *mg)
metaslab_class_t *mc = mg->mg_class;
vdev_stat_t *vs = &vd->vdev_stat;
boolean_t was_allocatable;
+ boolean_t was_initialized;
ASSERT(vd == vd->vdev_top);
mutex_enter(&mg->mg_lock);
was_allocatable = mg->mg_allocatable;
+ was_initialized = mg->mg_initialized;
mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
(vs->vs_space + 1);
+ mutex_enter(&mc->mc_lock);
+
+ /*
+ * If the metaslab group was just added then it won't
+ * have any space until we finish syncing out this txg.
+ * At that point we will consider it initialized and available
+ * for allocations. We also don't consider non-activated
+ * metaslab groups (e.g. vdevs that are in the middle of being removed)
+ * to be initialized, because they can't be used for allocation.
+ */
+ mg->mg_initialized = metaslab_group_initialized(mg);
+ if (!was_initialized && mg->mg_initialized) {
+ mc->mc_groups++;
+ } else if (was_initialized && !mg->mg_initialized) {
+ ASSERT3U(mc->mc_groups, >, 0);
+ mc->mc_groups--;
+ }
+ if (mg->mg_initialized)
+ mg->mg_no_free_space = B_FALSE;
+
/*
* A metaslab group is considered allocatable if it has plenty
* of free space or is not heavily fragmented. We only take
* fragmentation into account if the metaslab group has a valid
* fragmentation metric (i.e. a value between 0 and 100).
*/
- mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
+ mg->mg_allocatable = (mg->mg_activation_count > 0 &&
+ mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
(mg->mg_fragmentation == ZFS_FRAG_INVALID ||
mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
mc->mc_alloc_groups--;
else if (!was_allocatable && mg->mg_allocatable)
mc->mc_alloc_groups++;
+ mutex_exit(&mc->mc_lock);
mutex_exit(&mg->mg_lock);
}
mg->mg_vd = vd;
mg->mg_class = mc;
mg->mg_activation_count = 0;
+ mg->mg_initialized = B_FALSE;
+ mg->mg_no_free_space = B_TRUE;
+ refcount_create_tracked(&mg->mg_alloc_queue_depth);
mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
taskq_destroy(mg->mg_taskq);
avl_destroy(&mg->mg_metaslab_tree);
mutex_destroy(&mg->mg_lock);
+ refcount_destroy(&mg->mg_alloc_queue_depth);
kmem_free(mg, sizeof (metaslab_group_t));
}
mg->mg_next = NULL;
}
+boolean_t
+metaslab_group_initialized(metaslab_group_t *mg)
+{
+ vdev_t *vd = mg->mg_vd;
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ return (vs->vs_space != 0 && mg->mg_activation_count > 0);
+}
+
uint64_t
metaslab_group_get_space(metaslab_group_t *mg)
{
* group should avoid allocations if its free capacity is less than the
* zfs_mg_noalloc_threshold or its fragmentation metric is greater than
* zfs_mg_fragmentation_threshold and there is at least one metaslab group
- * that can still handle allocations.
+ * that can still handle allocations. If the allocation throttle is enabled
+ * then we skip allocations to devices that have reached their maximum
+ * allocation queue depth unless the selected metaslab group is the only
+ * eligible group remaining.
*/
static boolean_t
-metaslab_group_allocatable(metaslab_group_t *mg)
+metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
+ uint64_t psize)
{
- vdev_t *vd = mg->mg_vd;
- spa_t *spa = vd->vdev_spa;
+ spa_t *spa = mg->mg_vd->vdev_spa;
metaslab_class_t *mc = mg->mg_class;
/*
- * We use two key metrics to determine if a metaslab group is
- * considered allocatable -- free space and fragmentation. If
- * the free space is greater than the free space threshold and
- * the fragmentation is less than the fragmentation threshold then
- * consider the group allocatable. There are two case when we will
- * not consider these key metrics. The first is if the group is
- * associated with a slog device and the second is if all groups
- * in this metaslab class have already been consider ineligible
+ * We can only consider skipping this metaslab group if it's
+ * in the normal metaslab class and there are other metaslab
+ * groups to select from. Otherwise, we always consider it eligible
* for allocations.
*/
- return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
- (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
- mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) ||
- mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
+ if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
+ return (B_TRUE);
+
+ /*
+ * If the metaslab group's mg_allocatable flag is set (see comments
+ * in metaslab_group_alloc_update() for more information) and
+ * the allocation throttle is disabled then allow allocations to this
+ * device. However, if the allocation throttle is enabled then
+ * check if we have reached our allocation limit (mg_alloc_queue_depth)
+ * to determine if we should allow allocations to this metaslab group.
+ * If all metaslab groups are no longer considered allocatable
+ * (mc_alloc_groups == 0) or we're trying to allocate the smallest
+ * gang block size then we allow allocations on this metaslab group
+ * regardless of the mg_allocatable or throttle settings.
+ */
+ if (mg->mg_allocatable) {
+ metaslab_group_t *mgp;
+ int64_t qdepth;
+ uint64_t qmax = mg->mg_max_alloc_queue_depth;
+
+ if (!mc->mc_alloc_throttle_enabled)
+ return (B_TRUE);
+
+ /*
+ * If this metaslab group does not have any free space, then
+ * there is no point in looking further.
+ */
+ if (mg->mg_no_free_space)
+ return (B_FALSE);
+
+ qdepth = refcount_count(&mg->mg_alloc_queue_depth);
+
+ /*
+ * If this metaslab group is below its qmax or it's
+ * the only allocatable metasable group, then attempt
+ * to allocate from it.
+ */
+ if (qdepth < qmax || mc->mc_alloc_groups == 1)
+ return (B_TRUE);
+ ASSERT3U(mc->mc_alloc_groups, >, 1);
+
+ /*
+ * Since this metaslab group is at or over its qmax, we
+ * need to determine if there are metaslab groups after this
+ * one that might be able to handle this allocation. This is
+ * racy since we can't hold the locks for all metaslab
+ * groups at the same time when we make this check.
+ */
+ for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
+ qmax = mgp->mg_max_alloc_queue_depth;
+
+ qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
+
+ /*
+ * If there is another metaslab group that
+ * might be able to handle the allocation, then
+ * we return false so that we skip this group.
+ */
+ if (qdepth < qmax && !mgp->mg_no_free_space)
+ return (B_FALSE);
+ }
+
+ /*
+ * We didn't find another group to handle the allocation
+ * so we can't skip this metaslab group even though
+ * we are at or over our qmax.
+ */
+ return (B_TRUE);
+
+ } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
+ return (B_TRUE);
+ }
+ return (B_FALSE);
}
/*
return (0);
}
+/*
+ * ==========================================================================
+ * Metaslab block operations
+ * ==========================================================================
+ */
+
+static void
+metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
+{
+ metaslab_group_t *mg;
+
+ if (!(flags & METASLAB_ASYNC_ALLOC) ||
+ flags & METASLAB_DONT_THROTTLE)
+ return;
+
+ mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+ if (!mg->mg_class->mc_alloc_throttle_enabled)
+ return;
+
+ (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
+}
+
+void
+metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
+{
+ metaslab_group_t *mg;
+
+ if (!(flags & METASLAB_ASYNC_ALLOC) ||
+ flags & METASLAB_DONT_THROTTLE)
+ return;
+
+ mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+ if (!mg->mg_class->mc_alloc_throttle_enabled)
+ return;
+
+ (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
+}
+
+void
+metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
+{
+#ifdef ZFS_DEBUG
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+ int d;
+
+ for (d = 0; d < ndvas; d++) {
+ uint64_t vdev = DVA_GET_VDEV(&dva[d]);
+ metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+ VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
+ }
+#endif
+}
+
static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
{
spa_t *spa = mg->mg_vd->vdev_spa;
if (msp->ms_weight < asize) {
spa_dbgmsg(spa, "%s: failed to meet weight "
"requirement: vdev %llu, txg %llu, mg %p, "
- "msp %p, psize %llu, asize %llu, "
+ "msp %p, asize %llu, "
"weight %llu", spa_name(spa),
mg->mg_vd->vdev_id, txg,
- mg, msp, psize, asize, msp->ms_weight);
+ mg, msp, asize, msp->ms_weight);
mutex_exit(&mg->mg_lock);
return (-1ULL);
}
msp->ms_access_txg = txg + metaslab_unload_delay;
mutex_exit(&msp->ms_lock);
-
return (offset);
}
int all_zero;
int zio_lock = B_FALSE;
boolean_t allocatable;
- uint64_t offset = -1ULL;
uint64_t asize;
uint64_t distance;
top:
all_zero = B_TRUE;
do {
- ASSERT(mg->mg_activation_count == 1);
+ uint64_t offset;
+ ASSERT(mg->mg_activation_count == 1);
vd = mg->mg_vd;
/*
/*
* Determine if the selected metaslab group is eligible
- * for allocations. If we're ganging or have requested
- * an allocation for the smallest gang block size
- * then we don't want to avoid allocating to the this
- * metaslab group. If we're in this condition we should
- * try to allocate from any device possible so that we
- * don't inadvertently return ENOSPC and suspend the pool
+ * for allocations. If we're ganging then don't allow
+ * this metaslab group to skip allocations since that would
+ * inadvertently return ENOSPC and suspend the pool
* even though space is still available.
*/
- if (allocatable && CAN_FASTGANG(flags) &&
- psize > SPA_GANGBLOCKSIZE)
- allocatable = metaslab_group_allocatable(mg);
+ if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) {
+ allocatable = metaslab_group_allocatable(mg, rotor,
+ psize);
+ }
if (!allocatable)
goto next;
+ ASSERT(mg->mg_initialized);
+
/*
- * Avoid writing single-copy data to a failing vdev
- * unless the user instructs us that it is okay.
+ * Avoid writing single-copy data to a failing vdev.
*/
if ((vd->vdev_stat.vs_write_errors > 0 ||
vd->vdev_state < VDEV_STATE_HEALTHY) &&
asize = vdev_psize_to_asize(vd, psize);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
- offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
- dva, d);
+ offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+
+ mutex_enter(&mg->mg_lock);
+ if (offset == -1ULL) {
+ mg->mg_failed_allocations++;
+ if (asize == SPA_GANGBLOCKSIZE) {
+ /*
+ * This metaslab group was unable to allocate
+ * the minimum gang block size so it must be
+ * out of space. We must notify the allocation
+ * throttle to start skipping allocation
+ * attempts to this metaslab group until more
+ * space becomes available.
+ *
+ * Note: this failure cannot be caused by the
+ * allocation throttle since the allocation
+ * throttle is only responsible for skipping
+ * devices and not failing block allocations.
+ */
+ mg->mg_no_free_space = B_TRUE;
+ }
+ }
+ mg->mg_allocations++;
+ mutex_exit(&mg->mg_lock);
+
if (offset != -1ULL) {
/*
* If we've just selected this metaslab group,
return (0);
}
+/*
+ * Reserve some allocation slots. The reservation system must be called
+ * before we call into the allocator. If there aren't any available slots
+ * then the I/O will be throttled until an I/O completes and its slots are
+ * freed up. The function returns true if it was successful in placing
+ * the reservation.
+ */
+boolean_t
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
+ int flags)
+{
+ uint64_t available_slots = 0;
+ uint64_t reserved_slots;
+ boolean_t slot_reserved = B_FALSE;
+
+ ASSERT(mc->mc_alloc_throttle_enabled);
+ mutex_enter(&mc->mc_lock);
+
+ reserved_slots = refcount_count(&mc->mc_alloc_slots);
+ if (reserved_slots < mc->mc_alloc_max_slots)
+ available_slots = mc->mc_alloc_max_slots - reserved_slots;
+
+ if (slots <= available_slots || GANG_ALLOCATION(flags)) {
+ int d;
+
+ /*
+ * We reserve the slots individually so that we can unreserve
+ * them individually when an I/O completes.
+ */
+ for (d = 0; d < slots; d++) {
+ reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
+ }
+ zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
+ slot_reserved = B_TRUE;
+ }
+
+ mutex_exit(&mc->mc_lock);
+ return (slot_reserved);
+}
+
+void
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
+{
+ int d;
+
+ ASSERT(mc->mc_alloc_throttle_enabled);
+ mutex_enter(&mc->mc_lock);
+ for (d = 0; d < slots; d++) {
+ (void) refcount_remove(&mc->mc_alloc_slots, zio);
+ }
+ mutex_exit(&mc->mc_lock);
+}
+
int
metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
- int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
+ int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio)
{
dva_t *dva = bp->blk_dva;
dva_t *hintdva = hintbp->blk_dva;
if (error != 0) {
for (d--; d >= 0; d--) {
metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
+ metaslab_group_alloc_decrement(spa,
+ DVA_GET_VDEV(&dva[d]), zio, flags);
bzero(&dva[d], sizeof (dva_t));
}
spa_config_exit(spa, SCL_ALLOC, FTAG);
return (error);
+ } else {
+ /*
+ * Update the metaslab group's queue depth
+ * based on the newly allocated dva.
+ */
+ metaslab_group_alloc_increment(spa,
+ DVA_GET_VDEV(&dva[d]), zio, flags);
}
+
}
ASSERT(error == 0);
ASSERT(BP_GET_NDVAS(bp) == ndvas);
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
rc->rc_tracked = reference_tracking_enable;
}
+void
+refcount_create_tracked(refcount_t *rc)
+{
+ refcount_create(rc);
+ rc->rc_tracked = B_TRUE;
+}
+
void
refcount_create_untracked(refcount_t *rc)
{
ASSERT(found);
mutex_exit(&rc->rc_mtx);
}
+
+/*
+ * If tracking is enabled, return true if a reference exists that matches
+ * the "holder" tag. If tracking is disabled, then return true if a reference
+ * might be held.
+ */
+boolean_t
+refcount_held(refcount_t *rc, void *holder)
+{
+ reference_t *ref;
+
+ mutex_enter(&rc->rc_mtx);
+
+ if (!rc->rc_tracked) {
+ mutex_exit(&rc->rc_mtx);
+ return (rc->rc_count > 0);
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder) {
+ mutex_exit(&rc->rc_mtx);
+ return (B_TRUE);
+ }
+ }
+ mutex_exit(&rc->rc_mtx);
+ return (B_FALSE);
+}
+
+/*
+ * If tracking is enabled, return true if a reference does not exist that
+ * matches the "holder" tag. If tracking is disabled, always return true
+ * since the reference might not be held.
+ */
+boolean_t
+refcount_not_held(refcount_t *rc, void *holder)
+{
+ reference_t *ref;
+
+ mutex_enter(&rc->rc_mtx);
+
+ if (!rc->rc_tracked) {
+ mutex_exit(&rc->rc_mtx);
+ return (B_TRUE);
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder) {
+ mutex_exit(&rc->rc_mtx);
+ return (B_FALSE);
+ }
+ }
+ mutex_exit(&rc->rc_mtx);
+ return (B_TRUE);
+}
#endif /* ZFS_DEBUG */
ddt_unload(spa);
-
/*
* Drop and purge level 2 cache
*/
spa->spa_uberblock.ub_txg = txg - 1;
spa->spa_uberblock.ub_version = version;
spa->spa_ubsync = spa->spa_uberblock;
+ spa->spa_load_state = SPA_LOAD_CREATE;
/*
* Create "The Godfather" zio to hold all async IOs
*/
spa_evicting_os_wait(spa);
spa->spa_minref = refcount_count(&spa->spa_refcount);
+ spa->spa_load_state = SPA_LOAD_NONE;
mutex_exit(&spa_namespace_lock);
static void
spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
- nvlist_t *dev_to_remove)
+ nvlist_t *dev_to_remove)
{
nvlist_t **newdev = NULL;
int i, j;
dsl_pool_t *dp = spa->spa_dsl_pool;
objset_t *mos = spa->spa_meta_objset;
bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
+ metaslab_class_t *mc;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
dmu_tx_t *tx;
int error;
+ uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
+ zfs_vdev_queue_depth_pct / 100;
+ uint64_t queue_depth_total;
int c;
VERIFY(spa_writeable(spa));
spa->spa_syncing_txg = txg;
spa->spa_sync_pass = 0;
+ mutex_enter(&spa->spa_alloc_lock);
+ VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
+ mutex_exit(&spa->spa_alloc_lock);
+
/*
* If there are any pending vdev state changes, convert them
* into config changes that go out with this transaction group.
}
}
+ /*
+ * Set the top-level vdev's max queue depth. Evaluate each
+ * top-level's async write queue depth in case it changed.
+ * The max queue depth will not change in the middle of syncing
+ * out this txg.
+ */
+ queue_depth_total = 0;
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
+ !metaslab_group_initialized(mg))
+ continue;
+
+ /*
+ * It is safe to do a lock-free check here because only async
+ * allocations look at mg_max_alloc_queue_depth, and async
+ * allocations all happen from spa_sync().
+ */
+ ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
+ mg->mg_max_alloc_queue_depth = max_queue_depth;
+ queue_depth_total += mg->mg_max_alloc_queue_depth;
+ }
+ mc = spa_normal_class(spa);
+ ASSERT0(refcount_count(&mc->mc_alloc_slots));
+ mc->mc_alloc_max_slots = queue_depth_total;
+ mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+
+ ASSERT3U(mc->mc_alloc_max_slots, <=,
+ max_queue_depth * rvd->vdev_children);
+
/*
* Iterate to convergence.
*/
dsl_pool_sync_done(dp, txg);
+ mutex_enter(&spa->spa_alloc_lock);
+ VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
+ mutex_exit(&spa->spa_alloc_lock);
+
/*
* Update usable space statistics.
*/
mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
if (altroot)
spa->spa_root = spa_strdup(altroot);
+ avl_create(&spa->spa_alloc_tree, zio_timestamp_compare,
+ sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+
/*
* Every pool starts with the default cachefile
*/
kmem_free(dp, sizeof (spa_config_dirent_t));
}
+ avl_destroy(&spa->spa_alloc_tree);
list_destroy(&spa->spa_config_list);
nvlist_free(spa->spa_label_features);
cv_destroy(&spa->spa_scrub_io_cv);
cv_destroy(&spa->spa_suspend_cv);
+ mutex_destroy(&spa->spa_alloc_lock);
mutex_destroy(&spa->spa_async_lock);
mutex_destroy(&spa->spa_errlist_lock);
mutex_destroy(&spa->spa_errlog_lock);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
for (t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
&vd->vdev_dtl_lock);
}
mutex_exit(&vd->vdev_dtl_lock);
+ mutex_destroy(&vd->vdev_queue_lock);
mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
zio_buf_free(zio->io_data, zio->io_size);
} else if (zio->io_type == ZIO_TYPE_NULL) {
zio_t *pio;
+ zio_link_t *zl;
vd->vdev_cant_read |= !vps->vps_readable;
vd->vdev_cant_write |= !vps->vps_writeable;
vd->vdev_probe_zio = NULL;
mutex_exit(&vd->vdev_probe_lock);
- while ((pio = zio_walk_parents(zio)) != NULL)
+ zl = NULL;
+ while ((pio = zio_walk_parents(zio, &zl)) != NULL)
if (!vdev_accessible(vd, pio))
pio->io_error = SET_ERROR(ENXIO);
* we're asking two separate questions about it.
*/
return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
- !vd->vdev_cant_write && !vd->vdev_ishole);
+ !vd->vdev_cant_write && !vd->vdev_ishole &&
+ vd->vdev_mg->mg_initialized);
}
boolean_t
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
vdev_cache_t *vc = &vd->vdev_cache;
vdev_cache_entry_t *ve = fio->io_private;
zio_t *pio;
+ zio_link_t *zl;
ASSERT(fio->io_size == VCBS);
* any reads that were queued up before the missed update are still
* valid, so we can satisfy them from this line before we evict it.
*/
- while ((pio = zio_walk_parents(fio)) != NULL)
+ zl = NULL;
+ while ((pio = zio_walk_parents(fio, &zl)) != NULL)
vdev_cache_hit(vc, ve, pio);
if (fio->io_error || ve->ve_missed_update)
*/
/*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
if (zio->io_error == 0) {
zio_t *pio;
+ zio_link_t *zl = NULL;
mutex_enter(&zio->io_lock);
- while ((pio = zio_walk_parents(zio)) != NULL) {
+ while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
mutex_enter(&pio->io_lock);
ASSERT3U(zio->io_size, >=, pio->io_size);
bcopy(zio->io_data, pio->io_data, pio->io_size);
#include <sys/zio.h>
#include <sys/avl.h>
#include <sys/dsl_pool.h>
+#include <sys/metaslab_impl.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/kstat.h>
int zfs_vdev_read_gap_limit = 32 << 10;
int zfs_vdev_write_gap_limit = 4 << 10;
+/*
+ * Define the queue depth percentage for each top-level. This percentage is
+ * used in conjunction with zfs_vdev_async_max_active to determine how many
+ * allocations a specific top-level vdev should handle. Once the queue depth
+ * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
+ * then allocator will stop allocating blocks on that top-level device.
+ * The default kernel setting is 1000% which will yield 100 allocations per
+ * device. For userland testing, the default setting is 300% which equates
+ * to 30 allocations per device.
+ */
+#ifdef _KERNEL
+int zfs_vdev_queue_depth_pct = 1000;
+#else
+int zfs_vdev_queue_depth_pct = 300;
+#endif
+
+
int
vdev_queue_offset_compare(const void *x1, const void *x2)
{
{
if (aio->io_type == ZIO_TYPE_READ) {
zio_t *pio;
- while ((pio = zio_walk_parents(aio)) != NULL) {
+ zio_link_t *zl = NULL;
+ while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
bcopy((char *)aio->io_data + (pio->io_offset -
aio->io_offset), pio->io_data, pio->io_size);
}
module_param(zfs_vdev_sync_write_min_active, int, 0644);
MODULE_PARM_DESC(zfs_vdev_sync_write_min_active,
"Min active sync write I/Os per vdev");
+
+module_param(zfs_vdev_queue_depth_pct, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_queue_depth_pct,
+ "Queue depth percentage for each top-level vdev");
#endif
#include <sys/ddt.h>
#include <sys/blkptr.h>
#include <sys/zfeature.h>
+#include <sys/metaslab_impl.h>
#include <sys/time.h>
#include <sys/trace_zio.h>
* ==========================================================================
*/
const char *zio_type_name[ZIO_TYPES] = {
+ /*
+ * Note: Linux kernel thread name length is limited
+ * so these names will differ from upstream open zfs.
+ */
"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
};
+int zio_dva_throttle_enabled = B_TRUE;
+
/*
* ==========================================================================
* I/O kmem caches
static inline void __zio_execute(zio_t *zio);
+static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
+
void
zio_init(void)
{
* I/O parent/child relationships and pipeline interlocks
* ==========================================================================
*/
-/*
- * NOTE - Callers to zio_walk_parents() and zio_walk_children must
- * continue calling these functions until they return NULL.
- * Otherwise, the next caller will pick up the list walk in
- * some indeterminate state. (Otherwise every caller would
- * have to pass in a cookie to keep the state represented by
- * io_walk_link, which gets annoying.)
- */
zio_t *
-zio_walk_parents(zio_t *cio)
+zio_walk_parents(zio_t *cio, zio_link_t **zl)
{
- zio_link_t *zl = cio->io_walk_link;
list_t *pl = &cio->io_parent_list;
- zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
- cio->io_walk_link = zl;
-
- if (zl == NULL)
+ *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
+ if (*zl == NULL)
return (NULL);
- ASSERT(zl->zl_child == cio);
- return (zl->zl_parent);
+ ASSERT((*zl)->zl_child == cio);
+ return ((*zl)->zl_parent);
}
zio_t *
-zio_walk_children(zio_t *pio)
+zio_walk_children(zio_t *pio, zio_link_t **zl)
{
- zio_link_t *zl = pio->io_walk_link;
list_t *cl = &pio->io_child_list;
- zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
- pio->io_walk_link = zl;
-
- if (zl == NULL)
+ *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
+ if (*zl == NULL)
return (NULL);
- ASSERT(zl->zl_parent == pio);
- return (zl->zl_child);
+ ASSERT((*zl)->zl_parent == pio);
+ return ((*zl)->zl_child);
}
zio_t *
zio_unique_parent(zio_t *cio)
{
- zio_t *pio = zio_walk_parents(cio);
+ zio_link_t *zl = NULL;
+ zio_t *pio = zio_walk_parents(cio, &zl);
- VERIFY(zio_walk_parents(cio) == NULL);
+ VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
return (pio);
}
mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
-
kmem_cache_free(zio_link_cache, zl);
}
ASSERT(zio->io_stall == NULL);
if (*countp != 0) {
zio->io_stage >>= 1;
+ ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
zio->io_stall = countp;
waiting = B_TRUE;
}
(*countp)--;
if (*countp == 0 && pio->io_stall == countp) {
+ zio_taskq_type_t type =
+ pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
+ ZIO_TASKQ_INTERRUPT;
pio->io_stall = NULL;
mutex_exit(&pio->io_lock);
- __zio_execute(pio);
+ /*
+ * Dispatch the parent zio in its own taskq so that
+ * the child can continue to make progress. This also
+ * prevents overflowing the stack when we have deeply nested
+ * parent-child relationships.
+ */
+ zio_taskq_dispatch(pio, type, B_FALSE);
} else {
mutex_exit(&pio->io_lock);
}
zio->io_error = zio->io_child_error[c];
}
+int
+zio_timestamp_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+ int cmp;
+
+ cmp = AVL_CMP(z1->io_queued_timestamp, z2->io_queued_timestamp);
+ if (likely(cmp))
+ return (cmp);
+
+ cmp = AVL_CMP(z1->io_offset, z2->io_offset);
+ if (likely(cmp))
+ return (cmp);
+
+ return (AVL_PCMP(z1, z2));
+}
+
/*
* ==========================================================================
* Create the various types of I/O (read, write, free, etc)
zio->io_orig_flags = zio->io_flags = flags;
zio->io_orig_stage = zio->io_stage = stage;
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
+ zio->io_pipeline_trace = ZIO_STAGE_OPEN;
zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
zio_t *zio;
zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
- ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
return (zio);
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+ ASSERT0(zio->io_queued_timestamp);
return (zio);
}
if (flags & ZIO_FLAG_IO_REPAIR)
flags &= ~ZIO_FLAG_SPECULATIVE;
+ /*
+ * If we're creating a child I/O that is not associated with a
+ * top-level vdev, then the child zio is not an allocating I/O.
+ * If this is a retried I/O then we ignore it since we will
+ * have already processed the original allocating I/O.
+ */
+ if (flags & ZIO_FLAG_IO_ALLOCATING &&
+ (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
+ metaslab_class_t *mc = spa_normal_class(pio->io_spa);
+
+ ASSERT(mc->mc_alloc_throttle_enabled);
+ ASSERT(type == ZIO_TYPE_WRITE);
+ ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
+ ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
+ pio->io_child_type == ZIO_CHILD_GANG);
+
+ flags &= ~ZIO_FLAG_IO_ALLOCATING;
+ }
+
+
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
zio->io_physdone = pio->io_physdone;
if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
static int
zio_write_bp_init(zio_t *zio)
{
- spa_t *spa = zio->io_spa;
- zio_prop_t *zp = &zio->io_prop;
- enum zio_compress compress = zp->zp_compress;
- blkptr_t *bp = zio->io_bp;
- uint64_t lsize = zio->io_lsize;
- uint64_t psize = zio->io_size;
- int pass = 1;
-
- EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0);
-
- /*
- * If our children haven't all reached the ready stage,
- * wait for them and then repeat this pipeline stage.
- */
- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
- zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
- return (ZIO_PIPELINE_STOP);
if (!IO_IS_ALLOCATING(zio))
return (ZIO_PIPELINE_CONTINUE);
- if (zio->io_children_ready != NULL) {
- /*
- * Now that all our children are ready, run the callback
- * associated with this zio in case it wants to modify the
- * data to be written.
- */
- ASSERT3U(zp->zp_level, >, 0);
- zio->io_children_ready(zio);
- }
-
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
if (zio->io_bp_override) {
+ blkptr_t *bp = zio->io_bp;
+ zio_prop_t *zp = &zio->io_prop;
+
ASSERT(bp->blk_birth != zio->io_txg);
ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
*/
if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
ASSERT(!zp->zp_dedup);
+ ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
zio->io_flags |= ZIO_FLAG_NOPWRITE;
return (ZIO_PIPELINE_CONTINUE);
}
zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
return (ZIO_PIPELINE_CONTINUE);
}
+
+ /*
+ * We were unable to handle this as an override bp, treat
+ * it as a regular write I/O.
+ */
zio->io_bp_override = NULL;
- BP_ZERO(bp);
+ *bp = zio->io_bp_orig;
+ zio->io_pipeline = zio->io_orig_pipeline;
+ }
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_write_compress(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ zio_prop_t *zp = &zio->io_prop;
+ enum zio_compress compress = zp->zp_compress;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t lsize = zio->io_lsize;
+ uint64_t psize = zio->io_size;
+ int pass = 1;
+
+ EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0);
+
+ /*
+ * If our children haven't all reached the ready stage,
+ * wait for them and then repeat this pipeline stage.
+ */
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
+ zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
+ return (ZIO_PIPELINE_STOP);
+
+ if (!IO_IS_ALLOCATING(zio))
+ return (ZIO_PIPELINE_CONTINUE);
+
+ if (zio->io_children_ready != NULL) {
+ /*
+ * Now that all our children are ready, run the callback
+ * associated with this zio in case it wants to modify the
+ * data to be written.
+ */
+ ASSERT3U(zp->zp_level, >, 0);
+ zio->io_children_ready(zio);
}
+ ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+ ASSERT(zio->io_bp_override == NULL);
+
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
/*
* We're rewriting an existing block, which means we're
psize, lsize, NULL);
}
}
+
+ /*
+ * We were unable to handle this as an override bp, treat
+ * it as a regular write I/O.
+ */
+ zio->io_bp_override = NULL;
+ *bp = zio->io_bp_orig;
+ zio->io_pipeline = zio->io_orig_pipeline;
+
} else {
ASSERT3U(psize, !=, 0);
zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
}
}
-
return (ZIO_PIPELINE_CONTINUE);
}
{
zio->io_executor = curthread;
+ ASSERT3U(zio->io_queued_timestamp, >, 0);
+
while (zio->io_stage < ZIO_STAGE_DONE) {
enum zio_stage pipeline = zio->io_pipeline;
enum zio_stage stage = zio->io_stage;
}
zio->io_stage = stage;
+ zio->io_pipeline_trace |= zio->io_stage;
rv = zio_pipeline[highbit64(stage) - 1](zio);
if (rv == ZIO_PIPELINE_STOP)
ASSERT(zio->io_executor == NULL);
zio->io_waiter = curthread;
+ ASSERT0(zio->io_queued_timestamp);
+ zio->io_queued_timestamp = gethrtime();
__zio_execute(zio);
zio_add_child(pio, zio);
}
+ ASSERT0(zio->io_queued_timestamp);
+ zio->io_queued_timestamp = gethrtime();
__zio_execute(zio);
}
{
zio_t *cio, *cio_next;
int c, w;
+ zio_link_t *zl = NULL;
ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
pio->io_pipeline = pio->io_orig_pipeline;
pio->io_reexecute = 0;
pio->io_flags |= ZIO_FLAG_REEXECUTED;
+ pio->io_pipeline_trace = 0;
pio->io_error = 0;
for (w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_state[w] = 0;
* the remainder of pio's io_child_list, from 'cio_next' onward,
* cannot be affected by any side effects of reexecuting 'cio'.
*/
- for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
- cio_next = zio_walk_children(pio);
+ for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+ cio_next = zio_walk_children(pio, &zl);
mutex_enter(&pio->io_lock);
for (w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_children[cio->io_child_type][w]++;
* We don't reexecute "The Godfather" I/O here as it's the
* responsibility of the caller to wait on him.
*/
- if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
+ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
+ pio->io_queued_timestamp = gethrtime();
__zio_execute(pio);
+ }
}
void
zio_write_gang_block(zio_t *pio)
{
spa_t *spa = pio->io_spa;
+ metaslab_class_t *mc = spa_normal_class(spa);
blkptr_t *bp = pio->io_bp;
zio_t *gio = pio->io_gang_leader;
zio_t *zio;
zio_prop_t zp;
int g, error;
- error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
- bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
- METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
+ int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
+ if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+ flags |= METASLAB_ASYNC_ALLOC;
+ VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
+
+ /*
+ * The logical zio has already placed a reservation for
+ * 'copies' allocation slots but gang blocks may require
+ * additional copies. These additional copies
+ * (i.e. gbh_copies - copies) are guaranteed to succeed
+ * since metaslab_class_throttle_reserve() always allows
+ * additional reservations for gang blocks.
+ */
+ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
+ pio, flags));
+ }
+
+ error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
+ bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, pio);
if (error) {
+ if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+ /*
+ * If we failed to allocate the gang block header then
+ * we remove any additional allocation reservations that
+ * we placed here. The original reservation will
+ * be removed when the logical I/O goes to the ready
+ * stage.
+ */
+ metaslab_class_throttle_unreserve(mc,
+ gbh_copies - copies, pio);
+ }
+
pio->io_error = error;
return (ZIO_PIPELINE_CONTINUE);
}
* Create and nowait the gang children.
*/
for (g = 0; resid != 0; resid -= lsize, g++) {
+ zio_t *cio;
+
lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
SPA_MINBLOCKSIZE);
ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
zp.zp_dedup_verify = B_FALSE;
zp.zp_nopwrite = B_FALSE;
- zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
+ cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
(char *)pio->io_data + (pio->io_size - resid), lsize,
lsize, &zp, zio_write_gang_member_ready, NULL, NULL, NULL,
&gn->gn_child[g], pio->io_priority,
- ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark));
+ ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+
+ if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+ /*
+ * Gang children won't throttle but we should
+ * account for their work, so reserve an allocation
+ * slot for them here.
+ */
+ VERIFY(metaslab_class_throttle_reserve(mc,
+ zp.zp_copies, cio, flags));
+ }
+ zio_nowait(cio);
+
}
/*
ddt_entry_t *dde = zio->io_private;
ddt_phys_t *ddp = &dde->dde_phys[p];
zio_t *pio;
+ zio_link_t *zl;
if (zio->io_error)
return;
ddt_phys_fill(ddp, zio->io_bp);
- while ((pio = zio_walk_parents(zio)) != NULL)
+ zl = NULL;
+ while ((pio = zio_walk_parents(zio, &zl)) != NULL)
ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
ddt_exit(ddt);
dde->dde_lead_zio[p] = NULL;
if (zio->io_error == 0) {
- while (zio_walk_parents(zio) != NULL)
+ zio_link_t *zl = NULL;
+ while (zio_walk_parents(zio, &zl) != NULL)
ddt_phys_addref(ddp);
} else {
ddt_phys_clear(ddp);
* Allocate and free blocks
* ==========================================================================
*/
+
+static zio_t *
+zio_io_to_allocate(spa_t *spa)
+{
+ zio_t *zio;
+
+ ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
+
+ zio = avl_first(&spa->spa_alloc_tree);
+ if (zio == NULL)
+ return (NULL);
+
+ ASSERT(IO_IS_ALLOCATING(zio));
+
+ /*
+ * Try to place a reservation for this zio. If we're unable to
+ * reserve then we throttle.
+ */
+ if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
+ zio->io_prop.zp_copies, zio, 0)) {
+ return (NULL);
+ }
+
+ avl_remove(&spa->spa_alloc_tree, zio);
+ ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
+
+ return (zio);
+}
+
+static int
+zio_dva_throttle(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ zio_t *nio;
+
+ if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
+ !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
+ zio->io_child_type == ZIO_CHILD_GANG ||
+ zio->io_flags & ZIO_FLAG_NODATA) {
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+ ASSERT3U(zio->io_queued_timestamp, >, 0);
+ ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
+
+ mutex_enter(&spa->spa_alloc_lock);
+
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ avl_add(&spa->spa_alloc_tree, zio);
+
+ nio = zio_io_to_allocate(zio->io_spa);
+ mutex_exit(&spa->spa_alloc_lock);
+
+ if (nio == zio)
+ return (ZIO_PIPELINE_CONTINUE);
+
+ if (nio != NULL) {
+ ASSERT3U(nio->io_queued_timestamp, <=,
+ zio->io_queued_timestamp);
+ ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE);
+ /*
+ * We are passing control to a new zio so make sure that
+ * it is processed by a different thread. We do this to
+ * avoid stack overflows that can occur when parents are
+ * throttled and children are making progress. We allow
+ * it to go to the head of the taskq since it's already
+ * been waiting.
+ */
+ zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE);
+ }
+ return (ZIO_PIPELINE_STOP);
+}
+
+void
+zio_allocate_dispatch(spa_t *spa)
+{
+ zio_t *zio;
+
+ mutex_enter(&spa->spa_alloc_lock);
+ zio = zio_io_to_allocate(spa);
+ mutex_exit(&spa->spa_alloc_lock);
+ if (zio == NULL)
+ return;
+
+ ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
+ ASSERT0(zio->io_error);
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
+}
+
static int
zio_dva_allocate(zio_t *zio)
{
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
- /*
- * The dump device does not support gang blocks so allocation on
- * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
- * the "fast" gang feature.
- */
- flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
- flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
- METASLAB_GANG_CHILD : 0;
flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
+ if (zio->io_flags & ZIO_FLAG_NODATA)
+ flags |= METASLAB_DONT_THROTTLE;
+ if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
+ flags |= METASLAB_GANG_CHILD;
+ if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
+ flags |= METASLAB_ASYNC_ALLOC;
+
error = metaslab_alloc(spa, mc, zio->io_size, bp,
- zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
+ zio->io_prop.zp_copies, zio->io_txg, NULL, flags, zio);
- if (error) {
+ if (error != 0) {
spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
error);
ASSERT(txg > spa_syncing_txg(spa));
- /*
- * ZIL blocks are always contiguous (i.e. not gang blocks) so we
- * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
- * when allocating them.
- */
if (use_slog) {
error = metaslab_alloc(spa, spa_log_class(spa), size,
- new_bp, 1, txg, NULL,
- METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
+ new_bp, 1, txg, NULL, METASLAB_FASTWRITE, NULL);
}
if (error) {
error = metaslab_alloc(spa, spa_normal_class(spa), size,
- new_bp, 1, txg, NULL,
- METASLAB_FASTWRITE);
+ new_bp, 1, txg, NULL, METASLAB_FASTWRITE, NULL);
}
if (error == 0) {
return (ZIO_PIPELINE_STOP);
}
+ ASSERT3P(zio->io_logical, !=, zio);
+
/*
* We keep track of time-sensitive I/Os so that the scan thread
* can quickly react to certain workloads. In particular, we care
{
blkptr_t *bp = zio->io_bp;
zio_t *pio, *pio_next;
+ zio_link_t *zl = NULL;
if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
if (bp != NULL && bp != &zio->io_bp_copy)
zio->io_bp_copy = *bp;
- if (zio->io_error)
+ if (zio->io_error != 0) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(IO_IS_ALLOCATING(zio));
+ ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ /*
+ * We were unable to allocate anything, unreserve and
+ * issue the next I/O to allocate.
+ */
+ metaslab_class_throttle_unreserve(
+ spa_normal_class(zio->io_spa),
+ zio->io_prop.zp_copies, zio);
+ zio_allocate_dispatch(zio->io_spa);
+ }
+ }
+
mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_READY] = 1;
- pio = zio_walk_parents(zio);
+ pio = zio_walk_parents(zio, &zl);
mutex_exit(&zio->io_lock);
/*
* all parents must wait for us to be done before they can be done.
*/
for (; pio != NULL; pio = pio_next) {
- pio_next = zio_walk_parents(zio);
+ pio_next = zio_walk_parents(zio, &zl);
zio_notify_parent(pio, zio, ZIO_WAIT_READY);
}
return (ZIO_PIPELINE_CONTINUE);
}
+/*
+ * Update the allocation throttle accounting.
+ */
+static void
+zio_dva_throttle_done(zio_t *zio)
+{
+ zio_t *lio = zio->io_logical;
+ zio_t *pio = zio_unique_parent(zio);
+ vdev_t *vd = zio->io_vd;
+ int flags = METASLAB_ASYNC_ALLOC;
+
+ ASSERT3P(zio->io_bp, !=, NULL);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+ ASSERT(vd != NULL);
+ ASSERT3P(vd, ==, vd->vdev_top);
+ ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY)));
+ ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
+ ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
+ ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
+
+ /*
+ * Parents of gang children can have two flavors -- ones that
+ * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
+ * and ones that allocated the constituent blocks. The allocation
+ * throttle needs to know the allocating parent zio so we must find
+ * it here.
+ */
+ if (pio->io_child_type == ZIO_CHILD_GANG) {
+ /*
+ * If our parent is a rewrite gang child then our grandparent
+ * would have been the one that performed the allocation.
+ */
+ if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
+ pio = zio_unique_parent(pio);
+ flags |= METASLAB_GANG_CHILD;
+ }
+
+ ASSERT(IO_IS_ALLOCATING(pio));
+ ASSERT3P(zio, !=, zio->io_logical);
+ ASSERT(zio->io_logical != NULL);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
+ ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
+
+ mutex_enter(&pio->io_lock);
+ metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
+ mutex_exit(&pio->io_lock);
+
+ metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
+ 1, pio);
+
+ /*
+ * Call into the pipeline to see if there is more work that
+ * needs to be done. If there is work to be done it will be
+ * dispatched to another taskq thread.
+ */
+ zio_allocate_dispatch(zio->io_spa);
+}
+
static int
zio_done(zio_t *zio)
{
+ /*
+ * Always attempt to keep stack usage minimal here since
+ * we can be called recurisvely up to 19 levels deep.
+ */
zio_t *pio, *pio_next;
int c, w;
+ zio_link_t *zl = NULL;
/*
* If our children haven't all completed,
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
+ /*
+ * If the allocation throttle is enabled, then update the accounting.
+ * We only track child I/Os that are part of an allocating async
+ * write. We must do this since the allocation is performed
+ * by the logical I/O but the actual write is done by child I/Os.
+ */
+ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
+ zio->io_child_type == ZIO_CHILD_VDEV) {
+ ASSERT(spa_normal_class(
+ zio->io_spa)->mc_alloc_throttle_enabled);
+ zio_dva_throttle_done(zio);
+ }
+
+ /*
+ * If the allocation throttle is enabled, verify that
+ * we have decremented the refcounts for every I/O that was throttled.
+ */
+ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(zio->io_bp != NULL);
+ metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio);
+ VERIFY(refcount_not_held(
+ &(spa_normal_class(zio->io_spa)->mc_alloc_slots), zio));
+ }
+
+
for (c = 0; c < ZIO_CHILD_TYPES; c++)
for (w = 0; w < ZIO_WAIT_TYPES; w++)
ASSERT(zio->io_children[c][w] == 0);
* trouble (e.g. suspended). This allows "The Godfather"
* I/O to return status without blocking.
*/
- for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
- zio_link_t *zl = zio->io_walk_link;
- pio_next = zio_walk_parents(zio);
+ zl = NULL;
+ for (pio = zio_walk_parents(zio, &zl); pio != NULL;
+ pio = pio_next) {
+ zio_link_t *remove_zl = zl;
+ pio_next = zio_walk_parents(zio, &zl);
if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
- zio_remove_child(pio, zio, zl);
+ zio_remove_child(pio, zio, remove_zl);
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
}
}
zio->io_state[ZIO_WAIT_DONE] = 1;
mutex_exit(&zio->io_lock);
- for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
- zio_link_t *zl = zio->io_walk_link;
- pio_next = zio_walk_parents(zio);
- zio_remove_child(pio, zio, zl);
+ zl = NULL;
+ for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
+ zio_link_t *remove_zl = zl;
+ pio_next = zio_walk_parents(zio, &zl);
+ zio_remove_child(pio, zio, remove_zl);
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
}
static zio_pipe_stage_t *zio_pipeline[] = {
NULL,
zio_read_bp_init,
+ zio_write_bp_init,
zio_free_bp_init,
zio_issue_async,
- zio_write_bp_init,
+ zio_write_compress,
zio_checksum_generate,
zio_nop_write,
zio_ddt_read_start,
zio_ddt_free,
zio_gang_assemble,
zio_gang_issue,
+ zio_dva_throttle,
zio_dva_allocate,
zio_dva_free,
zio_dva_claim,
module_param(zfs_sync_pass_rewrite, int, 0644);
MODULE_PARM_DESC(zfs_sync_pass_rewrite,
"Rewrite new bps starting in this pass");
+
+module_param(zio_dva_throttle_enabled, int, 0644);
+MODULE_PARM_DESC(zio_dva_throttle_enabled,
+ "Throttle block allocations in the ZIO pipeline");
#endif