#include <sys/zio.h>
#include <sys/zil.h>
#include <sys/zil_impl.h>
-#include <sys/zfs_rlock.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
#include <sys/spa_impl.h>
uint64_t bw_data;
} bufwad_t;
+/*
+ * It would be better to use a rangelock_t per object. Unfortunately
+ * the rangelock_t is not a drop-in replacement for rl_t, because we
+ * still need to map from object ID to rangelock_t.
+ */
+typedef enum {
+ RL_READER,
+ RL_WRITER,
+ RL_APPEND
+} rl_type_t;
+
typedef struct rll {
void *rll_writer;
int rll_readers;
kcondvar_t rll_cv;
} rll_t;
-typedef struct zll {
- list_t z_list;
- kmutex_t z_lock;
-} zll_t;
+typedef struct rl {
+ uint64_t rl_object;
+ uint64_t rl_offset;
+ uint64_t rl_size;
+ rll_t *rl_lock;
+} rl_t;
#define ZTEST_RANGE_LOCKS 64
#define ZTEST_OBJECT_LOCKS 64
char zd_name[ZFS_MAX_DATASET_NAME_LEN];
kmutex_t zd_dirobj_lock;
rll_t zd_object_lock[ZTEST_OBJECT_LOCKS];
- zll_t zd_range_lock[ZTEST_RANGE_LOCKS];
+ rll_t zd_range_lock[ZTEST_RANGE_LOCKS];
} ztest_ds_t;
/*
return (err);
}
-
-/*
- * Object and range lock mechanics
- */
-typedef struct {
- list_node_t z_lnode;
- zfs_refcount_t z_refcnt;
- uint64_t z_object;
- zfs_rlock_t z_range_lock;
-} ztest_znode_t;
-
-typedef struct {
- rl_t *z_rl;
- ztest_znode_t *z_ztznode;
-} ztest_zrl_t;
-
-static ztest_znode_t *
-ztest_znode_init(uint64_t object)
-{
- ztest_znode_t *zp = umem_alloc(sizeof (*zp), UMEM_NOFAIL);
-
- list_link_init(&zp->z_lnode);
- zfs_refcount_create(&zp->z_refcnt);
- zp->z_object = object;
- zfs_rlock_init(&zp->z_range_lock);
-
- return (zp);
-}
-
-static void
-ztest_znode_fini(ztest_znode_t *zp)
-{
- ASSERT(zfs_refcount_is_zero(&zp->z_refcnt));
- zfs_rlock_destroy(&zp->z_range_lock);
- zp->z_object = 0;
- zfs_refcount_destroy(&zp->z_refcnt);
- list_link_init(&zp->z_lnode);
- umem_free(zp, sizeof (*zp));
-}
-
-static void
-ztest_zll_init(zll_t *zll)
-{
- mutex_init(&zll->z_lock, NULL, MUTEX_DEFAULT, NULL);
- list_create(&zll->z_list, sizeof (ztest_znode_t),
- offsetof(ztest_znode_t, z_lnode));
-}
-
-static void
-ztest_zll_destroy(zll_t *zll)
-{
- list_destroy(&zll->z_list);
- mutex_destroy(&zll->z_lock);
-}
-
-#define RL_TAG "range_lock"
-static ztest_znode_t *
-ztest_znode_get(ztest_ds_t *zd, uint64_t object)
-{
- zll_t *zll = &zd->zd_range_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
- ztest_znode_t *zp = NULL;
- mutex_enter(&zll->z_lock);
- for (zp = list_head(&zll->z_list); (zp);
- zp = list_next(&zll->z_list, zp)) {
- if (zp->z_object == object) {
- zfs_refcount_add(&zp->z_refcnt, RL_TAG);
- break;
- }
- }
- if (zp == NULL) {
- zp = ztest_znode_init(object);
- zfs_refcount_add(&zp->z_refcnt, RL_TAG);
- list_insert_head(&zll->z_list, zp);
- }
- mutex_exit(&zll->z_lock);
- return (zp);
-}
-
-static void
-ztest_znode_put(ztest_ds_t *zd, ztest_znode_t *zp)
-{
- zll_t *zll = NULL;
- ASSERT3U(zp->z_object, !=, 0);
- zll = &zd->zd_range_lock[zp->z_object & (ZTEST_OBJECT_LOCKS - 1)];
- mutex_enter(&zll->z_lock);
- zfs_refcount_remove(&zp->z_refcnt, RL_TAG);
- if (zfs_refcount_is_zero(&zp->z_refcnt)) {
- list_remove(&zll->z_list, zp);
- ztest_znode_fini(zp);
- }
- mutex_exit(&zll->z_lock);
-}
-
-
static void
ztest_rll_init(rll_t *rll)
{
ztest_rll_unlock(rll);
}
-static ztest_zrl_t *
-ztest_zrl_init(rl_t *rl, ztest_znode_t *zp)
-{
- ztest_zrl_t *zrl = umem_alloc(sizeof (*zrl), UMEM_NOFAIL);
- zrl->z_rl = rl;
- zrl->z_ztznode = zp;
- return (zrl);
-}
-
-static void
-ztest_zrl_fini(ztest_zrl_t *zrl)
-{
- umem_free(zrl, sizeof (*zrl));
-}
-
-static ztest_zrl_t *
+static rl_t *
ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
uint64_t size, rl_type_t type)
{
- ztest_znode_t *zp = ztest_znode_get(zd, object);
- rl_t *rl = zfs_range_lock(&zp->z_range_lock, offset,
- size, type);
- return (ztest_zrl_init(rl, zp));
+ uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
+ rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
+ rl_t *rl;
+
+ rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
+ rl->rl_object = object;
+ rl->rl_offset = offset;
+ rl->rl_size = size;
+ rl->rl_lock = rll;
+
+ ztest_rll_lock(rll, type);
+
+ return (rl);
}
static void
-ztest_range_unlock(ztest_ds_t *zd, ztest_zrl_t *zrl)
+ztest_range_unlock(rl_t *rl)
{
- zfs_range_unlock(zrl->z_rl);
- ztest_znode_put(zd, zrl->z_ztznode);
- ztest_zrl_fini(zrl);
+ rll_t *rll = rl->rl_lock;
+
+ ztest_rll_unlock(rll);
+
+ umem_free(rl, sizeof (*rl));
}
static void
ztest_rll_init(&zd->zd_object_lock[l]);
for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
- ztest_zll_init(&zd->zd_range_lock[l]);
+ ztest_rll_init(&zd->zd_range_lock[l]);
}
static void
ztest_rll_destroy(&zd->zd_object_lock[l]);
for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
- ztest_zll_destroy(&zd->zd_range_lock[l]);
+ ztest_rll_destroy(&zd->zd_range_lock[l]);
}
#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
dmu_tx_t *tx;
dmu_buf_t *db;
arc_buf_t *abuf = NULL;
- ztest_zrl_t *rl;
+ rl_t *rl;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
if (abuf != NULL)
dmu_return_arcbuf(abuf);
dmu_buf_rele(db, FTAG);
- ztest_range_unlock(zd, rl);
+ ztest_range_unlock(rl);
ztest_object_unlock(zd, lr->lr_foid);
return (ENOSPC);
}
dmu_tx_commit(tx);
- ztest_range_unlock(zd, rl);
+ ztest_range_unlock(rl);
ztest_object_unlock(zd, lr->lr_foid);
return (0);
objset_t *os = zd->zd_os;
dmu_tx_t *tx;
uint64_t txg;
- ztest_zrl_t *rl;
+ rl_t *rl;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
if (txg == 0) {
- ztest_range_unlock(zd, rl);
+ ztest_range_unlock(rl);
ztest_object_unlock(zd, lr->lr_foid);
return (ENOSPC);
}
dmu_tx_commit(tx);
- ztest_range_unlock(zd, rl);
+ ztest_range_unlock(rl);
ztest_object_unlock(zd, lr->lr_foid);
return (0);
/*
* ZIL get_data callbacks
*/
-typedef struct ztest_zgd_private {
- ztest_ds_t *z_zd;
- ztest_zrl_t *z_rl;
- uint64_t z_object;
-} ztest_zgd_private_t;
static void
ztest_get_done(zgd_t *zgd, int error)
{
- ztest_zgd_private_t *zzp = zgd->zgd_private;
- ztest_ds_t *zd = zzp->z_zd;
- uint64_t object = zzp->z_object;
+ ztest_ds_t *zd = zgd->zgd_private;
+ uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object;
if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd);
- ztest_range_unlock(zd, zzp->z_rl);
+ ztest_range_unlock((rl_t *)zgd->zgd_lr);
ztest_object_unlock(zd, object);
if (error == 0 && zgd->zgd_bp)
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
umem_free(zgd, sizeof (*zgd));
- umem_free(zzp, sizeof (*zzp));
}
static int
dmu_buf_t *db;
zgd_t *zgd;
int error;
- ztest_zgd_private_t *zgd_private;
ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
zgd->zgd_lwb = lwb;
- zgd_private = umem_zalloc(sizeof (ztest_zgd_private_t), UMEM_NOFAIL);
- zgd_private->z_zd = zd;
- zgd_private->z_object = object;
- zgd->zgd_private = zgd_private;
+ zgd->zgd_private = zd;
if (buf != NULL) { /* immediate write */
- zgd_private->z_rl = ztest_range_lock(zd, object, offset, size,
- RL_READER);
- zgd->zgd_rl = zgd_private->z_rl->z_rl;
+ zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd,
+ object, offset, size, RL_READER);
error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
offset = 0;
}
- zgd_private->z_rl = ztest_range_lock(zd, object, offset, size,
- RL_READER);
- zgd->zgd_rl = zgd_private->z_rl->z_rl;
+ zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd,
+ object, offset, size, RL_READER);
error = dmu_buf_hold(os, object, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
objset_t *os = zd->zd_os;
dmu_tx_t *tx;
uint64_t txg;
- ztest_zrl_t *rl;
+ rl_t *rl;
txg_wait_synced(dmu_objset_pool(os), 0);
(void) dmu_free_long_range(os, object, offset, size);
}
- ztest_range_unlock(zd, rl);
+ ztest_range_unlock(rl);
ztest_object_unlock(zd, object);
}
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
/*
* Interface
* ---------
* Defined in zfs_rlock.h but essentially:
- * rl = zfs_range_lock(zp, off, len, lock_type);
- * zfs_range_unlock(rl);
- * zfs_range_reduce(rl, off, len);
+ * lr = rangelock_enter(zp, off, len, lock_type);
+ * rangelock_reduce(lr, off, len); // optional
+ * rangelock_exit(lr);
*
* AVL tree
* --------
*
* Common case
* -----------
- * The (hopefully) usual case is of no overlaps or contention for
- * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree
- * searched that finds no overlap, and *this* rl_t is placed in the tree.
+ * The (hopefully) usual case is of no overlaps or contention for locks. On
+ * entry to rangelock_enter(), a locked_range_t is allocated; the tree
+ * searched that finds no overlap, and *this* locked_range_t is placed in the
+ * tree.
*
* Overlaps/Reference counting/Proxy locks
* ---------------------------------------
*
* Grow block handling
* -------------------
- * ZFS supports multiple block sizes currently up to 128K. The smallest
+ * ZFS supports multiple block sizes, up to 16MB. The smallest
* block size is used for the file which is grown as needed. During this
* growth all other writers and readers must be excluded.
* So if the block size needs to be grown then the whole file is
* exclusively locked, then later the caller will reduce the lock
- * range to just the range to be written using zfs_reduce_range.
+ * range to just the range to be written using rangelock_reduce().
*/
+#include <sys/zfs_context.h>
#include <sys/zfs_rlock.h>
-#include <sys/sysmacros.h>
+
+/*
+ * AVL comparison function used to order range locks
+ * Locks are ordered on the start offset of the range.
+ */
+static int
+rangelock_compare(const void *arg1, const void *arg2)
+{
+ const locked_range_t *rl1 = (const locked_range_t *)arg1;
+ const locked_range_t *rl2 = (const locked_range_t *)arg2;
+
+ return (AVL_CMP(rl1->lr_offset, rl2->lr_offset));
+}
+
+/*
+ * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock.
+ * It must convert RL_APPEND to RL_WRITER (starting at the end of the file),
+ * and may increase the range that's locked for RL_WRITER.
+ */
+void
+rangelock_init(rangelock_t *rl, rangelock_cb_t *cb, void *arg)
+{
+ mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&rl->rl_tree, rangelock_compare,
+ sizeof (locked_range_t), offsetof(locked_range_t, lr_node));
+ rl->rl_cb = cb;
+ rl->rl_arg = arg;
+}
+
+void
+rangelock_fini(rangelock_t *rl)
+{
+ mutex_destroy(&rl->rl_lock);
+ avl_destroy(&rl->rl_tree);
+}
/*
* Check if a write lock can be grabbed, or wait and recheck until available.
*/
static void
-zfs_range_lock_writer(zfs_rlock_t *zrl, rl_t *new)
+rangelock_enter_writer(rangelock_t *rl, locked_range_t *new)
{
- avl_tree_t *tree = &zrl->zr_avl;
- rl_t *rl;
+ avl_tree_t *tree = &rl->rl_tree;
+ locked_range_t *lr;
avl_index_t where;
- uint64_t end_size;
- uint64_t off = new->r_off;
- uint64_t len = new->r_len;
+ uint64_t orig_off = new->lr_offset;
+ uint64_t orig_len = new->lr_length;
+ rangelock_type_t orig_type = new->lr_type;
for (;;) {
/*
- * Range locking is also used by zvol. However, for zvol, we
- * don't need to append or grow blocksize, so skip that
- * processing.
- *
- * Yes, this is ugly, and would be solved by not handling
- * grow or append in range lock code. If that was done then
- * we could make the range locking code generically available
- * to other non-zfs consumers.
+ * Call callback which can modify new->r_off,len,type.
+ * Note, the callback is used by the ZPL to handle appending
+ * and changing blocksizes. It isn't needed for zvols.
*/
- if (zrl->zr_size) { /* caller is ZPL */
- /*
- * If in append mode pick up the current end of file.
- * This is done under z_range_lock to avoid races.
- */
- if (new->r_type == RL_APPEND)
- new->r_off = *zrl->zr_size;
-
- /*
- * If we need to grow the block size then grab the whole
- * file range. This is also done under z_range_lock to
- * avoid races.
- */
- end_size = MAX(*zrl->zr_size, new->r_off + len);
- if (end_size > *zrl->zr_blksz &&
- (!ISP2(*zrl->zr_blksz) ||
- *zrl->zr_blksz < *zrl->zr_max_blksz)) {
- new->r_off = 0;
- new->r_len = UINT64_MAX;
- }
+ if (rl->rl_cb != NULL) {
+ rl->rl_cb(new, rl->rl_arg);
}
+ /*
+ * If the type was APPEND, the callback must convert it to
+ * WRITER.
+ */
+ ASSERT3U(new->lr_type, ==, RL_WRITER);
+
/*
* First check for the usual case of no locks
*/
if (avl_numnodes(tree) == 0) {
- new->r_type = RL_WRITER; /* convert to writer */
avl_add(tree, new);
return;
}
/*
* Look for any locks in the range.
*/
- rl = avl_find(tree, new, &where);
- if (rl)
+ lr = avl_find(tree, new, &where);
+ if (lr != NULL)
goto wait; /* already locked at same offset */
- rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
- if (rl && (rl->r_off < new->r_off + new->r_len))
+ lr = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
+ if (lr != NULL &&
+ lr->lr_offset < new->lr_offset + new->lr_length)
goto wait;
- rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
- if (rl && rl->r_off + rl->r_len > new->r_off)
+ lr = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
+ if (lr != NULL &&
+ lr->lr_offset + lr->lr_length > new->lr_offset)
goto wait;
- new->r_type = RL_WRITER; /* convert possible RL_APPEND */
avl_insert(tree, new, where);
return;
wait:
- if (!rl->r_write_wanted) {
- cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
- rl->r_write_wanted = B_TRUE;
+ if (!lr->lr_write_wanted) {
+ cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL);
+ lr->lr_write_wanted = B_TRUE;
}
- cv_wait(&rl->r_wr_cv, &zrl->zr_mutex);
+ cv_wait(&lr->lr_write_cv, &rl->rl_lock);
/* reset to original */
- new->r_off = off;
- new->r_len = len;
+ new->lr_offset = orig_off;
+ new->lr_length = orig_len;
+ new->lr_type = orig_type;
}
}
* If this is an original (non-proxy) lock then replace it by
* a proxy and return the proxy.
*/
-static rl_t *
-zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
+static locked_range_t *
+rangelock_proxify(avl_tree_t *tree, locked_range_t *lr)
{
- rl_t *proxy;
+ locked_range_t *proxy;
- if (rl->r_proxy)
- return (rl); /* already a proxy */
+ if (lr->lr_proxy)
+ return (lr); /* already a proxy */
- ASSERT3U(rl->r_cnt, ==, 1);
- ASSERT(rl->r_write_wanted == B_FALSE);
- ASSERT(rl->r_read_wanted == B_FALSE);
- avl_remove(tree, rl);
- rl->r_cnt = 0;
+ ASSERT3U(lr->lr_count, ==, 1);
+ ASSERT(lr->lr_write_wanted == B_FALSE);
+ ASSERT(lr->lr_read_wanted == B_FALSE);
+ avl_remove(tree, lr);
+ lr->lr_count = 0;
/* create a proxy range lock */
- proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- proxy->r_off = rl->r_off;
- proxy->r_len = rl->r_len;
- proxy->r_cnt = 1;
- proxy->r_type = RL_READER;
- proxy->r_proxy = B_TRUE;
- proxy->r_write_wanted = B_FALSE;
- proxy->r_read_wanted = B_FALSE;
+ proxy = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+ proxy->lr_offset = lr->lr_offset;
+ proxy->lr_length = lr->lr_length;
+ proxy->lr_count = 1;
+ proxy->lr_type = RL_READER;
+ proxy->lr_proxy = B_TRUE;
+ proxy->lr_write_wanted = B_FALSE;
+ proxy->lr_read_wanted = B_FALSE;
avl_add(tree, proxy);
return (proxy);
* Split the range lock at the supplied offset
* returning the *front* proxy.
*/
-static rl_t *
-zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
+static locked_range_t *
+rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off)
{
- rl_t *front, *rear;
-
- ASSERT3U(rl->r_len, >, 1);
- ASSERT3U(off, >, rl->r_off);
- ASSERT3U(off, <, rl->r_off + rl->r_len);
- ASSERT(rl->r_write_wanted == B_FALSE);
- ASSERT(rl->r_read_wanted == B_FALSE);
+ ASSERT3U(lr->lr_length, >, 1);
+ ASSERT3U(off, >, lr->lr_offset);
+ ASSERT3U(off, <, lr->lr_offset + lr->lr_length);
+ ASSERT(lr->lr_write_wanted == B_FALSE);
+ ASSERT(lr->lr_read_wanted == B_FALSE);
/* create the rear proxy range lock */
- rear = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- rear->r_off = off;
- rear->r_len = rl->r_off + rl->r_len - off;
- rear->r_cnt = rl->r_cnt;
- rear->r_type = RL_READER;
- rear->r_proxy = B_TRUE;
- rear->r_write_wanted = B_FALSE;
- rear->r_read_wanted = B_FALSE;
-
- front = zfs_range_proxify(tree, rl);
- front->r_len = off - rl->r_off;
+ locked_range_t *rear = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+ rear->lr_offset = off;
+ rear->lr_length = lr->lr_offset + lr->lr_length - off;
+ rear->lr_count = lr->lr_count;
+ rear->lr_type = RL_READER;
+ rear->lr_proxy = B_TRUE;
+ rear->lr_write_wanted = B_FALSE;
+ rear->lr_read_wanted = B_FALSE;
+
+ locked_range_t *front = rangelock_proxify(tree, lr);
+ front->lr_length = off - lr->lr_offset;
avl_insert_here(tree, rear, front, AVL_AFTER);
return (front);
* Create and add a new proxy range lock for the supplied range.
*/
static void
-zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
+rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
{
- rl_t *rl;
-
- ASSERT(len);
- rl = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- rl->r_off = off;
- rl->r_len = len;
- rl->r_cnt = 1;
- rl->r_type = RL_READER;
- rl->r_proxy = B_TRUE;
- rl->r_write_wanted = B_FALSE;
- rl->r_read_wanted = B_FALSE;
- avl_add(tree, rl);
+ ASSERT(len != 0);
+ locked_range_t *lr = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ lr->lr_count = 1;
+ lr->lr_type = RL_READER;
+ lr->lr_proxy = B_TRUE;
+ lr->lr_write_wanted = B_FALSE;
+ lr->lr_read_wanted = B_FALSE;
+ avl_add(tree, lr);
}
static void
-zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
+rangelock_add_reader(avl_tree_t *tree, locked_range_t *new,
+ locked_range_t *prev, avl_index_t where)
{
- rl_t *next;
- uint64_t off = new->r_off;
- uint64_t len = new->r_len;
+ locked_range_t *next;
+ uint64_t off = new->lr_offset;
+ uint64_t len = new->lr_length;
/*
* prev arrives either:
* range may overlap with the new range
* - null, if there were no ranges starting before the new one
*/
- if (prev) {
- if (prev->r_off + prev->r_len <= off) {
+ if (prev != NULL) {
+ if (prev->lr_offset + prev->lr_length <= off) {
prev = NULL;
- } else if (prev->r_off != off) {
+ } else if (prev->lr_offset != off) {
/*
* convert to proxy if needed then
* split this entry and bump ref count
*/
- prev = zfs_range_split(tree, prev, off);
+ prev = rangelock_split(tree, prev, off);
prev = AVL_NEXT(tree, prev); /* move to rear range */
}
}
- ASSERT((prev == NULL) || (prev->r_off == off));
+ ASSERT((prev == NULL) || (prev->lr_offset == off));
- if (prev)
+ if (prev != NULL)
next = prev;
else
- next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
+ next = avl_nearest(tree, where, AVL_AFTER);
- if (next == NULL || off + len <= next->r_off) {
+ if (next == NULL || off + len <= next->lr_offset) {
/* no overlaps, use the original new rl_t in the tree */
avl_insert(tree, new, where);
return;
}
- if (off < next->r_off) {
+ if (off < next->lr_offset) {
/* Add a proxy for initial range before the overlap */
- zfs_range_new_proxy(tree, off, next->r_off - off);
+ rangelock_new_proxy(tree, off, next->lr_offset - off);
}
- new->r_cnt = 0; /* will use proxies in tree */
+ new->lr_count = 0; /* will use proxies in tree */
/*
* We now search forward through the ranges, until we go past the end
* of the new range. For each entry we make it a proxy if it
* gaps between the ranges then we create a new proxy range.
*/
for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
- if (off + len <= next->r_off)
+ if (off + len <= next->lr_offset)
break;
- if (prev && prev->r_off + prev->r_len < next->r_off) {
+ if (prev != NULL && prev->lr_offset + prev->lr_length <
+ next->lr_offset) {
/* there's a gap */
- ASSERT3U(next->r_off, >, prev->r_off + prev->r_len);
- zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
- next->r_off - (prev->r_off + prev->r_len));
+ ASSERT3U(next->lr_offset, >,
+ prev->lr_offset + prev->lr_length);
+ rangelock_new_proxy(tree,
+ prev->lr_offset + prev->lr_length,
+ next->lr_offset -
+ (prev->lr_offset + prev->lr_length));
}
- if (off + len == next->r_off + next->r_len) {
+ if (off + len == next->lr_offset + next->lr_length) {
/* exact overlap with end */
- next = zfs_range_proxify(tree, next);
- next->r_cnt++;
+ next = rangelock_proxify(tree, next);
+ next->lr_count++;
return;
}
- if (off + len < next->r_off + next->r_len) {
+ if (off + len < next->lr_offset + next->lr_length) {
/* new range ends in the middle of this block */
- next = zfs_range_split(tree, next, off + len);
- next->r_cnt++;
+ next = rangelock_split(tree, next, off + len);
+ next->lr_count++;
return;
}
- ASSERT3U(off + len, >, next->r_off + next->r_len);
- next = zfs_range_proxify(tree, next);
- next->r_cnt++;
+ ASSERT3U(off + len, >, next->lr_offset + next->lr_length);
+ next = rangelock_proxify(tree, next);
+ next->lr_count++;
}
/* Add the remaining end range. */
- zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
- (off + len) - (prev->r_off + prev->r_len));
+ rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length,
+ (off + len) - (prev->lr_offset + prev->lr_length));
}
/*
* Check if a reader lock can be grabbed, or wait and recheck until available.
*/
static void
-zfs_range_lock_reader(zfs_rlock_t *zrl, rl_t *new)
+rangelock_enter_reader(rangelock_t *rl, locked_range_t *new)
{
- avl_tree_t *tree = &zrl->zr_avl;
- rl_t *prev, *next;
+ avl_tree_t *tree = &rl->rl_tree;
+ locked_range_t *prev, *next;
avl_index_t where;
- uint64_t off = new->r_off;
- uint64_t len = new->r_len;
+ uint64_t off = new->lr_offset;
+ uint64_t len = new->lr_length;
/*
* Look for any writer locks in the range.
retry:
prev = avl_find(tree, new, &where);
if (prev == NULL)
- prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
+ prev = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
/*
* Check the previous range for a writer lock overlap.
*/
- if (prev && (off < prev->r_off + prev->r_len)) {
- if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) {
- if (!prev->r_read_wanted) {
- cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
- prev->r_read_wanted = B_TRUE;
+ if (prev && (off < prev->lr_offset + prev->lr_length)) {
+ if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) {
+ if (!prev->lr_read_wanted) {
+ cv_init(&prev->lr_read_cv,
+ NULL, CV_DEFAULT, NULL);
+ prev->lr_read_wanted = B_TRUE;
}
- cv_wait(&prev->r_rd_cv, &zrl->zr_mutex);
+ cv_wait(&prev->lr_read_cv, &rl->rl_lock);
goto retry;
}
- if (off + len < prev->r_off + prev->r_len)
+ if (off + len < prev->lr_offset + prev->lr_length)
goto got_lock;
}
* Search through the following ranges to see if there's
* write lock any overlap.
*/
- if (prev)
+ if (prev != NULL)
next = AVL_NEXT(tree, prev);
else
- next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
- for (; next; next = AVL_NEXT(tree, next)) {
- if (off + len <= next->r_off)
+ next = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
+ for (; next != NULL; next = AVL_NEXT(tree, next)) {
+ if (off + len <= next->lr_offset)
goto got_lock;
- if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) {
- if (!next->r_read_wanted) {
- cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
- next->r_read_wanted = B_TRUE;
+ if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) {
+ if (!next->lr_read_wanted) {
+ cv_init(&next->lr_read_cv,
+ NULL, CV_DEFAULT, NULL);
+ next->lr_read_wanted = B_TRUE;
}
- cv_wait(&next->r_rd_cv, &zrl->zr_mutex);
+ cv_wait(&next->lr_read_cv, &rl->rl_lock);
goto retry;
}
- if (off + len <= next->r_off + next->r_len)
+ if (off + len <= next->lr_offset + next->lr_length)
goto got_lock;
}
got_lock:
/*
* Add the read lock, which may involve splitting existing
- * locks and bumping ref counts (r_cnt).
+ * locks and bumping ref counts (r_count).
*/
- zfs_range_add_reader(tree, new, prev, where);
+ rangelock_add_reader(tree, new, prev, where);
}
/*
- * Lock a range (offset, length) as either shared (RL_READER)
- * or exclusive (RL_WRITER). Returns the range lock structure
- * for later unlocking or reduce range (if entire file
- * previously locked as RL_WRITER).
+ * Lock a range (offset, length) as either shared (RL_READER) or exclusive
+ * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert
+ * it to a RL_WRITER lock (with the offset at the end of the file). Returns
+ * the range lock structure for later unlocking (or reduce range if the
+ * entire file is locked as RL_WRITER).
*/
-rl_t *
-zfs_range_lock(zfs_rlock_t *zrl, uint64_t off, uint64_t len, rl_type_t type)
+locked_range_t *
+rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
+ rangelock_type_t type)
{
- rl_t *new;
-
ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
- new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- new->r_zrl = zrl;
- new->r_off = off;
+ locked_range_t *new = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+ new->lr_rangelock = rl;
+ new->lr_offset = off;
if (len + off < off) /* overflow */
len = UINT64_MAX - off;
- new->r_len = len;
- new->r_cnt = 1; /* assume it's going to be in the tree */
- new->r_type = type;
- new->r_proxy = B_FALSE;
- new->r_write_wanted = B_FALSE;
- new->r_read_wanted = B_FALSE;
-
- mutex_enter(&zrl->zr_mutex);
+ new->lr_length = len;
+ new->lr_count = 1; /* assume it's going to be in the tree */
+ new->lr_type = type;
+ new->lr_proxy = B_FALSE;
+ new->lr_write_wanted = B_FALSE;
+ new->lr_read_wanted = B_FALSE;
+
+ mutex_enter(&rl->rl_lock);
if (type == RL_READER) {
/*
* First check for the usual case of no locks
*/
- if (avl_numnodes(&zrl->zr_avl) == 0)
- avl_add(&zrl->zr_avl, new);
+ if (avl_numnodes(&rl->rl_tree) == 0)
+ avl_add(&rl->rl_tree, new);
else
- zfs_range_lock_reader(zrl, new);
- } else /* RL_WRITER or RL_APPEND */
- zfs_range_lock_writer(zrl, new);
- mutex_exit(&zrl->zr_mutex);
+ rangelock_enter_reader(rl, new);
+ } else
+ rangelock_enter_writer(rl, new); /* RL_WRITER or RL_APPEND */
+ mutex_exit(&rl->rl_lock);
return (new);
}
+/*
+ * Safely free the locked_range_t.
+ */
static void
-zfs_range_free(void *arg)
+rangelock_free(locked_range_t *lr)
{
- rl_t *rl = arg;
+ if (lr->lr_write_wanted)
+ cv_destroy(&lr->lr_write_cv);
- if (rl->r_write_wanted)
- cv_destroy(&rl->r_wr_cv);
+ if (lr->lr_read_wanted)
+ cv_destroy(&lr->lr_read_cv);
- if (rl->r_read_wanted)
- cv_destroy(&rl->r_rd_cv);
-
- kmem_free(rl, sizeof (rl_t));
+ kmem_free(lr, sizeof (locked_range_t));
}
/*
* Unlock a reader lock
*/
static void
-zfs_range_unlock_reader(zfs_rlock_t *zrl, rl_t *remove, list_t *free_list)
+rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove,
+ list_t *free_list)
{
- avl_tree_t *tree = &zrl->zr_avl;
- rl_t *rl, *next = NULL;
+ avl_tree_t *tree = &rl->rl_tree;
uint64_t len;
/*
* removed from the tree and replaced by proxies (one or
* more ranges mapping to the entire range).
*/
- if (remove->r_cnt == 1) {
+ if (remove->lr_count == 1) {
avl_remove(tree, remove);
-
- if (remove->r_write_wanted)
- cv_broadcast(&remove->r_wr_cv);
-
- if (remove->r_read_wanted)
- cv_broadcast(&remove->r_rd_cv);
-
+ if (remove->lr_write_wanted)
+ cv_broadcast(&remove->lr_write_cv);
+ if (remove->lr_read_wanted)
+ cv_broadcast(&remove->lr_read_cv);
list_insert_tail(free_list, remove);
} else {
- ASSERT0(remove->r_cnt);
- ASSERT0(remove->r_write_wanted);
- ASSERT0(remove->r_read_wanted);
+ ASSERT0(remove->lr_count);
+ ASSERT0(remove->lr_write_wanted);
+ ASSERT0(remove->lr_read_wanted);
/*
* Find start proxy representing this reader lock,
* then decrement ref count on all proxies
* that make up this range, freeing them as needed.
*/
- rl = avl_find(tree, remove, NULL);
- ASSERT(rl);
- ASSERT(rl->r_cnt);
- ASSERT(rl->r_type == RL_READER);
- for (len = remove->r_len; len != 0; rl = next) {
- len -= rl->r_len;
- if (len) {
- next = AVL_NEXT(tree, rl);
- ASSERT(next);
- ASSERT(rl->r_off + rl->r_len == next->r_off);
- ASSERT(next->r_cnt);
- ASSERT(next->r_type == RL_READER);
+ locked_range_t *lr = avl_find(tree, remove, NULL);
+ ASSERT3P(lr, !=, NULL);
+ ASSERT3U(lr->lr_count, !=, 0);
+ ASSERT3U(lr->lr_type, ==, RL_READER);
+ locked_range_t *next = NULL;
+ for (len = remove->lr_length; len != 0; lr = next) {
+ len -= lr->lr_length;
+ if (len != 0) {
+ next = AVL_NEXT(tree, lr);
+ ASSERT3P(next, !=, NULL);
+ ASSERT3U(lr->lr_offset + lr->lr_length, ==,
+ next->lr_offset);
+ ASSERT3U(next->lr_count, !=, 0);
+ ASSERT3U(next->lr_type, ==, RL_READER);
}
- rl->r_cnt--;
- if (rl->r_cnt == 0) {
- avl_remove(tree, rl);
-
- if (rl->r_write_wanted)
- cv_broadcast(&rl->r_wr_cv);
-
- if (rl->r_read_wanted)
- cv_broadcast(&rl->r_rd_cv);
-
- list_insert_tail(free_list, rl);
+ lr->lr_count--;
+ if (lr->lr_count == 0) {
+ avl_remove(tree, lr);
+ if (lr->lr_write_wanted)
+ cv_broadcast(&lr->lr_write_cv);
+ if (lr->lr_read_wanted)
+ cv_broadcast(&lr->lr_read_cv);
+ list_insert_tail(free_list, lr);
}
}
-
- kmem_free(remove, sizeof (rl_t));
+ kmem_free(remove, sizeof (locked_range_t));
}
}
* Unlock range and destroy range lock structure.
*/
void
-zfs_range_unlock(rl_t *rl)
+rangelock_exit(locked_range_t *lr)
{
- zfs_rlock_t *zrl = rl->r_zrl;
+ rangelock_t *rl = lr->lr_rangelock;
list_t free_list;
- rl_t *free_rl;
-
- ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER);
- ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0);
- ASSERT(!rl->r_proxy);
- list_create(&free_list, sizeof (rl_t), offsetof(rl_t, rl_node));
+ locked_range_t *free_lr;
- mutex_enter(&zrl->zr_mutex);
- if (rl->r_type == RL_WRITER) {
- /* writer locks can't be shared or split */
- avl_remove(&zrl->zr_avl, rl);
- if (rl->r_write_wanted)
- cv_broadcast(&rl->r_wr_cv);
+ ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER);
+ ASSERT(lr->lr_count == 1 || lr->lr_count == 0);
+ ASSERT(!lr->lr_proxy);
- if (rl->r_read_wanted)
- cv_broadcast(&rl->r_rd_cv);
+ /*
+ * The free list is used to defer the cv_destroy() and
+ * subsequent kmem_free until after the mutex is dropped.
+ */
+ list_create(&free_list, sizeof (locked_range_t),
+ offsetof(locked_range_t, lr_node));
- list_insert_tail(&free_list, rl);
+ mutex_enter(&rl->rl_lock);
+ if (lr->lr_type == RL_WRITER) {
+ /* writer locks can't be shared or split */
+ avl_remove(&rl->rl_tree, lr);
+ if (lr->lr_write_wanted)
+ cv_broadcast(&lr->lr_write_cv);
+ if (lr->lr_read_wanted)
+ cv_broadcast(&lr->lr_read_cv);
+ list_insert_tail(&free_list, lr);
} else {
/*
- * lock may be shared, let zfs_range_unlock_reader()
- * release the zp->z_range_lock lock and free the rl_t
+ * lock may be shared, let rangelock_exit_reader()
+ * release the lock and free the locked_range_t.
*/
- zfs_range_unlock_reader(zrl, rl, &free_list);
+ rangelock_exit_reader(rl, lr, &free_list);
}
- mutex_exit(&zrl->zr_mutex);
+ mutex_exit(&rl->rl_lock);
- while ((free_rl = list_head(&free_list)) != NULL) {
- list_remove(&free_list, free_rl);
- zfs_range_free(free_rl);
- }
+ while ((free_lr = list_remove_head(&free_list)) != NULL)
+ rangelock_free(free_lr);
list_destroy(&free_list);
}
/*
* Reduce range locked as RL_WRITER from whole file to specified range.
- * Asserts the whole file is exclusivly locked and so there's only one
+ * Asserts the whole file is exclusively locked and so there's only one
* entry in the tree.
*/
void
-zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
+rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len)
{
- zfs_rlock_t *zrl = rl->r_zrl;
+ rangelock_t *rl = lr->lr_rangelock;
/* Ensure there are no other locks */
- ASSERT(avl_numnodes(&zrl->zr_avl) == 1);
- ASSERT(rl->r_off == 0);
- ASSERT(rl->r_type == RL_WRITER);
- ASSERT(!rl->r_proxy);
- ASSERT3U(rl->r_len, ==, UINT64_MAX);
- ASSERT3U(rl->r_cnt, ==, 1);
-
- mutex_enter(&zrl->zr_mutex);
- rl->r_off = off;
- rl->r_len = len;
-
- if (rl->r_write_wanted)
- cv_broadcast(&rl->r_wr_cv);
- if (rl->r_read_wanted)
- cv_broadcast(&rl->r_rd_cv);
-
- mutex_exit(&zrl->zr_mutex);
-}
-
-/*
- * AVL comparison function used to order range locks
- * Locks are ordered on the start offset of the range.
- */
-int
-zfs_range_compare(const void *arg1, const void *arg2)
-{
- const rl_t *rl1 = (const rl_t *)arg1;
- const rl_t *rl2 = (const rl_t *)arg2;
-
- return (AVL_CMP(rl1->r_off, rl2->r_off));
+ ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1);
+ ASSERT3U(lr->lr_offset, ==, 0);
+ ASSERT3U(lr->lr_type, ==, RL_WRITER);
+ ASSERT(!lr->lr_proxy);
+ ASSERT3U(lr->lr_length, ==, UINT64_MAX);
+ ASSERT3U(lr->lr_count, ==, 1);
+
+ mutex_enter(&rl->rl_lock);
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ mutex_exit(&rl->rl_lock);
+ if (lr->lr_write_wanted)
+ cv_broadcast(&lr->lr_write_cv);
+ if (lr->lr_read_wanted)
+ cv_broadcast(&lr->lr_read_cv);
}
-#ifdef _KERNEL
-EXPORT_SYMBOL(zfs_range_lock);
-EXPORT_SYMBOL(zfs_range_unlock);
-EXPORT_SYMBOL(zfs_range_reduce);
-EXPORT_SYMBOL(zfs_range_compare);
+#if defined(_KERNEL)
+EXPORT_SYMBOL(rangelock_init);
+EXPORT_SYMBOL(rangelock_fini);
+EXPORT_SYMBOL(rangelock_enter);
+EXPORT_SYMBOL(rangelock_exit);
+EXPORT_SYMBOL(rangelock_reduce);
#endif