OpenZFS 9689 - zfs range lock code should not be zpl-specific

author Matt Ahrens <mahrens@delphix.com>

Mon, 1 Oct 2018 22:13:12 +0000 (15:13 -0700)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Thu, 11 Oct 2018 17:19:33 +0000 (10:19 -0700)
author Matt Ahrens <mahrens@delphix.com>
Mon, 1 Oct 2018 22:13:12 +0000 (15:13 -0700)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Thu, 11 Oct 2018 17:19:33 +0000 (10:19 -0700)
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c

index 40b107175edca47f2b5ba044697e32fe9c646930..cfd7bc12f43d693c554f93bf20ebf948017d6639 100644 (file)
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -104,7 +104,6 @@
  #include <sys/zio.h>
  #include <sys/zil.h>
  #include <sys/zil_impl.h>
-#include <sys/zfs_rlock.h>
  #include <sys/vdev_impl.h>
  #include <sys/vdev_file.h>
  #include <sys/spa_impl.h>
@@ -258,6 +257,17 @@ typedef struct bufwad {
         uint64_t        bw_data;
  } bufwad_t;
  
+/*
+ * It would be better to use a rangelock_t per object.  Unfortunately
+ * the rangelock_t is not a drop-in replacement for rl_t, because we
+ * still need to map from object ID to rangelock_t.
+ */
+typedef enum {
+       RL_READER,
+       RL_WRITER,
+       RL_APPEND
+} rl_type_t;
+
  typedef struct rll {
         void            *rll_writer;
         int             rll_readers;
@@ -265,10 +275,12 @@ typedef struct rll {
         kcondvar_t      rll_cv;
  } rll_t;
  
-typedef struct zll {
-       list_t z_list;
-       kmutex_t z_lock;
-} zll_t;
+typedef struct rl {
+       uint64_t        rl_object;
+       uint64_t        rl_offset;
+       uint64_t        rl_size;
+       rll_t           *rl_lock;
+} rl_t;
  
  #define        ZTEST_RANGE_LOCKS       64
  #define        ZTEST_OBJECT_LOCKS      64
@@ -301,7 +313,7 @@ typedef struct ztest_ds {
         char            zd_name[ZFS_MAX_DATASET_NAME_LEN];
         kmutex_t        zd_dirobj_lock;
         rll_t           zd_object_lock[ZTEST_OBJECT_LOCKS];
-       zll_t           zd_range_lock[ZTEST_RANGE_LOCKS];
+       rll_t           zd_range_lock[ZTEST_RANGE_LOCKS];
  } ztest_ds_t;
  
  /*
@@ -1318,100 +1330,6 @@ ztest_dmu_objset_own(const char *name, dmu_objset_type_t type,
         return (err);
  }
  
-
-/*
- * Object and range lock mechanics
- */
-typedef struct {
-       list_node_t z_lnode;
-       zfs_refcount_t z_refcnt;
-       uint64_t z_object;
-       zfs_rlock_t z_range_lock;
-} ztest_znode_t;
-
-typedef struct {
-       rl_t *z_rl;
-       ztest_znode_t *z_ztznode;
-} ztest_zrl_t;
-
-static ztest_znode_t *
-ztest_znode_init(uint64_t object)
-{
-       ztest_znode_t *zp = umem_alloc(sizeof (*zp), UMEM_NOFAIL);
-
-       list_link_init(&zp->z_lnode);
-       zfs_refcount_create(&zp->z_refcnt);
-       zp->z_object = object;
-       zfs_rlock_init(&zp->z_range_lock);
-
-       return (zp);
-}
-
-static void
-ztest_znode_fini(ztest_znode_t *zp)
-{
-       ASSERT(zfs_refcount_is_zero(&zp->z_refcnt));
-       zfs_rlock_destroy(&zp->z_range_lock);
-       zp->z_object = 0;
-       zfs_refcount_destroy(&zp->z_refcnt);
-       list_link_init(&zp->z_lnode);
-       umem_free(zp, sizeof (*zp));
-}
-
-static void
-ztest_zll_init(zll_t *zll)
-{
-       mutex_init(&zll->z_lock, NULL, MUTEX_DEFAULT, NULL);
-       list_create(&zll->z_list, sizeof (ztest_znode_t),
-           offsetof(ztest_znode_t, z_lnode));
-}
-
-static void
-ztest_zll_destroy(zll_t *zll)
-{
-       list_destroy(&zll->z_list);
-       mutex_destroy(&zll->z_lock);
-}
-
-#define        RL_TAG "range_lock"
-static ztest_znode_t *
-ztest_znode_get(ztest_ds_t *zd, uint64_t object)
-{
-       zll_t *zll = &zd->zd_range_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
-       ztest_znode_t *zp = NULL;
-       mutex_enter(&zll->z_lock);
-       for (zp = list_head(&zll->z_list); (zp);
-           zp = list_next(&zll->z_list, zp)) {
-               if (zp->z_object == object) {
-                       zfs_refcount_add(&zp->z_refcnt, RL_TAG);
-                       break;
-               }
-       }
-       if (zp == NULL) {
-               zp = ztest_znode_init(object);
-               zfs_refcount_add(&zp->z_refcnt, RL_TAG);
-               list_insert_head(&zll->z_list, zp);
-       }
-       mutex_exit(&zll->z_lock);
-       return (zp);
-}
-
-static void
-ztest_znode_put(ztest_ds_t *zd, ztest_znode_t *zp)
-{
-       zll_t *zll = NULL;
-       ASSERT3U(zp->z_object, !=, 0);
-       zll = &zd->zd_range_lock[zp->z_object & (ZTEST_OBJECT_LOCKS - 1)];
-       mutex_enter(&zll->z_lock);
-       zfs_refcount_remove(&zp->z_refcnt, RL_TAG);
-       if (zfs_refcount_is_zero(&zp->z_refcnt)) {
-               list_remove(&zll->z_list, zp);
-               ztest_znode_fini(zp);
-       }
-       mutex_exit(&zll->z_lock);
-}
-
-
  static void
  ztest_rll_init(rll_t *rll)
  {
@@ -1484,37 +1402,33 @@ ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
         ztest_rll_unlock(rll);
  }
  
-static ztest_zrl_t *
-ztest_zrl_init(rl_t *rl, ztest_znode_t *zp)
-{
-       ztest_zrl_t *zrl = umem_alloc(sizeof (*zrl), UMEM_NOFAIL);
-       zrl->z_rl = rl;
-       zrl->z_ztznode = zp;
-       return (zrl);
-}
-
-static void
-ztest_zrl_fini(ztest_zrl_t *zrl)
-{
-       umem_free(zrl, sizeof (*zrl));
-}
-
-static ztest_zrl_t *
+static rl_t *
  ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
      uint64_t size, rl_type_t type)
  {
-       ztest_znode_t *zp = ztest_znode_get(zd, object);
-       rl_t *rl = zfs_range_lock(&zp->z_range_lock, offset,
-           size, type);
-       return (ztest_zrl_init(rl, zp));
+       uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
+       rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
+       rl_t *rl;
+
+       rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
+       rl->rl_object = object;
+       rl->rl_offset = offset;
+       rl->rl_size = size;
+       rl->rl_lock = rll;
+
+       ztest_rll_lock(rll, type);
+
+       return (rl);
  }
  
  static void
-ztest_range_unlock(ztest_ds_t *zd, ztest_zrl_t *zrl)
+ztest_range_unlock(rl_t *rl)
  {
-       zfs_range_unlock(zrl->z_rl);
-       ztest_znode_put(zd, zrl->z_ztznode);
-       ztest_zrl_fini(zrl);
+       rll_t *rll = rl->rl_lock;
+
+       ztest_rll_unlock(rll);
+
+       umem_free(rl, sizeof (*rl));
  }
  
  static void
@@ -1536,7 +1450,7 @@ ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os)
                 ztest_rll_init(&zd->zd_object_lock[l]);
  
         for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
-               ztest_zll_init(&zd->zd_range_lock[l]);
+               ztest_rll_init(&zd->zd_range_lock[l]);
  }
  
  static void
@@ -1551,7 +1465,7 @@ ztest_zd_fini(ztest_ds_t *zd)
                 ztest_rll_destroy(&zd->zd_object_lock[l]);
  
         for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
-               ztest_zll_destroy(&zd->zd_range_lock[l]);
+               ztest_rll_destroy(&zd->zd_range_lock[l]);
  }
  
  #define        TXG_MIGHTWAIT   (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
@@ -1967,7 +1881,7 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
         dmu_tx_t *tx;
         dmu_buf_t *db;
         arc_buf_t *abuf = NULL;
-       ztest_zrl_t *rl;
+       rl_t *rl;
  
         if (byteswap)
                 byteswap_uint64_array(lr, sizeof (*lr));
@@ -2016,7 +1930,7 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
                 if (abuf != NULL)
                         dmu_return_arcbuf(abuf);
                 dmu_buf_rele(db, FTAG);
-               ztest_range_unlock(zd, rl);
+               ztest_range_unlock(rl);
                 ztest_object_unlock(zd, lr->lr_foid);
                 return (ENOSPC);
         }
@@ -2074,7 +1988,7 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
  
         dmu_tx_commit(tx);
  
-       ztest_range_unlock(zd, rl);
+       ztest_range_unlock(rl);
         ztest_object_unlock(zd, lr->lr_foid);
  
         return (0);
@@ -2088,7 +2002,7 @@ ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
         objset_t *os = zd->zd_os;
         dmu_tx_t *tx;
         uint64_t txg;
-       ztest_zrl_t *rl;
+       rl_t *rl;
  
         if (byteswap)
                 byteswap_uint64_array(lr, sizeof (*lr));
@@ -2103,7 +2017,7 @@ ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
  
         txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
         if (txg == 0) {
-               ztest_range_unlock(zd, rl);
+               ztest_range_unlock(rl);
                 ztest_object_unlock(zd, lr->lr_foid);
                 return (ENOSPC);
         }
@@ -2115,7 +2029,7 @@ ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
  
         dmu_tx_commit(tx);
  
-       ztest_range_unlock(zd, rl);
+       ztest_range_unlock(rl);
         ztest_object_unlock(zd, lr->lr_foid);
  
         return (0);
@@ -2222,30 +2136,23 @@ zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
  /*
   * ZIL get_data callbacks
   */
-typedef struct ztest_zgd_private {
-       ztest_ds_t *z_zd;
-       ztest_zrl_t *z_rl;
-       uint64_t z_object;
-} ztest_zgd_private_t;
  
  static void
  ztest_get_done(zgd_t *zgd, int error)
  {
-       ztest_zgd_private_t *zzp = zgd->zgd_private;
-       ztest_ds_t *zd = zzp->z_zd;
-       uint64_t object = zzp->z_object;
+       ztest_ds_t *zd = zgd->zgd_private;
+       uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object;
  
         if (zgd->zgd_db)
                 dmu_buf_rele(zgd->zgd_db, zgd);
  
-       ztest_range_unlock(zd, zzp->z_rl);
+       ztest_range_unlock((rl_t *)zgd->zgd_lr);
         ztest_object_unlock(zd, object);
  
         if (error == 0 && zgd->zgd_bp)
                 zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
  
         umem_free(zgd, sizeof (*zgd));
-       umem_free(zzp, sizeof (*zzp));
  }
  
  static int
@@ -2263,7 +2170,6 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb,
         dmu_buf_t *db;
         zgd_t *zgd;
         int error;
-       ztest_zgd_private_t *zgd_private;
  
         ASSERT3P(lwb, !=, NULL);
         ASSERT3P(zio, !=, NULL);
@@ -2290,15 +2196,11 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb,
  
         zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
         zgd->zgd_lwb = lwb;
-       zgd_private = umem_zalloc(sizeof (ztest_zgd_private_t), UMEM_NOFAIL);
-       zgd_private->z_zd = zd;
-       zgd_private->z_object = object;
-       zgd->zgd_private = zgd_private;
+       zgd->zgd_private = zd;
  
         if (buf != NULL) {      /* immediate write */
-               zgd_private->z_rl = ztest_range_lock(zd, object, offset, size,
-                   RL_READER);
-               zgd->zgd_rl = zgd_private->z_rl->z_rl;
+               zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd,
+                   object, offset, size, RL_READER);
  
                 error = dmu_read(os, object, offset, size, buf,
                     DMU_READ_NO_PREFETCH);
@@ -2312,9 +2214,8 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb,
                         offset = 0;
                 }
  
-               zgd_private->z_rl = ztest_range_lock(zd, object, offset, size,
-                   RL_READER);
-               zgd->zgd_rl = zgd_private->z_rl->z_rl;
+               zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd,
+                   object, offset, size, RL_READER);
  
                 error = dmu_buf_hold(os, object, offset, zgd, &db,
                     DMU_READ_NO_PREFETCH);
@@ -2560,7 +2461,7 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
         objset_t *os = zd->zd_os;
         dmu_tx_t *tx;
         uint64_t txg;
-       ztest_zrl_t *rl;
+       rl_t *rl;
  
         txg_wait_synced(dmu_objset_pool(os), 0);
  
@@ -2581,7 +2482,7 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
                 (void) dmu_free_long_range(os, object, offset, size);
         }
  
-       ztest_range_unlock(zd, rl);
+       ztest_range_unlock(rl);
         ztest_object_unlock(zd, object);
  }
  
diff --git a/include/sys/dmu.h b/include/sys/dmu.h

index bc7046fdced8d56499ae826e65fd0eee8e0e319f..f8b5f096a1db278c7aa7dafa48ded0ca39419bd1 100644 (file)
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -73,6 +73,7 @@ struct arc_buf;
  struct zio_prop;
  struct sa_handle;
  struct dsl_crypto_params;
+struct locked_range;
  
  typedef struct objset objset_t;
  typedef struct dmu_tx dmu_tx_t;
@@ -1034,7 +1035,7 @@ typedef struct zgd {
         struct lwb      *zgd_lwb;
         struct blkptr   *zgd_bp;
         dmu_buf_t       *zgd_db;
-       struct rl       *zgd_rl;
+       struct locked_range *zgd_lr;
         void            *zgd_private;
  } zgd_t;
  
diff --git a/include/sys/zfs_rlock.h b/include/sys/zfs_rlock.h

index 8483b4e8bf03c9052604a7fb7d83c4379b930052..05b080843d72a4a89d14cf89e450e36978d96dff 100644 (file)
--- a/include/sys/zfs_rlock.h
+++ b/include/sys/zfs_rlock.h
@@ -22,6 +22,9 @@
   * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
   * Use is subject to license terms.
   */
+/*
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
  
  #ifndef        _SYS_FS_ZFS_RLOCK_H
  #define        _SYS_FS_ZFS_RLOCK_H
@@ -30,85 +33,46 @@
  extern "C" {
  #endif
  
-#include <sys/list.h>
  #include <sys/avl.h>
  
-#ifdef _KERNEL
-#include <sys/condvar.h>
-#else
-#include <sys/zfs_context.h>
-#endif
-
  typedef enum {
         RL_READER,
         RL_WRITER,
         RL_APPEND
-} rl_type_t;
+} rangelock_type_t;
  
-typedef struct zfs_rlock {
-       kmutex_t zr_mutex;      /* protects changes to zr_avl */
-       avl_tree_t zr_avl;      /* avl tree of range locks */
-       uint64_t *zr_size;      /* points to znode->z_size */
-       uint_t *zr_blksz;       /* points to znode->z_blksz */
-       uint64_t *zr_max_blksz; /* points to zfsvfs->z_max_blksz */
-} zfs_rlock_t;
-
-typedef struct rl {
-       zfs_rlock_t *r_zrl;
-       avl_node_t r_node;      /* avl node link */
-       uint64_t r_off;         /* file range offset */
-       uint64_t r_len;         /* file range length */
-       uint_t r_cnt;           /* range reference count in tree */
-       rl_type_t r_type;       /* range type */
-       kcondvar_t r_wr_cv;     /* cv for waiting writers */
-       kcondvar_t r_rd_cv;     /* cv for waiting readers */
-       uint8_t r_proxy;        /* acting for original range */
-       uint8_t r_write_wanted; /* writer wants to lock this range */
-       uint8_t r_read_wanted;  /* reader wants to lock this range */
-       list_node_t rl_node;    /* used for deferred release */
-} rl_t;
-
-/*
- * Lock a range (offset, length) as either shared (RL_READER)
- * or exclusive (RL_WRITER or RL_APPEND).  RL_APPEND is a special type that
- * is converted to RL_WRITER that specified to lock from the start of the
- * end of file.  Returns the range lock structure.
- */
-rl_t *zfs_range_lock(zfs_rlock_t *zrl, uint64_t off, uint64_t len,
-    rl_type_t type);
+struct locked_range;
  
-/* Unlock range and destroy range lock structure. */
-void zfs_range_unlock(rl_t *rl);
+typedef void (rangelock_cb_t)(struct locked_range *, void *);
  
-/*
- * Reduce range locked as RW_WRITER from whole file to specified range.
- * Asserts the whole file was previously locked.
- */
-void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len);
+typedef struct rangelock {
+       avl_tree_t rl_tree; /* contains locked_range_t */
+       kmutex_t rl_lock;
+       rangelock_cb_t *rl_cb;
+       void *rl_arg;
+} rangelock_t;
  
-/*
- * AVL comparison function used to order range locks
- * Locks are ordered on the start offset of the range.
- */
-int zfs_range_compare(const void *arg1, const void *arg2);
+typedef struct locked_range {
+       rangelock_t *lr_rangelock; /* rangelock that this lock applies to */
+       avl_node_t lr_node;     /* avl node link */
+       uint64_t lr_offset;     /* file range offset */
+       uint64_t lr_length;     /* file range length */
+       uint_t lr_count;        /* range reference count in tree */
+       rangelock_type_t lr_type; /* range type */
+       kcondvar_t lr_write_cv; /* cv for waiting writers */
+       kcondvar_t lr_read_cv;  /* cv for waiting readers */
+       uint8_t lr_proxy;       /* acting for original range */
+       uint8_t lr_write_wanted; /* writer wants to lock this range */
+       uint8_t lr_read_wanted; /* reader wants to lock this range */
+} locked_range_t;
  
-static inline void
-zfs_rlock_init(zfs_rlock_t *zrl)
-{
-       mutex_init(&zrl->zr_mutex, NULL, MUTEX_DEFAULT, NULL);
-       avl_create(&zrl->zr_avl, zfs_range_compare,
-           sizeof (rl_t), offsetof(rl_t, r_node));
-       zrl->zr_size = NULL;
-       zrl->zr_blksz = NULL;
-       zrl->zr_max_blksz = NULL;
-}
+void rangelock_init(rangelock_t *, rangelock_cb_t *, void *);
+void rangelock_fini(rangelock_t *);
  
-static inline void
-zfs_rlock_destroy(zfs_rlock_t *zrl)
-{
-       avl_destroy(&zrl->zr_avl);
-       mutex_destroy(&zrl->zr_mutex);
-}
+locked_range_t *rangelock_enter(rangelock_t *,
+    uint64_t, uint64_t, rangelock_type_t);
+void rangelock_exit(locked_range_t *);
+void rangelock_reduce(locked_range_t *, uint64_t, uint64_t);
  
  #ifdef __cplusplus
  }
diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h

index 01f4328f040537114cf47e3609fbf1b4473b1223..5fa0986eacc3ca7ca841c7e6ef3cd2b778afbb6f 100644 (file)
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@@ -20,7 +20,7 @@
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
   * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
   */
  
@@ -191,7 +191,7 @@ typedef struct znode {
         krwlock_t       z_parent_lock;  /* parent lock for directories */
         krwlock_t       z_name_lock;    /* "master" lock for dirent locks */
         zfs_dirlock_t   *z_dirlocks;    /* directory entry lock list */
-       zfs_rlock_t     z_range_lock;   /* file range lock */
+       rangelock_t     z_rangelock;    /* file range locks */
         uint8_t         z_unlinked;     /* file has been unlinked */
         uint8_t         z_atime_dirty;  /* atime needs to be synced */
         uint8_t         z_zn_prefetch;  /* Prefetch znodes? */
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c

index 2ff484b634756c7e2e851f952be074cc9638d4e8..180c1f12fc9c2d7239e51c28a656eebde112d10d 100644 (file)
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1924,11 +1924,6 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
         ASSERT(pio != NULL);
         ASSERT(txg != 0);
  
-       /* dbuf is within the locked range */
-       ASSERT3U(db->db.db_offset, >=, zgd->zgd_rl->r_off);
-       ASSERT3U(db->db.db_offset + db->db.db_size, <=,
-           zgd->zgd_rl->r_off + zgd->zgd_rl->r_len);
-
         SET_BOOKMARK(&zb, ds->ds_object,
             db->db.db_object, db->db_level, db->db_blkid);
  
diff --git a/module/zfs/zfs_rlock.c b/module/zfs/zfs_rlock.c

index 7ecc353d262cf3e658f054053df2a2e3c87017f5..d514a4fc775361bb8a6e3c31b0295f22f851064e 100644 (file)
--- a/module/zfs/zfs_rlock.c
+++ b/module/zfs/zfs_rlock.c
@@ -23,7 +23,7 @@
   * Use is subject to license terms.
   */
  /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
   */
  
  /*
@@ -34,9 +34,9 @@
   * Interface
   * ---------
   * Defined in zfs_rlock.h but essentially:
- *     rl = zfs_range_lock(zp, off, len, lock_type);
- *     zfs_range_unlock(rl);
- *     zfs_range_reduce(rl, off, len);
+ *     lr = rangelock_enter(zp, off, len, lock_type);
+ *     rangelock_reduce(lr, off, len); // optional
+ *     rangelock_exit(lr);
   *
   * AVL tree
   * --------
@@ -46,9 +46,10 @@
   *
   * Common case
   * -----------
- * The (hopefully) usual case is of no overlaps or contention for
- * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree
- * searched that finds no overlap, and *this* rl_t is placed in the tree.
+ * The (hopefully) usual case is of no overlaps or contention for locks. On
+ * entry to rangelock_enter(), a locked_range_t is allocated; the tree
+ * searched that finds no overlap, and *this* locked_range_t is placed in the
+ * tree.
   *
   * Overlaps/Reference counting/Proxy locks
   * ---------------------------------------
@@ -87,68 +88,85 @@
   *
   * Grow block handling
   * -------------------
- * ZFS supports multiple block sizes currently up to 128K. The smallest
+ * ZFS supports multiple block sizes, up to 16MB. The smallest
   * block size is used for the file which is grown as needed. During this
   * growth all other writers and readers must be excluded.
   * So if the block size needs to be grown then the whole file is
   * exclusively locked, then later the caller will reduce the lock
- * range to just the range to be written using zfs_reduce_range.
+ * range to just the range to be written using rangelock_reduce().
   */
  
+#include <sys/zfs_context.h>
  #include <sys/zfs_rlock.h>
-#include <sys/sysmacros.h>
+
+/*
+ * AVL comparison function used to order range locks
+ * Locks are ordered on the start offset of the range.
+ */
+static int
+rangelock_compare(const void *arg1, const void *arg2)
+{
+       const locked_range_t *rl1 = (const locked_range_t *)arg1;
+       const locked_range_t *rl2 = (const locked_range_t *)arg2;
+
+       return (AVL_CMP(rl1->lr_offset, rl2->lr_offset));
+}
+
+/*
+ * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock.
+ * It must convert RL_APPEND to RL_WRITER (starting at the end of the file),
+ * and may increase the range that's locked for RL_WRITER.
+ */
+void
+rangelock_init(rangelock_t *rl, rangelock_cb_t *cb, void *arg)
+{
+       mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL);
+       avl_create(&rl->rl_tree, rangelock_compare,
+           sizeof (locked_range_t), offsetof(locked_range_t, lr_node));
+       rl->rl_cb = cb;
+       rl->rl_arg = arg;
+}
+
+void
+rangelock_fini(rangelock_t *rl)
+{
+       mutex_destroy(&rl->rl_lock);
+       avl_destroy(&rl->rl_tree);
+}
  
  /*
   * Check if a write lock can be grabbed, or wait and recheck until available.
   */
  static void
-zfs_range_lock_writer(zfs_rlock_t *zrl, rl_t *new)
+rangelock_enter_writer(rangelock_t *rl, locked_range_t *new)
  {
-       avl_tree_t *tree = &zrl->zr_avl;
-       rl_t *rl;
+       avl_tree_t *tree = &rl->rl_tree;
+       locked_range_t *lr;
         avl_index_t where;
-       uint64_t end_size;
-       uint64_t off = new->r_off;
-       uint64_t len = new->r_len;
+       uint64_t orig_off = new->lr_offset;
+       uint64_t orig_len = new->lr_length;
+       rangelock_type_t orig_type = new->lr_type;
  
         for (;;) {
                 /*
-                * Range locking is also used by zvol. However, for zvol, we
-                * don't need to append or grow blocksize, so skip that
-                * processing.
-                *
-                * Yes, this is ugly, and would be solved by not handling
-                * grow or append in range lock code. If that was done then
-                * we could make the range locking code generically available
-                * to other non-zfs consumers.
+                * Call callback which can modify new->r_off,len,type.
+                * Note, the callback is used by the ZPL to handle appending
+                * and changing blocksizes.  It isn't needed for zvols.
                  */
-               if (zrl->zr_size) { /* caller is ZPL */
-                       /*
-                        * If in append mode pick up the current end of file.
-                        * This is done under z_range_lock to avoid races.
-                        */
-                       if (new->r_type == RL_APPEND)
-                               new->r_off = *zrl->zr_size;
-
-                       /*
-                        * If we need to grow the block size then grab the whole
-                        * file range. This is also done under z_range_lock to
-                        * avoid races.
-                        */
-                       end_size = MAX(*zrl->zr_size, new->r_off + len);
-                       if (end_size > *zrl->zr_blksz &&
-                           (!ISP2(*zrl->zr_blksz) ||
-                           *zrl->zr_blksz < *zrl->zr_max_blksz)) {
-                               new->r_off = 0;
-                               new->r_len = UINT64_MAX;
-                       }
+               if (rl->rl_cb != NULL) {
+                       rl->rl_cb(new, rl->rl_arg);
                 }
  
+               /*
+                * If the type was APPEND, the callback must convert it to
+                * WRITER.
+                */
+               ASSERT3U(new->lr_type, ==, RL_WRITER);
+
                 /*
                  * First check for the usual case of no locks
                  */
                 if (avl_numnodes(tree) == 0) {
-                       new->r_type = RL_WRITER; /* convert to writer */
                         avl_add(tree, new);
                         return;
                 }
@@ -156,31 +174,33 @@ zfs_range_lock_writer(zfs_rlock_t *zrl, rl_t *new)
                 /*
                  * Look for any locks in the range.
                  */
-               rl = avl_find(tree, new, &where);
-               if (rl)
+               lr = avl_find(tree, new, &where);
+               if (lr != NULL)
                         goto wait; /* already locked at same offset */
  
-               rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
-               if (rl && (rl->r_off < new->r_off + new->r_len))
+               lr = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
+               if (lr != NULL &&
+                   lr->lr_offset < new->lr_offset + new->lr_length)
                         goto wait;
  
-               rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
-               if (rl && rl->r_off + rl->r_len > new->r_off)
+               lr = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
+               if (lr != NULL &&
+                   lr->lr_offset + lr->lr_length > new->lr_offset)
                         goto wait;
  
-               new->r_type = RL_WRITER; /* convert possible RL_APPEND */
                 avl_insert(tree, new, where);
                 return;
  wait:
-               if (!rl->r_write_wanted) {
-                       cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
-                       rl->r_write_wanted = B_TRUE;
+               if (!lr->lr_write_wanted) {
+                       cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL);
+                       lr->lr_write_wanted = B_TRUE;
                 }
-               cv_wait(&rl->r_wr_cv, &zrl->zr_mutex);
+               cv_wait(&lr->lr_write_cv, &rl->rl_lock);
  
                 /* reset to original */
-               new->r_off = off;
-               new->r_len = len;
+               new->lr_offset = orig_off;
+               new->lr_length = orig_len;
+               new->lr_type = orig_type;
         }
  }
  
@@ -188,29 +208,29 @@ wait:
   * If this is an original (non-proxy) lock then replace it by
   * a proxy and return the proxy.
   */
-static rl_t *
-zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
+static locked_range_t *
+rangelock_proxify(avl_tree_t *tree, locked_range_t *lr)
  {
-       rl_t *proxy;
+       locked_range_t *proxy;
  
-       if (rl->r_proxy)
-               return (rl); /* already a proxy */
+       if (lr->lr_proxy)
+               return (lr); /* already a proxy */
  
-       ASSERT3U(rl->r_cnt, ==, 1);
-       ASSERT(rl->r_write_wanted == B_FALSE);
-       ASSERT(rl->r_read_wanted == B_FALSE);
-       avl_remove(tree, rl);
-       rl->r_cnt = 0;
+       ASSERT3U(lr->lr_count, ==, 1);
+       ASSERT(lr->lr_write_wanted == B_FALSE);
+       ASSERT(lr->lr_read_wanted == B_FALSE);
+       avl_remove(tree, lr);
+       lr->lr_count = 0;
  
         /* create a proxy range lock */
-       proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-       proxy->r_off = rl->r_off;
-       proxy->r_len = rl->r_len;
-       proxy->r_cnt = 1;
-       proxy->r_type = RL_READER;
-       proxy->r_proxy = B_TRUE;
-       proxy->r_write_wanted = B_FALSE;
-       proxy->r_read_wanted = B_FALSE;
+       proxy = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+       proxy->lr_offset = lr->lr_offset;
+       proxy->lr_length = lr->lr_length;
+       proxy->lr_count = 1;
+       proxy->lr_type = RL_READER;
+       proxy->lr_proxy = B_TRUE;
+       proxy->lr_write_wanted = B_FALSE;
+       proxy->lr_read_wanted = B_FALSE;
         avl_add(tree, proxy);
  
         return (proxy);
@@ -220,29 +240,27 @@ zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
   * Split the range lock at the supplied offset
   * returning the *front* proxy.
   */
-static rl_t *
-zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
+static locked_range_t *
+rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off)
  {
-       rl_t *front, *rear;
-
-       ASSERT3U(rl->r_len, >, 1);
-       ASSERT3U(off, >, rl->r_off);
-       ASSERT3U(off, <, rl->r_off + rl->r_len);
-       ASSERT(rl->r_write_wanted == B_FALSE);
-       ASSERT(rl->r_read_wanted == B_FALSE);
+       ASSERT3U(lr->lr_length, >, 1);
+       ASSERT3U(off, >, lr->lr_offset);
+       ASSERT3U(off, <, lr->lr_offset + lr->lr_length);
+       ASSERT(lr->lr_write_wanted == B_FALSE);
+       ASSERT(lr->lr_read_wanted == B_FALSE);
  
         /* create the rear proxy range lock */
-       rear = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-       rear->r_off = off;
-       rear->r_len = rl->r_off + rl->r_len - off;
-       rear->r_cnt = rl->r_cnt;
-       rear->r_type = RL_READER;
-       rear->r_proxy = B_TRUE;
-       rear->r_write_wanted = B_FALSE;
-       rear->r_read_wanted = B_FALSE;
-
-       front = zfs_range_proxify(tree, rl);
-       front->r_len = off - rl->r_off;
+       locked_range_t *rear = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+       rear->lr_offset = off;
+       rear->lr_length = lr->lr_offset + lr->lr_length - off;
+       rear->lr_count = lr->lr_count;
+       rear->lr_type = RL_READER;
+       rear->lr_proxy = B_TRUE;
+       rear->lr_write_wanted = B_FALSE;
+       rear->lr_read_wanted = B_FALSE;
+
+       locked_range_t *front = rangelock_proxify(tree, lr);
+       front->lr_length = off - lr->lr_offset;
  
         avl_insert_here(tree, rear, front, AVL_AFTER);
         return (front);
@@ -252,28 +270,27 @@ zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
   * Create and add a new proxy range lock for the supplied range.
   */
  static void
-zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
+rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
  {
-       rl_t *rl;
-
-       ASSERT(len);
-       rl = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-       rl->r_off = off;
-       rl->r_len = len;
-       rl->r_cnt = 1;
-       rl->r_type = RL_READER;
-       rl->r_proxy = B_TRUE;
-       rl->r_write_wanted = B_FALSE;
-       rl->r_read_wanted = B_FALSE;
-       avl_add(tree, rl);
+       ASSERT(len != 0);
+       locked_range_t *lr = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+       lr->lr_offset = off;
+       lr->lr_length = len;
+       lr->lr_count = 1;
+       lr->lr_type = RL_READER;
+       lr->lr_proxy = B_TRUE;
+       lr->lr_write_wanted = B_FALSE;
+       lr->lr_read_wanted = B_FALSE;
+       avl_add(tree, lr);
  }
  
  static void
-zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
+rangelock_add_reader(avl_tree_t *tree, locked_range_t *new,
+    locked_range_t *prev, avl_index_t where)
  {
-       rl_t *next;
-       uint64_t off = new->r_off;
-       uint64_t len = new->r_len;
+       locked_range_t *next;
+       uint64_t off = new->lr_offset;
+       uint64_t len = new->lr_length;
  
         /*
          * prev arrives either:
@@ -282,37 +299,37 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
          *   range may overlap with the new range
          * - null, if there were no ranges starting before the new one
          */
-       if (prev) {
-               if (prev->r_off + prev->r_len <= off) {
+       if (prev != NULL) {
+               if (prev->lr_offset + prev->lr_length <= off) {
                         prev = NULL;
-               } else if (prev->r_off != off) {
+               } else if (prev->lr_offset != off) {
                         /*
                          * convert to proxy if needed then
                          * split this entry and bump ref count
                          */
-                       prev = zfs_range_split(tree, prev, off);
+                       prev = rangelock_split(tree, prev, off);
                         prev = AVL_NEXT(tree, prev); /* move to rear range */
                 }
         }
-       ASSERT((prev == NULL) || (prev->r_off == off));
+       ASSERT((prev == NULL) || (prev->lr_offset == off));
  
-       if (prev)
+       if (prev != NULL)
                 next = prev;
         else
-               next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
+               next = avl_nearest(tree, where, AVL_AFTER);
  
-       if (next == NULL || off + len <= next->r_off) {
+       if (next == NULL || off + len <= next->lr_offset) {
                 /* no overlaps, use the original new rl_t in the tree */
                 avl_insert(tree, new, where);
                 return;
         }
  
-       if (off < next->r_off) {
+       if (off < next->lr_offset) {
                 /* Add a proxy for initial range before the overlap */
-               zfs_range_new_proxy(tree, off, next->r_off - off);
+               rangelock_new_proxy(tree, off, next->lr_offset - off);
         }
  
-       new->r_cnt = 0; /* will use proxies in tree */
+       new->lr_count = 0; /* will use proxies in tree */
         /*
          * We now search forward through the ranges, until we go past the end
          * of the new range. For each entry we make it a proxy if it
@@ -320,47 +337,51 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
          * gaps between the ranges then we create a new proxy range.
          */
         for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
-               if (off + len <= next->r_off)
+               if (off + len <= next->lr_offset)
                         break;
-               if (prev && prev->r_off + prev->r_len < next->r_off) {
+               if (prev != NULL && prev->lr_offset + prev->lr_length <
+                   next->lr_offset) {
                         /* there's a gap */
-                       ASSERT3U(next->r_off, >, prev->r_off + prev->r_len);
-                       zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
-                           next->r_off - (prev->r_off + prev->r_len));
+                       ASSERT3U(next->lr_offset, >,
+                           prev->lr_offset + prev->lr_length);
+                       rangelock_new_proxy(tree,
+                           prev->lr_offset + prev->lr_length,
+                           next->lr_offset -
+                           (prev->lr_offset + prev->lr_length));
                 }
-               if (off + len == next->r_off + next->r_len) {
+               if (off + len == next->lr_offset + next->lr_length) {
                         /* exact overlap with end */
-                       next = zfs_range_proxify(tree, next);
-                       next->r_cnt++;
+                       next = rangelock_proxify(tree, next);
+                       next->lr_count++;
                         return;
                 }
-               if (off + len < next->r_off + next->r_len) {
+               if (off + len < next->lr_offset + next->lr_length) {
                         /* new range ends in the middle of this block */
-                       next = zfs_range_split(tree, next, off + len);
-                       next->r_cnt++;
+                       next = rangelock_split(tree, next, off + len);
+                       next->lr_count++;
                         return;
                 }
-               ASSERT3U(off + len, >, next->r_off + next->r_len);
-               next = zfs_range_proxify(tree, next);
-               next->r_cnt++;
+               ASSERT3U(off + len, >, next->lr_offset + next->lr_length);
+               next = rangelock_proxify(tree, next);
+               next->lr_count++;
         }
  
         /* Add the remaining end range. */
-       zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
-           (off + len) - (prev->r_off + prev->r_len));
+       rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length,
+           (off + len) - (prev->lr_offset + prev->lr_length));
  }
  
  /*
   * Check if a reader lock can be grabbed, or wait and recheck until available.
   */
  static void
-zfs_range_lock_reader(zfs_rlock_t *zrl, rl_t *new)
+rangelock_enter_reader(rangelock_t *rl, locked_range_t *new)
  {
-       avl_tree_t *tree = &zrl->zr_avl;
-       rl_t *prev, *next;
+       avl_tree_t *tree = &rl->rl_tree;
+       locked_range_t *prev, *next;
         avl_index_t where;
-       uint64_t off = new->r_off;
-       uint64_t len = new->r_len;
+       uint64_t off = new->lr_offset;
+       uint64_t len = new->lr_length;
  
         /*
          * Look for any writer locks in the range.
@@ -368,21 +389,22 @@ zfs_range_lock_reader(zfs_rlock_t *zrl, rl_t *new)
  retry:
         prev = avl_find(tree, new, &where);
         if (prev == NULL)
-               prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
+               prev = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
  
         /*
          * Check the previous range for a writer lock overlap.
          */
-       if (prev && (off < prev->r_off + prev->r_len)) {
-               if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) {
-                       if (!prev->r_read_wanted) {
-                               cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
-                               prev->r_read_wanted = B_TRUE;
+       if (prev && (off < prev->lr_offset + prev->lr_length)) {
+               if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) {
+                       if (!prev->lr_read_wanted) {
+                               cv_init(&prev->lr_read_cv,
+                                   NULL, CV_DEFAULT, NULL);
+                               prev->lr_read_wanted = B_TRUE;
                         }
-                       cv_wait(&prev->r_rd_cv, &zrl->zr_mutex);
+                       cv_wait(&prev->lr_read_cv, &rl->rl_lock);
                         goto retry;
                 }
-               if (off + len < prev->r_off + prev->r_len)
+               if (off + len < prev->lr_offset + prev->lr_length)
                         goto got_lock;
         }
  
@@ -390,95 +412,97 @@ retry:
          * Search through the following ranges to see if there's
          * write lock any overlap.
          */
-       if (prev)
+       if (prev != NULL)
                 next = AVL_NEXT(tree, prev);
         else
-               next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
-       for (; next; next = AVL_NEXT(tree, next)) {
-               if (off + len <= next->r_off)
+               next = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
+       for (; next != NULL; next = AVL_NEXT(tree, next)) {
+               if (off + len <= next->lr_offset)
                         goto got_lock;
-               if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) {
-                       if (!next->r_read_wanted) {
-                               cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
-                               next->r_read_wanted = B_TRUE;
+               if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) {
+                       if (!next->lr_read_wanted) {
+                               cv_init(&next->lr_read_cv,
+                                   NULL, CV_DEFAULT, NULL);
+                               next->lr_read_wanted = B_TRUE;
                         }
-                       cv_wait(&next->r_rd_cv, &zrl->zr_mutex);
+                       cv_wait(&next->lr_read_cv, &rl->rl_lock);
                         goto retry;
                 }
-               if (off + len <= next->r_off + next->r_len)
+               if (off + len <= next->lr_offset + next->lr_length)
                         goto got_lock;
         }
  
  got_lock:
         /*
          * Add the read lock, which may involve splitting existing
-        * locks and bumping ref counts (r_cnt).
+        * locks and bumping ref counts (r_count).
          */
-       zfs_range_add_reader(tree, new, prev, where);
+       rangelock_add_reader(tree, new, prev, where);
  }
  
  /*
- * Lock a range (offset, length) as either shared (RL_READER)
- * or exclusive (RL_WRITER). Returns the range lock structure
- * for later unlocking or reduce range (if entire file
- * previously locked as RL_WRITER).
+ * Lock a range (offset, length) as either shared (RL_READER) or exclusive
+ * (RL_WRITER or RL_APPEND).  If RL_APPEND is specified, rl_cb() will convert
+ * it to a RL_WRITER lock (with the offset at the end of the file).  Returns
+ * the range lock structure for later unlocking (or reduce range if the
+ * entire file is locked as RL_WRITER).
   */
-rl_t *
-zfs_range_lock(zfs_rlock_t *zrl, uint64_t off, uint64_t len, rl_type_t type)
+locked_range_t *
+rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
+    rangelock_type_t type)
  {
-       rl_t *new;
-
         ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
  
-       new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-       new->r_zrl = zrl;
-       new->r_off = off;
+       locked_range_t *new = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+       new->lr_rangelock = rl;
+       new->lr_offset = off;
         if (len + off < off)    /* overflow */
                 len = UINT64_MAX - off;
-       new->r_len = len;
-       new->r_cnt = 1; /* assume it's going to be in the tree */
-       new->r_type = type;
-       new->r_proxy = B_FALSE;
-       new->r_write_wanted = B_FALSE;
-       new->r_read_wanted = B_FALSE;
-
-       mutex_enter(&zrl->zr_mutex);
+       new->lr_length = len;
+       new->lr_count = 1; /* assume it's going to be in the tree */
+       new->lr_type = type;
+       new->lr_proxy = B_FALSE;
+       new->lr_write_wanted = B_FALSE;
+       new->lr_read_wanted = B_FALSE;
+
+       mutex_enter(&rl->rl_lock);
         if (type == RL_READER) {
                 /*
                  * First check for the usual case of no locks
                  */
-               if (avl_numnodes(&zrl->zr_avl) == 0)
-                       avl_add(&zrl->zr_avl, new);
+               if (avl_numnodes(&rl->rl_tree) == 0)
+                       avl_add(&rl->rl_tree, new);
                 else
-                       zfs_range_lock_reader(zrl, new);
-       } else /* RL_WRITER or RL_APPEND */
-               zfs_range_lock_writer(zrl, new);
-       mutex_exit(&zrl->zr_mutex);
+                       rangelock_enter_reader(rl, new);
+       } else
+               rangelock_enter_writer(rl, new); /* RL_WRITER or RL_APPEND */
+       mutex_exit(&rl->rl_lock);
         return (new);
  }
  
+/*
+ * Safely free the locked_range_t.
+ */
  static void
-zfs_range_free(void *arg)
+rangelock_free(locked_range_t *lr)
  {
-       rl_t *rl = arg;
+       if (lr->lr_write_wanted)
+               cv_destroy(&lr->lr_write_cv);
  
-       if (rl->r_write_wanted)
-               cv_destroy(&rl->r_wr_cv);
+       if (lr->lr_read_wanted)
+               cv_destroy(&lr->lr_read_cv);
  
-       if (rl->r_read_wanted)
-               cv_destroy(&rl->r_rd_cv);
-
-       kmem_free(rl, sizeof (rl_t));
+       kmem_free(lr, sizeof (locked_range_t));
  }
  
  /*
   * Unlock a reader lock
   */
  static void
-zfs_range_unlock_reader(zfs_rlock_t *zrl, rl_t *remove, list_t *free_list)
+rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove,
+    list_t *free_list)
  {
-       avl_tree_t *tree = &zrl->zr_avl;
-       rl_t *rl, *next = NULL;
+       avl_tree_t *tree = &rl->rl_tree;
         uint64_t len;
  
         /*
@@ -488,53 +512,48 @@ zfs_range_unlock_reader(zfs_rlock_t *zrl, rl_t *remove, list_t *free_list)
          * removed from the tree and replaced by proxies (one or
          * more ranges mapping to the entire range).
          */
-       if (remove->r_cnt == 1) {
+       if (remove->lr_count == 1) {
                 avl_remove(tree, remove);
-
-               if (remove->r_write_wanted)
-                       cv_broadcast(&remove->r_wr_cv);
-
-               if (remove->r_read_wanted)
-                       cv_broadcast(&remove->r_rd_cv);
-
+               if (remove->lr_write_wanted)
+                       cv_broadcast(&remove->lr_write_cv);
+               if (remove->lr_read_wanted)
+                       cv_broadcast(&remove->lr_read_cv);
                 list_insert_tail(free_list, remove);
         } else {
-               ASSERT0(remove->r_cnt);
-               ASSERT0(remove->r_write_wanted);
-               ASSERT0(remove->r_read_wanted);
+               ASSERT0(remove->lr_count);
+               ASSERT0(remove->lr_write_wanted);
+               ASSERT0(remove->lr_read_wanted);
                 /*
                  * Find start proxy representing this reader lock,
                  * then decrement ref count on all proxies
                  * that make up this range, freeing them as needed.
                  */
-               rl = avl_find(tree, remove, NULL);
-               ASSERT(rl);
-               ASSERT(rl->r_cnt);
-               ASSERT(rl->r_type == RL_READER);
-               for (len = remove->r_len; len != 0; rl = next) {
-                       len -= rl->r_len;
-                       if (len) {
-                               next = AVL_NEXT(tree, rl);
-                               ASSERT(next);
-                               ASSERT(rl->r_off + rl->r_len == next->r_off);
-                               ASSERT(next->r_cnt);
-                               ASSERT(next->r_type == RL_READER);
+               locked_range_t *lr = avl_find(tree, remove, NULL);
+               ASSERT3P(lr, !=, NULL);
+               ASSERT3U(lr->lr_count, !=, 0);
+               ASSERT3U(lr->lr_type, ==, RL_READER);
+               locked_range_t *next = NULL;
+               for (len = remove->lr_length; len != 0; lr = next) {
+                       len -= lr->lr_length;
+                       if (len != 0) {
+                               next = AVL_NEXT(tree, lr);
+                               ASSERT3P(next, !=, NULL);
+                               ASSERT3U(lr->lr_offset + lr->lr_length, ==,
+                                   next->lr_offset);
+                               ASSERT3U(next->lr_count, !=, 0);
+                               ASSERT3U(next->lr_type, ==, RL_READER);
                         }
-                       rl->r_cnt--;
-                       if (rl->r_cnt == 0) {
-                               avl_remove(tree, rl);
-
-                               if (rl->r_write_wanted)
-                                       cv_broadcast(&rl->r_wr_cv);
-
-                               if (rl->r_read_wanted)
-                                       cv_broadcast(&rl->r_rd_cv);
-
-                               list_insert_tail(free_list, rl);
+                       lr->lr_count--;
+                       if (lr->lr_count == 0) {
+                               avl_remove(tree, lr);
+                               if (lr->lr_write_wanted)
+                                       cv_broadcast(&lr->lr_write_cv);
+                               if (lr->lr_read_wanted)
+                                       cv_broadcast(&lr->lr_read_cv);
+                               list_insert_tail(free_list, lr);
                         }
                 }
-
-               kmem_free(remove, sizeof (rl_t));
+               kmem_free(remove, sizeof (locked_range_t));
         }
  }
  
@@ -542,91 +561,79 @@ zfs_range_unlock_reader(zfs_rlock_t *zrl, rl_t *remove, list_t *free_list)
   * Unlock range and destroy range lock structure.
   */
  void
-zfs_range_unlock(rl_t *rl)
+rangelock_exit(locked_range_t *lr)
  {
-       zfs_rlock_t *zrl = rl->r_zrl;
+       rangelock_t *rl = lr->lr_rangelock;
         list_t free_list;
-       rl_t *free_rl;
-
-       ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER);
-       ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0);
-       ASSERT(!rl->r_proxy);
-       list_create(&free_list, sizeof (rl_t), offsetof(rl_t, rl_node));
+       locked_range_t *free_lr;
  
-       mutex_enter(&zrl->zr_mutex);
-       if (rl->r_type == RL_WRITER) {
-               /* writer locks can't be shared or split */
-               avl_remove(&zrl->zr_avl, rl);
-               if (rl->r_write_wanted)
-                       cv_broadcast(&rl->r_wr_cv);
+       ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER);
+       ASSERT(lr->lr_count == 1 || lr->lr_count == 0);
+       ASSERT(!lr->lr_proxy);
  
-               if (rl->r_read_wanted)
-                       cv_broadcast(&rl->r_rd_cv);
+       /*
+        * The free list is used to defer the cv_destroy() and
+        * subsequent kmem_free until after the mutex is dropped.
+        */
+       list_create(&free_list, sizeof (locked_range_t),
+           offsetof(locked_range_t, lr_node));
  
-               list_insert_tail(&free_list, rl);
+       mutex_enter(&rl->rl_lock);
+       if (lr->lr_type == RL_WRITER) {
+               /* writer locks can't be shared or split */
+               avl_remove(&rl->rl_tree, lr);
+               if (lr->lr_write_wanted)
+                       cv_broadcast(&lr->lr_write_cv);
+               if (lr->lr_read_wanted)
+                       cv_broadcast(&lr->lr_read_cv);
+               list_insert_tail(&free_list, lr);
         } else {
                 /*
-                * lock may be shared, let zfs_range_unlock_reader()
-                * release the zp->z_range_lock lock and free the rl_t
+                * lock may be shared, let rangelock_exit_reader()
+                * release the lock and free the locked_range_t.
                  */
-               zfs_range_unlock_reader(zrl, rl, &free_list);
+               rangelock_exit_reader(rl, lr, &free_list);
         }
-       mutex_exit(&zrl->zr_mutex);
+       mutex_exit(&rl->rl_lock);
  
-       while ((free_rl = list_head(&free_list)) != NULL) {
-               list_remove(&free_list, free_rl);
-               zfs_range_free(free_rl);
-       }
+       while ((free_lr = list_remove_head(&free_list)) != NULL)
+               rangelock_free(free_lr);
  
         list_destroy(&free_list);
  }
  
  /*
   * Reduce range locked as RL_WRITER from whole file to specified range.
- * Asserts the whole file is exclusivly locked and so there's only one
+ * Asserts the whole file is exclusively locked and so there's only one
   * entry in the tree.
   */
  void
-zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
+rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len)
  {
-       zfs_rlock_t *zrl = rl->r_zrl;
+       rangelock_t *rl = lr->lr_rangelock;
  
         /* Ensure there are no other locks */
-       ASSERT(avl_numnodes(&zrl->zr_avl) == 1);
-       ASSERT(rl->r_off == 0);
-       ASSERT(rl->r_type == RL_WRITER);
-       ASSERT(!rl->r_proxy);
-       ASSERT3U(rl->r_len, ==, UINT64_MAX);
-       ASSERT3U(rl->r_cnt, ==, 1);
-
-       mutex_enter(&zrl->zr_mutex);
-       rl->r_off = off;
-       rl->r_len = len;
-
-       if (rl->r_write_wanted)
-               cv_broadcast(&rl->r_wr_cv);
-       if (rl->r_read_wanted)
-               cv_broadcast(&rl->r_rd_cv);
-
-       mutex_exit(&zrl->zr_mutex);
-}
-
-/*
- * AVL comparison function used to order range locks
- * Locks are ordered on the start offset of the range.
- */
-int
-zfs_range_compare(const void *arg1, const void *arg2)
-{
-       const rl_t *rl1 = (const rl_t *)arg1;
-       const rl_t *rl2 = (const rl_t *)arg2;
-
-       return (AVL_CMP(rl1->r_off, rl2->r_off));
+       ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1);
+       ASSERT3U(lr->lr_offset, ==, 0);
+       ASSERT3U(lr->lr_type, ==, RL_WRITER);
+       ASSERT(!lr->lr_proxy);
+       ASSERT3U(lr->lr_length, ==, UINT64_MAX);
+       ASSERT3U(lr->lr_count, ==, 1);
+
+       mutex_enter(&rl->rl_lock);
+       lr->lr_offset = off;
+       lr->lr_length = len;
+       mutex_exit(&rl->rl_lock);
+       if (lr->lr_write_wanted)
+               cv_broadcast(&lr->lr_write_cv);
+       if (lr->lr_read_wanted)
+               cv_broadcast(&lr->lr_read_cv);
  }
  
-#ifdef _KERNEL
-EXPORT_SYMBOL(zfs_range_lock);
-EXPORT_SYMBOL(zfs_range_unlock);
-EXPORT_SYMBOL(zfs_range_reduce);
-EXPORT_SYMBOL(zfs_range_compare);
+#if defined(_KERNEL)
+EXPORT_SYMBOL(rangelock_init);
+EXPORT_SYMBOL(rangelock_fini);
+EXPORT_SYMBOL(rangelock_enter);
+EXPORT_SYMBOL(rangelock_exit);
+EXPORT_SYMBOL(rangelock_reduce);
  #endif
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c

index 4e163e2e3fe8864197987cbcdd5ea6a50f1ad8e6..36f47e77a01cbb28e26a93ccfa8a444fa56a2226 100644 (file)
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -477,7 +477,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
         /*
          * Lock the range against changes.
          */
-       rl_t *rl = zfs_range_lock(&zp->z_range_lock,
+       locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
             uio->uio_loffset, uio->uio_resid, RL_READER);
  
         /*
@@ -550,7 +550,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
         dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
         task_io_account_read(nread);
  out:
-       zfs_range_unlock(rl);
+       rangelock_exit(lr);
  
         ZFS_EXIT(zfsvfs);
         return (error);
@@ -652,19 +652,18 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
  #endif
                 uio_prefaultpages(MIN(n, max_blksz), uio);
  
-       rl_t     *rl;
-
         /*
          * If in append mode, set the io offset pointer to eof.
          */
+       locked_range_t *lr;
         if (ioflag & FAPPEND) {
                 /*
                  * Obtain an appending range lock to guarantee file append
                  * semantics.  We reset the write offset once we have the lock.
                  */
-               rl = zfs_range_lock(&zp->z_range_lock, 0, n, RL_APPEND);
-               woff = rl->r_off;
-               if (rl->r_len == UINT64_MAX) {
+               lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
+               woff = lr->lr_offset;
+               if (lr->lr_length == UINT64_MAX) {
                         /*
                          * We overlocked the file because this write will cause
                          * the file block size to increase.
@@ -679,11 +678,11 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
                  * this write, then this range lock will lock the entire file
                  * so that we can re-write the block safely.
                  */
-               rl = zfs_range_lock(&zp->z_range_lock, woff, n, RL_WRITER);
+               lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
         }
  
         if (woff >= limit) {
-               zfs_range_unlock(rl);
+               rangelock_exit(lr);
                 ZFS_EXIT(zfsvfs);
                 return (SET_ERROR(EFBIG));
         }
@@ -776,12 +775,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
                 }
  
                 /*
-                * If zfs_range_lock() over-locked we grow the blocksize
+                * If rangelock_enter() over-locked we grow the blocksize
                  * and then reduce the lock range.  This will only happen
-                * on the first iteration since zfs_range_reduce() will
-                * shrink down r_len to the appropriate size.
+                * on the first iteration since rangelock_reduce() will
+                * shrink down lr_length to the appropriate size.
                  */
-               if (rl->r_len == UINT64_MAX) {
+               if (lr->lr_length == UINT64_MAX) {
                         uint64_t new_blksz;
  
                         if (zp->z_blksz > max_blksz) {
@@ -797,7 +796,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
                                 new_blksz = MIN(end_size, max_blksz);
                         }
                         zfs_grow_blocksize(zp, new_blksz, tx);
-                       zfs_range_reduce(rl, woff, n);
+                       rangelock_reduce(lr, woff, n);
                 }
  
                 /*
@@ -915,7 +914,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
         }
  
         zfs_inode_update(zp);
-       zfs_range_unlock(rl);
+       rangelock_exit(lr);
  
         /*
          * If we're in replay mode, or we made no progress, return error.
@@ -967,7 +966,7 @@ zfs_get_done(zgd_t *zgd, int error)
         if (zgd->zgd_db)
                 dmu_buf_rele(zgd->zgd_db, zgd);
  
-       zfs_range_unlock(zgd->zgd_rl);
+       rangelock_exit(zgd->zgd_lr);
  
         /*
          * Release the vnode asynchronously as we currently have the
@@ -1031,8 +1030,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
          * we don't have to write the data twice.
          */
         if (buf != NULL) { /* immediate write */
-               zgd->zgd_rl = zfs_range_lock(&zp->z_range_lock, offset, size,
-                   RL_READER);
+               zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
+                   offset, size, RL_READER);
                 /* test for truncation needs to be done while range locked */
                 if (offset >= zp->z_size) {
                         error = SET_ERROR(ENOENT);
@@ -1053,12 +1052,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
                         size = zp->z_blksz;
                         blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
                         offset -= blkoff;
-                       zgd->zgd_rl = zfs_range_lock(&zp->z_range_lock, offset,
-                           size, RL_READER);
+                       zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
+                           offset, size, RL_READER);
                         if (zp->z_blksz == size)
                                 break;
                         offset += blkoff;
-                       zfs_range_unlock(zgd->zgd_rl);
+                       rangelock_exit(zgd->zgd_lr);
                 }
                 /* test for truncation needs to be done while range locked */
                 if (lr->lr_offset >= zp->z_size)
@@ -4432,7 +4431,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
         loff_t          offset;
         loff_t          pgoff;
         unsigned int    pglen;
-       rl_t            *rl;
         dmu_tx_t        *tx;
         caddr_t         va;
         int             err = 0;
@@ -4506,13 +4504,14 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
         redirty_page_for_writepage(wbc, pp);
         unlock_page(pp);
  
-       rl = zfs_range_lock(&zp->z_range_lock, pgoff, pglen, RL_WRITER);
+       locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
+           pgoff, pglen, RL_WRITER);
         lock_page(pp);
  
         /* Page mapping changed or it was no longer dirty, we're done */
         if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
                 unlock_page(pp);
-               zfs_range_unlock(rl);
+               rangelock_exit(lr);
                 ZFS_EXIT(zfsvfs);
                 return (0);
         }
@@ -4520,7 +4519,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
         /* Another process started write block if required */
         if (PageWriteback(pp)) {
                 unlock_page(pp);
-               zfs_range_unlock(rl);
+               rangelock_exit(lr);
  
                 if (wbc->sync_mode != WB_SYNC_NONE)
                         wait_on_page_writeback(pp);
@@ -4532,7 +4531,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
         /* Clear the dirty flag the required locks are held */
         if (!clear_page_dirty_for_io(pp)) {
                 unlock_page(pp);
-               zfs_range_unlock(rl);
+               rangelock_exit(lr);
                 ZFS_EXIT(zfsvfs);
                 return (0);
         }
@@ -4559,7 +4558,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
                 __set_page_dirty_nobuffers(pp);
                 ClearPageError(pp);
                 end_page_writeback(pp);
-               zfs_range_unlock(rl);
+               rangelock_exit(lr);
                 ZFS_EXIT(zfsvfs);
                 return (err);
         }
@@ -4586,7 +4585,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
             zfs_putpage_commit_cb, pp);
         dmu_tx_commit(tx);
  
-       zfs_range_unlock(rl);
+       rangelock_exit(lr);
  
         if (wbc->sync_mode != WB_SYNC_NONE) {
                 /*
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c

index 67732043579d84945218c13eb1d1fea4948981ad..8925b67004b6694c3e6fba29102c6dad198f6a2b 100644 (file)
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -20,7 +20,7 @@
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
   */
  
  /* Portions Copyright 2007 Jeremy Teo */
@@ -91,6 +91,37 @@ static kmem_cache_t *znode_cache = NULL;
  static kmem_cache_t *znode_hold_cache = NULL;
  unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
  
+/*
+ * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
+ * z_rangelock. It will modify the offset and length of the lock to reflect
+ * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
+ * called with the rangelock_t's rl_lock held, which avoids races.
+ */
+static void
+zfs_rangelock_cb(locked_range_t *new, void *arg)
+{
+       znode_t *zp = arg;
+
+       /*
+        * If in append mode, convert to writer and lock starting at the
+        * current end of file.
+        */
+       if (new->lr_type == RL_APPEND) {
+               new->lr_offset = zp->z_size;
+               new->lr_type = RL_WRITER;
+       }
+
+       /*
+        * If we need to grow the block size then lock the whole file range.
+        */
+       uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
+       if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+           zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
+               new->lr_offset = 0;
+               new->lr_length = UINT64_MAX;
+       }
+}
+
  /*ARGSUSED*/
  static int
  zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
@@ -106,7 +137,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
         mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
         rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
  
-       zfs_rlock_init(&zp->z_range_lock);
+       rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
  
         zp->z_dirlocks = NULL;
         zp->z_acl_cached = NULL;
@@ -128,7 +159,7 @@ zfs_znode_cache_destructor(void *buf, void *arg)
         rw_destroy(&zp->z_name_lock);
         mutex_destroy(&zp->z_acl_lock);
         rw_destroy(&zp->z_xattr_lock);
-       zfs_rlock_destroy(&zp->z_range_lock);
+       rangelock_fini(&zp->z_rangelock);
  
         ASSERT(zp->z_dirlocks == NULL);
         ASSERT(zp->z_acl_cached == NULL);
@@ -577,9 +608,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
         zp->z_is_mapped = B_FALSE;
         zp->z_is_ctldir = B_FALSE;
         zp->z_is_stale = B_FALSE;
-       zp->z_range_lock.zr_size = &zp->z_size;
-       zp->z_range_lock.zr_blksz = &zp->z_blksz;
-       zp->z_range_lock.zr_max_blksz = &ZTOZSB(zp)->z_max_blksz;
  
         zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
  
@@ -1475,20 +1503,20 @@ zfs_extend(znode_t *zp, uint64_t end)
  {
         zfsvfs_t *zfsvfs = ZTOZSB(zp);
         dmu_tx_t *tx;
-       rl_t *rl;
+       locked_range_t *lr;
         uint64_t newblksz;
         int error;
  
         /*
          * We will change zp_size, lock the whole file.
          */
-       rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
+       lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
  
         /*
          * Nothing to do if file already at desired length.
          */
         if (end <= zp->z_size) {
-               zfs_range_unlock(rl);
+               rangelock_exit(lr);
                 return (0);
         }
         tx = dmu_tx_create(zfsvfs->z_os);
@@ -1518,7 +1546,7 @@ zfs_extend(znode_t *zp, uint64_t end)
         error = dmu_tx_assign(tx, TXG_WAIT);
         if (error) {
                 dmu_tx_abort(tx);
-               zfs_range_unlock(rl);
+               rangelock_exit(lr);
                 return (error);
         }
  
@@ -1530,7 +1558,7 @@ zfs_extend(znode_t *zp, uint64_t end)
         VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
             &zp->z_size, sizeof (zp->z_size), tx));
  
-       zfs_range_unlock(rl);
+       rangelock_exit(lr);
  
         dmu_tx_commit(tx);
  
@@ -1593,19 +1621,19 @@ static int
  zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
  {
         zfsvfs_t *zfsvfs = ZTOZSB(zp);
-       rl_t *rl;
+       locked_range_t *lr;
         int error;
  
         /*
          * Lock the range being freed.
          */
-       rl = zfs_range_lock(&zp->z_range_lock, off, len, RL_WRITER);
+       lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
  
         /*
          * Nothing to do if file already at desired length.
          */
         if (off >= zp->z_size) {
-               zfs_range_unlock(rl);
+               rangelock_exit(lr);
                 return (0);
         }
  
@@ -1655,7 +1683,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
                                     page_len);
                 }
         }
-       zfs_range_unlock(rl);
+       rangelock_exit(lr);
  
         return (error);
  }
@@ -1673,7 +1701,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
  {
         zfsvfs_t *zfsvfs = ZTOZSB(zp);
         dmu_tx_t *tx;
-       rl_t *rl;
+       locked_range_t *lr;
         int error;
         sa_bulk_attr_t bulk[2];
         int count = 0;
@@ -1681,20 +1709,20 @@ zfs_trunc(znode_t *zp, uint64_t end)
         /*
          * We will change zp_size, lock the whole file.
          */
-       rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
+       lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
  
         /*
          * Nothing to do if file already at desired length.
          */
         if (end >= zp->z_size) {
-               zfs_range_unlock(rl);
+               rangelock_exit(lr);
                 return (0);
         }
  
         error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
             DMU_OBJECT_END);
         if (error) {
-               zfs_range_unlock(rl);
+               rangelock_exit(lr);
                 return (error);
         }
         tx = dmu_tx_create(zfsvfs->z_os);
@@ -1704,7 +1732,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
         error = dmu_tx_assign(tx, TXG_WAIT);
         if (error) {
                 dmu_tx_abort(tx);
-               zfs_range_unlock(rl);
+               rangelock_exit(lr);
                 return (error);
         }
  
@@ -1720,8 +1748,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
         VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
  
         dmu_tx_commit(tx);
-
-       zfs_range_unlock(rl);
+       rangelock_exit(lr);
  
         return (0);
  }
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c

index f7706f14312f62ab3e15c07ef6611f9c6ac098fc..e6f8451b259bbb4764a03a63b62d2ba4cac18e9b 100644 (file)
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -86,7 +86,6 @@
  #include <sys/dmu_tx.h>
  #include <sys/zio.h>
  #include <sys/zfs_rlock.h>
-#include <sys/zfs_znode.h>
  #include <sys/spa_impl.h>
  #include <sys/zvol.h>
  
@@ -123,7 +122,7 @@ struct zvol_state {
         uint32_t                zv_open_count;  /* open counts */
         uint32_t                zv_changed;     /* disk changed */
         zilog_t                 *zv_zilog;      /* ZIL handle */
-       zfs_rlock_t             zv_range_lock;  /* range lock */
+       rangelock_t             zv_rangelock;   /* for range locking */
         dnode_t                 *zv_dn;         /* dnode hold */
         dev_t                   zv_dev;         /* device id */
         struct gendisk          *zv_disk;       /* generic disk */
@@ -716,7 +715,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
  typedef struct zv_request {
         zvol_state_t    *zv;
         struct bio      *bio;
-       rl_t            *rl;
+       locked_range_t  *lr;
  } zv_request_t;
  
  static void
@@ -778,7 +777,7 @@ zvol_write(void *arg)
                 if (error)
                         break;
         }
-       zfs_range_unlock(zvr->rl);
+       rangelock_exit(zvr->lr);
  
         int64_t nwritten = start_resid - uio.uio_resid;
         dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
@@ -872,7 +871,8 @@ zvol_discard(void *arg)
                     ZVOL_OBJ, start, size);
         }
  unlock:
-       zfs_range_unlock(zvr->rl);
+       rangelock_exit(zvr->lr);
+
         if (error == 0 && sync)
                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
  
@@ -917,7 +917,7 @@ zvol_read(void *arg)
                         break;
                 }
         }
-       zfs_range_unlock(zvr->rl);
+       rangelock_exit(zvr->lr);
  
         int64_t nread = start_resid - uio.uio_resid;
         dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
@@ -985,7 +985,7 @@ zvol_request(struct request_queue *q, struct bio *bio)
                  * are asynchronous, we take it here synchronously to make
                  * sure overlapped I/Os are properly ordered.
                  */
-               zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
+               zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size,
                     RL_WRITER);
                 /*
                  * Sync writes and discards execute zil_commit() which may need
@@ -1014,7 +1014,7 @@ zvol_request(struct request_queue *q, struct bio *bio)
  
                 rw_enter(&zv->zv_suspend_lock, RW_READER);
  
-               zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
+               zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size,
                     RL_READER);
                 if (zvol_request_sync || taskq_dispatch(zvol_taskq,
                     zvol_read, zvr, TQ_SLEEP) == TASKQID_INVALID)
@@ -1036,7 +1036,7 @@ zvol_get_done(zgd_t *zgd, int error)
         if (zgd->zgd_db)
                 dmu_buf_rele(zgd->zgd_db, zgd);
  
-       zfs_range_unlock(zgd->zgd_rl);
+       rangelock_exit(zgd->zgd_lr);
  
         if (error == 0 && zgd->zgd_bp)
                 zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
@@ -1072,7 +1072,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
          * we don't have to write the data twice.
          */
         if (buf != NULL) { /* immediate write */
-               zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
+               zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
                     RL_READER);
                 error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
                     DMU_READ_NO_PREFETCH);
@@ -1085,7 +1085,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
                  */
                 size = zv->zv_volblocksize;
                 offset = P2ALIGN_TYPED(offset, size, uint64_t);
-               zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
+               zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
                     RL_READER);
                 error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
                     DMU_READ_NO_PREFETCH);
@@ -1687,7 +1687,7 @@ zvol_alloc(dev_t dev, const char *name)
         zv->zv_open_count = 0;
         strlcpy(zv->zv_name, name, MAXNAMELEN);
  
-       zfs_rlock_init(&zv->zv_range_lock);
+       rangelock_init(&zv->zv_rangelock, NULL, NULL);
         rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
  
         zv->zv_disk->major = zvol_major;
@@ -1745,7 +1745,7 @@ zvol_free(void *arg)
         ASSERT(zv->zv_disk->private_data == NULL);
  
         rw_destroy(&zv->zv_suspend_lock);
-       zfs_rlock_destroy(&zv->zv_range_lock);
+       rangelock_fini(&zv->zv_rangelock);
  
         del_gendisk(zv->zv_disk);
         blk_cleanup_queue(zv->zv_queue);
diff --git a/scripts/commitcheck.sh b/scripts/commitcheck.sh

index 4d37b3a3c721eb80ac520e2fe4fa6d93000ed279..f377bb91284adaa953ea1bfa51df3d02eec3988e 100755 (executable)
--- a/scripts/commitcheck.sh
+++ b/scripts/commitcheck.sh
@@ -121,11 +121,6 @@ function openzfs_port_commit()
          error=1
      fi
  
-    # need a approved by line
-    if ! check_tagged_line "Approved by" ; then
-        error=1
-    fi
-
      # need ported by line
      if ! check_tagged_line "Ported-by" ; then
          error=1
author	Matt Ahrens <mahrens@delphix.com>
	Mon, 1 Oct 2018 22:13:12 +0000 (15:13 -0700)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Thu, 11 Oct 2018 17:19:33 +0000 (10:19 -0700)
cmd/ztest/ztest.c		patch \| blob \| history
include/sys/dmu.h		patch \| blob \| history
include/sys/zfs_rlock.h		patch \| blob \| history
include/sys/zfs_znode.h		patch \| blob \| history
module/zfs/dmu.c		patch \| blob \| history
module/zfs/zfs_rlock.c		patch \| blob \| history
module/zfs/zfs_vnops.c		patch \| blob \| history
module/zfs/zfs_znode.c		patch \| blob \| history
module/zfs/zvol.c		patch \| blob \| history
scripts/commitcheck.sh		patch \| blob \| history