]> granicus.if.org Git - zfs/commitdiff
Illumos 5008 - lock contention (rrw_exit) while running a read only load
authorAlexander Motin <mav@freebsd.org>
Fri, 18 Jul 2014 16:53:38 +0000 (08:53 -0800)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Mon, 6 Jul 2015 16:34:13 +0000 (09:34 -0700)
5008 lock contention (rrw_exit) while running a read only load
Reviewed by: Matthew Ahrens <matthew.ahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Alex Reece <alex.reece@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Richard Yao <ryao@gentoo.org>
Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com>
Approved by: Garrett D'Amore <garrett@damore.org>

Porting notes:

This patch ported perfectly cleanly to ZoL.  During testing 100% cached
small-block reads, extreme contention was noticed on rrl->rr_lock from
rrw_exit() due to the frequent entering and leaving ZPL.  Illumos picked
up this patch from FreeBSD and it also helps under Linux.

On a 1-minute 4K cached read test with 10 fio processes pinned to a single
socket on a 4-socket (10 thread per socket) NUMA system, contentions on
rrl->rr_lock were reduced from 508799 to 43085.

Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3555

include/sys/rrwlock.h
include/sys/zfs_vfsops.h
include/sys/zfs_znode.h
module/zfs/rrwlock.c
module/zfs/zfs_ioctl.c
module/zfs/zfs_vfsops.c

index d2bdff495cbf6692b4070eff46816318cb4ebf59..7a328fd680305bd0689977a066aac700da91b024 100644 (file)
@@ -83,6 +83,31 @@ void rrw_tsd_destroy(void *arg);
 #define        RRW_LOCK_HELD(x) \
        (rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER))
 
+/*
+ * A reader-mostly lock implementation, tuning above reader-writer locks
+ * for hightly parallel read acquisitions, pessimizing write acquisitions.
+ *
+ * This should be a prime number.  See comment in rrwlock.c near
+ * RRM_TD_LOCK() for details.
+ */
+#define        RRM_NUM_LOCKS           17
+typedef struct rrmlock {
+       rrwlock_t       locks[RRM_NUM_LOCKS];
+} rrmlock_t;
+
+void rrm_init(rrmlock_t *rrl, boolean_t track_all);
+void rrm_destroy(rrmlock_t *rrl);
+void rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag);
+void rrm_enter_read(rrmlock_t *rrl, void *tag);
+void rrm_enter_write(rrmlock_t *rrl);
+void rrm_exit(rrmlock_t *rrl, void *tag);
+boolean_t rrm_held(rrmlock_t *rrl, krw_t rw);
+
+#define        RRM_READ_HELD(x)        rrm_held(x, RW_READER)
+#define        RRM_WRITE_HELD(x)       rrm_held(x, RW_WRITER)
+#define        RRM_LOCK_HELD(x) \
+       (rrm_held(x, RW_WRITER) || rrm_held(x, RW_READER))
+
 #ifdef __cplusplus
 }
 #endif
index c4db2a911d3e7fb0ea3b342be1b1b8ab78e16b01..4073d82f8fe0b38593f99abd21821ce2f16d0c0b 100644 (file)
@@ -67,7 +67,7 @@ typedef struct zfs_sb {
        boolean_t       z_atime;        /* enable atimes mount option */
        boolean_t       z_relatime;     /* enable relatime mount option */
        boolean_t       z_unmounted;    /* unmounted */
-       rrwlock_t       z_teardown_lock;
+       rrmlock_t       z_teardown_lock;
        krwlock_t       z_teardown_inactive_lock;
        list_t          z_all_znodes;   /* all znodes in the fs */
        uint64_t        z_nr_znodes;    /* number of znodes in the fs */
index 79ca4f7e9dae67844d453084dfc76be46e3510a7..7e73cf99182e50ea936c9f340324e270ea350233 100644 (file)
@@ -250,7 +250,7 @@ typedef struct znode {
 /* Called on entry to each ZFS vnode and vfs operation  */
 #define        ZFS_ENTER(zsb) \
        { \
-               rrw_enter_read(&(zsb)->z_teardown_lock, FTAG); \
+               rrm_enter_read(&(zsb)->z_teardown_lock, FTAG); \
                if ((zsb)->z_unmounted) { \
                        ZFS_EXIT(zsb); \
                        return (EIO); \
@@ -260,7 +260,7 @@ typedef struct znode {
 /* Must be called before exiting the vop */
 #define        ZFS_EXIT(zsb) \
        { \
-               rrw_exit(&(zsb)->z_teardown_lock, FTAG); \
+               rrm_exit(&(zsb)->z_teardown_lock, FTAG); \
        }
 
 /* Verifies the znode is valid */
index 29a22534e6005bee658d720ebf2a1a5c14ae5493..51394c01c4310cfccb68ba9f931299b83f37ef6e 100644 (file)
@@ -305,3 +305,91 @@ rrw_tsd_destroy(void *arg)
                    (void *)curthread, (void *)rn->rn_rrl);
        }
 }
+
+/*
+ * A reader-mostly lock implementation, tuning above reader-writer locks
+ * for hightly parallel read acquisitions, while pessimizing writes.
+ *
+ * The idea is to split single busy lock into array of locks, so that
+ * each reader can lock only one of them for read, depending on result
+ * of simple hash function.  That proportionally reduces lock congestion.
+ * Writer same time has to sequentially aquire write on all the locks.
+ * That makes write aquisition proportionally slower, but in places where
+ * it is used (filesystem unmount) performance is not critical.
+ *
+ * All the functions below are direct wrappers around functions above.
+ */
+void
+rrm_init(rrmlock_t *rrl, boolean_t track_all)
+{
+       int i;
+
+       for (i = 0; i < RRM_NUM_LOCKS; i++)
+               rrw_init(&rrl->locks[i], track_all);
+}
+
+void
+rrm_destroy(rrmlock_t *rrl)
+{
+       int i;
+
+       for (i = 0; i < RRM_NUM_LOCKS; i++)
+               rrw_destroy(&rrl->locks[i]);
+}
+
+void
+rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
+{
+       if (rw == RW_READER)
+               rrm_enter_read(rrl, tag);
+       else
+               rrm_enter_write(rrl);
+}
+
+/*
+ * This maps the current thread to a specific lock.  Note that the lock
+ * must be released by the same thread that acquired it.  We do this
+ * mapping by taking the thread pointer mod a prime number.  We examine
+ * only the low 32 bits of the thread pointer, because 32-bit division
+ * is faster than 64-bit division, and the high 32 bits have little
+ * entropy anyway.
+ */
+#define        RRM_TD_LOCK()   (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
+
+void
+rrm_enter_read(rrmlock_t *rrl, void *tag)
+{
+       rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
+}
+
+void
+rrm_enter_write(rrmlock_t *rrl)
+{
+       int i;
+
+       for (i = 0; i < RRM_NUM_LOCKS; i++)
+               rrw_enter_write(&rrl->locks[i]);
+}
+
+void
+rrm_exit(rrmlock_t *rrl, void *tag)
+{
+       int i;
+
+       if (rrl->locks[0].rr_writer == curthread) {
+               for (i = 0; i < RRM_NUM_LOCKS; i++)
+                       rrw_exit(&rrl->locks[i], tag);
+       } else {
+               rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
+       }
+}
+
+boolean_t
+rrm_held(rrmlock_t *rrl, krw_t rw)
+{
+       if (rw == RW_WRITER) {
+               return (rrw_held(&rrl->locks[0], rw));
+       } else {
+               return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
+       }
+}
index c44927036aa3fa2ee26a65acb1aebb112dae27b5..d997616ae4c2040ea72ccb558033ec13a67c7202 100644 (file)
@@ -1451,7 +1451,7 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
        if (get_zfs_sb(name, zsbp) != 0)
                error = zfs_sb_create(name, zsbp);
        if (error == 0) {
-               rrw_enter(&(*zsbp)->z_teardown_lock, (writer) ? RW_WRITER :
+               rrm_enter(&(*zsbp)->z_teardown_lock, (writer) ? RW_WRITER :
                    RW_READER, tag);
                if ((*zsbp)->z_unmounted) {
                        /*
@@ -1459,7 +1459,7 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
                         * thread should be just about to disassociate the
                         * objset from the zsb.
                         */
-                       rrw_exit(&(*zsbp)->z_teardown_lock, tag);
+                       rrm_exit(&(*zsbp)->z_teardown_lock, tag);
                        return (SET_ERROR(EBUSY));
                }
        }
@@ -1469,7 +1469,7 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
 static void
 zfs_sb_rele(zfs_sb_t *zsb, void *tag)
 {
-       rrw_exit(&zsb->z_teardown_lock, tag);
+       rrm_exit(&zsb->z_teardown_lock, tag);
 
        if (zsb->z_sb) {
                deactivate_super(zsb->z_sb);
index ae1bc324b9f3baa22669bc4f46f7e2343d68ff9a..a7005a2a12d5b4128489800d1e84608b7575dff2 100644 (file)
@@ -771,7 +771,7 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
        mutex_init(&zsb->z_lock, NULL, MUTEX_DEFAULT, NULL);
        list_create(&zsb->z_all_znodes, sizeof (znode_t),
            offsetof(znode_t, z_link_node));
-       rrw_init(&zsb->z_teardown_lock, B_FALSE);
+       rrm_init(&zsb->z_teardown_lock, B_FALSE);
        rw_init(&zsb->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
        rw_init(&zsb->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 
@@ -890,7 +890,7 @@ zfs_sb_free(zfs_sb_t *zsb)
        mutex_destroy(&zsb->z_znodes_lock);
        mutex_destroy(&zsb->z_lock);
        list_destroy(&zsb->z_all_znodes);
-       rrw_destroy(&zsb->z_teardown_lock);
+       rrm_destroy(&zsb->z_teardown_lock);
        rw_destroy(&zsb->z_teardown_inactive_lock);
        rw_destroy(&zsb->z_fuid_lock);
        for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -1221,7 +1221,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
                }
        }
 
-       rrw_enter(&zsb->z_teardown_lock, RW_WRITER, FTAG);
+       rrm_enter(&zsb->z_teardown_lock, RW_WRITER, FTAG);
 
        if (!unmounting) {
                /*
@@ -1252,7 +1252,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
         */
        if (!unmounting && (zsb->z_unmounted || zsb->z_os == NULL)) {
                rw_exit(&zsb->z_teardown_inactive_lock);
-               rrw_exit(&zsb->z_teardown_lock, FTAG);
+               rrm_exit(&zsb->z_teardown_lock, FTAG);
                return (SET_ERROR(EIO));
        }
 
@@ -1280,7 +1280,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
         */
        if (unmounting) {
                zsb->z_unmounted = B_TRUE;
-               rrw_exit(&zsb->z_teardown_lock, FTAG);
+               rrm_exit(&zsb->z_teardown_lock, FTAG);
                rw_exit(&zsb->z_teardown_inactive_lock);
        }
 
@@ -1599,7 +1599,7 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname)
        znode_t *zp;
        uint64_t sa_obj = 0;
 
-       ASSERT(RRW_WRITE_HELD(&zsb->z_teardown_lock));
+       ASSERT(RRM_WRITE_HELD(&zsb->z_teardown_lock));
        ASSERT(RW_WRITE_HELD(&zsb->z_teardown_inactive_lock));
 
        /*
@@ -1663,7 +1663,7 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname)
 bail:
        /* release the VFS ops */
        rw_exit(&zsb->z_teardown_inactive_lock);
-       rrw_exit(&zsb->z_teardown_lock, FTAG);
+       rrm_exit(&zsb->z_teardown_lock, FTAG);
 
        if (err) {
                /*