]> granicus.if.org Git - zfs/commitdiff
Update rwlocks to track owner to ensure correct semantics
authorBrian Behlendorf <behlendorf1@llnl.gov>
Fri, 25 Sep 2009 21:14:35 +0000 (14:14 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Fri, 25 Sep 2009 21:14:35 +0000 (14:14 -0700)
The behavior of RW_*_HELD was updated because it was not quite right.
It is not sufficient to return non-zero when the lock is help, we must
only do this when the current task in the holder.

This means we need to track the lock owner which is not something
tracked in a Linux semaphore.  After some experimentation the
solution I settled on was to embed the Linux semaphore at the start
of a larger krwlock_t structure which includes the owner field.
This maintains good performance and allows us to cleanly intergrate
with the kernel lock analysis tools.  My reasons:

1) By placing the Linux semaphore at the start of krwlock_t we can
then simply cast krwlock_t to a rw_semaphore and pass that on to
the linux kernel.  This allows us to use '#defines so the preprocessor
can do direct replacement of the Solaris primative with the linux
equivilant.  This is important because it then maintains the location
information for each rw_* call point.

2) Additionally, by adding the owner to krwlock_t we can keep this
needed extra information adjacent to the lock itself.  This removes
the need for a fancy lookup to get the owner which is optimal for
performance.  We can also leverage the existing spin lock in the
semaphore to ensure owner is updated correctly.

3) All helper functions which do not need to strictly be implemented
as a define to preserve location information can be done as a static
inline function.

4) Adding the owner to krwlock_t allows us to remove all memory
allocations done during lock initialization.  This is good for all
the obvious reasons, we do give up the ability to specific the lock
name.  The Linux profiling tools will stringify the lock name used
in the code via the preprocessor and use that.

Update rwlocks validated on:
- SLES10   (ppc64)
- SLES11   (x86_64)
- CHAOS4.2 (x86_64)
- RHEL5.3  (x86_64)
- RHEL6    (x86_64)
- FC11     (x86_64)

include/sys/rwlock.h
module/spl/spl-generic.c
module/spl/spl-rwlock.c

index c6d66ceaa11078e9cc00d59db51e4c3125d61b6e..a68d072433b4700693f81a974cd59df33e7f2a55 100644 (file)
 #ifndef _SPL_RWLOCK_H
 #define _SPL_RWLOCK_H
 
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/rwsem.h>
 #include <sys/types.h>
+#include <linux/rwsem.h>
 
 typedef enum {
         RW_DRIVER  = 2,
@@ -38,81 +36,199 @@ typedef enum {
 } krw_type_t;
 
 typedef enum {
-       RW_NONE   = 0,
+        RW_NONE   = 0,
         RW_WRITER = 1,
         RW_READER = 2
 } krw_t;
 
-typedef struct rw_semaphore krwlock_t;
+typedef struct {
+        struct rw_semaphore rw_rwlock;
+        kthread_t *rw_owner;
+} krwlock_t;
 
-#define rw_init(rwlp, name, type, arg)  init_rwsem(rwlp)
-#define rw_destroy(rwlp)                ((void)0)
-#define rw_downgrade(rwlp)              downgrade_write(rwlp)
-#define RW_LOCK_HELD(rwlp)              rwsem_is_locked(rwlp)
 /*
- * the rw-semaphore definition
+ * For the generic and x86 implementations of rw-semaphores the following
+ * is true.  If your semaphore implementation internally represents the
+ * semaphore state differently special case handling will be required.
  * - if activity/count is 0 then there are no active readers or writers
  * - if activity/count is +ve then that is the number of active readers
  * - if activity/count is -1 then there is one active writer
  */
+#define SEM(rwp)                        ((struct rw_semaphore *)(rwp))
+
 #if defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
-# define RW_COUNT(rwlp)            ((rwlp)->activity)
-# define RW_READ_HELD(rwlp)        ((RW_COUNT(rwlp) > 0) ? RW_COUNT(rwlp) : 0)
-# define RW_WRITE_HELD(rwlp)       ((RW_COUNT(rwlp) < 0))
-# define rw_exit_locked(rwlp)      __up_read_locked(rwlp)
-# define rw_tryenter_locked(rwlp)  __down_write_trylock_locked(rwlp)
-void __up_read_locked(struct rw_semaphore *);
-int __down_write_trylock_locked(struct rw_semaphore *);
+# define RW_COUNT(rwp)                  (SEM(rwp)->activity)
+# define rw_exit_locked(rwp)            __up_read_locked(rwp)
+# define rw_tryenter_locked(rwp)        __down_write_trylock_locked(rwp)
+extern void __up_read_locked(struct rw_semaphore *);
+extern int __down_write_trylock_locked(struct rw_semaphore *);
 #else
-# define RW_COUNT(rwlp)            ((rwlp)->count & RWSEM_ACTIVE_MASK)
-# define RW_READ_HELD(rwlp)        ((RW_COUNT(rwlp) > 0) ? RW_COUNT(rwlp) : 0)
-# define RW_WRITE_HELD(rwlp)       ((RW_COUNT(rwlp) < 0))
-# define rw_exit_locked(rwlp)      up_read(rwlp)
-# define rw_tryenter_locked(rwlp)  down_write_trylock(rwlp)
+# define RW_COUNT(rwp)                  (SEM(rwp)->count & RWSEM_ACTIVE_MASK)
+# define rw_exit_locked(rwp)            up_read(rwp)
+# define rw_tryenter_locked(rwp)        down_write_trylock(rwp)
 #endif
 
-#define rw_tryenter(rwlp, rw)                                                 \
-({                                                                            \
-        int _rc_ = 0;                                                         \
-        switch (rw) {                                                         \
-                case RW_READER: _rc_ = down_read_trylock(rwlp);  break;       \
-                case RW_WRITER: _rc_ = down_write_trylock(rwlp); break;       \
-                default:        SBUG();                                       \
-        }                                                                     \
-        _rc_;                                                                 \
+static inline kthread_t *
+spl_rw_get_owner(krwlock_t *rwp)
+{
+        return rwp->rw_owner;
+}
+
+static inline void
+spl_rw_set_owner(krwlock_t *rwp)
+{
+        unsigned long flags;
+
+        spin_lock_irqsave(&SEM(rwp)->wait_lock, flags);
+        rwp->rw_owner = current;
+        spin_unlock_irqrestore(&SEM(rwp)->wait_lock, flags);
+}
+
+static inline void
+spl_rw_clear_owner(krwlock_t *rwp)
+{
+        unsigned long flags;
+
+        spin_lock_irqsave(&SEM(rwp)->wait_lock, flags);
+        rwp->rw_owner = NULL;
+        spin_unlock_irqrestore(&SEM(rwp)->wait_lock, flags);
+}
+
+static inline kthread_t *
+rw_owner(krwlock_t *rwp)
+{
+        unsigned long flags;
+        kthread_t *owner;
+
+        spin_lock_irqsave(&SEM(rwp)->wait_lock, flags);
+        owner = spl_rw_get_owner(rwp);
+        spin_unlock_irqrestore(&SEM(rwp)->wait_lock, flags);
+
+        return owner;
+}
+
+static inline int
+RW_READ_HELD(krwlock_t *rwp)
+{
+        unsigned long flags;
+        int rc;
+
+        spin_lock_irqsave(&SEM(rwp)->wait_lock, flags);
+        rc = ((RW_COUNT(rwp) > 0) && (spl_rw_get_owner(rwp) == NULL));
+        spin_unlock_irqrestore(&SEM(rwp)->wait_lock, flags);
+
+        return rc;
+}
+
+static inline int
+RW_WRITE_HELD(krwlock_t *rwp)
+{
+        unsigned long flags;
+        int rc;
+
+        spin_lock_irqsave(&SEM(rwp)->wait_lock, flags);
+        rc = ((RW_COUNT(rwp) < 0) && (spl_rw_get_owner(rwp) == current));
+        spin_unlock_irqrestore(&SEM(rwp)->wait_lock, flags);
+
+        return rc;
+}
+
+static inline int
+RW_LOCK_HELD(krwlock_t *rwp)
+{
+        unsigned long flags;
+        int rc;
+
+        spin_lock_irqsave(&SEM(rwp)->wait_lock, flags);
+        rc = (RW_COUNT(rwp) != 0);
+        spin_unlock_irqrestore(&SEM(rwp)->wait_lock, flags);
+
+        return rc;
+}
+
+/*
+ * The following functions must be a #define and not static inline.
+ * This ensures that the native linux semaphore functions (down/up)
+ * will be correctly located in the users code which is important
+ * for the built in kernel lock analysis tools
+ */
+#define rw_init(rwp, name, type, arg)                                   \
+({                                                                      \
+        init_rwsem(SEM(rwp));                                           \
+        spl_rw_clear_owner(rwp);                                        \
 })
 
-#define rw_enter(rwlp, rw)                                                    \
-({                                                                            \
-        switch (rw) {                                                         \
-                case RW_READER: down_read(rwlp);  break;                      \
-                case RW_WRITER: down_write(rwlp); break;                      \
-                default:        SBUG();                                       \
-        }                                                                     \
+#define rw_destroy(rwp)                                                 \
+({                                                                      \
+        VERIFY(!RW_LOCK_HELD(rwp));                                     \
 })
 
-#define rw_exit(rwlp)                                                         \
-({                                                                            \
-        if (RW_READ_HELD(rwlp))                                               \
-              up_read(rwlp);                                                  \
-        else if (RW_WRITE_HELD(rwlp))                                         \
-              up_write(rwlp);                                                 \
-        else                                                                  \
-              SBUG();                                                         \
+#define rw_tryenter(rwp, rw)                                            \
+({                                                                      \
+        int _rc_ = 0;                                                   \
+                                                                        \
+        switch (rw) {                                                   \
+        case RW_READER:                                                 \
+                _rc_ = down_read_trylock(SEM(rwp));                     \
+                break;                                                  \
+        case RW_WRITER:                                                 \
+                if ((_rc_ = down_write_trylock(SEM(rwp))))              \
+                        spl_rw_set_owner(rwp);                          \
+                break;                                                  \
+        default:                                                        \
+                SBUG();                                                 \
+        }                                                               \
+        _rc_;                                                           \
 })
 
-#define rw_tryupgrade(rwlp)                                                   \
-({                                                                            \
-        unsigned long flags;                                                  \
-        int _rc_ = 0;                                                         \
-        spin_lock_irqsave(&(rwlp)->wait_lock, flags);                         \
-        if (list_empty(&(rwlp)->wait_list) && (RW_READ_HELD(rwlp) == 1)) {    \
-                rw_exit_locked(rwlp);                                         \
-                _rc_ = rw_tryenter_locked(rwlp);                              \
-                ASSERT(_rc_);                                                 \
-        }                                                                     \
-        spin_unlock_irqrestore(&(rwlp)->wait_lock, flags);                    \
-        _rc_;                                                                 \
+#define rw_enter(rwp, rw)                                               \
+({                                                                      \
+        switch (rw) {                                                   \
+        case RW_READER:                                                 \
+                down_read(SEM(rwp));                                    \
+                break;                                                  \
+        case RW_WRITER:                                                 \
+                down_write(SEM(rwp));                                   \
+                spl_rw_set_owner(rwp);                                  \
+                break;                                                  \
+        default:                                                        \
+                SBUG();                                                 \
+        }                                                               \
 })
 
+#define rw_exit(rwp)                                                    \
+({                                                                      \
+        if (RW_WRITE_HELD(rwp)) {                                       \
+                spl_rw_clear_owner(rwp);                                \
+                up_write(SEM(rwp));                                     \
+        } else {                                                        \
+                ASSERT(RW_READ_HELD(rwp));                              \
+                up_read(SEM(rwp));                                      \
+        }                                                               \
+})
+
+#define rw_downgrade(rwp)                                               \
+({                                                                      \
+        spl_rw_clear_owner(rwp);                                        \
+        downgrade_write(SEM(rwp));                                      \
+})
+
+#define rw_tryupgrade(rwp)                                              \
+({                                                                      \
+        unsigned long _flags_;                                          \
+        int _rc_ = 0;                                                   \
+                                                                        \
+        spin_lock_irqsave(&SEM(rwp)->wait_lock, _flags_);               \
+        if (list_empty(&SEM(rwp)->wait_list) && (RW_COUNT(rwp) == 1)) { \
+                rw_exit_locked(SEM(rwp));                               \
+                VERIFY(_rc_ = rw_tryenter_locked(SEM(rwp)));            \
+                (rwp)->rw_owner = current;                              \
+        }                                                               \
+        spin_unlock_irqrestore(&SEM(rwp)->wait_lock, _flags_);          \
+        _rc_;                                                           \
+})
+
+int spl_rw_init(void);
+void spl_rw_fini(void);
+
 #endif /* _SPL_RWLOCK_H */
index 8e0ef926310b5abdbec45fc1411de31cf02fbad6..8bf97e0cf14de32079fed2a7997089ea93f326c6 100644 (file)
@@ -30,6 +30,7 @@
 #include <sys/vnode.h>
 #include <sys/kmem.h>
 #include <sys/mutex.h>
+#include <sys/rwlock.h>
 #include <sys/taskq.h>
 #include <sys/debug.h>
 #include <sys/proc.h>
@@ -365,49 +366,54 @@ static int __init spl_init(void)
                return rc;
 
        if ((rc = spl_kmem_init()))
-               GOTO(out , rc);
+               GOTO(out1, rc);
 
        if ((rc = spl_mutex_init()))
-               GOTO(out2 , rc);
+               GOTO(out2, rc);
 
-       if ((rc = spl_taskq_init()))
+       if ((rc = spl_rw_init()))
                GOTO(out3, rc);
 
-       if ((rc = vn_init()))
+       if ((rc = spl_taskq_init()))
                GOTO(out4, rc);
 
-       if ((rc = proc_init()))
+       if ((rc = vn_init()))
                GOTO(out5, rc);
 
-       if ((rc = kstat_init()))
+       if ((rc = proc_init()))
                GOTO(out6, rc);
 
+       if ((rc = kstat_init()))
+               GOTO(out7, rc);
+
        if ((rc = set_hostid()))
-               GOTO(out7, rc = -EADDRNOTAVAIL);
+               GOTO(out8, rc = -EADDRNOTAVAIL);
 
 #ifndef HAVE_KALLSYMS_LOOKUP_NAME
        if ((rc = set_kallsyms_lookup_name()))
-               GOTO(out7, rc = -EADDRNOTAVAIL);
+               GOTO(out8, rc = -EADDRNOTAVAIL);
 #endif /* HAVE_KALLSYMS_LOOKUP_NAME */
 
        if ((rc = spl_kmem_init_kallsyms_lookup()))
-               GOTO(out7, rc);
+               GOTO(out8, rc);
 
        printk("SPL: Loaded Solaris Porting Layer v%s\n", SPL_META_VERSION);
        RETURN(rc);
-out7:
+out8:
        kstat_fini();
-out6:
+out7:
        proc_fini();
-out5:
+out6:
        vn_fini();
-out4:
+out5:
        spl_taskq_fini();
+out4:
+       spl_rw_fini();
 out3:
        spl_mutex_fini();
 out2:
        spl_kmem_fini();
-out:
+out1:
        debug_fini();
 
        printk("SPL: Failed to Load Solaris Porting Layer v%s, "
@@ -424,6 +430,7 @@ static void spl_fini(void)
        proc_fini();
        vn_fini();
        spl_taskq_fini();
+       spl_rw_fini();
        spl_mutex_fini();
        spl_kmem_fini();
        debug_fini();
index 8ff66bf9f7823d625f50abf4acee89816321b54c..5120b2c8100648b2c91ca989f5063921ff1f25d7 100644 (file)
@@ -91,3 +91,6 @@ __down_write_trylock_locked(struct rw_semaphore *sem)
 EXPORT_SYMBOL(__down_write_trylock_locked);
 
 #endif
+
+int spl_rw_init(void) { return 0; }
+void spl_rw_fini(void) { }