]> granicus.if.org Git - zfs/commitdiff
OpenZFS 9577 - remove zfs_dbuf_evict_key tsd
authorMatthew Ahrens <mahrens@delphix.com>
Thu, 31 May 2018 17:29:12 +0000 (10:29 -0700)
committerTony Hutter <hutter2@llnl.gov>
Fri, 22 Feb 2019 17:47:34 +0000 (09:47 -0800)
The zfs_dbuf_evict_key TSD (thread-specific data) is not necessary -
we can instead pass a flag down in a few places to prevent recursive
dbuf eviction. Making this change has 3 benefits:

1. The code semantics are easier to understand.
2. On Linux, performance is improved, because creating/removing
   TSD values (by setting to NULL vs non-NULL) is expensive, and
   we do it very often.
3. According to Nexenta, the current semantics can cause a
   deadlock when concurrently calling dmu_objset_evict_dbufs()
   (which is rare today, but they are working on a "parallel
   unmount" change that triggers this more easily):

Porting Notes:
* Minor conflict with OpenZFS 9337 which has not yet been ported.

Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
OpenZFS-issue: https://illumos.org/issues/9577
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/645
External-issue: DLPX-58547
Closes #7602

include/sys/dbuf.h
include/sys/dnode.h
module/zfs/dbuf.c
module/zfs/dnode.c
module/zfs/dnode_sync.c

index 127acad33c71ad632d56a646a7e04e8f9c71596f..e007e97644e37a36525ffba9c48f4621abf6a14f 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
@@ -294,7 +294,7 @@ boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 
 void dbuf_rele(dmu_buf_impl_t *db, void *tag);
-void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
+void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting);
 
 dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
     uint64_t blkid);
index 1e77e0a32ec4d3886223accc644df8585a79105e..4e006df553598042f8689bf251fc2270fe5d5caf 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
@@ -339,7 +339,7 @@ int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots,
     void *ref, dnode_t **dnp);
 boolean_t dnode_add_ref(dnode_t *dn, void *ref);
 void dnode_rele(dnode_t *dn, void *ref);
-void dnode_rele_and_unlock(dnode_t *dn, void *tag);
+void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
 void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
 void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
 void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
index 62b77bb0a1d1458dca8b5f1391ac89dec4531856..9694ce78b19365636f909564f4c1ee792af2f2d9 100644 (file)
@@ -72,8 +72,6 @@ static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
        void *tag, dmu_buf_impl_t **dbp, int depth);
 static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
 
-uint_t zfs_dbuf_evict_key;
-
 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 
@@ -505,14 +503,6 @@ dbuf_evict_one(void)
        dmu_buf_impl_t *db;
        ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
 
-       /*
-        * Set the thread's tsd to indicate that it's processing evictions.
-        * Once a thread stops evicting from the dbuf cache it will
-        * reset its tsd to NULL.
-        */
-       ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
-       (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
-
        db = multilist_sublist_tail(mls);
        while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
                db = multilist_sublist_prev(mls, db);
@@ -530,7 +520,6 @@ dbuf_evict_one(void)
        } else {
                multilist_sublist_unlock(mls);
        }
-       (void) tsd_set(zfs_dbuf_evict_key, NULL);
 }
 
 /*
@@ -583,29 +572,6 @@ dbuf_evict_thread(void)
 static void
 dbuf_evict_notify(void)
 {
-
-       /*
-        * We use thread specific data to track when a thread has
-        * started processing evictions. This allows us to avoid deeply
-        * nested stacks that would have a call flow similar to this:
-        *
-        * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
-        *      ^                                               |
-        *      |                                               |
-        *      +-----dbuf_destroy()<--dbuf_evict_one()<--------+
-        *
-        * The dbuf_eviction_thread will always have its tsd set until
-        * that thread exits. All other threads will only set their tsd
-        * if they are participating in the eviction process. This only
-        * happens if the eviction thread is unable to process evictions
-        * fast enough. To keep the dbuf cache size in check, other threads
-        * can evict from the dbuf cache directly. Those threads will set
-        * their tsd values so that we ensure that they only evict one dbuf
-        * from the dbuf cache.
-        */
-       if (tsd_get(zfs_dbuf_evict_key) != NULL)
-               return;
-
        /*
         * We check if we should evict without holding the dbuf_evict_lock,
         * because it's OK to occasionally make the wrong decision here,
@@ -681,7 +647,6 @@ retry:
            dbuf_cache_multilist_index_func);
        zfs_refcount_create(&dbuf_cache_size);
 
-       tsd_create(&zfs_dbuf_evict_key, NULL);
        dbuf_evict_thread_exit = B_FALSE;
        mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
@@ -718,7 +683,6 @@ dbuf_fini(void)
                cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
        }
        mutex_exit(&dbuf_evict_lock);
-       tsd_destroy(&zfs_dbuf_evict_key);
 
        mutex_destroy(&dbuf_evict_lock);
        cv_destroy(&dbuf_evict_cv);
@@ -1004,7 +968,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
                db->db_state = DB_UNCACHED;
        }
        cv_broadcast(&db->db_changed);
-       dbuf_rele_and_unlock(db, NULL);
+       dbuf_rele_and_unlock(db, NULL, B_FALSE);
 }
 
 static int
@@ -2178,7 +2142,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
                 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
                 * release any lock.
                 */
-               dnode_rele(dn, db);
+               mutex_enter(&dn->dn_mtx);
+               dnode_rele_and_unlock(dn, db, B_TRUE);
                db->db_dnode_handle = NULL;
 
                dbuf_hash_remove(db);
@@ -2204,8 +2169,10 @@ dbuf_destroy(dmu_buf_impl_t *db)
         * If this dbuf is referenced from an indirect dbuf,
         * decrement the ref count on the indirect dbuf.
         */
-       if (parent && parent != dndb)
-               dbuf_rele(parent, db);
+       if (parent && parent != dndb) {
+               mutex_enter(&parent->db_mtx);
+               dbuf_rele_and_unlock(parent, db, B_TRUE);
+       }
 }
 
 /*
@@ -2912,7 +2879,7 @@ void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
        mutex_enter(&db->db_mtx);
-       dbuf_rele_and_unlock(db, tag);
+       dbuf_rele_and_unlock(db, tag, B_FALSE);
 }
 
 void
@@ -2923,10 +2890,19 @@ dmu_buf_rele(dmu_buf_t *db, void *tag)
 
 /*
  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
- * db_dirtycnt and db_holds to be updated atomically.
+ * db_dirtycnt and db_holds to be updated atomically.  The 'evicting'
+ * argument should be set if we are already in the dbuf-evicting code
+ * path, in which case we don't want to recursively evict.  This allows us to
+ * avoid deeply nested stacks that would have a call flow similar to this:
+ *
+ * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
+ *     ^                                               |
+ *     |                                               |
+ *     +-----dbuf_destroy()<--dbuf_evict_one()<--------+
+ *
  */
 void
-dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
 {
        int64_t holds;
 
@@ -3021,7 +2997,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
                                    db->db.db_size, db);
                                mutex_exit(&db->db_mtx);
 
-                               dbuf_evict_notify();
+                               if (!evicting)
+                                       dbuf_evict_notify();
                        }
 
                        if (do_arc_evict)
@@ -3314,7 +3291,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
                kmem_free(dr, sizeof (dbuf_dirty_record_t));
                ASSERT(db->db_dirtycnt > 0);
                db->db_dirtycnt -= 1;
-               dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+               dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
                return;
        }
 
@@ -3670,7 +3647,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
        ASSERT(db->db_dirtycnt > 0);
        db->db_dirtycnt -= 1;
        db->db_data_pending = NULL;
-       dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
+       dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
 }
 
 static void
index 989a8ec7f69174bc4c9873fcacefcde080ce038b..b80aeb8b1f89ef1fefea11dfbe14c655a310bf8c 100644 (file)
@@ -1533,11 +1533,11 @@ void
 dnode_rele(dnode_t *dn, void *tag)
 {
        mutex_enter(&dn->dn_mtx);
-       dnode_rele_and_unlock(dn, tag);
+       dnode_rele_and_unlock(dn, tag, B_FALSE);
 }
 
 void
-dnode_rele_and_unlock(dnode_t *dn, void *tag)
+dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
 {
        uint64_t refs;
        /* Get while the hold prevents the dnode from moving. */
@@ -1568,7 +1568,8 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag)
                 * that the handle has zero references, but that will be
                 * asserted anyway when the handle gets destroyed.
                 */
-               dbuf_rele(db, dnh);
+               mutex_enter(&db->db_mtx);
+               dbuf_rele_and_unlock(db, dnh, evicting);
        }
 }
 
index 2febb52063052b6d90734ea1a8b3c18f0c434fab..ee86c13b1b25e9b6577ad3e10d640f87db2f363d 100644 (file)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
@@ -429,6 +429,19 @@ dnode_evict_dbufs(dnode_t *dn)
                        avl_insert_here(&dn->dn_dbufs, db_marker, db,
                            AVL_BEFORE);
 
+                       /*
+                        * We need to use the "marker" dbuf rather than
+                        * simply getting the next dbuf, because
+                        * dbuf_destroy() may actually remove multiple dbufs.
+                        * It can call itself recursively on the parent dbuf,
+                        * which may also be removed from dn_dbufs.  The code
+                        * flow would look like:
+                        *
+                        * dbuf_destroy():
+                        *   dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):
+                        *      if (!cacheable || pending_evict)
+                        *        dbuf_destroy()
+                        */
                        dbuf_destroy(db);
 
                        db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
@@ -489,7 +502,7 @@ dnode_undirty_dbufs(list_t *list)
                        list_destroy(&dr->dt.di.dr_children);
                }
                kmem_free(dr, sizeof (dbuf_dirty_record_t));
-               dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+               dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
        }
 }