]> granicus.if.org Git - zfs/commitdiff
Fix race in dnode_check_slots_free()
authorTom Caputi <tcaputi@datto.com>
Tue, 10 Apr 2018 18:15:05 +0000 (14:15 -0400)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 10 Apr 2018 18:15:05 +0000 (11:15 -0700)
Currently, dnode_check_slots_free() works by checking dn->dn_type
in the dnode to determine if the dnode is reclaimable. However,
there is a small window of time between dnode_free_sync() in the
first call to dsl_dataset_sync() and when the useraccounting code
is run when the type is set DMU_OT_NONE, but the dnode is not yet
evictable, leading to crashes. This patch adds the ability for
dnodes to track which txg they were last dirtied in and adds a
check for this before performing the reclaim.

This patch also corrects several instances when dn_dirty_link was
treated as a list_node_t when it is technically a multilist_node_t.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #7147
Closes #7388

include/sys/dmu_impl.h
include/sys/dnode.h
module/zfs/dbuf.c
module/zfs/dmu.c
module/zfs/dmu_objset.c
module/zfs/dnode.c

index 65e417e3f665de8579973b7ff2742929bdecbb05..03a63077f10139f3f50c4bf36ee29060e60657cf 100644 (file)
@@ -161,6 +161,7 @@ extern "C" {
  *     dn_allocated_txg
  *     dn_free_txg
  *     dn_assigned_txg
+ *     dn_dirty_txg
  *     dd_assigned_tx
  *     dn_notxholds
  *     dn_dirtyctx
index 1cb7cae0935ae5a79a1c785c8c0b4669bb6ec035..2f70d54461cb81fe97631be696bbaa55299da5b0 100644 (file)
@@ -329,6 +329,7 @@ struct dnode {
        uint64_t dn_allocated_txg;
        uint64_t dn_free_txg;
        uint64_t dn_assigned_txg;
+       uint64_t dn_dirty_txg;                  /* txg dnode was last dirtied */
        kcondvar_t dn_notxholds;
        enum dnode_dirtycontext dn_dirtyctx;
        uint8_t *dn_dirtyctx_firstset;          /* dbg: contents meaningless */
@@ -432,6 +433,9 @@ void dnode_evict_dbufs(dnode_t *dn);
 void dnode_evict_bonus(dnode_t *dn);
 void dnode_free_interior_slots(dnode_t *dn);
 
+#define        DNODE_IS_DIRTY(_dn)                                             \
+       ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa))
+
 #define        DNODE_IS_CACHEABLE(_dn)                                         \
        ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||         \
        (DMU_OT_IS_METADATA((_dn)->dn_type) &&                          \
index 18edd783431d490ae7e678e73af1345f564014f2..1a6e560d21d6631d1f78033e217ba67eba2bf9c7 100644 (file)
@@ -1818,6 +1818,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                            FTAG);
                }
        }
+
+       if (tx->tx_txg > dn->dn_dirty_txg)
+               dn->dn_dirty_txg = tx->tx_txg;
        mutex_exit(&dn->dn_mtx);
 
        if (db->db_blkid == DMU_SPILL_BLKID)
index 2b727372c749bcc58dfdcf5f794d22d6eb386153..64c898198dea0c497154d7ebe404dcea2a5718ae 100644 (file)
@@ -2284,7 +2284,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
         * Check if dnode is dirty
         */
        for (i = 0; i < TXG_SIZE; i++) {
-               if (list_link_active(&dn->dn_dirty_link[i])) {
+               if (multilist_link_active(&dn->dn_dirty_link[i])) {
                        clean = B_FALSE;
                        break;
                }
index 8efc3154691e062ec8ef56b61203b14b6cbc0077..a0982f6ec176203bd72ac8545fcbc4f4c476e568 100644 (file)
@@ -1416,10 +1416,23 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
                ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
                multilist_sublist_remove(list, dn);
 
+               /*
+                * If we are not doing useraccounting (os_synced_dnodes == NULL)
+                * we are done with this dnode for this txg. Unset dn_dirty_txg
+                * if later txgs aren't dirtying it so that future holders do
+                * not get a stale value. Otherwise, we will do this in
+                * userquota_updates_task() when processing has completely
+                * finished for this txg.
+                */
                multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
                if (newlist != NULL) {
                        (void) dnode_add_ref(dn, newlist);
                        multilist_insert(newlist, dn);
+               } else {
+                       mutex_enter(&dn->dn_mtx);
+                       if (dn->dn_dirty_txg == tx->tx_txg)
+                               dn->dn_dirty_txg = 0;
+                       mutex_exit(&dn->dn_mtx);
                }
 
                dnode_sync(dn, tx);
@@ -1889,6 +1902,8 @@ userquota_updates_task(void *arg)
                                dn->dn_id_flags |= DN_ID_CHKED_BONUS;
                }
                dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
+               if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa))
+                       dn->dn_dirty_txg = 0;
                mutex_exit(&dn->dn_mtx);
 
                multilist_sublist_remove(list, dn);
index 596983b47acdf50e5e02dc0f193df5af3ada0281..a379527a0662c7eada06e9ba995fcc2c9b994934 100644 (file)
@@ -139,7 +139,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
        bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid));
 
        for (i = 0; i < TXG_SIZE; i++) {
-               list_link_init(&dn->dn_dirty_link[i]);
+               multilist_link_init(&dn->dn_dirty_link[i]);
                dn->dn_free_ranges[i] = NULL;
                list_create(&dn->dn_dirty_records[i],
                    sizeof (dbuf_dirty_record_t),
@@ -149,6 +149,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
        dn->dn_allocated_txg = 0;
        dn->dn_free_txg = 0;
        dn->dn_assigned_txg = 0;
+       dn->dn_dirty_txg = 0;
        dn->dn_dirtyctx = 0;
        dn->dn_dirtyctx_firstset = NULL;
        dn->dn_bonus = NULL;
@@ -188,7 +189,7 @@ dnode_dest(void *arg, void *unused)
        ASSERT(!list_link_active(&dn->dn_link));
 
        for (i = 0; i < TXG_SIZE; i++) {
-               ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+               ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
                ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
                list_destroy(&dn->dn_dirty_records[i]);
                ASSERT0(dn->dn_next_nblkptr[i]);
@@ -204,6 +205,7 @@ dnode_dest(void *arg, void *unused)
        ASSERT0(dn->dn_allocated_txg);
        ASSERT0(dn->dn_free_txg);
        ASSERT0(dn->dn_assigned_txg);
+       ASSERT0(dn->dn_dirty_txg);
        ASSERT0(dn->dn_dirtyctx);
        ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
        ASSERT3P(dn->dn_bonus, ==, NULL);
@@ -530,6 +532,7 @@ dnode_destroy(dnode_t *dn)
        dn->dn_allocated_txg = 0;
        dn->dn_free_txg = 0;
        dn->dn_assigned_txg = 0;
+       dn->dn_dirty_txg = 0;
 
        dn->dn_dirtyctx = 0;
        if (dn->dn_dirtyctx_firstset != NULL) {
@@ -601,6 +604,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
        ASSERT0(dn->dn_maxblkid);
        ASSERT0(dn->dn_allocated_txg);
        ASSERT0(dn->dn_assigned_txg);
+       ASSERT0(dn->dn_dirty_txg);
        ASSERT(refcount_is_zero(&dn->dn_tx_holds));
        ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
        ASSERT(avl_is_empty(&dn->dn_dbufs));
@@ -614,7 +618,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
                ASSERT0(dn->dn_rm_spillblk[i]);
                ASSERT0(dn->dn_next_blksz[i]);
                ASSERT0(dn->dn_next_maxblkid[i]);
-               ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+               ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
                ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
                ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
        }
@@ -792,6 +796,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        ndn->dn_allocated_txg = odn->dn_allocated_txg;
        ndn->dn_free_txg = odn->dn_free_txg;
        ndn->dn_assigned_txg = odn->dn_assigned_txg;
+       ndn->dn_dirty_txg = odn->dn_dirty_txg;
        ndn->dn_dirtyctx = odn->dn_dirtyctx;
        ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
        ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
@@ -860,6 +865,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        odn->dn_allocated_txg = 0;
        odn->dn_free_txg = 0;
        odn->dn_assigned_txg = 0;
+       odn->dn_dirty_txg = 0;
        odn->dn_dirtyctx = 0;
        odn->dn_dirtyctx_firstset = NULL;
        odn->dn_have_spill = B_FALSE;
@@ -1086,6 +1092,10 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
 {
        ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
+       /*
+        * If all dnode slots are either already free or
+        * evictable return B_TRUE.
+        */
        for (int i = idx; i < idx + slots; i++) {
                dnode_handle_t *dnh = &children->dnc_children[i];
                dnode_t *dn = dnh->dnh_dnode;
@@ -1094,18 +1104,17 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
                        continue;
                } else if (DN_SLOT_IS_PTR(dn)) {
                        mutex_enter(&dn->dn_mtx);
-                       dmu_object_type_t type = dn->dn_type;
+                       boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
+                           !DNODE_IS_DIRTY(dn));
                        mutex_exit(&dn->dn_mtx);
 
-                       if (type != DMU_OT_NONE)
+                       if (!can_free)
                                return (B_FALSE);
-
-                       continue;
+                       else
+                               continue;
                } else {
                        return (B_FALSE);
                }
-
-               return (B_FALSE);
        }
 
        return (B_TRUE);
@@ -1633,7 +1642,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
        /*
         * If we are already marked dirty, we're done.
         */
-       if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
+       if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
                multilist_sublist_unlock(mls);
                return;
        }