]> granicus.if.org Git - zfs/commitdiff
Fix dnode allocation race
authorBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 8 Aug 2017 15:38:53 +0000 (08:38 -0700)
committerGitHub <noreply@github.com>
Tue, 8 Aug 2017 15:38:53 +0000 (08:38 -0700)
When performing concurrent object allocations using the new
multi-threaded allocator and large dnodes it's possible to
allocate overlapping large dnodes.

This case should have been handled by detecting an error
returned by dnode_hold_impl().  But that logic only checked
the returned dnp was not-NULL, and the dnp variable was not
reset to NULL when retrying.  Resolve this issue by properly
checking the return value of dnode_hold_impl().

Additionally, it was possible that dnode_hold_impl() would
misreport a dnode as free when it was in fact in use.  This
could occurs for two reasons:

* The per-slot zrl_lock must be held over the entire critical
  section which includes the alloc/free until the new dnode
  is assigned to children_dnodes.  Additionally, all of the
  zrl_lock's in the range must be held to protect moving
  dnodes.

* The dn->dn_ot_type cannot be solely relied upon to check
  the type.  When allocating a new dnode its type will be
  DMU_OT_NONE after dnode_create().  Only latter when
  dnode_allocate() is called will it transition to the new
  type.  This means there's a window when allocating where
  it can mistaken for a free dnode.

Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #6414
Closes #6439

module/zfs/dmu_object.c
module/zfs/dmu_tx.c
module/zfs/dnode.c
module/zfs/zap_micro.c
module/zfs/zfs_znode.c
tests/runfiles/linux.run
tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am
tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh [new file with mode: 0755]

index 1c7b26d4becc17f46c253b1143683c4b627daffb..14264ec30cfc3e0ae76296cf62cbbddcf978624d 100644 (file)
@@ -61,6 +61,7 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
        boolean_t restarted = B_FALSE;
        uint64_t *cpuobj = NULL;
        int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+       int error;
 
        kpreempt_disable();
        cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
@@ -129,7 +130,6 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
                                uint64_t offset;
                                uint64_t blkfill;
                                int minlvl;
-                               int error;
                                if (os->os_rescan_dnodes) {
                                        offset = 0;
                                        os->os_rescan_dnodes = B_FALSE;
@@ -163,9 +163,9 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
                 * dmu_tx_assign(), but there is currently no mechanism
                 * to do so.
                 */
-               (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
+               error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
                    dn_slots, FTAG, &dn);
-               if (dn != NULL) {
+               if (error == 0) {
                        rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
                        /*
                         * Another thread could have allocated it; check
index 3b96c6bdd062a79ee09dd669438c028a73cdf395..7937615f2e750116e1fe89b534d9d27e1439bcde 100644 (file)
@@ -1242,11 +1242,13 @@ dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
 void
 dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
 {
-       dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
-           tx->tx_objset, object, THT_SPILL, 0, 0);
+       dmu_tx_hold_t *txh;
 
-       (void) refcount_add_many(&txh->txh_space_towrite,
-           SPA_OLD_MAXBLOCKSIZE, FTAG);
+       txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
+           THT_SPILL, 0, 0);
+       if (txh != NULL)
+               (void) refcount_add_many(&txh->txh_space_towrite,
+                   SPA_OLD_MAXBLOCKSIZE, FTAG);
 }
 
 void
index 627cc8df1050d616e69a4b2fcf8870532dd52818..41180bedf406eaba0adc867c92dd3e9d979b5006 100644 (file)
@@ -391,7 +391,7 @@ dnode_setdblksz(dnode_t *dn, int size)
 }
 
 static dnode_t *
-dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+dnode_create(objset_t *os, dnode_phys_t *dnp, int slots, dmu_buf_impl_t *db,
     uint64_t object, dnode_handle_t *dnh)
 {
        dnode_t *dn;
@@ -424,11 +424,15 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
        dn->dn_compress = dnp->dn_compress;
        dn->dn_bonustype = dnp->dn_bonustype;
        dn->dn_bonuslen = dnp->dn_bonuslen;
-       dn->dn_num_slots = dnp->dn_extra_slots + 1;
        dn->dn_maxblkid = dnp->dn_maxblkid;
        dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
        dn->dn_id_flags = 0;
 
+       if (slots && dn->dn_type == DMU_OT_NONE)
+               dn->dn_num_slots = slots;
+       else
+               dn->dn_num_slots = dnp->dn_extra_slots + 1;
+
        dmu_zfetch_init(&dn->dn_zfetch, dn);
 
        ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
@@ -1011,7 +1015,7 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
 {
        dnode_t *dn;
 
-       dn = dnode_create(os, dnp, NULL, object, dnh);
+       dn = dnode_create(os, dnp, 0, NULL, object, dnh);
        zrl_init(&dnh->dnh_zrlock);
        DNODE_VERIFY(dn);
 }
@@ -1062,36 +1066,26 @@ dnode_buf_evict_async(void *dbu)
  *
  * The dnode_phys_t buffer may not be in sync with the in-core dnode
  * structure, so we try to check the dnode structure first and fall back
- * to the dnode_phys_t buffer it doesn't exist.
+ * to the dnode_phys_t buffer it doesn't exist.  When an in-code dnode
+ * exists we can always trust dn->dn_num_slots to be accurate, even for
+ * a held dnode which has not yet been fully allocated.
  */
 static boolean_t
-dnode_is_consumed(dmu_buf_impl_t *db, int idx)
+dnode_is_consumed(dnode_children_t *children, dnode_phys_t *dn_block, int idx)
 {
-       dnode_handle_t *dnh;
-       dmu_object_type_t ot;
-       dnode_children_t *children_dnodes;
-       dnode_phys_t *dn_block;
-       int skip;
-       int i;
-
-       children_dnodes = dmu_buf_get_user(&db->db);
-       dn_block = (dnode_phys_t *)db->db.db_data;
+       int skip, i;
 
        for (i = 0; i < idx; i += skip) {
-               dnh = &children_dnodes->dnc_children[i];
+               dnode_handle_t *dnh = &children->dnc_children[i];
 
-               zrl_add(&dnh->dnh_zrlock);
                if (dnh->dnh_dnode != NULL) {
-                       ot = dnh->dnh_dnode->dn_type;
                        skip = dnh->dnh_dnode->dn_num_slots;
                } else {
-                       ot = dn_block[i].dn_type;
-                       skip = dn_block[i].dn_extra_slots + 1;
+                       if (dn_block[i].dn_type != DMU_OT_NONE)
+                               skip = dn_block[i].dn_extra_slots + 1;
+                       else
+                               skip = 1;
                }
-               zrl_remove(&dnh->dnh_zrlock);
-
-               if (ot == DMU_OT_NONE)
-                       skip = 1;
        }
 
        return (i > idx);
@@ -1107,27 +1101,19 @@ dnode_is_consumed(dmu_buf_impl_t *db, int idx)
  * to the dnode_phys_t buffer it doesn't exist.
  */
 static boolean_t
-dnode_is_allocated(dmu_buf_impl_t *db, int idx)
+dnode_is_allocated(dnode_children_t *children, dnode_phys_t *dn_block, int idx)
 {
        dnode_handle_t *dnh;
        dmu_object_type_t ot;
-       dnode_children_t *children_dnodes;
-       dnode_phys_t *dn_block;
 
-       if (dnode_is_consumed(db, idx))
+       if (dnode_is_consumed(children, dn_block, idx))
                return (B_FALSE);
 
-       children_dnodes = dmu_buf_get_user(&db->db);
-       dn_block = (dnode_phys_t *)db->db.db_data;
-
-       dnh = &children_dnodes->dnc_children[idx];
-
-       zrl_add(&dnh->dnh_zrlock);
+       dnh = &children->dnc_children[idx];
        if (dnh->dnh_dnode != NULL)
                ot = dnh->dnh_dnode->dn_type;
        else
                ot = dn_block[idx].dn_type;
-       zrl_remove(&dnh->dnh_zrlock);
 
        return (ot != DMU_OT_NONE);
 }
@@ -1142,32 +1128,27 @@ dnode_is_allocated(dmu_buf_impl_t *db, int idx)
  * to the dnode_phys_t buffer it doesn't exist.
  */
 static boolean_t
-dnode_is_free(dmu_buf_impl_t *db, int idx, int slots)
+dnode_is_free(dnode_children_t *children, dnode_phys_t *dn_block, int idx,
+    int slots)
 {
-       dnode_handle_t *dnh;
-       dmu_object_type_t ot;
-       dnode_children_t *children_dnodes;
-       dnode_phys_t *dn_block;
-       int i;
-
        if (idx + slots > DNODES_PER_BLOCK)
                return (B_FALSE);
 
-       children_dnodes = dmu_buf_get_user(&db->db);
-       dn_block = (dnode_phys_t *)db->db.db_data;
-
-       if (dnode_is_consumed(db, idx))
+       if (dnode_is_consumed(children, dn_block, idx))
                return (B_FALSE);
 
-       for (i = idx; i < idx + slots; i++) {
-               dnh = &children_dnodes->dnc_children[i];
+       for (int i = idx; i < idx + slots; i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+               dmu_object_type_t ot;
+
+               if (dnh->dnh_dnode != NULL) {
+                       if (dnh->dnh_dnode->dn_num_slots > 1)
+                               return (B_FALSE);
 
-               zrl_add(&dnh->dnh_zrlock);
-               if (dnh->dnh_dnode != NULL)
                        ot = dnh->dnh_dnode->dn_type;
-               else
+               } else {
                        ot = dn_block[i].dn_type;
-               zrl_remove(&dnh->dnh_zrlock);
+               }
 
                if (ot != DMU_OT_NONE)
                        return (B_FALSE);
@@ -1176,6 +1157,24 @@ dnode_is_free(dmu_buf_impl_t *db, int idx, int slots)
        return (B_TRUE);
 }
 
+static void
+dnode_hold_slots(dnode_children_t *children, int idx, int slots)
+{
+       for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+               zrl_add(&dnh->dnh_zrlock);
+       }
+}
+
+static void
+dnode_rele_slots(dnode_children_t *children, int idx, int slots)
+{
+       for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+               zrl_remove(&dnh->dnh_zrlock);
+       }
+}
+
 /*
  * errors:
  * EINVAL - invalid object number.
@@ -1286,36 +1285,42 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
        idx = object & (epb - 1);
        dn_block_begin = (dnode_phys_t *)db->db.db_data;
 
-       if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) {
+       dnode_hold_slots(children_dnodes, idx, slots);
+
+       if ((flag & DNODE_MUST_BE_FREE) &&
+           !dnode_is_free(children_dnodes, dn_block_begin, idx, slots)) {
+               dnode_rele_slots(children_dnodes, idx, slots);
                dbuf_rele(db, FTAG);
                return (SET_ERROR(ENOSPC));
        } else if ((flag & DNODE_MUST_BE_ALLOCATED) &&
-           !dnode_is_allocated(db, idx)) {
+           !dnode_is_allocated(children_dnodes, dn_block_begin, idx)) {
+               dnode_rele_slots(children_dnodes, idx, slots);
                dbuf_rele(db, FTAG);
                return (SET_ERROR(ENOENT));
        }
 
        dnh = &children_dnodes->dnc_children[idx];
-       zrl_add(&dnh->dnh_zrlock);
        dn = dnh->dnh_dnode;
        if (dn == NULL)
-               dn = dnode_create(os, dn_block_begin + idx, db, object, dnh);
+               dn = dnode_create(os, dn_block_begin + idx, slots, db,
+                   object, dnh);
 
        mutex_enter(&dn->dn_mtx);
        type = dn->dn_type;
        if (dn->dn_free_txg ||
            ((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) {
                mutex_exit(&dn->dn_mtx);
-               zrl_remove(&dnh->dnh_zrlock);
+               dnode_rele_slots(children_dnodes, idx, slots);
                dbuf_rele(db, FTAG);
                return (SET_ERROR(type == DMU_OT_NONE ? ENOENT : EEXIST));
        }
        if (refcount_add(&dn->dn_holds, tag) == 1)
                dbuf_add_ref(db, dnh);
+
        mutex_exit(&dn->dn_mtx);
 
        /* Now we can rely on the hold to prevent the dnode from moving. */
-       zrl_remove(&dnh->dnh_zrlock);
+       dnode_rele_slots(children_dnodes, idx, slots);
 
        DNODE_VERIFY(dn);
        ASSERT3P(dn->dn_dbuf, ==, db);
index 2cb9f42ae08b84dce8045d2b3047d6aa5e3f380c..3ebf995c6780cea7fc260a1c3dde9c04e024166b 100644 (file)
@@ -675,7 +675,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
        dmu_buf_t *db;
        mzap_phys_t *zp;
 
-       VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
+       VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
 
 #ifdef ZFS_DEBUG
        {
index a3a0285835f5a37095a93191b1ee28323df8bbb5..e4115fef089aab4515785bcef3319ffb72a08409 100644 (file)
@@ -763,7 +763,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
        }
 
        zh = zfs_znode_hold_enter(zfsvfs, obj);
-       VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
+       VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
 
        /*
         * If this is the root, fix up the half-initialized parent pointer
index 2017affa1b4d1964f1c67a4b1b3bd9e7f3e8cb03..708cb43548602fc5548debd1c518e46a754ede90 100644 (file)
@@ -364,7 +364,7 @@ tests = ['async_destroy_001_pos']
 [tests/functional/features/large_dnode]
 tests = ['large_dnode_001_pos', 'large_dnode_002_pos', 'large_dnode_003_pos',
          'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_006_pos',
-         'large_dnode_007_neg']
+         'large_dnode_007_neg', 'large_dnode_008_pos']
 
 [tests/functional/grow_pool]
 tests = ['grow_pool_001_pos']
index 6afda53629ae9ad9847eaf874d67ad186eaafceb..69ec5e18a0ac68b03ee0232f6e159cfa3ed26c9c 100644 (file)
@@ -8,4 +8,5 @@ dist_pkgdata_SCRIPTS = \
        large_dnode_004_neg.ksh \
        large_dnode_005_pos.ksh \
        large_dnode_006_pos.ksh \
-       large_dnode_007_neg.ksh
+       large_dnode_007_neg.ksh \
+       large_dnode_008_pos.ksh
diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh
new file mode 100755 (executable)
index 0000000..1f900b5
--- /dev/null
@@ -0,0 +1,60 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Run many xattrtests on a dataset with large dnodes and xattr=sa to
+# stress concurrent allocation of large dnodes.
+#
+
+TEST_FS=$TESTPOOL/large_dnode
+
+verify_runnable "both"
+
+function cleanup
+{
+       datasetexists $TEST_FS && log_must zfs destroy $TEST_FS
+}
+
+log_onexit cleanup
+log_assert "xattrtest runs concurrently on dataset with large dnodes"
+
+log_must zfs create $TEST_FS
+log_must zfs set dnsize=auto $TEST_FS
+log_must zfs set xattr=sa $TEST_FS
+
+for ((i=0; i < 100; i++)); do
+       dir="/$TEST_FS/dir.$i"
+       log_must mkdir "$dir"
+       log_must eval "xattrtest -R -r -y -x 1 -f 1024 -k -p $dir &"
+done
+
+log_must wait
+
+log_pass