From 9631681b75336ec6265d8fa5cecb353687c1f373 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 8 Aug 2017 08:38:53 -0700 Subject: [PATCH] Fix dnode allocation race When performing concurrent object allocations using the new multi-threaded allocator and large dnodes it's possible to allocate overlapping large dnodes. This case should have been handled by detecting an error returned by dnode_hold_impl(). But that logic only checked the returned dnp was not-NULL, and the dnp variable was not reset to NULL when retrying. Resolve this issue by properly checking the return value of dnode_hold_impl(). Additionally, it was possible that dnode_hold_impl() would misreport a dnode as free when it was in fact in use. This could occurs for two reasons: * The per-slot zrl_lock must be held over the entire critical section which includes the alloc/free until the new dnode is assigned to children_dnodes. Additionally, all of the zrl_lock's in the range must be held to protect moving dnodes. * The dn->dn_ot_type cannot be solely relied upon to check the type. When allocating a new dnode its type will be DMU_OT_NONE after dnode_create(). Only latter when dnode_allocate() is called will it transition to the new type. This means there's a window when allocating where it can mistaken for a free dnode. Reviewed-by: Giuseppe Di Natale Reviewed-by: Ned Bass Reviewed-by: Tony Hutter Reviewed-by: Olaf Faaland Signed-off-by: Brian Behlendorf Closes #6414 Closes #6439 --- module/zfs/dmu_object.c | 6 +- module/zfs/dmu_tx.c | 10 +- module/zfs/dnode.c | 119 +++++++++--------- module/zfs/zap_micro.c | 2 +- module/zfs/zfs_znode.c | 2 +- tests/runfiles/linux.run | 2 +- .../features/large_dnode/Makefile.am | 3 +- .../large_dnode/large_dnode_008_pos.ksh | 60 +++++++++ 8 files changed, 136 insertions(+), 68 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index 1c7b26d4b..14264ec30 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -61,6 +61,7 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, boolean_t restarted = B_FALSE; uint64_t *cpuobj = NULL; int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; + int error; kpreempt_disable(); cpuobj = &os->os_obj_next_percpu[CPU_SEQID % @@ -129,7 +130,6 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, uint64_t offset; uint64_t blkfill; int minlvl; - int error; if (os->os_rescan_dnodes) { offset = 0; os->os_rescan_dnodes = B_FALSE; @@ -163,9 +163,9 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, * dmu_tx_assign(), but there is currently no mechanism * to do so. */ - (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, + error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, FTAG, &dn); - if (dn != NULL) { + if (error == 0) { rw_enter(&dn->dn_struct_rwlock, RW_WRITER); /* * Another thread could have allocated it; check diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 3b96c6bdd..7937615f2 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1242,11 +1242,13 @@ dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) { - dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, - tx->tx_objset, object, THT_SPILL, 0, 0); + dmu_tx_hold_t *txh; - (void) refcount_add_many(&txh->txh_space_towrite, - SPA_OLD_MAXBLOCKSIZE, FTAG); + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, + THT_SPILL, 0, 0); + if (txh != NULL) + (void) refcount_add_many(&txh->txh_space_towrite, + SPA_OLD_MAXBLOCKSIZE, FTAG); } void diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 627cc8df1..41180bedf 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -391,7 +391,7 @@ dnode_setdblksz(dnode_t *dn, int size) } static dnode_t * -dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, +dnode_create(objset_t *os, dnode_phys_t *dnp, int slots, dmu_buf_impl_t *db, uint64_t object, dnode_handle_t *dnh) { dnode_t *dn; @@ -424,11 +424,15 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dn->dn_compress = dnp->dn_compress; dn->dn_bonustype = dnp->dn_bonustype; dn->dn_bonuslen = dnp->dn_bonuslen; - dn->dn_num_slots = dnp->dn_extra_slots + 1; dn->dn_maxblkid = dnp->dn_maxblkid; dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); dn->dn_id_flags = 0; + if (slots && dn->dn_type == DMU_OT_NONE) + dn->dn_num_slots = slots; + else + dn->dn_num_slots = dnp->dn_extra_slots + 1; + dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); @@ -1011,7 +1015,7 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, { dnode_t *dn; - dn = dnode_create(os, dnp, NULL, object, dnh); + dn = dnode_create(os, dnp, 0, NULL, object, dnh); zrl_init(&dnh->dnh_zrlock); DNODE_VERIFY(dn); } @@ -1062,36 +1066,26 @@ dnode_buf_evict_async(void *dbu) * * The dnode_phys_t buffer may not be in sync with the in-core dnode * structure, so we try to check the dnode structure first and fall back - * to the dnode_phys_t buffer it doesn't exist. + * to the dnode_phys_t buffer it doesn't exist. When an in-code dnode + * exists we can always trust dn->dn_num_slots to be accurate, even for + * a held dnode which has not yet been fully allocated. */ static boolean_t -dnode_is_consumed(dmu_buf_impl_t *db, int idx) +dnode_is_consumed(dnode_children_t *children, dnode_phys_t *dn_block, int idx) { - dnode_handle_t *dnh; - dmu_object_type_t ot; - dnode_children_t *children_dnodes; - dnode_phys_t *dn_block; - int skip; - int i; - - children_dnodes = dmu_buf_get_user(&db->db); - dn_block = (dnode_phys_t *)db->db.db_data; + int skip, i; for (i = 0; i < idx; i += skip) { - dnh = &children_dnodes->dnc_children[i]; + dnode_handle_t *dnh = &children->dnc_children[i]; - zrl_add(&dnh->dnh_zrlock); if (dnh->dnh_dnode != NULL) { - ot = dnh->dnh_dnode->dn_type; skip = dnh->dnh_dnode->dn_num_slots; } else { - ot = dn_block[i].dn_type; - skip = dn_block[i].dn_extra_slots + 1; + if (dn_block[i].dn_type != DMU_OT_NONE) + skip = dn_block[i].dn_extra_slots + 1; + else + skip = 1; } - zrl_remove(&dnh->dnh_zrlock); - - if (ot == DMU_OT_NONE) - skip = 1; } return (i > idx); @@ -1107,27 +1101,19 @@ dnode_is_consumed(dmu_buf_impl_t *db, int idx) * to the dnode_phys_t buffer it doesn't exist. */ static boolean_t -dnode_is_allocated(dmu_buf_impl_t *db, int idx) +dnode_is_allocated(dnode_children_t *children, dnode_phys_t *dn_block, int idx) { dnode_handle_t *dnh; dmu_object_type_t ot; - dnode_children_t *children_dnodes; - dnode_phys_t *dn_block; - if (dnode_is_consumed(db, idx)) + if (dnode_is_consumed(children, dn_block, idx)) return (B_FALSE); - children_dnodes = dmu_buf_get_user(&db->db); - dn_block = (dnode_phys_t *)db->db.db_data; - - dnh = &children_dnodes->dnc_children[idx]; - - zrl_add(&dnh->dnh_zrlock); + dnh = &children->dnc_children[idx]; if (dnh->dnh_dnode != NULL) ot = dnh->dnh_dnode->dn_type; else ot = dn_block[idx].dn_type; - zrl_remove(&dnh->dnh_zrlock); return (ot != DMU_OT_NONE); } @@ -1142,32 +1128,27 @@ dnode_is_allocated(dmu_buf_impl_t *db, int idx) * to the dnode_phys_t buffer it doesn't exist. */ static boolean_t -dnode_is_free(dmu_buf_impl_t *db, int idx, int slots) +dnode_is_free(dnode_children_t *children, dnode_phys_t *dn_block, int idx, + int slots) { - dnode_handle_t *dnh; - dmu_object_type_t ot; - dnode_children_t *children_dnodes; - dnode_phys_t *dn_block; - int i; - if (idx + slots > DNODES_PER_BLOCK) return (B_FALSE); - children_dnodes = dmu_buf_get_user(&db->db); - dn_block = (dnode_phys_t *)db->db.db_data; - - if (dnode_is_consumed(db, idx)) + if (dnode_is_consumed(children, dn_block, idx)) return (B_FALSE); - for (i = idx; i < idx + slots; i++) { - dnh = &children_dnodes->dnc_children[i]; + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + dmu_object_type_t ot; + + if (dnh->dnh_dnode != NULL) { + if (dnh->dnh_dnode->dn_num_slots > 1) + return (B_FALSE); - zrl_add(&dnh->dnh_zrlock); - if (dnh->dnh_dnode != NULL) ot = dnh->dnh_dnode->dn_type; - else + } else { ot = dn_block[i].dn_type; - zrl_remove(&dnh->dnh_zrlock); + } if (ot != DMU_OT_NONE) return (B_FALSE); @@ -1176,6 +1157,24 @@ dnode_is_free(dmu_buf_impl_t *db, int idx, int slots) return (B_TRUE); } +static void +dnode_hold_slots(dnode_children_t *children, int idx, int slots) +{ + for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + zrl_add(&dnh->dnh_zrlock); + } +} + +static void +dnode_rele_slots(dnode_children_t *children, int idx, int slots) +{ + for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + zrl_remove(&dnh->dnh_zrlock); + } +} + /* * errors: * EINVAL - invalid object number. @@ -1286,36 +1285,42 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, idx = object & (epb - 1); dn_block_begin = (dnode_phys_t *)db->db.db_data; - if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) { + dnode_hold_slots(children_dnodes, idx, slots); + + if ((flag & DNODE_MUST_BE_FREE) && + !dnode_is_free(children_dnodes, dn_block_begin, idx, slots)) { + dnode_rele_slots(children_dnodes, idx, slots); dbuf_rele(db, FTAG); return (SET_ERROR(ENOSPC)); } else if ((flag & DNODE_MUST_BE_ALLOCATED) && - !dnode_is_allocated(db, idx)) { + !dnode_is_allocated(children_dnodes, dn_block_begin, idx)) { + dnode_rele_slots(children_dnodes, idx, slots); dbuf_rele(db, FTAG); return (SET_ERROR(ENOENT)); } dnh = &children_dnodes->dnc_children[idx]; - zrl_add(&dnh->dnh_zrlock); dn = dnh->dnh_dnode; if (dn == NULL) - dn = dnode_create(os, dn_block_begin + idx, db, object, dnh); + dn = dnode_create(os, dn_block_begin + idx, slots, db, + object, dnh); mutex_enter(&dn->dn_mtx); type = dn->dn_type; if (dn->dn_free_txg || ((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) { mutex_exit(&dn->dn_mtx); - zrl_remove(&dnh->dnh_zrlock); + dnode_rele_slots(children_dnodes, idx, slots); dbuf_rele(db, FTAG); return (SET_ERROR(type == DMU_OT_NONE ? ENOENT : EEXIST)); } if (refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); + mutex_exit(&dn->dn_mtx); /* Now we can rely on the hold to prevent the dnode from moving. */ - zrl_remove(&dnh->dnh_zrlock); + dnode_rele_slots(children_dnodes, idx, slots); DNODE_VERIFY(dn); ASSERT3P(dn->dn_dbuf, ==, db); diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 2cb9f42ae..3ebf995c6 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -675,7 +675,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, dmu_buf_t *db; mzap_phys_t *zp; - VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); + VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); #ifdef ZFS_DEBUG { diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index a3a028583..e4115fef0 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -763,7 +763,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, } zh = zfs_znode_hold_enter(zfsvfs, obj); - VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); + VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 2017affa1..708cb4354 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -364,7 +364,7 @@ tests = ['async_destroy_001_pos'] [tests/functional/features/large_dnode] tests = ['large_dnode_001_pos', 'large_dnode_002_pos', 'large_dnode_003_pos', 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_006_pos', - 'large_dnode_007_neg'] + 'large_dnode_007_neg', 'large_dnode_008_pos'] [tests/functional/grow_pool] tests = ['grow_pool_001_pos'] diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am b/tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am index 6afda5362..69ec5e18a 100644 --- a/tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am +++ b/tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am @@ -8,4 +8,5 @@ dist_pkgdata_SCRIPTS = \ large_dnode_004_neg.ksh \ large_dnode_005_pos.ksh \ large_dnode_006_pos.ksh \ - large_dnode_007_neg.ksh + large_dnode_007_neg.ksh \ + large_dnode_008_pos.ksh diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh new file mode 100755 index 000000000..1f900b5ef --- /dev/null +++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh @@ -0,0 +1,60 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Lawrence Livermore National Security, LLC. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Run many xattrtests on a dataset with large dnodes and xattr=sa to +# stress concurrent allocation of large dnodes. +# + +TEST_FS=$TESTPOOL/large_dnode + +verify_runnable "both" + +function cleanup +{ + datasetexists $TEST_FS && log_must zfs destroy $TEST_FS +} + +log_onexit cleanup +log_assert "xattrtest runs concurrently on dataset with large dnodes" + +log_must zfs create $TEST_FS +log_must zfs set dnsize=auto $TEST_FS +log_must zfs set xattr=sa $TEST_FS + +for ((i=0; i < 100; i++)); do + dir="/$TEST_FS/dir.$i" + log_must mkdir "$dir" + log_must eval "xattrtest -R -r -y -x 1 -f 1024 -k -p $dir &" +done + +log_must wait + +log_pass -- 2.40.0