extern unsigned long zfs_deadman_synctime_ms;
extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
-extern int zfs_abd_scatter_enabled;
+extern int zfs_abd_scatter_enabled;
+extern int dmu_object_alloc_chunk_shift;
static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts;
ztest_func_t ztest_dmu_read_write;
ztest_func_t ztest_dmu_write_parallel;
ztest_func_t ztest_dmu_object_alloc_free;
+ztest_func_t ztest_dmu_object_next_chunk;
ztest_func_t ztest_dmu_commit_callbacks;
ztest_func_t ztest_zap;
ztest_func_t ztest_zap_parallel;
ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always),
ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always),
ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always),
+ ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes),
ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always),
ZTI_INIT(ztest_zap, 30, &zopt_always),
ZTI_INIT(ztest_zap_parallel, 100, &zopt_always),
umem_free(od, size);
}
+/*
+ * Rewind the global allocator to verify object allocation backfilling.
+ */
+void
+ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id)
+{
+ objset_t *os = zd->zd_os;
+ int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+ uint64_t object;
+
+ /*
+ * Rewind the global allocator randomly back to a lower object number
+ * to force backfilling and reclamation of recently freed dnodes.
+ */
+ mutex_enter(&os->os_obj_lock);
+ object = ztest_random(os->os_obj_next_chunk);
+ os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk);
+ mutex_exit(&os->os_obj_lock);
+}
+
#undef OD_ARRAY_SIZE
#define OD_ARRAY_SIZE 2
int minlvl, uint64_t blkfill, uint64_t txg);
void dnode_evict_dbufs(dnode_t *dn);
void dnode_evict_bonus(dnode_t *dn);
+void dnode_free_interior_slots(dnode_t *dn);
#define DNODE_IS_CACHEABLE(_dn) \
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
* which had already been unlinked in an earlier txg.
*/
kstat_named_t dnode_hold_free_txg;
+ /*
+ * Number of times dnode_free_interior_slots() needed to retry
+ * acquiring a slot zrl lock due to contention.
+ */
+ kstat_named_t dnode_free_interior_lock_retry;
/*
* Number of new dnodes allocated by dnode_allocate().
*/
return (err);
}
-
int
dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
{
}
err = dmu_object_info(rwa->os, drro->drr_object, &doi);
-
- if (err != 0 && err != ENOENT)
+ if (err != 0 && err != ENOENT && err != EEXIST)
return (SET_ERROR(EINVAL));
- object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
if (drro->drr_object > rwa->max_object)
rwa->max_object = drro->drr_object;
int nblkptr = deduce_nblkptr(drro->drr_bonustype,
drro->drr_bonuslen);
+ object = drro->drr_object;
+
/* nblkptr will be bounded by the bonus size and type */
if (rwa->raw && nblkptr != drro->drr_nblkptr)
return (SET_ERROR(EINVAL));
(drro->drr_blksz != doi.doi_data_block_size ||
nblkptr < doi.doi_nblkptr ||
indblksz != doi.doi_metadata_block_size ||
- drro->drr_nlevels < doi.doi_indirection)) {
+ drro->drr_nlevels < doi.doi_indirection ||
+ drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT)) {
err = dmu_free_long_range_raw(rwa->os,
drro->drr_object, 0, DMU_OBJECT_END);
if (err != 0)
return (SET_ERROR(EINVAL));
} else if (drro->drr_blksz != doi.doi_data_block_size ||
- nblkptr < doi.doi_nblkptr) {
+ nblkptr < doi.doi_nblkptr ||
+ drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
err = dmu_free_long_range(rwa->os, drro->drr_object,
0, DMU_OBJECT_END);
if (err != 0)
return (SET_ERROR(EINVAL));
}
+
+ /*
+ * The dmu does not currently support decreasing nlevels
+ * on an object. For non-raw sends, this does not matter
+ * and the new object can just use the previous one's nlevels.
+ * For raw sends, however, the structure of the received dnode
+ * (including nlevels) must match that of the send side.
+ * Therefore, instead of using dmu_object_reclaim(), we must
+ * free the object completely and call dmu_object_claim_dnsize()
+ * instead.
+ */
+ if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) ||
+ drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
+ if (rwa->raw) {
+ err = dmu_free_long_object_raw(rwa->os,
+ drro->drr_object);
+ } else {
+ err = dmu_free_long_object(rwa->os,
+ drro->drr_object);
+ }
+ if (err != 0)
+ return (SET_ERROR(EINVAL));
+
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+ object = DMU_NEW_OBJECT;
+ }
+ } else if (err == EEXIST) {
+ /*
+ * The object requested is currently an interior slot of a
+ * multi-slot dnode. This will be resolved when the next txg
+ * is synced out, since the send stream will have told us
+ * to free this slot when we freed the associated dnode
+ * earlier in the stream.
+ */
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+ object = drro->drr_object;
+ } else {
+ /* object is free and we are about to allocate a new one */
+ object = DMU_NEW_OBJECT;
+ }
+
+ /*
+ * If this is a multi-slot dnode there is a chance that this
+ * object will expand into a slot that is already used by
+ * another object from the previous snapshot. We must free
+ * these objects before we attempt to allocate the new dnode.
+ */
+ if (drro->drr_dn_slots > 1) {
+ for (uint64_t slot = drro->drr_object + 1;
+ slot < drro->drr_object + drro->drr_dn_slots;
+ slot++) {
+ dmu_object_info_t slot_doi;
+
+ err = dmu_object_info(rwa->os, slot, &slot_doi);
+ if (err == ENOENT || err == EEXIST)
+ continue;
+ else if (err != 0)
+ return (err);
+
+ if (rwa->raw)
+ err = dmu_free_long_object_raw(rwa->os, slot);
+ else
+ err = dmu_free_long_object(rwa->os, slot);
+
+ if (err != 0)
+ return (err);
+ }
+
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
}
tx = dmu_tx_create(rwa->os);
dmu_tx_abort(tx);
return (err);
}
+
if (rwa->raw) {
VERIFY0(dmu_object_dirty_raw(rwa->os, drrs->drr_object, tx));
dmu_buf_will_change_crypt_params(db_spill, tx);
* See receive_read_prefetch for an explanation why we're
* storing this object in the ignore_obj_list.
*/
- if (err == ENOENT ||
+ if (err == ENOENT || err == EEXIST ||
(err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
objlist_insert(&ra->ignore_objlist, drro->drr_object);
err = 0;
{ "dnode_hold_free_overflow", KSTAT_DATA_UINT64 },
{ "dnode_hold_free_refcount", KSTAT_DATA_UINT64 },
{ "dnode_hold_free_txg", KSTAT_DATA_UINT64 },
+ { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 },
{ "dnode_allocate", KSTAT_DATA_UINT64 },
{ "dnode_reallocate", KSTAT_DATA_UINT64 },
{ "dnode_buf_evict", KSTAT_DATA_UINT64 },
mutex_exit(&os->os_lock);
/* the dnode can no longer move, so we can release the handle */
- zrl_remove(&dn->dn_handle->dnh_zrlock);
+ if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
+ zrl_remove(&dn->dn_handle->dnh_zrlock);
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
+
+ dnode_free_interior_slots(dn);
DNODE_STAT_BUMP(dnode_reallocate);
/* clean up any unreferenced dbufs */
}
static boolean_t
-dnode_check_slots(dnode_children_t *children, int idx, int slots, void *ptr)
+dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
{
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
for (int i = idx; i < idx + slots; i++) {
dnode_handle_t *dnh = &children->dnc_children[i];
- if (dnh->dnh_dnode != ptr)
+ dnode_t *dn = dnh->dnh_dnode;
+
+ if (dn == DN_SLOT_FREE) {
+ continue;
+ } else if (DN_SLOT_IS_PTR(dn)) {
+ mutex_enter(&dn->dn_mtx);
+ dmu_object_type_t type = dn->dn_type;
+ mutex_exit(&dn->dn_mtx);
+
+ if (type != DMU_OT_NONE)
+ return (B_FALSE);
+
+ continue;
+ } else {
return (B_FALSE);
+ }
+
+ return (B_FALSE);
}
return (B_TRUE);
}
+static void
+dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+
+ ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
+ dnode_destroy(dnh->dnh_dnode);
+ dnh->dnh_dnode = DN_SLOT_FREE;
+ }
+ }
+}
+
+void
+dnode_free_interior_slots(dnode_t *dn)
+{
+ dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
+ int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
+ int idx = (dn->dn_object & (epb - 1)) + 1;
+ int slots = dn->dn_num_slots - 1;
+
+ if (slots == 0)
+ return;
+
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ while (!dnode_slots_tryenter(children, idx, slots))
+ DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
+
+ dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
+ dnode_slots_rele(children, idx, slots);
+}
+
void
dnode_special_close(dnode_handle_t *dnh)
{
while (dn == DN_SLOT_UNINIT) {
dnode_slots_hold(dnc, idx, slots);
- if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
+ if (!dnode_check_slots_free(dnc, idx, slots)) {
DNODE_STAT_BUMP(dnode_hold_free_misses);
dnode_slots_rele(dnc, idx, slots);
dbuf_rele(db, FTAG);
continue;
}
- if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
+ if (!dnode_check_slots_free(dnc, idx, slots)) {
DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
dnode_slots_rele(dnc, idx, slots);
dbuf_rele(db, FTAG);
return (SET_ERROR(ENOSPC));
}
+ /*
+ * Allocated but otherwise free dnodes which would
+ * be in the interior of a multi-slot dnodes need
+ * to be freed. Single slot dnodes can be safely
+ * re-purposed as a performance optimization.
+ */
+ if (slots > 1)
+ dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+
dnh = &dnc->dnc_children[idx];
- dn = dnode_create(os, dn_block + idx, db, object, dnh);
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ dn = dnh->dnh_dnode;
+ } else {
+ dn = dnode_create(os, dn_block + idx, db,
+ object, dnh);
+ }
}
mutex_enter(&dn->dn_mtx);
if (dn->dn_allocated_txg != dn->dn_free_txg)
dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
+ dnode_free_interior_slots(dn);
mutex_enter(&dn->dn_mtx);
dn->dn_type = DMU_OT_NONE;
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
dn->dn_have_spill = B_FALSE;
+ dn->dn_num_slots = 1;
mutex_exit(&dn->dn_mtx);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD',
'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
'send-c_recv_dedup', 'send_encrypted_files', 'send_encrypted_heirarchy',
- 'send_freeobjects']
+ 'send_freeobjects', 'send_realloc_dnode_size']
tags = ['functional', 'rsend']
[tests/functional/scrub_mirror]
send-c_volume.ksh \
send-c_zstreamdump.ksh \
send-cpL_varied_recsize.ksh \
- send_freeobjects.ksh
+ send_freeobjects.ksh \
+ send_realloc_dnode_size.ksh
--- /dev/null
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify incremental receive properly handles objects with changed
+# dnode slot count.
+#
+# Strategy:
+# 1. Populate a dataset with 1k byte dnodes and snapshot
+# 2. Remove objects, set dnodesize=legacy, and remount dataset so new objects
+# get recycled numbers and formerly "interior" dnode slots get assigned
+# to new objects
+# 3. Remove objects, set dnodesize=2k, and remount dataset so new objects
+# overlap with recently recycled and formerly "normal" dnode slots get
+# assigned to new objects
+# 4. Generate initial and incremental streams
+# 5. Verify initial and incremental streams can be received
+#
+
+verify_runnable "both"
+
+log_assert "Verify incremental receive handles objects with changed dnode size"
+
+function cleanup
+{
+ rm -f $BACKDIR/fs-dn-legacy
+ rm -f $BACKDIR/fs-dn-1k
+ rm -f $BACKDIR/fs-dn-2k
+
+ if datasetexists $POOL/fs ; then
+ log_must zfs destroy -rR $POOL/fs
+ fi
+
+ if datasetexists $POOL/newfs ; then
+ log_must zfs destroy -rR $POOL/newfs
+ fi
+}
+
+log_onexit cleanup
+
+# 1. Populate a dataset with 1k byte dnodes and snapshot
+log_must zfs create -o dnodesize=1k $POOL/fs
+log_must mk_files 200 262144 0 $POOL/fs
+log_must zfs snapshot $POOL/fs@a
+
+# 2. Remove objects, set dnodesize=legacy, and remount dataset so new objects
+# get recycled numbers and formerly "interior" dnode slots get assigned
+# to new objects
+rm /$POOL/fs/*
+
+log_must zfs unmount $POOL/fs
+log_must zfs set dnodesize=legacy $POOL/fs
+log_must zfs mount $POOL/fs
+
+log_must mk_files 200 262144 0 $POOL/fs
+log_must zfs snapshot $POOL/fs@b
+
+# 3. Remove objects, set dnodesize=2k, and remount dataset so new objects
+# overlap with recently recycled and formerly "normal" dnode slots get
+# assigned to new objects
+rm /$POOL/fs/*
+
+log_must zfs unmount $POOL/fs
+log_must zfs set dnodesize=2k $POOL/fs
+log_must zfs mount $POOL/fs
+
+mk_files 200 262144 0 $POOL/fs
+log_must zfs snapshot $POOL/fs@c
+
+# 4. Generate initial and incremental streams
+log_must eval "zfs send $POOL/fs@a > $BACKDIR/fs-dn-1k"
+log_must eval "zfs send -i $POOL/fs@a $POOL/fs@b > $BACKDIR/fs-dn-legacy"
+log_must eval "zfs send -i $POOL/fs@b $POOL/fs@c > $BACKDIR/fs-dn-2k"
+
+# 5. Verify initial and incremental streams can be received
+log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-1k"
+log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-legacy"
+log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-2k"
+
+log_pass "Verify incremental receive handles objects with changed dnode size"