From 1e0457e7f5384b0328ea499083120dd191d80c90 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Tue, 29 Aug 2017 09:00:28 -0700 Subject: [PATCH] Enhance comments for large dnode project Fix a few nits in the comments from large dnodes. Also import some of the commit message as a comment in the code, making it more accessible. Reviewed-by: @rottegift Reviewed-by: George Melikov Reviewed-by: Giuseppe Di Natale Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Matt Ahrens Closes #6551 --- include/sys/dnode.h | 51 +++++++++++++++++++++++++++++++++++++++++ module/zfs/dmu_object.c | 2 +- module/zfs/dnode.c | 12 ++++++++++ module/zfs/zfs_replay.c | 6 ++--- 4 files changed, 67 insertions(+), 4 deletions(-) diff --git a/include/sys/dnode.h b/include/sys/dnode.h index 7a5a2aa26..5d589a95c 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -145,6 +145,57 @@ enum dnode_dirtycontext { #define DNODE_CRYPT_PORTABLE_FLAGS_MASK (DNODE_FLAG_SPILL_BLKPTR) +/* + * VARIABLE-LENGTH (LARGE) DNODES + * + * The motivation for variable-length dnodes is to eliminate the overhead + * associated with using spill blocks. Spill blocks are used to store + * system attribute data (i.e. file metadata) that does not fit in the + * dnode's bonus buffer. By allowing a larger bonus buffer area the use of + * a spill block can be avoided. Spill blocks potentially incur an + * additional read I/O for every dnode in a dnode block. As a worst case + * example, reading 32 dnodes from a 16k dnode block and all of the spill + * blocks could issue 33 separate reads. Now suppose those dnodes have size + * 1024 and therefore don't need spill blocks. Then the worst case number + * of blocks read is reduced to from 33 to two--one per dnode block. + * + * ZFS-on-Linux systems that make heavy use of extended attributes benefit + * from this feature. In particular, ZFS-on-Linux supports the xattr=sa + * dataset property which allows file extended attribute data to be stored + * in the dnode bonus buffer as an alternative to the traditional + * directory-based format. Workloads such as SELinux and the Lustre + * distributed filesystem often store enough xattr data to force spill + * blocks when xattr=sa is in effect. Large dnodes may therefore provide a + * performance benefit to such systems. Other use cases that benefit from + * this feature include files with large ACLs and symbolic links with long + * target names. + * + * The size of a dnode may be a multiple of 512 bytes up to the size of a + * dnode block (currently 16384 bytes). The dn_extra_slots field of the + * on-disk dnode_phys_t structure describes the size of the physical dnode + * on disk. The field represents how many "extra" dnode_phys_t slots a + * dnode consumes in its dnode block. This convention results in a value of + * 0 for 512 byte dnodes which preserves on-disk format compatibility with + * older software which doesn't support large dnodes. + * + * Similarly, the in-memory dnode_t structure has a dn_num_slots field + * to represent the total number of dnode_phys_t slots consumed on disk. + * Thus dn->dn_num_slots is 1 greater than the corresponding + * dnp->dn_extra_slots. This difference in convention was adopted + * because, unlike on-disk structures, backward compatibility is not a + * concern for in-memory objects, so we used a more natural way to + * represent size for a dnode_t. + * + * The default size for newly created dnodes is determined by the value of + * the "dnodesize" dataset property. By default the property is set to + * "legacy" which is compatible with older software. Setting the property + * to "auto" will allow the filesystem to choose the most suitable dnode + * size. Currently this just sets the default dnode size to 1k, but future + * code improvements could dynamically choose a size based on observed + * workload patterns. Dnodes of varying sizes can coexist within the same + * dataset and even within the same dnode block. + */ + typedef struct dnode_phys { uint8_t dn_type; /* dmu_object_type_t */ uint8_t dn_indblkshift; /* ln2(indirect block size) */ diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index 14264ec30..38ce6746e 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -318,7 +318,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) dmu_object_info_t doi; error = dmu_object_info(os, i, &doi); - if (error) + if (error != 0) skip = 1; else skip = doi.doi_dnodesize >> DNODE_SHIFT; diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 9942d6427..8b3ec3aab 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1176,6 +1176,18 @@ dnode_rele_slots(dnode_children_t *children, int idx, int slots) } /* + * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used + * to ensure the hole at the specified object offset is large enough to + * hold the dnode being created. The slots parameter is also used to ensure + * a dnode does not span multiple dnode blocks. In both of these cases, if + * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases + * are only possible when using DNODE_MUST_BE_FREE. + * + * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0. + * dnode_hold_impl() will check if the requested dnode is already consumed + * as an extra dnode slot by an large dnode, in which case it returns + * ENOENT. + * * errors: * EINVAL - invalid object number. * ENOSPC - hole too small to fulfill "slots" request diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index 30efb4b57..c2a9a8fde 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 Cyril Plisko. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #include @@ -453,8 +453,8 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) * eventually end up in zfs_mknode(), which assigns the object's * creation time, generation number, and dnode slot count. The * generic zfs_create() has no concept of these attributes, so - * we smuggle the values inside * the vattr's otherwise unused - * va_ctime, va_nblocks, and va_nlink fields. + * we smuggle the values inside the vattr's otherwise unused + * va_ctime, va_nblocks, and va_fsid fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; -- 2.40.0