]> granicus.if.org Git - zfs/commitdiff
Update to onnv_147
authorBrian Behlendorf <behlendorf1@llnl.gov>
Thu, 26 Aug 2010 21:24:34 +0000 (14:24 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Thu, 26 Aug 2010 21:24:34 +0000 (14:24 -0700)
This is the last official OpenSolaris tag before the public
development tree was closed.

101 files changed:
ZFS.RELEASE
cmd/zdb/zdb.c
cmd/zfs/zfs_main.c
cmd/zinject/translate.c
cmd/zinject/zinject.c
cmd/zpool/zpool_main.c
cmd/ztest/ztest.c
lib/libefi/include/sys/efi_partition.h
lib/libefi/rdwr_efi.c
lib/libnvpair/include/libnvpair.h
lib/libnvpair/libnvpair.c
lib/libuutil/include/libuutil.h
lib/libuutil/uu_alloc.c
lib/libuutil/uu_misc.c
lib/libuutil/uu_string.c [new file with mode: 0644]
lib/libzfs/include/libzfs.h
lib/libzfs/include/libzfs_impl.h
lib/libzfs/libzfs_dataset.c
lib/libzfs/libzfs_diff.c [new file with mode: 0644]
lib/libzfs/libzfs_import.c
lib/libzfs/libzfs_mount.c
lib/libzfs/libzfs_pool.c
lib/libzfs/libzfs_sendrecv.c
lib/libzfs/libzfs_util.c
lib/libzpool/include/sys/zfs_context.h
lib/libzpool/kernel.c
module/nvpair/include/sys/nvpair.h
module/nvpair/nvpair.c
module/zcommon/include/sys/fs/zfs.h
module/zcommon/include/zfs_deleg.h
module/zcommon/zfs_deleg.c
module/zcommon/zpool_prop.c
module/zfs/arc.c
module/zfs/bpobj.c
module/zfs/dbuf.c
module/zfs/ddt.c
module/zfs/dmu.c
module/zfs/dmu_diff.c [new file with mode: 0644]
module/zfs/dmu_object.c
module/zfs/dmu_objset.c
module/zfs/dmu_send.c
module/zfs/dmu_traverse.c
module/zfs/dmu_tx.c
module/zfs/dnode.c
module/zfs/dnode_sync.c
module/zfs/dsl_dataset.c
module/zfs/dsl_deleg.c
module/zfs/dsl_pool.c
module/zfs/dsl_scan.c
module/zfs/dsl_synctask.c
module/zfs/fm.c
module/zfs/include/sys/dbuf.h
module/zfs/include/sys/dmu.h
module/zfs/include/sys/dmu_objset.h
module/zfs/include/sys/dmu_traverse.h
module/zfs/include/sys/dnode.h
module/zfs/include/sys/dsl_dataset.h
module/zfs/include/sys/dsl_deleg.h
module/zfs/include/sys/fm/protocol.h
module/zfs/include/sys/fm/util.h
module/zfs/include/sys/refcount.h
module/zfs/include/sys/sa.h
module/zfs/include/sys/sa_impl.h
module/zfs/include/sys/spa.h
module/zfs/include/sys/spa_impl.h
module/zfs/include/sys/vdev_impl.h
module/zfs/include/sys/zfs_acl.h
module/zfs/include/sys/zfs_ioctl.h
module/zfs/include/sys/zfs_onexit.h [new file with mode: 0644]
module/zfs/include/sys/zfs_stat.h [new file with mode: 0644]
module/zfs/include/sys/zfs_vfsops.h
module/zfs/include/sys/zfs_znode.h
module/zfs/include/sys/zil.h
module/zfs/include/sys/zil_impl.h
module/zfs/include/sys/zio.h
module/zfs/include/sys/zrlock.h [new file with mode: 0644]
module/zfs/lzjb.c
module/zfs/refcount.c
module/zfs/sa.c
module/zfs/spa.c
module/zfs/spa_config.c
module/zfs/spa_misc.c
module/zfs/txg.c
module/zfs/vdev.c
module/zfs/vdev_label.c
module/zfs/zfs_acl.c
module/zfs/zfs_ctldir.c
module/zfs/zfs_dir.c
module/zfs/zfs_fuid.c
module/zfs/zfs_ioctl.c
module/zfs/zfs_log.c
module/zfs/zfs_onexit.c [new file with mode: 0644]
module/zfs/zfs_replay.c
module/zfs/zfs_sa.c
module/zfs/zfs_vfsops.c
module/zfs/zfs_vnops.c
module/zfs/zfs_znode.c
module/zfs/zil.c
module/zfs/zio.c
module/zfs/zio_inject.c
module/zfs/zrlock.c [new file with mode: 0644]

index dd19a8e8d458ee6a0e053772f461600add102556..8ad36b98359796d9700bd035460b48a56919630c 100644 (file)
@@ -1 +1 @@
-ssh://anon@hg.opensolaris.org/hg/onnv/onnv-gate/onnv_141
+ssh://anon@hg.opensolaris.org/hg/onnv/onnv-gate/onnv_147
index ff73072f8a64f5622c23cb5b2b3c625af3b9e9b4..c6e219df9e1d7bc7df39e9d3f23185fe84b39552 100644 (file)
@@ -695,12 +695,12 @@ dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
                return;
        ASSERT(error == 0);
 
-       count = ddt_object_count(ddt, type, class);
+       if ((count = ddt_object_count(ddt, type, class)) == 0)
+               return;
+
        dspace = doi.doi_physical_blocks_512 << 9;
        mspace = doi.doi_fill_count * doi.doi_data_block_size;
 
-       ASSERT(count != 0);     /* we should have destroyed it */
-
        ddt_object_name(ddt, type, class, name);
 
        (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
@@ -1290,8 +1290,12 @@ dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
                        VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
                            8, 1, &sa_attrs) == 0);
                }
-               sa_attr_table = sa_setup(os, sa_attrs,
-                   zfs_attr_table, ZPL_END);
+               if ((error = sa_setup(os, sa_attrs, zfs_attr_table,
+                   ZPL_END, &sa_attr_table)) != 0) {
+                       (void) printf("sa_setup failed errno %d, can't "
+                           "display znode contents\n", error);
+                       return;
+               }
                sa_loaded = B_TRUE;
        }
 
@@ -1455,7 +1459,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
        }
 
        if (object == 0) {
-               dn = os->os_meta_dnode;
+               dn = DMU_META_DNODE(os);
        } else {
                error = dmu_bonus_hold(os, object, FTAG, &db);
                if (error)
@@ -1463,7 +1467,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
                            object, error);
                bonus = db->db_data;
                bsize = db->db_size;
-               dn = ((dmu_buf_impl_t *)db)->db_dnode;
+               dn = DB_DNODE((dmu_buf_impl_t *)db);
        }
        dmu_object_info_from_dnode(dn, &doi);
 
@@ -1627,8 +1631,8 @@ dump_dir(objset_t *os)
 
        dump_object(os, 0, verbosity, &print_header);
        object_count = 0;
-       if (os->os_userused_dnode &&
-           os->os_userused_dnode->dn_type != 0) {
+       if (DMU_USERUSED_DNODE(os) != NULL &&
+           DMU_USERUSED_DNODE(os)->dn_type != 0) {
                dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
                dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
        }
@@ -3072,8 +3076,11 @@ main(int argc, char **argv)
                                fatal("can't open '%s': %s",
                                    target, strerror(ENOMEM));
                        }
-                       if ((error = spa_import(name, cfg, NULL)) != 0)
-                               error = spa_import_verbatim(name, cfg, NULL);
+                       if ((error = spa_import(name, cfg, NULL,
+                           ZFS_IMPORT_MISSING_LOG)) != 0) {
+                               error = spa_import(name, cfg, NULL,
+                                   ZFS_IMPORT_VERBATIM);
+                       }
                }
        }
 
index 353fd4fa627db988f138357e8418f6db25fa8199..9516697390efd5d212cd2834133e74775116a530 100644 (file)
@@ -40,6 +40,7 @@
 #include <zone.h>
 #include <grp.h>
 #include <pwd.h>
+#include <signal.h>
 #include <sys/mkdev.h>
 #include <sys/mntent.h>
 #include <sys/mnttab.h>
@@ -84,6 +85,7 @@ static int zfs_do_userspace(int argc, char **argv);
 static int zfs_do_python(int argc, char **argv);
 static int zfs_do_hold(int argc, char **argv);
 static int zfs_do_release(int argc, char **argv);
+static int zfs_do_diff(int argc, char **argv);
 
 /*
  * Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
@@ -128,7 +130,8 @@ typedef enum {
        HELP_GROUPSPACE,
        HELP_HOLD,
        HELP_HOLDS,
-       HELP_RELEASE
+       HELP_RELEASE,
+       HELP_DIFF
 } zfs_help_t;
 
 typedef struct zfs_command {
@@ -180,6 +183,7 @@ static zfs_command_t command_table[] = {
        { "hold",       zfs_do_hold,            HELP_HOLD               },
        { "holds",      zfs_do_python,          HELP_HOLDS              },
        { "release",    zfs_do_release,         HELP_RELEASE            },
+       { "diff",       zfs_do_diff,            HELP_DIFF               },
 };
 
 #define        NCOMMAND        (sizeof (command_table) / sizeof (command_table[0]))
@@ -283,6 +287,9 @@ get_usage(zfs_help_t idx)
                return (gettext("\tholds [-r] <snapshot> ...\n"));
        case HELP_RELEASE:
                return (gettext("\trelease [-r] <tag> <snapshot> ...\n"));
+       case HELP_DIFF:
+               return (gettext("\tdiff [-FHt] <snapshot> "
+                   "[snapshot|filesystem]\n"));
        }
 
        abort();
@@ -624,8 +631,9 @@ zfs_do_clone(int argc, char **argv)
 
                clone = zfs_open(g_zfs, argv[1], ZFS_TYPE_DATASET);
                if (clone != NULL) {
-                       if ((ret = zfs_mount(clone, NULL, 0)) == 0)
-                               ret = zfs_share(clone);
+                       if (zfs_get_type(clone) != ZFS_TYPE_VOLUME)
+                               if ((ret = zfs_mount(clone, NULL, 0)) == 0)
+                                       ret = zfs_share(clone);
                        zfs_close(clone);
                }
        }
@@ -671,7 +679,7 @@ zfs_do_create(int argc, char **argv)
        int ret = 1;
        nvlist_t *props;
        uint64_t intval;
-       int canmount;
+       int canmount = ZFS_CANMOUNT_OFF;
 
        if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
                nomem();
@@ -802,19 +810,20 @@ zfs_do_create(int argc, char **argv)
 
        if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
                goto error;
+
+       ret = 0;
        /*
         * if the user doesn't want the dataset automatically mounted,
         * then skip the mount/share step
         */
-
-       canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
+       if (zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, type))
+               canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
 
        /*
         * Mount and/or share the new filesystem as appropriate.  We provide a
         * verbose error message to let the user know that their filesystem was
         * in fact created, even if we failed to mount or share it.
         */
-       ret = 0;
        if (canmount == ZFS_CANMOUNT_ON) {
                if (zfs_mount(zhp, NULL, 0) != 0) {
                        (void) fprintf(stderr, gettext("filesystem "
@@ -2888,7 +2897,7 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
                }
                if (holding) {
                        if (zfs_hold(zhp, delim+1, tag, recursive,
-                           temphold, B_FALSE) != 0)
+                           temphold, B_FALSE, -1, 0, 0) != 0)
                                ++errors;
                } else {
                        if (zfs_release(zhp, delim+1, tag, recursive) != 0)
@@ -2927,14 +2936,6 @@ zfs_do_release(int argc, char **argv)
        return (zfs_do_hold_rele_impl(argc, argv, B_FALSE));
 }
 
-typedef struct get_all_cbdata {
-       zfs_handle_t    **cb_handles;
-       size_t          cb_alloc;
-       size_t          cb_used;
-       uint_t          cb_types;
-       boolean_t       cb_verbose;
-} get_all_cbdata_t;
-
 #define        CHECK_SPINNER 30
 #define        SPINNER_TIME 3          /* seconds */
 #define        MOUNT_TIME 5            /* seconds */
@@ -2946,7 +2947,7 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
        static int spinval = 0;
        static int spincheck = 0;
        static time_t last_spin_time = (time_t)0;
-       get_all_cbdata_t *cbp = data;
+       get_all_cb_t *cbp = data;
        zfs_type_t type = zfs_get_type(zhp);
 
        if (cbp->cb_verbose) {
@@ -2963,8 +2964,7 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
        /*
         * Interate over any nested datasets.
         */
-       if (type == ZFS_TYPE_FILESYSTEM &&
-           zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
+       if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
                zfs_close(zhp);
                return (1);
        }
@@ -2972,46 +2972,25 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
        /*
         * Skip any datasets whose type does not match.
         */
-       if ((type & cbp->cb_types) == 0) {
+       if ((type & ZFS_TYPE_FILESYSTEM) == 0) {
                zfs_close(zhp);
                return (0);
        }
-
-       if (cbp->cb_alloc == cbp->cb_used) {
-               zfs_handle_t **handles;
-
-               if (cbp->cb_alloc == 0)
-                       cbp->cb_alloc = 64;
-               else
-                       cbp->cb_alloc *= 2;
-
-               handles = safe_malloc(cbp->cb_alloc * sizeof (void *));
-
-               if (cbp->cb_handles) {
-                       bcopy(cbp->cb_handles, handles,
-                           cbp->cb_used * sizeof (void *));
-                       free(cbp->cb_handles);
-               }
-
-               cbp->cb_handles = handles;
-       }
-
-       cbp->cb_handles[cbp->cb_used++] = zhp;
+       libzfs_add_handle(cbp, zhp);
+       assert(cbp->cb_used <= cbp->cb_alloc);
 
        return (0);
 }
 
 static void
-get_all_datasets(uint_t types, zfs_handle_t ***dslist, size_t *count,
-    boolean_t verbose)
+get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t verbose)
 {
-       get_all_cbdata_t cb = { 0 };
-       cb.cb_types = types;
+       get_all_cb_t cb = { 0 };
        cb.cb_verbose = verbose;
+       cb.cb_getone = get_one_dataset;
 
        if (verbose)
                set_progress_header(gettext("Reading ZFS config"));
-
        (void) zfs_iter_root(g_zfs, get_one_dataset, &cb);
 
        *dslist = cb.cb_handles;
@@ -3021,33 +3000,6 @@ get_all_datasets(uint_t types, zfs_handle_t ***dslist, size_t *count,
                finish_progress(gettext("done."));
 }
 
-static int
-dataset_cmp(const void *a, const void *b)
-{
-       zfs_handle_t **za = (zfs_handle_t **)a;
-       zfs_handle_t **zb = (zfs_handle_t **)b;
-       char mounta[MAXPATHLEN];
-       char mountb[MAXPATHLEN];
-       boolean_t gota, gotb;
-
-       if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0)
-               verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta,
-                   sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
-       if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0)
-               verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb,
-                   sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
-
-       if (gota && gotb)
-               return (strcmp(mounta, mountb));
-
-       if (gota)
-               return (-1);
-       if (gotb)
-               return (1);
-
-       return (strcmp(zfs_get_name(a), zfs_get_name(b)));
-}
-
 /*
  * Generic callback for sharing or mounting filesystems.  Because the code is so
  * similar, we have a common function with an extra parameter to determine which
@@ -3069,184 +3021,180 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
        const char *cmdname = op == OP_SHARE ? "share" : "mount";
        struct mnttab mnt;
        uint64_t zoned, canmount;
-       zfs_type_t type = zfs_get_type(zhp);
        boolean_t shared_nfs, shared_smb;
 
-       assert(type & (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME));
+       assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM);
 
-       if (type == ZFS_TYPE_FILESYSTEM) {
-               /*
-                * Check to make sure we can mount/share this dataset.  If we
-                * are in the global zone and the filesystem is exported to a
-                * local zone, or if we are in a local zone and the
-                * filesystem is not exported, then it is an error.
-                */
-               zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
+       /*
+        * Check to make sure we can mount/share this dataset.  If we
+        * are in the global zone and the filesystem is exported to a
+        * local zone, or if we are in a local zone and the
+        * filesystem is not exported, then it is an error.
+        */
+       zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
 
-               if (zoned && getzoneid() == GLOBAL_ZONEID) {
-                       if (!explicit)
-                               return (0);
+       if (zoned && getzoneid() == GLOBAL_ZONEID) {
+               if (!explicit)
+                       return (0);
 
-                       (void) fprintf(stderr, gettext("cannot %s '%s': "
-                           "dataset is exported to a local zone\n"), cmdname,
-                           zfs_get_name(zhp));
-                       return (1);
+               (void) fprintf(stderr, gettext("cannot %s '%s': "
+                   "dataset is exported to a local zone\n"), cmdname,
+                   zfs_get_name(zhp));
+               return (1);
 
-               } else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
-                       if (!explicit)
-                               return (0);
+       } else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
+               if (!explicit)
+                       return (0);
 
-                       (void) fprintf(stderr, gettext("cannot %s '%s': "
-                           "permission denied\n"), cmdname,
-                           zfs_get_name(zhp));
-                       return (1);
-               }
+               (void) fprintf(stderr, gettext("cannot %s '%s': "
+                   "permission denied\n"), cmdname,
+                   zfs_get_name(zhp));
+               return (1);
+       }
 
-               /*
-                * Ignore any filesystems which don't apply to us. This
-                * includes those with a legacy mountpoint, or those with
-                * legacy share options.
-                */
-               verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
-                   sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
-               verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
-                   sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
-               verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
-                   sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);
-
-               if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
-                   strcmp(smbshareopts, "off") == 0) {
-                       if (!explicit)
-                               return (0);
+       /*
+        * Ignore any filesystems which don't apply to us. This
+        * includes those with a legacy mountpoint, or those with
+        * legacy share options.
+        */
+       verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
+           sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
+       verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
+           sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
+       verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
+           sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);
+
+       if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
+           strcmp(smbshareopts, "off") == 0) {
+               if (!explicit)
+                       return (0);
 
-                       (void) fprintf(stderr, gettext("cannot share '%s': "
-                           "legacy share\n"), zfs_get_name(zhp));
-                       (void) fprintf(stderr, gettext("use share(1M) to "
-                           "share this filesystem, or set "
-                           "sharenfs property on\n"));
-                       return (1);
-               }
+               (void) fprintf(stderr, gettext("cannot share '%s': "
+                   "legacy share\n"), zfs_get_name(zhp));
+               (void) fprintf(stderr, gettext("use share(1M) to "
+                   "share this filesystem, or set "
+                   "sharenfs property on\n"));
+               return (1);
+       }
 
-               /*
-                * We cannot share or mount legacy filesystems. If the
-                * shareopts is non-legacy but the mountpoint is legacy, we
-                * treat it as a legacy share.
-                */
-               if (strcmp(mountpoint, "legacy") == 0) {
-                       if (!explicit)
-                               return (0);
+       /*
+        * We cannot share or mount legacy filesystems. If the
+        * shareopts is non-legacy but the mountpoint is legacy, we
+        * treat it as a legacy share.
+        */
+       if (strcmp(mountpoint, "legacy") == 0) {
+               if (!explicit)
+                       return (0);
 
-                       (void) fprintf(stderr, gettext("cannot %s '%s': "
-                           "legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
-                       (void) fprintf(stderr, gettext("use %s(1M) to "
-                           "%s this filesystem\n"), cmdname, cmdname);
-                       return (1);
-               }
+               (void) fprintf(stderr, gettext("cannot %s '%s': "
+                   "legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
+               (void) fprintf(stderr, gettext("use %s(1M) to "
+                   "%s this filesystem\n"), cmdname, cmdname);
+               return (1);
+       }
 
-               if (strcmp(mountpoint, "none") == 0) {
-                       if (!explicit)
-                               return (0);
+       if (strcmp(mountpoint, "none") == 0) {
+               if (!explicit)
+                       return (0);
 
-                       (void) fprintf(stderr, gettext("cannot %s '%s': no "
-                           "mountpoint set\n"), cmdname, zfs_get_name(zhp));
-                       return (1);
-               }
+               (void) fprintf(stderr, gettext("cannot %s '%s': no "
+                   "mountpoint set\n"), cmdname, zfs_get_name(zhp));
+               return (1);
+       }
 
-               /*
-                * canmount     explicit        outcome
-                * on           no              pass through
-                * on           yes             pass through
-                * off          no              return 0
-                * off          yes             display error, return 1
-                * noauto       no              return 0
-                * noauto       yes             pass through
-                */
-               canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
-               if (canmount == ZFS_CANMOUNT_OFF) {
+       /*
+        * canmount     explicit        outcome
+        * on           no              pass through
+        * on           yes             pass through
+        * off          no              return 0
+        * off          yes             display error, return 1
+        * noauto       no              return 0
+        * noauto       yes             pass through
+        */
+       canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
+       if (canmount == ZFS_CANMOUNT_OFF) {
+               if (!explicit)
+                       return (0);
+
+               (void) fprintf(stderr, gettext("cannot %s '%s': "
+                   "'canmount' property is set to 'off'\n"), cmdname,
+                   zfs_get_name(zhp));
+               return (1);
+       } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) {
+               return (0);
+       }
+
+       /*
+        * At this point, we have verified that the mountpoint and/or
+        * shareopts are appropriate for auto management. If the
+        * filesystem is already mounted or shared, return (failing
+        * for explicit requests); otherwise mount or share the
+        * filesystem.
+        */
+       switch (op) {
+       case OP_SHARE:
+
+               shared_nfs = zfs_is_shared_nfs(zhp, NULL);
+               shared_smb = zfs_is_shared_smb(zhp, NULL);
+
+               if (shared_nfs && shared_smb ||
+                   (shared_nfs && strcmp(shareopts, "on") == 0 &&
+                   strcmp(smbshareopts, "off") == 0) ||
+                   (shared_smb && strcmp(smbshareopts, "on") == 0 &&
+                   strcmp(shareopts, "off") == 0)) {
                        if (!explicit)
                                return (0);
 
-                       (void) fprintf(stderr, gettext("cannot %s '%s': "
-                           "'canmount' property is set to 'off'\n"), cmdname,
+                       (void) fprintf(stderr, gettext("cannot share "
+                           "'%s': filesystem already shared\n"),
                            zfs_get_name(zhp));
                        return (1);
-               } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) {
-                       return (0);
                }
 
-               /*
-                * At this point, we have verified that the mountpoint and/or
-                * shareopts are appropriate for auto management. If the
-                * filesystem is already mounted or shared, return (failing
-                * for explicit requests); otherwise mount or share the
-                * filesystem.
-                */
-               switch (op) {
-               case OP_SHARE:
-
-                       shared_nfs = zfs_is_shared_nfs(zhp, NULL);
-                       shared_smb = zfs_is_shared_smb(zhp, NULL);
-
-                       if (shared_nfs && shared_smb ||
-                           (shared_nfs && strcmp(shareopts, "on") == 0 &&
-                           strcmp(smbshareopts, "off") == 0) ||
-                           (shared_smb && strcmp(smbshareopts, "on") == 0 &&
-                           strcmp(shareopts, "off") == 0)) {
-                               if (!explicit)
-                                       return (0);
+               if (!zfs_is_mounted(zhp, NULL) &&
+                   zfs_mount(zhp, NULL, 0) != 0)
+                       return (1);
 
-                               (void) fprintf(stderr, gettext("cannot share "
-                                   "'%s': filesystem already shared\n"),
-                                   zfs_get_name(zhp));
+               if (protocol == NULL) {
+                       if (zfs_shareall(zhp) != 0)
                                return (1);
-                       }
-
-                       if (!zfs_is_mounted(zhp, NULL) &&
-                           zfs_mount(zhp, NULL, 0) != 0)
+               } else if (strcmp(protocol, "nfs") == 0) {
+                       if (zfs_share_nfs(zhp))
                                return (1);
-
-                       if (protocol == NULL) {
-                               if (zfs_shareall(zhp) != 0)
-                                       return (1);
-                       } else if (strcmp(protocol, "nfs") == 0) {
-                               if (zfs_share_nfs(zhp))
-                                       return (1);
-                       } else if (strcmp(protocol, "smb") == 0) {
-                               if (zfs_share_smb(zhp))
-                                       return (1);
-                       } else {
-                               (void) fprintf(stderr, gettext("cannot share "
-                                   "'%s': invalid share type '%s' "
-                                   "specified\n"),
-                                   zfs_get_name(zhp), protocol);
+               } else if (strcmp(protocol, "smb") == 0) {
+                       if (zfs_share_smb(zhp))
                                return (1);
-                       }
-
-                       break;
+               } else {
+                       (void) fprintf(stderr, gettext("cannot share "
+                           "'%s': invalid share type '%s' "
+                           "specified\n"),
+                           zfs_get_name(zhp), protocol);
+                       return (1);
+               }
 
-               case OP_MOUNT:
-                       if (options == NULL)
-                               mnt.mnt_mntopts = "";
-                       else
-                               mnt.mnt_mntopts = (char *)options;
+               break;
 
-                       if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
-                           zfs_is_mounted(zhp, NULL)) {
-                               if (!explicit)
-                                       return (0);
+       case OP_MOUNT:
+               if (options == NULL)
+                       mnt.mnt_mntopts = "";
+               else
+                       mnt.mnt_mntopts = (char *)options;
 
-                               (void) fprintf(stderr, gettext("cannot mount "
-                                   "'%s': filesystem already mounted\n"),
-                                   zfs_get_name(zhp));
-                               return (1);
-                       }
+               if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
+                   zfs_is_mounted(zhp, NULL)) {
+                       if (!explicit)
+                               return (0);
 
-                       if (zfs_mount(zhp, options, flags) != 0)
-                               return (1);
-                       break;
+                       (void) fprintf(stderr, gettext("cannot mount "
+                           "'%s': filesystem already mounted\n"),
+                           zfs_get_name(zhp));
+                       return (1);
                }
-       } else
-               assert(op == OP_SHARE);
+
+               if (zfs_mount(zhp, options, flags) != 0)
+                       return (1);
+               break;
+       }
 
        return (0);
 }
@@ -3308,7 +3256,7 @@ share_mount(int op, int argc, char **argv)
        boolean_t verbose = B_FALSE;
        int c, ret = 0;
        char *options = NULL;
-       int types, flags = 0;
+       int flags = 0;
 
        /* check options */
        while ((c = getopt(argc, argv, op == OP_MOUNT ? ":avo:O" : "a"))
@@ -3358,13 +3306,9 @@ share_mount(int op, int argc, char **argv)
                size_t i, count = 0;
                char *protocol = NULL;
 
-               if (op == OP_MOUNT) {
-                       types = ZFS_TYPE_FILESYSTEM;
-               } else if (argc > 0) {
-                       if (strcmp(argv[0], "nfs") == 0 ||
-                           strcmp(argv[0], "smb") == 0) {
-                               types = ZFS_TYPE_FILESYSTEM;
-                       } else {
+               if (op == OP_SHARE && argc > 0) {
+                       if (strcmp(argv[0], "nfs") != 0 &&
+                           strcmp(argv[0], "smb") != 0) {
                                (void) fprintf(stderr, gettext("share type "
                                    "must be 'nfs' or 'smb'\n"));
                                usage(B_FALSE);
@@ -3372,8 +3316,6 @@ share_mount(int op, int argc, char **argv)
                        protocol = argv[0];
                        argc--;
                        argv++;
-               } else {
-                       types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
                }
 
                if (argc != 0) {
@@ -3382,12 +3324,12 @@ share_mount(int op, int argc, char **argv)
                }
 
                start_progress_timer();
-               get_all_datasets(types, &dslist, &count, verbose);
+               get_all_datasets(&dslist, &count, verbose);
 
                if (count == 0)
                        return (0);
 
-               qsort(dslist, count, sizeof (void *), dataset_cmp);
+               qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp);
 
                for (i = 0; i < count; i++) {
                        if (verbose)
@@ -3427,17 +3369,14 @@ share_mount(int op, int argc, char **argv)
        } else {
                zfs_handle_t *zhp;
 
-               types = ZFS_TYPE_FILESYSTEM;
-               if (op == OP_SHARE)
-                       types |= ZFS_TYPE_VOLUME;
-
                if (argc > 1) {
                        (void) fprintf(stderr,
                            gettext("too many arguments\n"));
                        usage(B_FALSE);
                }
 
-               if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) {
+               if ((zhp = zfs_open(g_zfs, argv[0],
+                   ZFS_TYPE_FILESYSTEM)) == NULL) {
                        ret = 1;
                } else {
                        ret = share_mount_one(zhp, op, flags, NULL, B_TRUE,
@@ -3616,7 +3555,7 @@ unshare_unmount(int op, int argc, char **argv)
        int do_all = 0;
        int flags = 0;
        int ret = 0;
-       int types, c;
+       int c;
        zfs_handle_t *zhp;
        char nfs_mnt_prop[ZFS_MAXPROPLEN];
        char sharesmb[ZFS_MAXPROPLEN];
@@ -3792,68 +3731,63 @@ unshare_unmount(int op, int argc, char **argv)
                        return (unshare_unmount_path(op, argv[0],
                            flags, B_FALSE));
 
-               types = ZFS_TYPE_FILESYSTEM;
-               if (op == OP_SHARE)
-                       types |= ZFS_TYPE_VOLUME;
-
-               if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL)
+               if ((zhp = zfs_open(g_zfs, argv[0],
+                   ZFS_TYPE_FILESYSTEM)) == NULL)
                        return (1);
 
-               if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
-                       verify(zfs_prop_get(zhp, op == OP_SHARE ?
-                           ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
-                           nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL,
-                           NULL, 0, B_FALSE) == 0);
+               verify(zfs_prop_get(zhp, op == OP_SHARE ?
+                   ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
+                   nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL,
+                   NULL, 0, B_FALSE) == 0);
 
-                       switch (op) {
-                       case OP_SHARE:
-                               verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
-                                   nfs_mnt_prop,
-                                   sizeof (nfs_mnt_prop),
-                                   NULL, NULL, 0, B_FALSE) == 0);
-                               verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
-                                   sharesmb, sizeof (sharesmb), NULL, NULL,
-                                   0, B_FALSE) == 0);
-
-                               if (strcmp(nfs_mnt_prop, "off") == 0 &&
-                                   strcmp(sharesmb, "off") == 0) {
-                                       (void) fprintf(stderr, gettext("cannot "
-                                           "unshare '%s': legacy share\n"),
-                                           zfs_get_name(zhp));
-                                       (void) fprintf(stderr, gettext("use "
-                                           "unshare(1M) to unshare this "
-                                           "filesystem\n"));
-                                       ret = 1;
-                               } else if (!zfs_is_shared(zhp)) {
-                                       (void) fprintf(stderr, gettext("cannot "
-                                           "unshare '%s': not currently "
-                                           "shared\n"), zfs_get_name(zhp));
-                                       ret = 1;
-                               } else if (zfs_unshareall(zhp) != 0) {
-                                       ret = 1;
-                               }
-                               break;
+               switch (op) {
+               case OP_SHARE:
+                       verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
+                           nfs_mnt_prop,
+                           sizeof (nfs_mnt_prop),
+                           NULL, NULL, 0, B_FALSE) == 0);
+                       verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
+                           sharesmb, sizeof (sharesmb), NULL, NULL,
+                           0, B_FALSE) == 0);
+
+                       if (strcmp(nfs_mnt_prop, "off") == 0 &&
+                           strcmp(sharesmb, "off") == 0) {
+                               (void) fprintf(stderr, gettext("cannot "
+                                   "unshare '%s': legacy share\n"),
+                                   zfs_get_name(zhp));
+                               (void) fprintf(stderr, gettext("use "
+                                   "unshare(1M) to unshare this "
+                                   "filesystem\n"));
+                               ret = 1;
+                       } else if (!zfs_is_shared(zhp)) {
+                               (void) fprintf(stderr, gettext("cannot "
+                                   "unshare '%s': not currently "
+                                   "shared\n"), zfs_get_name(zhp));
+                               ret = 1;
+                       } else if (zfs_unshareall(zhp) != 0) {
+                               ret = 1;
+                       }
+                       break;
 
-                       case OP_MOUNT:
-                               if (strcmp(nfs_mnt_prop, "legacy") == 0) {
-                                       (void) fprintf(stderr, gettext("cannot "
-                                           "unmount '%s': legacy "
-                                           "mountpoint\n"), zfs_get_name(zhp));
-                                       (void) fprintf(stderr, gettext("use "
-                                           "umount(1M) to unmount this "
-                                           "filesystem\n"));
-                                       ret = 1;
-                               } else if (!zfs_is_mounted(zhp, NULL)) {
-                                       (void) fprintf(stderr, gettext("cannot "
-                                           "unmount '%s': not currently "
-                                           "mounted\n"),
-                                           zfs_get_name(zhp));
-                                       ret = 1;
-                               } else if (zfs_unmountall(zhp, flags) != 0) {
-                                       ret = 1;
-                               }
-                               break;
+               case OP_MOUNT:
+                       if (strcmp(nfs_mnt_prop, "legacy") == 0) {
+                               (void) fprintf(stderr, gettext("cannot "
+                                   "unmount '%s': legacy "
+                                   "mountpoint\n"), zfs_get_name(zhp));
+                               (void) fprintf(stderr, gettext("use "
+                                   "umount(1M) to unmount this "
+                                   "filesystem\n"));
+                               ret = 1;
+                       } else if (!zfs_is_mounted(zhp, NULL)) {
+                               (void) fprintf(stderr, gettext("cannot "
+                                   "unmount '%s': not currently "
+                                   "mounted\n"),
+                                   zfs_get_name(zhp));
+                               ret = 1;
+                       } else if (zfs_unmountall(zhp, flags) != 0) {
+                               ret = 1;
                        }
+                       break;
                }
 
                zfs_close(zhp);
@@ -4047,6 +3981,81 @@ find_command_idx(char *command, int *idx)
        return (1);
 }
 
+static int
+zfs_do_diff(int argc, char **argv)
+{
+       zfs_handle_t *zhp;
+       int flags = 0;
+       char *tosnap = NULL;
+       char *fromsnap = NULL;
+       char *atp, *copy;
+       int err;
+       int c;
+
+       while ((c = getopt(argc, argv, "FHt")) != -1) {
+               switch (c) {
+               case 'F':
+                       flags |= ZFS_DIFF_CLASSIFY;
+                       break;
+               case 'H':
+                       flags |= ZFS_DIFF_PARSEABLE;
+                       break;
+               case 't':
+                       flags |= ZFS_DIFF_TIMESTAMP;
+                       break;
+               default:
+                       (void) fprintf(stderr,
+                           gettext("invalid option '%c'\n"), optopt);
+                       usage(B_FALSE);
+               }
+       }
+
+       argc -= optind;
+       argv += optind;
+
+       if (argc < 1) {
+               (void) fprintf(stderr,
+               gettext("must provide at least one snapshot name\n"));
+               usage(B_FALSE);
+       }
+
+       if (argc > 2) {
+               (void) fprintf(stderr, gettext("too many arguments\n"));
+               usage(B_FALSE);
+       }
+
+       fromsnap = argv[0];
+       tosnap = (argc == 2) ? argv[1] : NULL;
+
+       copy = NULL;
+       if (*fromsnap != '@')
+               copy = strdup(fromsnap);
+       else if (tosnap)
+               copy = strdup(tosnap);
+       if (copy == NULL)
+               usage(B_FALSE);
+
+       if (atp = strchr(copy, '@'))
+               *atp = '\0';
+
+       if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL)
+               return (1);
+
+       free(copy);
+
+       /*
+        * Ignore SIGPIPE so that the library can give us
+        * information on any failure
+        */
+       (void) sigignore(SIGPIPE);
+
+       err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags);
+
+       zfs_close(zhp);
+
+       return (err != 0);
+}
+
 int
 main(int argc, char **argv)
 {
index cd967a84516d7867460bb279a9b50a0ae6c56567..87751e315e89e932a088e3d4bde3713e4d4421c9 100644 (file)
@@ -267,7 +267,7 @@ calculate_range(const char *dataset, err_type_t type, int level, char *range,
        }
 
        if (record->zi_object == 0) {
-               dn = os->os_meta_dnode;
+               dn = DMU_META_DNODE(os);
        } else {
                err = dnode_hold(os, record->zi_object, FTAG, &dn);
                if (err != 0) {
@@ -318,7 +318,7 @@ calculate_range(const char *dataset, err_type_t type, int level, char *range,
        ret = 0;
 out:
        if (dn) {
-               if (dn != os->os_meta_dnode)
+               if (dn != DMU_META_DNODE(os))
                        dnode_rele(dn, FTAG);
        }
        if (os)
index ab04e422a9b2b8a460abadd377300957f6ed8838..60c53ceb3fce2d52259185ad7a88ff3f3a5c903a 100644 (file)
@@ -233,7 +233,7 @@ usage(void)
            "\t\tInject a fault into a particular device or the device's\n"
            "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
            "\t\t'pad1', or 'pad2'.\n"
-           "\t\t'errno' can either be 'nxio' (the default) or 'io'.\n"
+           "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n"
            "\n"
            "\tzinject -d device -A <degrade|fault> pool\n"
            "\t\tPerform a specific action on a particular device\n"
@@ -395,17 +395,25 @@ print_panic_handler(int id, const char *pool, zinject_record_t *record,
 static int
 print_all_handlers(void)
 {
-       int count = 0;
+       int count = 0, total = 0;
 
        (void) iter_handlers(print_device_handler, &count);
-       (void) printf("\n");
-       count = 0;
+       if (count > 0) {
+               total += count;
+               (void) printf("\n");
+               count = 0;
+       }
+
        (void) iter_handlers(print_data_handler, &count);
-       (void) printf("\n");
-       count = 0;
+       if (count > 0) {
+               total += count;
+               (void) printf("\n");
+               count = 0;
+       }
+
        (void) iter_handlers(print_panic_handler, &count);
 
-       return (count);
+       return (count + total);
 }
 
 /* ARGSUSED */
@@ -627,6 +635,8 @@ main(int argc, char **argv)
                                error = ECKSUM;
                        } else if (strcasecmp(optarg, "nxio") == 0) {
                                error = ENXIO;
+                       } else if (strcasecmp(optarg, "dtl") == 0) {
+                               error = ECHILD;
                        } else {
                                (void) fprintf(stderr, "invalid error type "
                                    "'%s': must be 'io', 'checksum' or "
index 62c4be832b8bdf2725471a1c2a8cbbc61c8c2277..8aa985b1a5526da9d32e8c6ab4ae7e4da0dd9453 100644 (file)
@@ -202,12 +202,14 @@ get_usage(zpool_help_t idx) {
                return (gettext("\thistory [-il] [<pool>] ...\n"));
        case HELP_IMPORT:
                return (gettext("\timport [-d dir] [-D]\n"
-                   "\timport [-d dir | -c cachefile] [-n] -F <pool | id>\n"
+                   "\timport [-d dir | -c cachefile] [-F [-n]] <pool | id>\n"
                    "\timport [-o mntopts] [-o property=value] ... \n"
-                   "\t    [-d dir | -c cachefile] [-D] [-f] [-R root] -a\n"
+                   "\t    [-d dir | -c cachefile] [-D] [-f] [-m] [-N] "
+                   "[-R root] [-F [-n]] -a\n"
                    "\timport [-o mntopts] [-o property=value] ... \n"
-                   "\t    [-d dir | -c cachefile] [-D] [-f] [-R root] "
-                   "<pool | id> [newpool]\n"));
+                   "\t    [-d dir | -c cachefile] [-D] [-f] [-m] [-N] "
+                   "[-R root] [-F [-n]]\n"
+                   "\t    <pool | id> [newpool]\n"));
        case HELP_IOSTAT:
                return (gettext("\tiostat [-v] [-T d|u] [pool] ... [interval "
                    "[count]]\n"));
@@ -1499,7 +1501,7 @@ show_import(nvlist_t *config)
  */
 static int
 do_import(nvlist_t *config, const char *newname, const char *mntopts,
-    int force, nvlist_t *props, boolean_t do_verbatim)
+    nvlist_t *props, int flags)
 {
        zpool_handle_t *zhp;
        char *name;
@@ -1517,7 +1519,8 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
                (void) fprintf(stderr, gettext("cannot import '%s': pool "
                    "is formatted using a newer ZFS version\n"), name);
                return (1);
-       } else if (state != POOL_STATE_EXPORTED && !force) {
+       } else if (state != POOL_STATE_EXPORTED &&
+           !(flags & ZFS_IMPORT_ANY_HOST)) {
                uint64_t hostid;
 
                if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID,
@@ -1551,7 +1554,7 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
                }
        }
 
-       if (zpool_import_props(g_zfs, config, newname, props, do_verbatim) != 0)
+       if (zpool_import_props(g_zfs, config, newname, props, flags) != 0)
                return (1);
 
        if (newname != NULL)
@@ -1561,6 +1564,7 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
                return (1);
 
        if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
+           !(flags & ZFS_IMPORT_ONLY) &&
            zpool_enable_datasets(zhp, mntopts, 0) != 0) {
                zpool_close(zhp);
                return (1);
@@ -1602,6 +1606,11 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
  *
  *       -n     See if rewind would work, but don't actually rewind.
  *
+ *       -N     Import the pool but don't mount datasets.
+ *
+ *       -T     Specify a starting txg to use for import. This option is
+ *             intentionally undocumented option for testing purposes.
+ *
  *       -a    Import all pools found.
  *
  *       -o    Set property=value and/or temporary mount options (without '=').
@@ -1620,7 +1629,6 @@ zpool_do_import(int argc, char **argv)
        boolean_t do_all = B_FALSE;
        boolean_t do_destroyed = B_FALSE;
        char *mntopts = NULL;
-       boolean_t do_force = B_FALSE;
        nvpair_t *elem;
        nvlist_t *config;
        uint64_t searchguid = 0;
@@ -1630,17 +1638,18 @@ zpool_do_import(int argc, char **argv)
        nvlist_t *policy = NULL;
        nvlist_t *props = NULL;
        boolean_t first;
-       boolean_t do_verbatim = B_FALSE;
+       int flags = ZFS_IMPORT_NORMAL;
        uint32_t rewind_policy = ZPOOL_NO_REWIND;
        boolean_t dryrun = B_FALSE;
        boolean_t do_rewind = B_FALSE;
        boolean_t xtreme_rewind = B_FALSE;
-       uint64_t pool_state;
+       uint64_t pool_state, txg = -1ULL;
        char *cachefile = NULL;
        importargs_t idata = { 0 };
+       char *endptr;
 
        /* check options */
-       while ((c = getopt(argc, argv, ":aCc:d:DEfFno:rR:VX")) != -1) {
+       while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:rR:T:VX")) != -1) {
                switch (c) {
                case 'a':
                        do_all = B_TRUE;
@@ -1665,14 +1674,20 @@ zpool_do_import(int argc, char **argv)
                        do_destroyed = B_TRUE;
                        break;
                case 'f':
-                       do_force = B_TRUE;
+                       flags |= ZFS_IMPORT_ANY_HOST;
                        break;
                case 'F':
                        do_rewind = B_TRUE;
                        break;
+               case 'm':
+                       flags |= ZFS_IMPORT_MISSING_LOG;
+                       break;
                case 'n':
                        dryrun = B_TRUE;
                        break;
+               case 'N':
+                       flags |= ZFS_IMPORT_ONLY;
+                       break;
                case 'o':
                        if ((propval = strchr(optarg, '=')) != NULL) {
                                *propval = '\0';
@@ -1696,8 +1711,18 @@ zpool_do_import(int argc, char **argv)
                            ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
                                goto error;
                        break;
+               case 'T':
+                       errno = 0;
+                       txg = strtoull(optarg, &endptr, 10);
+                       if (errno != 0 || *endptr != '\0') {
+                               (void) fprintf(stderr,
+                                   gettext("invalid txg value\n"));
+                               usage(B_FALSE);
+                       }
+                       rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND;
+                       break;
                case 'V':
-                       do_verbatim = B_TRUE;
+                       flags |= ZFS_IMPORT_VERBATIM;
                        break;
                case 'X':
                        xtreme_rewind = B_TRUE;
@@ -1736,6 +1761,7 @@ zpool_do_import(int argc, char **argv)
 
        /* In the future, we can capture further policy and include it here */
        if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
+           nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, txg) != 0 ||
            nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0)
                goto error;
 
@@ -1869,7 +1895,7 @@ zpool_do_import(int argc, char **argv)
 
                        if (do_all) {
                                err |= do_import(config, NULL, mntopts,
-                                   do_force, props, do_verbatim);
+                                   props, flags);
                        } else {
                                show_import(config);
                        }
@@ -1918,7 +1944,7 @@ zpool_do_import(int argc, char **argv)
                        err = B_TRUE;
                } else {
                        err |= do_import(found_config, argc == 1 ? NULL :
-                           argv[1], mntopts, do_force, props, do_verbatim);
+                           argv[1], mntopts, props, flags);
                }
        }
 
@@ -3217,7 +3243,7 @@ void
 print_scan_status(pool_scan_stat_t *ps)
 {
        time_t start, end;
-       uint64_t elapsed, mins_left;
+       uint64_t elapsed, mins_left, hours_left;
        uint64_t pass_exam, examined, total;
        uint_t rate;
        double fraction_done;
@@ -3294,15 +3320,24 @@ print_scan_status(pool_scan_stat_t *ps)
        rate = pass_exam / elapsed;
        rate = rate ? rate : 1;
        mins_left = ((total - examined) / rate) / 60;
+       hours_left = mins_left / 60;
 
        zfs_nicenum(examined, examined_buf, sizeof (examined_buf));
        zfs_nicenum(total, total_buf, sizeof (total_buf));
        zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
 
-       (void) printf(gettext("    %s scanned out of %s at "
-           "%s/s, %lluh%um to go\n"), examined_buf, total_buf, rate_buf,
-           (u_longlong_t)(mins_left / 60),
-           (uint_t)(mins_left % 60));
+       /*
+        * do not print estimated time if hours_left is more than 30 days
+        */
+       (void) printf(gettext("    %s scanned out of %s at %s/s"),
+           examined_buf, total_buf, rate_buf);
+       if (hours_left < (30 * 24)) {
+               (void) printf(gettext(", %lluh%um to go\n"),
+                   (u_longlong_t)hours_left, (uint_t)(mins_left % 60));
+       } else {
+               (void) printf(gettext(
+                   ", (scan is slow, no estimated time)\n"));
+       }
 
        if (ps->pss_func == POOL_SCAN_RESILVER) {
                (void) printf(gettext("    %s resilvered, %.2f%% done\n"),
@@ -4009,6 +4044,9 @@ zpool_do_upgrade(int argc, char **argv)
                (void) printf(gettext(" 25  Improved scrub stats\n"));
                (void) printf(gettext(" 26  Improved snapshot deletion "
                    "performance\n"));
+               (void) printf(gettext(" 27  Improved snapshot creation "
+                   "performance\n"));
+               (void) printf(gettext(" 28  Multiple vdev replacements\n"));
                (void) printf(gettext("\nFor more information on a particular "
                    "version, including supported releases,\n"));
                (void) printf(gettext("see the ZFS Administration Guide.\n\n"));
index eed92ec72ebb5546bf75d1f3d2a21c036067b68b..b2d81b5588822572a97e020c884e29acb13c2532 100644 (file)
@@ -1102,7 +1102,7 @@ ztest_bt_bonus(dmu_buf_t *db)
 #define        lrz_bonustype   lr_rdev
 #define        lrz_bonuslen    lr_crtime[1]
 
-static uint64_t
+static void
 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
 {
        char *name = (void *)(lr + 1);          /* name follows lr */
@@ -1110,40 +1110,41 @@ ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
        itx_t *itx;
 
        if (zil_replaying(zd->zd_zilog, tx))
-               return (0);
+               return;
 
        itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
        bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
            sizeof (*lr) + namesize - sizeof (lr_t));
 
-       return (zil_itx_assign(zd->zd_zilog, itx, tx));
+       zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
-static uint64_t
-ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr)
+static void
+ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object)
 {
        char *name = (void *)(lr + 1);          /* name follows lr */
        size_t namesize = strlen(name) + 1;
        itx_t *itx;
 
        if (zil_replaying(zd->zd_zilog, tx))
-               return (0);
+               return;
 
        itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
        bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
            sizeof (*lr) + namesize - sizeof (lr_t));
 
-       return (zil_itx_assign(zd->zd_zilog, itx, tx));
+       itx->itx_oid = object;
+       zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
-static uint64_t
+static void
 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
 {
        itx_t *itx;
        itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
 
        if (zil_replaying(zd->zd_zilog, tx))
-               return (0);
+               return;
 
        if (lr->lr_length > ZIL_MAX_LOG_DATA)
                write_state = WR_INDIRECT;
@@ -1166,37 +1167,39 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
        bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
            sizeof (*lr) - sizeof (lr_t));
 
-       return (zil_itx_assign(zd->zd_zilog, itx, tx));
+       zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
-static uint64_t
+static void
 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
 {
        itx_t *itx;
 
        if (zil_replaying(zd->zd_zilog, tx))
-               return (0);
+               return;
 
        itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
        bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
            sizeof (*lr) - sizeof (lr_t));
 
-       return (zil_itx_assign(zd->zd_zilog, itx, tx));
+       itx->itx_sync = B_FALSE;
+       zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
-static uint64_t
+static void
 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
 {
        itx_t *itx;
 
        if (zil_replaying(zd->zd_zilog, tx))
-               return (0);
+               return;
 
        itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
        bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
            sizeof (*lr) - sizeof (lr_t));
 
-       return (zil_itx_assign(zd->zd_zilog, itx, tx));
+       itx->itx_sync = B_FALSE;
+       zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
 /*
@@ -1328,7 +1331,7 @@ ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap)
 
        VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
 
-       (void) ztest_log_remove(zd, tx, lr);
+       (void) ztest_log_remove(zd, tx, lr, object);
 
        dmu_tx_commit(tx);
 
@@ -2045,7 +2048,7 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
 {
        zilog_t *zilog = zd->zd_zilog;
 
-       zil_commit(zilog, UINT64_MAX, ztest_random(ZTEST_OBJECTS));
+       zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
 
        /*
         * Remember the committed values in zd, which is in parent/child
@@ -2875,7 +2878,7 @@ ztest_snapshot_create(char *osname, uint64_t id)
            (u_longlong_t)id);
 
        error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1,
-           NULL, B_FALSE);
+           NULL, NULL, B_FALSE, B_FALSE, -1);
        if (error == ENOSPC) {
                ztest_record_enospc(FTAG);
                return (B_FALSE);
@@ -3080,7 +3083,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
        (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
 
        error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1,
-           NULL, B_FALSE);
+           NULL, NULL, B_FALSE, B_FALSE, -1);
        if (error && error != EEXIST) {
                if (error == ENOSPC) {
                        ztest_record_enospc(FTAG);
@@ -3104,7 +3107,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
        }
 
        error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1,
-           NULL, B_FALSE);
+           NULL, NULL, B_FALSE, B_FALSE, -1);
        if (error && error != EEXIST) {
                if (error == ENOSPC) {
                        ztest_record_enospc(FTAG);
@@ -3114,7 +3117,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
        }
 
        error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1,
-           NULL, B_FALSE);
+           NULL, NULL, B_FALSE, B_FALSE, -1);
        if (error && error != EEXIST) {
                if (error == ENOSPC) {
                        ztest_record_enospc(FTAG);
@@ -4304,7 +4307,8 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
         * Create snapshot, clone it, mark snap for deferred destroy,
         * destroy clone, verify snap was also destroyed.
         */
-       error = dmu_objset_snapshot(osname, snapname, NULL, FALSE);
+       error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE,
+           FALSE, -1);
        if (error) {
                if (error == ENOSPC) {
                        ztest_record_enospc("dmu_objset_snapshot");
@@ -4346,7 +4350,8 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
         * destroy a held snapshot, mark for deferred destroy,
         * release hold, verify snapshot was destroyed.
         */
-       error = dmu_objset_snapshot(osname, snapname, NULL, FALSE);
+       error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE,
+           FALSE, -1);
        if (error) {
                if (error == ENOSPC) {
                        ztest_record_enospc("dmu_objset_snapshot");
@@ -4355,7 +4360,8 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
                fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
        }
 
-       error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE, B_TRUE);
+       error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE,
+           B_TRUE, -1);
        if (error)
                fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag);
 
@@ -4843,19 +4849,19 @@ ztest_spa_import_export(char *oldname, char *newname)
        /*
         * Import it under the new name.
         */
-       VERIFY3U(0, ==, spa_import(newname, config, NULL));
+       VERIFY3U(0, ==, spa_import(newname, config, NULL, 0));
 
        ztest_walk_pool_directory("pools after import");
 
        /*
         * Try to import it again -- should fail with EEXIST.
         */
-       VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL));
+       VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0));
 
        /*
         * Try to import it under a different name -- should fail with EEXIST.
         */
-       VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL));
+       VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0));
 
        /*
         * Verify that the pool is no longer visible under the old name.
@@ -5242,6 +5248,13 @@ ztest_run(ztest_shared_t *zs)
        }
 
        kernel_fini();
+
+       list_destroy(&zcl.zcl_callbacks);
+
+       (void) _mutex_destroy(&zcl.zcl_callbacks_lock);
+
+       (void) rwlock_destroy(&zs->zs_name_lock);
+       (void) _mutex_destroy(&zs->zs_vdev_lock);
 }
 
 static void
@@ -5265,7 +5278,7 @@ ztest_freeze(ztest_shared_t *zs)
         */
        while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
                ztest_dmu_object_alloc_free(zd, 0);
-               zil_commit(zd->zd_zilog, UINT64_MAX, 0);
+               zil_commit(zd->zd_zilog, 0);
        }
 
        txg_wait_synced(spa_get_dsl(spa), 0);
@@ -5292,7 +5305,7 @@ ztest_freeze(ztest_shared_t *zs)
        /*
         * Commit all of the changes we just generated.
         */
-       zil_commit(zd->zd_zilog, UINT64_MAX, 0);
+       zil_commit(zd->zd_zilog, 0);
        txg_wait_synced(spa_get_dsl(spa), 0);
 
        /*
@@ -5311,13 +5324,6 @@ ztest_freeze(ztest_shared_t *zs)
        ztest_dataset_close(zs, 0);
        spa_close(spa, FTAG);
        kernel_fini();
-
-       list_destroy(&zcl.zcl_callbacks);
-
-       (void) _mutex_destroy(&zcl.zcl_callbacks_lock);
-
-       (void) rwlock_destroy(&zs->zs_name_lock);
-       (void) _mutex_destroy(&zs->zs_vdev_lock);
 }
 
 void
@@ -5401,6 +5407,9 @@ ztest_init(ztest_shared_t *zs)
        ztest_freeze(zs);
 
        ztest_run_zdb(zs->zs_pool);
+
+       (void) rwlock_destroy(&zs->zs_name_lock);
+       (void) _mutex_destroy(&zs->zs_vdev_lock);
 }
 
 int
index 46fb424c0575bccaa7ab051d4737b0e41743a38a..4ffa422118d121a3584c26c0ae85f29857a290bd 100644 (file)
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_EFI_PARTITION_H
 #define        _SYS_EFI_PARTITION_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/uuid.h>
 
 #ifdef __cplusplus
@@ -116,9 +113,9 @@ typedef struct efi_gpe_Attrs {
                            { 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B } }
 #define        EFI_LEGACY_MBR  { 0x024DEE41, 0x33E7, 0x11d3, 0x9D, 0x69, \
                            { 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F } }
-#define        EFI_RESV3       { 0x6a9630d1, 0x1dd2, 0x11b2, 0x99, 0xa6, \
+#define        EFI_SYMC_PUB    { 0x6a9630d1, 0x1dd2, 0x11b2, 0x99, 0xa6, \
                            { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } }
-#define        EFI_RESV4       { 0x6a980767, 0x1dd2, 0x11b2, 0x99, 0xa6, \
+#define        EFI_SYMC_CDS    { 0x6a980767, 0x1dd2, 0x11b2, 0x99, 0xa6, \
                            { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } }
 #define        EFI_MSFT_RESV   { 0xE3C9E316, 0x0B5C, 0x4DB8, 0x81, 0x7D, \
                            { 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE } }
index 31eb3d3f614b48ee644b25934aae9da26989062f..e682b840a708bfb7467e05cf3fb538e208c68405 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <stdio.h>
@@ -58,8 +57,8 @@ static struct uuid_to_ptag {
        { EFI_RESERVED },
        { EFI_SYSTEM },
        { EFI_LEGACY_MBR },
-       { EFI_RESV3 },
-       { EFI_RESV4 },
+       { EFI_SYMC_PUB },
+       { EFI_SYMC_CDS },
        { EFI_MSFT_RESV },
        { EFI_DELL_BASIC },
        { EFI_DELL_RAID },
index 15c1c781679f879c84b98bb83388e0dab4b79e36..4c2615d924a5e60856024f7b01fa6bc5d28e1cec 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _LIBNVPAIR_H
 extern "C" {
 #endif
 
-void nvlist_print(FILE *, nvlist_t *);
-int nvpair_value_match(nvpair_t *, int, char *, char **);
-int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, char **);
-void dump_nvlist(nvlist_t *, int);
+/*
+ * All interfaces described in this file are private to Solaris, and
+ * are subject to change at any time and without notice.  The public
+ * nvlist/nvpair interfaces, as documented in manpage sections 3NVPAIR,
+ * are all imported from <sys/nvpair.h> included above.
+ */
+
+extern int nvpair_value_match(nvpair_t *, int, char *, char **);
+extern int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *,
+    char **);
+
+extern void nvlist_print(FILE *, nvlist_t *);
+extern void dump_nvlist(nvlist_t *, int);
+
+/*
+ * Private nvlist printing interface that allows the caller some control
+ * over output rendering (as opposed to nvlist_print and dump_nvlist).
+ *
+ * Obtain an opaque nvlist_prtctl_t cookie using nvlist_prtctl_alloc
+ * (NULL on failure);  on return the cookie is set up for default formatting
+ * and rendering.  Quote the cookie in subsequent customisation functions and
+ * then pass the cookie to nvlist_prt to render the nvlist.  Finally,
+ * use nvlist_prtctl_free to release the cookie.
+ *
+ * For all nvlist_lookup_xxx and nvlist_lookup_xxx_array functions
+ * we have a corresponding brace of functions that appoint replacement
+ * rendering functions:
+ *
+ *     extern void nvlist_prtctl_xxx(nvlist_prtctl_t,
+ *         void (*)(nvlist_prtctl_t ctl, void *private, const char *name,
+ *         xxxtype value))
+ *
+ *     and
+ *
+ *     extern void nvlist_prtctl_xxx_array(nvlist_prtctl_t,
+ *         void (*)(nvlist_prtctl_t ctl, void *private, const char *name,
+ *         xxxtype value, uint_t count))
+ *
+ * where xxxtype is the C datatype corresponding to xxx, eg int8_t for "int8"
+ * and char * for "string".  The function that is appointed to render the
+ * specified datatype receives as arguments the cookie, the nvlist
+ * member name, the value of that member (or a pointer for array function),
+ * and (for array rendering functions) a count of the number of elements.
+ */
+
+typedef struct nvlist_prtctl *nvlist_prtctl_t; /* opaque */
+
+enum nvlist_indent_mode {
+       NVLIST_INDENT_ABS,      /* Absolute indentation */
+       NVLIST_INDENT_TABBED    /* Indent with tabstops */
+};
+
+extern nvlist_prtctl_t nvlist_prtctl_alloc(void);
+extern void nvlist_prtctl_free(nvlist_prtctl_t);
+extern void nvlist_prt(nvlist_t *, nvlist_prtctl_t);
+
+/* Output stream */
+extern void nvlist_prtctl_setdest(nvlist_prtctl_t, FILE *);
+extern FILE *nvlist_prtctl_getdest(nvlist_prtctl_t);
+
+/* Indentation mode, start indent, indent increment; default tabbed/0/1 */
+extern void nvlist_prtctl_setindent(nvlist_prtctl_t, enum nvlist_indent_mode,
+    int, int);
+extern void nvlist_prtctl_doindent(nvlist_prtctl_t, int);
+
+enum nvlist_prtctl_fmt {
+       NVLIST_FMT_MEMBER_NAME,         /* name fmt; default "%s = " */
+       NVLIST_FMT_MEMBER_POSTAMBLE,    /* after nvlist member; default "\n" */
+       NVLIST_FMT_BTWN_ARRAY           /* between array members; default " " */
+};
+
+extern void nvlist_prtctl_setfmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt,
+    const char *);
+extern void nvlist_prtctl_dofmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, ...);
+
+/*
+ * Function prototypes for interfaces that appoint a new rendering function
+ * for single-valued nvlist members.
+ *
+ * A replacement function receives arguments as follows:
+ *
+ *     nvlist_prtctl_t Print control structure; do not change preferences
+ *                     for this object from a print callback function.
+ *
+ *     void *          The function-private cookie argument registered
+ *                     when the replacement function was appointed.
+ *
+ *     nvlist_t *      The full nvlist that is being processed.  The
+ *                     rendering function is called to render a single
+ *                     member (name and value passed as below) but it may
+ *                     want to reference or incorporate other aspects of
+ *                     the full nvlist.
+ *
+ *     const char *    Member name to render
+ *
+ *     valtype         Value of the member to render
+ *
+ * The function must return non-zero if it has rendered output for this
+ * member, or 0 if it wants to default to standard rendering for this
+ * one member.
+ */
+
+#define        NVLIST_PRINTCTL_SVDECL(funcname, valtype) \
+    extern void funcname(nvlist_prtctl_t, \
+    int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, valtype), \
+    void *)
+
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean, int);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean_value, boolean_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_byte, uchar_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int8, int8_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint8, uint8_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int16, int16_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint16, uint16_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int32, int32_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint32, uint32_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int64, int64_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint64, uint64_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_double, double);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_string, char *);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_hrtime, hrtime_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_nvlist, nvlist_t *);
+
+#undef NVLIST_PRINTCTL_SVDECL  /* was just for "clarity" above */
+
+/*
+ * Function prototypes for interfaces that appoint a new rendering function
+ * for array-valued nvlist members.
+ *
+ * One additional argument is taken: uint_t for the number of array elements
+ *
+ * Return values as above.
+ */
+#define        NVLIST_PRINTCTL_AVDECL(funcname, vtype) \
+    extern void funcname(nvlist_prtctl_t, \
+    int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, uint_t), \
+    void *)
+
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_boolean_array, boolean_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_byte_array, uchar_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int8_array, int8_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint8_array, uint8_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int16_array, int16_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint16_array, uint16_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int32_array, int32_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint32_array, uint32_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int64_array, int64_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint64_array, uint64_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_string_array, char **);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_nvlist_array, nvlist_t **);
+
+#undef NVLIST_PRINTCTL_AVDECL  /* was just for "clarity" above */
 
 #ifdef __cplusplus
 }
index 57915cd7373e2b9312b70941c5d98ce38591836b..16bce483bee587064ed1d2d79f3fefd5f293393e 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <unistd.h>
@@ -28,6 +27,8 @@
 #include <libintl.h>
 #include <sys/types.h>
 #include <sys/inttypes.h>
+#include <stdarg.h>
+#include <note.h>
 #include "libnvpair.h"
 
 /*
  *     between kernel and userland, and possibly saving onto disk files.
  */
 
+/*
+ * Print control structure.
+ */
+
+#define        DEFINEOP(opname, vtype) \
+       struct { \
+               int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \
+                   const char *, vtype); \
+               void *arg; \
+       } opname
+
+#define        DEFINEARROP(opname, vtype) \
+       struct { \
+               int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \
+                   const char *, vtype, uint_t); \
+               void *arg; \
+       } opname
+
+struct nvlist_printops {
+       DEFINEOP(print_boolean, int);
+       DEFINEOP(print_boolean_value, boolean_t);
+       DEFINEOP(print_byte, uchar_t);
+       DEFINEOP(print_int8, int8_t);
+       DEFINEOP(print_uint8, uint8_t);
+       DEFINEOP(print_int16, int16_t);
+       DEFINEOP(print_uint16, uint16_t);
+       DEFINEOP(print_int32, int32_t);
+       DEFINEOP(print_uint32, uint32_t);
+       DEFINEOP(print_int64, int64_t);
+       DEFINEOP(print_uint64, uint64_t);
+       DEFINEOP(print_double, double);
+       DEFINEOP(print_string, char *);
+       DEFINEOP(print_hrtime, hrtime_t);
+       DEFINEOP(print_nvlist, nvlist_t *);
+       DEFINEARROP(print_boolean_array, boolean_t *);
+       DEFINEARROP(print_byte_array, uchar_t *);
+       DEFINEARROP(print_int8_array, int8_t *);
+       DEFINEARROP(print_uint8_array, uint8_t *);
+       DEFINEARROP(print_int16_array, int16_t *);
+       DEFINEARROP(print_uint16_array, uint16_t *);
+       DEFINEARROP(print_int32_array, int32_t *);
+       DEFINEARROP(print_uint32_array, uint32_t *);
+       DEFINEARROP(print_int64_array, int64_t *);
+       DEFINEARROP(print_uint64_array, uint64_t *);
+       DEFINEARROP(print_string_array, char **);
+       DEFINEARROP(print_nvlist_array, nvlist_t **);
+};
+
+struct nvlist_prtctl {
+       FILE *nvprt_fp;                 /* output destination */
+       enum nvlist_indent_mode nvprt_indent_mode; /* see above */
+       int nvprt_indent;               /* absolute indent, or tab depth */
+       int nvprt_indentinc;            /* indent or tab increment */
+       const char *nvprt_nmfmt;        /* member name format, max one %s */
+       const char *nvprt_eomfmt;       /* after member format, e.g. "\n" */
+       const char *nvprt_btwnarrfmt;   /* between array members */
+       int nvprt_btwnarrfmt_nl;        /* nvprt_eoamfmt includes newline? */
+       struct nvlist_printops *nvprt_dfltops;
+       struct nvlist_printops *nvprt_custops;
+};
+
+#define        DFLTPRTOP(pctl, type) \
+       ((pctl)->nvprt_dfltops->print_##type.op)
+
+#define        DFLTPRTOPARG(pctl, type) \
+       ((pctl)->nvprt_dfltops->print_##type.arg)
+
+#define        CUSTPRTOP(pctl, type) \
+       ((pctl)->nvprt_custops->print_##type.op)
+
+#define        CUSTPRTOPARG(pctl, type) \
+       ((pctl)->nvprt_custops->print_##type.arg)
+
+#define        RENDER(pctl, type, nvl, name, val) \
+       { \
+               int done = 0; \
+               if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \
+                       done = CUSTPRTOP(pctl, type)(pctl, \
+                           CUSTPRTOPARG(pctl, type), nvl, name, val); \
+               } \
+               if (!done) { \
+                       (void) DFLTPRTOP(pctl, type)(pctl, \
+                           DFLTPRTOPARG(pctl, type), nvl, name, val); \
+               } \
+               (void) fprintf(pctl->nvprt_fp, pctl->nvprt_eomfmt); \
+       }
+
+#define        ARENDER(pctl, type, nvl, name, arrp, count) \
+       { \
+               int done = 0; \
+               if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \
+                       done = CUSTPRTOP(pctl, type)(pctl, \
+                           CUSTPRTOPARG(pctl, type), nvl, name, arrp, count); \
+               } \
+               if (!done) { \
+                       (void) DFLTPRTOP(pctl, type)(pctl, \
+                           DFLTPRTOPARG(pctl, type), nvl, name, arrp, count); \
+               } \
+               (void) fprintf(pctl->nvprt_fp, pctl->nvprt_eomfmt); \
+       }
+
+static void nvlist_print_with_indent(nvlist_t *, nvlist_prtctl_t);
+
+/*
+ * ======================================================================
+ * |                                                                   |
+ * | Indentation                                                       |
+ * |                                                                   |
+ * ======================================================================
+ */
+
 static void
-indent(FILE *fp, int depth)
+indent(nvlist_prtctl_t pctl, int onemore)
 {
-       while (depth-- > 0)
-               (void) fprintf(fp, "\t");
+       int depth;
+
+       switch (pctl->nvprt_indent_mode) {
+       case NVLIST_INDENT_ABS:
+               (void) fprintf(pctl->nvprt_fp, "%*s",
+                   pctl->nvprt_indent + onemore * pctl->nvprt_indentinc, "");
+               break;
+
+       case NVLIST_INDENT_TABBED:
+               depth = pctl->nvprt_indent + onemore;
+               while (depth-- > 0)
+                       (void) fprintf(pctl->nvprt_fp, "\t");
+       }
 }
 
 /*
- * nvlist_print - Prints elements in an event buffer
+ * ======================================================================
+ * |                                                                   |
+ * | Default nvlist member rendering functions.                                |
+ * |                                                                   |
+ * ======================================================================
+ */
+
+/*
+ * Generate functions to print single-valued nvlist members.
+ *
+ * type_and_variant - suffix to form function name
+ * vtype - C type for the member value
+ * ptype - C type to cast value to for printing
+ * vfmt - format string for pair value, e.g "%d" or "0x%llx"
+ */
+
+#define        NVLIST_PRTFUNC(type_and_variant, vtype, ptype, vfmt) \
+static int \
+nvprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \
+    nvlist_t *nvl, const char *name, vtype value) \
+{ \
+       FILE *fp = pctl->nvprt_fp; \
+       NOTE(ARGUNUSED(private)) \
+       NOTE(ARGUNUSED(nvl)) \
+       indent(pctl, 1); \
+       (void) fprintf(fp, pctl->nvprt_nmfmt, name); \
+       (void) fprintf(fp, vfmt, (ptype)value); \
+       return (1); \
+}
+
+NVLIST_PRTFUNC(boolean, int, int, "%d")
+NVLIST_PRTFUNC(boolean_value, boolean_t, int, "%d")
+NVLIST_PRTFUNC(byte, uchar_t, uchar_t, "0x%2.2x")
+NVLIST_PRTFUNC(int8, int8_t, int, "%d")
+NVLIST_PRTFUNC(uint8, uint8_t, uint8_t, "0x%x")
+NVLIST_PRTFUNC(int16, int16_t, int16_t, "%d")
+NVLIST_PRTFUNC(uint16, uint16_t, uint16_t, "0x%x")
+NVLIST_PRTFUNC(int32, int32_t, int32_t, "%d")
+NVLIST_PRTFUNC(uint32, uint32_t, uint32_t, "0x%x")
+NVLIST_PRTFUNC(int64, int64_t, longlong_t, "%lld")
+NVLIST_PRTFUNC(uint64, uint64_t, u_longlong_t, "0x%llx")
+NVLIST_PRTFUNC(double, double, double, "0x%llf")
+NVLIST_PRTFUNC(string, char *, char *, "%s")
+NVLIST_PRTFUNC(hrtime, hrtime_t, hrtime_t, "0x%llx")
+
+/*
+ * Generate functions to print array-valued nvlist members.
+ */
+
+#define        NVLIST_ARRPRTFUNC(type_and_variant, vtype, ptype, vfmt) \
+static int \
+nvaprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \
+    nvlist_t *nvl, const char *name, vtype *valuep, uint_t count) \
+{ \
+       FILE *fp = pctl->nvprt_fp; \
+       uint_t i; \
+       NOTE(ARGUNUSED(private)) \
+       NOTE(ARGUNUSED(nvl)) \
+       for (i = 0; i < count; i++) { \
+               if (i == 0 || pctl->nvprt_btwnarrfmt_nl) { \
+                       indent(pctl, 1); \
+                       (void) fprintf(fp, pctl->nvprt_nmfmt, name); \
+                       if (pctl->nvprt_btwnarrfmt_nl) \
+                               (void) fprintf(fp, "[%d]: ", i); \
+               } \
+               if (i != 0) \
+                       (void) fprintf(fp, pctl->nvprt_btwnarrfmt); \
+               (void) fprintf(fp, vfmt, (ptype)valuep[i]); \
+       } \
+       return (1); \
+}
+
+NVLIST_ARRPRTFUNC(boolean_array, boolean_t, boolean_t, "%d")
+NVLIST_ARRPRTFUNC(byte_array, uchar_t, uchar_t, "0x%2.2x")
+NVLIST_ARRPRTFUNC(int8_array, int8_t, int8_t, "%d")
+NVLIST_ARRPRTFUNC(uint8_array, uint8_t, uint8_t, "0x%x")
+NVLIST_ARRPRTFUNC(int16_array, int16_t, int16_t, "%d")
+NVLIST_ARRPRTFUNC(uint16_array, uint16_t, uint16_t, "0x%x")
+NVLIST_ARRPRTFUNC(int32_array, int32_t, int32_t, "%d")
+NVLIST_ARRPRTFUNC(uint32_array, uint32_t, uint32_t, "0x%x")
+NVLIST_ARRPRTFUNC(int64_array, int64_t, longlong_t, "%lld")
+NVLIST_ARRPRTFUNC(uint64_array, uint64_t, u_longlong_t, "0x%llx")
+NVLIST_ARRPRTFUNC(string_array, char *, char *, "%s")
+
+/*ARGSUSED*/
+static int
+nvprint_nvlist(nvlist_prtctl_t pctl, void *private,
+    nvlist_t *nvl, const char *name, nvlist_t *value)
+{
+       FILE *fp = pctl->nvprt_fp;
+
+       indent(pctl, 1);
+       (void) fprintf(fp, "%s = (embedded nvlist)\n", name);
+
+       pctl->nvprt_indent += pctl->nvprt_indentinc;
+       nvlist_print_with_indent(value, pctl);
+       pctl->nvprt_indent -= pctl->nvprt_indentinc;
+
+       indent(pctl, 1);
+       (void) fprintf(fp, "(end %s)\n", name);
+
+       return (1);
+}
+
+/*ARGSUSED*/
+static int
+nvaprint_nvlist_array(nvlist_prtctl_t pctl, void *private,
+    nvlist_t *nvl, const char *name, nvlist_t **valuep, uint_t count)
+{
+       FILE *fp = pctl->nvprt_fp;
+       uint_t i;
+
+       indent(pctl, 1);
+       (void) fprintf(fp, "%s = (array of embedded nvlists)\n", name);
+
+       for (i = 0; i < count; i++) {
+               indent(pctl, 1);
+               (void) fprintf(fp, "(start %s[%d])\n", name, i);
+
+               pctl->nvprt_indent += pctl->nvprt_indentinc;
+               nvlist_print_with_indent(valuep[i], pctl);
+               pctl->nvprt_indent -= pctl->nvprt_indentinc;
+
+               indent(pctl, 1);
+               (void) fprintf(fp, "(end %s[%d])\n", name, i);
+       }
+
+       return (1);
+}
+
+/*
+ * ======================================================================
+ * |                                                                   |
+ * | Interfaces that allow control over formatting.                    |
+ * |                                                                   |
+ * ======================================================================
  */
-static
+
 void
-nvlist_print_with_indent(FILE *fp, nvlist_t *nvl, int depth)
+nvlist_prtctl_setdest(nvlist_prtctl_t pctl, FILE *fp)
 {
-       int i;
+       pctl->nvprt_fp = fp;
+}
+
+FILE *
+nvlist_prtctl_getdest(nvlist_prtctl_t pctl)
+{
+       return (pctl->nvprt_fp);
+}
+
+
+void
+nvlist_prtctl_setindent(nvlist_prtctl_t pctl, enum nvlist_indent_mode mode,
+    int start, int inc)
+{
+       if (mode < NVLIST_INDENT_ABS || mode > NVLIST_INDENT_TABBED)
+               mode = NVLIST_INDENT_TABBED;
+
+       if (start < 0)
+               start = 0;
+
+       if (inc < 0)
+               inc = 1;
+
+       pctl->nvprt_indent_mode = mode;
+       pctl->nvprt_indent = start;
+       pctl->nvprt_indentinc = inc;
+}
+
+void
+nvlist_prtctl_doindent(nvlist_prtctl_t pctl, int onemore)
+{
+       indent(pctl, onemore);
+}
+
+
+void
+nvlist_prtctl_setfmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which,
+    const char *fmt)
+{
+       switch (which) {
+       case NVLIST_FMT_MEMBER_NAME:
+               if (fmt == NULL)
+                       fmt = "%s = ";
+               pctl->nvprt_nmfmt = fmt;
+               break;
+
+       case NVLIST_FMT_MEMBER_POSTAMBLE:
+               if (fmt == NULL)
+                       fmt = "\n";
+               pctl->nvprt_eomfmt = fmt;
+               break;
+
+       case NVLIST_FMT_BTWN_ARRAY:
+               if (fmt == NULL) {
+                       pctl->nvprt_btwnarrfmt = " ";
+                       pctl->nvprt_btwnarrfmt_nl = 0;
+               } else {
+                       pctl->nvprt_btwnarrfmt = fmt;
+                       pctl->nvprt_btwnarrfmt_nl = (strstr(fmt, "\n") != NULL);
+               }
+               break;
+
+       default:
+               break;
+       }
+}
+
+
+void
+nvlist_prtctl_dofmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which, ...)
+{
+       FILE *fp = pctl->nvprt_fp;
+       va_list ap;
+       char *name;
+
+       va_start(ap, which);
+
+       switch (which) {
+       case NVLIST_FMT_MEMBER_NAME:
+               name = va_arg(ap, char *);
+               (void) fprintf(fp, pctl->nvprt_nmfmt, name);
+               break;
+
+       case NVLIST_FMT_MEMBER_POSTAMBLE:
+               (void) fprintf(fp, pctl->nvprt_eomfmt);
+               break;
+
+       case NVLIST_FMT_BTWN_ARRAY:
+               (void) fprintf(fp, pctl->nvprt_btwnarrfmt); \
+               break;
+
+       default:
+               break;
+       }
+
+       va_end(ap);
+}
+
+/*
+ * ======================================================================
+ * |                                                                   |
+ * | Interfaces to allow appointment of replacement rendering functions.|
+ * |                                                                   |
+ * ======================================================================
+ */
+
+#define        NVLIST_PRINTCTL_REPLACE(type, vtype) \
+void \
+nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \
+    int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype), \
+    void *private) \
+{ \
+       CUSTPRTOP(pctl, type) = func; \
+       CUSTPRTOPARG(pctl, type) = private; \
+}
+
+NVLIST_PRINTCTL_REPLACE(boolean, int)
+NVLIST_PRINTCTL_REPLACE(boolean_value, boolean_t)
+NVLIST_PRINTCTL_REPLACE(byte, uchar_t)
+NVLIST_PRINTCTL_REPLACE(int8, int8_t)
+NVLIST_PRINTCTL_REPLACE(uint8, uint8_t)
+NVLIST_PRINTCTL_REPLACE(int16, int16_t)
+NVLIST_PRINTCTL_REPLACE(uint16, uint16_t)
+NVLIST_PRINTCTL_REPLACE(int32, int32_t)
+NVLIST_PRINTCTL_REPLACE(uint32, uint32_t)
+NVLIST_PRINTCTL_REPLACE(int64, int64_t)
+NVLIST_PRINTCTL_REPLACE(uint64, uint64_t)
+NVLIST_PRINTCTL_REPLACE(double, double)
+NVLIST_PRINTCTL_REPLACE(string, char *)
+NVLIST_PRINTCTL_REPLACE(hrtime, hrtime_t)
+NVLIST_PRINTCTL_REPLACE(nvlist, nvlist_t *)
+
+#define        NVLIST_PRINTCTL_AREPLACE(type, vtype) \
+void \
+nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \
+    int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, \
+    uint_t), void *private) \
+{ \
+       CUSTPRTOP(pctl, type) = func; \
+       CUSTPRTOPARG(pctl, type) = private; \
+}
+
+NVLIST_PRINTCTL_AREPLACE(boolean_array, boolean_t *)
+NVLIST_PRINTCTL_AREPLACE(byte_array, uchar_t *)
+NVLIST_PRINTCTL_AREPLACE(int8_array, int8_t *)
+NVLIST_PRINTCTL_AREPLACE(uint8_array, uint8_t *)
+NVLIST_PRINTCTL_AREPLACE(int16_array, int16_t *)
+NVLIST_PRINTCTL_AREPLACE(uint16_array, uint16_t *)
+NVLIST_PRINTCTL_AREPLACE(int32_array, int32_t *)
+NVLIST_PRINTCTL_AREPLACE(uint32_array, uint32_t *)
+NVLIST_PRINTCTL_AREPLACE(int64_array, int64_t *)
+NVLIST_PRINTCTL_AREPLACE(uint64_array, uint64_t *)
+NVLIST_PRINTCTL_AREPLACE(string_array, char **)
+NVLIST_PRINTCTL_AREPLACE(nvlist_array, nvlist_t **)
+
+/*
+ * ======================================================================
+ * |                                                                   |
+ * | Interfaces to manage nvlist_prtctl_t cookies.                     |
+ * |                                                                   |
+ * ======================================================================
+ */
+
+
+static const struct nvlist_printops defprtops = {
+       { nvprint_boolean, NULL },
+       { nvprint_boolean_value, NULL },
+       { nvprint_byte, NULL },
+       { nvprint_int8, NULL },
+       { nvprint_uint8, NULL },
+       { nvprint_int16, NULL },
+       { nvprint_uint16, NULL },
+       { nvprint_int32, NULL },
+       { nvprint_uint32, NULL },
+       { nvprint_int64, NULL },
+       { nvprint_uint64, NULL },
+       { nvprint_double, NULL },
+       { nvprint_string, NULL },
+       { nvprint_hrtime, NULL },
+       { nvprint_nvlist, NULL },
+       { nvaprint_boolean_array, NULL },
+       { nvaprint_byte_array, NULL },
+       { nvaprint_int8_array, NULL },
+       { nvaprint_uint8_array, NULL },
+       { nvaprint_int16_array, NULL },
+       { nvaprint_uint16_array, NULL },
+       { nvaprint_int32_array, NULL },
+       { nvaprint_uint32_array, NULL },
+       { nvaprint_int64_array, NULL },
+       { nvaprint_uint64_array, NULL },
+       { nvaprint_string_array, NULL },
+       { nvaprint_nvlist_array, NULL },
+};
+
+static void
+prtctl_defaults(FILE *fp, struct nvlist_prtctl *pctl,
+    struct nvlist_printops *ops)
+{
+       pctl->nvprt_fp = fp;
+       pctl->nvprt_indent_mode = NVLIST_INDENT_TABBED;
+       pctl->nvprt_indent = 0;
+       pctl->nvprt_indentinc = 1;
+       pctl->nvprt_nmfmt = "%s = ";
+       pctl->nvprt_eomfmt = "\n";
+       pctl->nvprt_btwnarrfmt = " ";
+       pctl->nvprt_btwnarrfmt_nl = 0;
+
+       pctl->nvprt_dfltops = (struct nvlist_printops *)&defprtops;
+       pctl->nvprt_custops = ops;
+}
+
+nvlist_prtctl_t
+nvlist_prtctl_alloc(void)
+{
+       struct nvlist_prtctl *pctl;
+       struct nvlist_printops *ops;
+
+       if ((pctl = malloc(sizeof (*pctl))) == NULL)
+               return (NULL);
+
+       if ((ops = calloc(1, sizeof (*ops))) == NULL) {
+               free(pctl);
+               return (NULL);
+       }
+
+       prtctl_defaults(stdout, pctl, ops);
+
+       return (pctl);
+}
+
+void
+nvlist_prtctl_free(nvlist_prtctl_t pctl)
+{
+       if (pctl != NULL) {
+               free(pctl->nvprt_custops);
+               free(pctl);
+       }
+}
+
+/*
+ * ======================================================================
+ * |                                                                   |
+ * | Top-level print request interfaces.                               |
+ * |                                                                   |
+ * ======================================================================
+ */
+
+/*
+ * nvlist_print - Prints elements in an event buffer
+ */
+static void
+nvlist_print_with_indent(nvlist_t *nvl, nvlist_prtctl_t pctl)
+{
+       FILE *fp = pctl->nvprt_fp;
        char *name;
        uint_t nelem;
        nvpair_t *nvp;
@@ -60,7 +571,7 @@ nvlist_print_with_indent(FILE *fp, nvlist_t *nvl, int depth)
        if (nvl == NULL)
                return;
 
-       indent(fp, depth);
+       indent(pctl, 0);
        (void) fprintf(fp, "nvlist version: %d\n", NVL_VERSION(nvl));
 
        nvp = nvlist_next_nvpair(nvl, NULL);
@@ -68,199 +579,174 @@ nvlist_print_with_indent(FILE *fp, nvlist_t *nvl, int depth)
        while (nvp) {
                data_type_t type = nvpair_type(nvp);
 
-               indent(fp, depth);
                name = nvpair_name(nvp);
-               (void) fprintf(fp, "\t%s =", name);
                nelem = 0;
+
                switch (type) {
                case DATA_TYPE_BOOLEAN: {
-                       (void) fprintf(fp, " 1");
+                       RENDER(pctl, boolean, nvl, name, 1);
                        break;
                }
                case DATA_TYPE_BOOLEAN_VALUE: {
                        boolean_t val;
                        (void) nvpair_value_boolean_value(nvp, &val);
-                       (void) fprintf(fp, " %d", val);
+                       RENDER(pctl, boolean_value, nvl, name, val);
                        break;
                }
                case DATA_TYPE_BYTE: {
                        uchar_t val;
                        (void) nvpair_value_byte(nvp, &val);
-                       (void) fprintf(fp, " 0x%2.2x", val);
+                       RENDER(pctl, byte, nvl, name, val);
                        break;
                }
                case DATA_TYPE_INT8: {
                        int8_t val;
                        (void) nvpair_value_int8(nvp, &val);
-                       (void) fprintf(fp, " %d", val);
+                       RENDER(pctl, int8, nvl, name, val);
                        break;
                }
                case DATA_TYPE_UINT8: {
                        uint8_t val;
                        (void) nvpair_value_uint8(nvp, &val);
-                       (void) fprintf(fp, " 0x%x", val);
+                       RENDER(pctl, uint8, nvl, name, val);
                        break;
                }
                case DATA_TYPE_INT16: {
                        int16_t val;
                        (void) nvpair_value_int16(nvp, &val);
-                       (void) fprintf(fp, " %d", val);
+                       RENDER(pctl, int16, nvl, name, val);
                        break;
                }
                case DATA_TYPE_UINT16: {
                        uint16_t val;
                        (void) nvpair_value_uint16(nvp, &val);
-                       (void) fprintf(fp, " 0x%x", val);
+                       RENDER(pctl, uint16, nvl, name, val);
                        break;
                }
                case DATA_TYPE_INT32: {
                        int32_t val;
                        (void) nvpair_value_int32(nvp, &val);
-                       (void) fprintf(fp, " %d", val);
+                       RENDER(pctl, int32, nvl, name, val);
                        break;
                }
                case DATA_TYPE_UINT32: {
                        uint32_t val;
                        (void) nvpair_value_uint32(nvp, &val);
-                       (void) fprintf(fp, " 0x%x", val);
+                       RENDER(pctl, uint32, nvl, name, val);
                        break;
                }
                case DATA_TYPE_INT64: {
                        int64_t val;
                        (void) nvpair_value_int64(nvp, &val);
-                       (void) fprintf(fp, " %lld", (longlong_t)val);
+                       RENDER(pctl, int64, nvl, name, val);
                        break;
                }
                case DATA_TYPE_UINT64: {
                        uint64_t val;
                        (void) nvpair_value_uint64(nvp, &val);
-                       (void) fprintf(fp, " 0x%llx", (u_longlong_t)val);
+                       RENDER(pctl, uint64, nvl, name, val);
                        break;
                }
                case DATA_TYPE_DOUBLE: {
                        double val;
                        (void) nvpair_value_double(nvp, &val);
-                       (void) fprintf(fp, " 0x%llf", val);
+                       RENDER(pctl, double, nvl, name, val);
                        break;
                }
                case DATA_TYPE_STRING: {
                        char *val;
                        (void) nvpair_value_string(nvp, &val);
-                       (void) fprintf(fp, " %s", val);
+                       RENDER(pctl, string, nvl, name, val);
                        break;
                }
                case DATA_TYPE_BOOLEAN_ARRAY: {
                        boolean_t *val;
                        (void) nvpair_value_boolean_array(nvp, &val, &nelem);
-                       for (i = 0; i < nelem; i++)
-                               (void) fprintf(fp, " %d", val[i]);
+                       ARENDER(pctl, boolean_array, nvl, name, val, nelem);
                        break;
                }
                case DATA_TYPE_BYTE_ARRAY: {
                        uchar_t *val;
                        (void) nvpair_value_byte_array(nvp, &val, &nelem);
-                       for (i = 0; i < nelem; i++)
-                               (void) fprintf(fp, " 0x%2.2x", val[i]);
+                       ARENDER(pctl, byte_array, nvl, name, val, nelem);
                        break;
                }
                case DATA_TYPE_INT8_ARRAY: {
                        int8_t *val;
                        (void) nvpair_value_int8_array(nvp, &val, &nelem);
-                       for (i = 0; i < nelem; i++)
-                               (void) fprintf(fp, " %d", val[i]);
+                       ARENDER(pctl, int8_array, nvl, name, val, nelem);
                        break;
                }
                case DATA_TYPE_UINT8_ARRAY: {
                        uint8_t *val;
                        (void) nvpair_value_uint8_array(nvp, &val, &nelem);
-                       for (i = 0; i < nelem; i++)
-                               (void) fprintf(fp, " 0x%x", val[i]);
+                       ARENDER(pctl, uint8_array, nvl, name, val, nelem);
                        break;
                }
                case DATA_TYPE_INT16_ARRAY: {
                        int16_t *val;
                        (void) nvpair_value_int16_array(nvp, &val, &nelem);
-                       for (i = 0; i < nelem; i++)
-                               (void) fprintf(fp, " %d", val[i]);
+                       ARENDER(pctl, int16_array, nvl, name, val, nelem);
                        break;
                }
                case DATA_TYPE_UINT16_ARRAY: {
                        uint16_t *val;
                        (void) nvpair_value_uint16_array(nvp, &val, &nelem);
-                       for (i = 0; i < nelem; i++)
-                               (void) fprintf(fp, " 0x%x", val[i]);
+                       ARENDER(pctl, uint16_array, nvl, name, val, nelem);
                        break;
                }
                case DATA_TYPE_INT32_ARRAY: {
                        int32_t *val;
                        (void) nvpair_value_int32_array(nvp, &val, &nelem);
-                       for (i = 0; i < nelem; i++)
-                               (void) fprintf(fp, " %d", val[i]);
+                       ARENDER(pctl, int32_array, nvl, name, val, nelem);
                        break;
                }
                case DATA_TYPE_UINT32_ARRAY: {
                        uint32_t *val;
                        (void) nvpair_value_uint32_array(nvp, &val, &nelem);
-                       for (i = 0; i < nelem; i++)
-                               (void) fprintf(fp, " 0x%x", val[i]);
+                       ARENDER(pctl, uint32_array, nvl, name, val, nelem);
                        break;
                }
                case DATA_TYPE_INT64_ARRAY: {
                        int64_t *val;
                        (void) nvpair_value_int64_array(nvp, &val, &nelem);
-                       for (i = 0; i < nelem; i++)
-                               (void) fprintf(fp, " %lld", (longlong_t)val[i]);
+                       ARENDER(pctl, int64_array, nvl, name, val, nelem);
                        break;
                }
                case DATA_TYPE_UINT64_ARRAY: {
                        uint64_t *val;
                        (void) nvpair_value_uint64_array(nvp, &val, &nelem);
-                       for (i = 0; i < nelem; i++)
-                               (void) fprintf(fp, " 0x%llx",
-                                   (u_longlong_t)val[i]);
+                       ARENDER(pctl, uint64_array, nvl, name, val, nelem);
                        break;
                }
                case DATA_TYPE_STRING_ARRAY: {
                        char **val;
                        (void) nvpair_value_string_array(nvp, &val, &nelem);
-                       for (i = 0; i < nelem; i++)
-                               (void) fprintf(fp, " %s", val[i]);
+                       ARENDER(pctl, string_array, nvl, name, val, nelem);
                        break;
                }
                case DATA_TYPE_HRTIME: {
                        hrtime_t val;
                        (void) nvpair_value_hrtime(nvp, &val);
-                       (void) fprintf(fp, " 0x%llx", val);
+                       RENDER(pctl, hrtime, nvl, name, val);
                        break;
                }
                case DATA_TYPE_NVLIST: {
                        nvlist_t *val;
                        (void) nvpair_value_nvlist(nvp, &val);
-                       (void) fprintf(fp, " (embedded nvlist)\n");
-                       nvlist_print_with_indent(fp, val, depth + 1);
-                       indent(fp, depth + 1);
-                       (void) fprintf(fp, "(end %s)\n", name);
+                       RENDER(pctl, nvlist, nvl, name, val);
                        break;
                }
                case DATA_TYPE_NVLIST_ARRAY: {
                        nvlist_t **val;
                        (void) nvpair_value_nvlist_array(nvp, &val, &nelem);
-                       (void) fprintf(fp, " (array of embedded nvlists)\n");
-                       for (i = 0; i < nelem; i++) {
-                               indent(fp, depth + 1);
-                               (void) fprintf(fp,
-                                   "(start %s[%d])\n", name, i);
-                               nvlist_print_with_indent(fp, val[i], depth + 1);
-                               indent(fp, depth + 1);
-                               (void) fprintf(fp, "(end %s[%d])\n", name, i);
-                       }
+                       ARENDER(pctl, nvlist_array, nvl, name, val, nelem);
                        break;
                }
                default:
                        (void) fprintf(fp, " unknown data type (%d)", type);
                        break;
                }
-               (void) fprintf(fp, "\n");
                nvp = nvlist_next_nvpair(nvl, nvp);
        }
 }
@@ -268,9 +754,17 @@ nvlist_print_with_indent(FILE *fp, nvlist_t *nvl, int depth)
 void
 nvlist_print(FILE *fp, nvlist_t *nvl)
 {
-       nvlist_print_with_indent(fp, nvl, 0);
+       struct nvlist_prtctl pc;
+
+       prtctl_defaults(fp, &pc, NULL);
+       nvlist_print_with_indent(nvl, &pc);
 }
 
+void
+nvlist_prt(nvlist_t *nvl, nvlist_prtctl_t pctl)
+{
+       nvlist_print_with_indent(nvl, pctl);
+}
 
 #define        NVP(elem, type, vtype, ptype, format) { \
        vtype   value; \
@@ -421,6 +915,14 @@ dump_nvlist(nvlist_t *list, int indent)
        }
 }
 
+/*
+ * ======================================================================
+ * |                                                                   |
+ * | Misc private interface.                                           |
+ * |                                                                   |
+ * ======================================================================
+ */
+
 /*
  * Determine if string 'value' matches 'nvp' value.  The 'value' string is
  * converted, depending on the type of 'nvp', prior to match.  For numeric
index ccd46b97748e7930bcdfd7c123d3b7d6dbb14533..667542446672a3fe08e1193fe0319e952d2da4f1 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _LIBUUTIL_H
@@ -28,6 +27,7 @@
 
 #include <sys/types.h>
 #include <stdarg.h>
+#include <stdio.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -142,12 +142,21 @@ extern int uu_open_tmp(const char *dir, uint_t uflags);
 /*
  * Convenience functions.
  */
+#define        UU_NELEM(a)     (sizeof (a) / sizeof ((a)[0]))
+
 /*PRINTFLIKE1*/
 extern char *uu_msprintf(const char *format, ...);
 extern void *uu_zalloc(size_t);
 extern char *uu_strdup(const char *);
 extern void uu_free(void *);
 
+extern boolean_t uu_strcaseeq(const char *a, const char *b);
+extern boolean_t uu_streq(const char *a, const char *b);
+extern char *uu_strndup(const char *s, size_t n);
+extern boolean_t uu_strbw(const char *a, const char *b);
+extern void *uu_memdup(const void *buf, size_t sz);
+extern void uu_dump(FILE *out, const char *prefix, const void *buf, size_t len);
+
 /*
  * Comparison function type definition.
  *   Developers should be careful in their use of the _private argument. If you
index 05d8622871fa9199e9e4ae89e4f2fbeb1bc60c60..2bef759d525ece3a5f0a5eb7c41f99e1a3e9e982 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include "libuutil_common.h"
@@ -67,6 +66,44 @@ uu_strdup(const char *str)
        return (buf);
 }
 
+/*
+ * Duplicate up to n bytes of a string.  Kind of sort of like
+ * strdup(strlcpy(s, n)).
+ */
+char *
+uu_strndup(const char *s, size_t n)
+{
+       size_t len;
+       char *p;
+
+       len = strnlen(s, n);
+       p = uu_zalloc(len + 1);
+       if (p == NULL)
+               return (NULL);
+
+       if (len > 0)
+               (void) memcpy(p, s, len);
+       p[len] = '\0';
+
+       return (p);
+}
+
+/*
+ * Duplicate a block of memory.  Combines malloc with memcpy, much as
+ * strdup combines malloc, strlen, and strcpy.
+ */
+void *
+uu_memdup(const void *buf, size_t sz)
+{
+       void *p;
+
+       p = uu_zalloc(sz);
+       if (p == NULL)
+               return (NULL);
+       (void) memcpy(p, buf, sz);
+       return (p);
+}
+
 char *
 uu_msprintf(const char *format, ...)
 {
index 74ec177c11b79b57d7e380960e2d4f390b6277fc..3d5b40ca8549c692be4678b6c625e3592d91f654 100644 (file)
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include "libuutil_common.h"
 
 #include <assert.h>
@@ -39,6 +36,7 @@
 #include <sys/debug.h>
 #include <thread.h>
 #include <unistd.h>
+#include <ctype.h>
 
 #if !defined(TEXT_DOMAIN)
 #define        TEXT_DOMAIN "SYS_TEST"
@@ -253,3 +251,30 @@ uu_init(void)
 {
        (void) pthread_atfork(uu_lockup, uu_release, uu_release_child);
 }
+
+/*
+ * Dump a block of memory in hex+ascii, for debugging
+ */
+void
+uu_dump(FILE *out, const char *prefix, const void *buf, size_t len)
+{
+       const unsigned char *p = buf;
+       int i;
+
+       for (i = 0; i < len; i += 16) {
+               int j;
+
+               (void) fprintf(out, "%s", prefix);
+               for (j = 0; j < 16 && i + j < len; j++) {
+                       (void) fprintf(out, "%2.2x ", p[i + j]);
+               }
+               for (; j < 16; j++) {
+                       (void) fprintf(out, "   ");
+               }
+               for (j = 0; j < 16 && i + j < len; j++) {
+                       (void) fprintf(out, "%c",
+                           isprint(p[i + j]) ? p[i + j] : '.');
+               }
+               (void) fprintf(out, "\n");
+       }
+}
diff --git a/lib/libuutil/uu_string.c b/lib/libuutil/uu_string.c
new file mode 100644 (file)
index 0000000..66afba0
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * String helper functions
+ */
+
+#include <string.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <malloc.h>
+#include <ctype.h>
+#include "libuutil.h"
+
+/* Return true if strings are equal */
+boolean_t
+uu_streq(const char *a, const char *b)
+{
+       return (strcmp(a, b) == 0);
+}
+
+/* Return true if strings are equal, case-insensitively */
+boolean_t
+uu_strcaseeq(const char *a, const char *b)
+{
+       return (strcasecmp(a, b) == 0);
+}
+
+/* Return true if string a Begins With string b */
+boolean_t
+uu_strbw(const char *a, const char *b)
+{
+       return (strncmp(a, b, strlen(b)) == 0);
+}
index 6f7fed62c4bd0a2d6cb7cd85628478fad3ea4ce3..ea34cc9efa319b274d1b371905c63cc171b61646 100644 (file)
@@ -103,7 +103,6 @@ enum {
        EZFS_BADPERM,           /* invalid permission */
        EZFS_BADPERMSET,        /* invalid permission set name */
        EZFS_NODELEGATION,      /* delegated administration is disabled */
-       EZFS_PERMRDONLY,        /* pemissions are readonly */
        EZFS_UNSHARESMBFAILED,  /* failed to unshare over smb */
        EZFS_SHARESMBFAILED,    /* failed to share over smb */
        EZFS_BADCACHE,          /* bad cache file */
@@ -120,6 +119,9 @@ enum {
        EZFS_POSTSPLIT_ONLINE,  /* onlining a disk after splitting it */
        EZFS_SCRUBBING,         /* currently scrubbing */
        EZFS_NO_SCRUB,          /* no active scrub */
+       EZFS_DIFF,              /* general failure of zfs diff */
+       EZFS_DIFFDATA,          /* bad zfs diff data */
+       EZFS_POOLREADONLY,      /* pool is in read-only mode */
        EZFS_UNKNOWN
 };
 
@@ -326,7 +328,7 @@ extern int zpool_export_force(zpool_handle_t *);
 extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *,
     char *altroot);
 extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *,
-    nvlist_t *, boolean_t);
+    nvlist_t *, int);
 
 /*
  * Search for pools to import
@@ -492,6 +494,17 @@ extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *);
 
+typedef struct get_all_cb {
+       zfs_handle_t    **cb_handles;
+       size_t          cb_alloc;
+       size_t          cb_used;
+       boolean_t       cb_verbose;
+       int             (*cb_getone)(zfs_handle_t *, void *);
+} get_all_cb_t;
+
+void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *);
+int libzfs_dataset_cmp(const void *, const void *);
+
 /*
  * Functions to create and destroy datasets.
  */
@@ -533,12 +546,8 @@ extern int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
 
 extern int zfs_promote(zfs_handle_t *);
 extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t,
-    boolean_t, boolean_t);
-extern int zfs_hold_range(zfs_handle_t *, const char *, const char *,
-    const char *, boolean_t, boolean_t, snapfilter_cb_t, void *);
+    boolean_t, boolean_t, int, uint64_t, uint64_t);
 extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
-extern int zfs_release_range(zfs_handle_t *, const char *, const char *,
-    const char *, boolean_t);
 extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
 
 typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
@@ -579,6 +588,15 @@ typedef struct recvflags {
 extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t,
     int, avl_tree_t *);
 
+typedef enum diff_flags {
+       ZFS_DIFF_PARSEABLE = 0x1,
+       ZFS_DIFF_TIMESTAMP = 0x2,
+       ZFS_DIFF_CLASSIFY = 0x4
+} diff_flags_t;
+
+extern int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *,
+    int);
+
 /*
  * Miscellaneous functions.
  */
index 89c48c1c03c0627ddf4b1e8540ec8e23ab85d67a..c9b09a2050249916de0947bb81567cb81d0830e4 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _LIBFS_IMPL_H
@@ -69,6 +68,7 @@ struct libzfs_handle {
        char libzfs_desc[1024];
        char *libzfs_log_str;
        int libzfs_printerr;
+       int libzfs_storeerr; /* stuff error messages into buffer */
        void *libzfs_sharehdl; /* libshare handle */
        uint_t libzfs_shareflags;
        boolean_t libzfs_mnttab_enable;
@@ -136,6 +136,7 @@ int zfs_error_fmt(libzfs_handle_t *, int, const char *, ...);
 void zfs_error_aux(libzfs_handle_t *, const char *, ...);
 void *zfs_alloc(libzfs_handle_t *, size_t);
 void *zfs_realloc(libzfs_handle_t *, void *, size_t, size_t);
+char *zfs_asprintf(libzfs_handle_t *, const char *, ...);
 char *zfs_strdup(libzfs_handle_t *, const char *);
 int no_memory(libzfs_handle_t *);
 
@@ -188,6 +189,9 @@ int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **);
 
 boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *);
 
+int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
+    boolean_t modifying);
+
 void namespace_clear(libzfs_handle_t *);
 
 /*
index a3f5a7d0fc6d702f34d32d06286d885ffefd30a3..b7c1360db4b741f16e62090c7e0de695f323faae 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <ctype.h>
@@ -126,7 +125,7 @@ path_to_str(const char *path, int types)
  * provide a more meaningful error message.  We call zfs_error_aux() to
  * explain exactly why the name was not valid.
  */
-static int
+int
 zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
     boolean_t modifying)
 {
@@ -1212,39 +1211,46 @@ badlabel:
                (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
                goto error;
        }
+       return (ret);
+
+error:
+       nvlist_free(ret);
+       return (NULL);
+}
+
+int
+zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
+{
+       uint64_t old_volsize;
+       uint64_t new_volsize;
+       uint64_t old_reservation;
+       uint64_t new_reservation;
+       zfs_prop_t resv_prop;
 
        /*
         * If this is an existing volume, and someone is setting the volsize,
         * make sure that it matches the reservation, or add it if necessary.
         */
-       if (zhp != NULL && type == ZFS_TYPE_VOLUME &&
-           nvlist_lookup_uint64(ret, zfs_prop_to_name(ZFS_PROP_VOLSIZE),
-           &intval) == 0) {
-               uint64_t old_volsize = zfs_prop_get_int(zhp,
-                   ZFS_PROP_VOLSIZE);
-               uint64_t old_reservation;
-               uint64_t new_reservation;
-               zfs_prop_t resv_prop;
-
-               if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
-                       goto error;
-               old_reservation = zfs_prop_get_int(zhp, resv_prop);
-
-               if (old_volsize == old_reservation &&
-                   nvlist_lookup_uint64(ret, zfs_prop_to_name(resv_prop),
-                   &new_reservation) != 0) {
-                       if (nvlist_add_uint64(ret,
-                           zfs_prop_to_name(resv_prop), intval) != 0) {
-                               (void) no_memory(hdl);
-                               goto error;
-                       }
-               }
+       old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
+       if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
+               return (-1);
+       old_reservation = zfs_prop_get_int(zhp, resv_prop);
+       if ((zvol_volsize_to_reservation(old_volsize, zhp->zfs_props) !=
+           old_reservation) || nvlist_lookup_uint64(nvl,
+           zfs_prop_to_name(resv_prop), &new_reservation) != ENOENT) {
+               return (0);
        }
-       return (ret);
-
-error:
-       nvlist_free(ret);
-       return (NULL);
+       if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE),
+           &new_volsize) != 0)
+               return (-1);
+       new_reservation = zvol_volsize_to_reservation(new_volsize,
+           zhp->zfs_props);
+       if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop),
+           new_reservation) != 0) {
+               (void) no_memory(zhp->zfs_hdl);
+               return (-1);
+       }
+       return (1);
 }
 
 void
@@ -1346,6 +1352,7 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
        zfs_prop_t prop;
        boolean_t do_prefix;
        uint64_t idx;
+       int added_resv;
 
        (void) snprintf(errbuf, sizeof (errbuf),
            dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
@@ -1366,6 +1373,11 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
 
        prop = zfs_name_to_prop(propname);
 
+       if (prop == ZFS_PROP_VOLSIZE) {
+               if ((added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1)
+                       goto error;
+       }
+
        if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL)
                goto error;
 
@@ -1400,6 +1412,22 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
 
        if (ret != 0) {
                zfs_setprop_error(hdl, prop, errno, errbuf);
+               if (added_resv && errno == ENOSPC) {
+                       /* clean up the volsize property we tried to set */
+                       uint64_t old_volsize = zfs_prop_get_int(zhp,
+                           ZFS_PROP_VOLSIZE);
+                       nvlist_free(nvl);
+                       zcmd_free_nvlists(&zc);
+                       if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+                               goto error;
+                       if (nvlist_add_uint64(nvl,
+                           zfs_prop_to_name(ZFS_PROP_VOLSIZE),
+                           old_volsize) != 0)
+                               goto error;
+                       if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0)
+                               goto error;
+                       (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
+               }
        } else {
                if (do_prefix)
                        ret = changelist_postfix(cl);
@@ -1474,7 +1502,7 @@ zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received)
                return (zfs_error(hdl, EZFS_PROPTYPE, errbuf));
 
        /*
-        * Normalize the name, to get rid of shorthand abbrevations.
+        * Normalize the name, to get rid of shorthand abbreviations.
         */
        propname = zfs_prop_to_name(prop);
        (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
@@ -2173,14 +2201,11 @@ static int
 idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser,
     char **domainp, idmap_rid_t *ridp)
 {
-       idmap_handle_t *idmap_hdl = NULL;
        idmap_get_handle_t *get_hdl = NULL;
        idmap_stat status;
        int err = EINVAL;
 
-       if (idmap_init(&idmap_hdl) != IDMAP_SUCCESS)
-               goto out;
-       if (idmap_get_create(idmap_hdl, &get_hdl) != IDMAP_SUCCESS)
+       if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS)
                goto out;
 
        if (isuser) {
@@ -2199,8 +2224,6 @@ idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser,
 out:
        if (get_hdl)
                idmap_get_destroy(get_hdl);
-       if (idmap_hdl)
-               (void) idmap_fini(idmap_hdl);
        return (err);
 }
 
@@ -3898,11 +3921,14 @@ zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
 
 int
 zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
-    boolean_t recursive, boolean_t temphold, boolean_t enoent_ok)
+    boolean_t recursive, boolean_t temphold, boolean_t enoent_ok,
+    int cleanup_fd, uint64_t dsobj, uint64_t createtxg)
 {
        zfs_cmd_t zc = { 0 };
        libzfs_handle_t *hdl = zhp->zfs_hdl;
 
+       ASSERT(!recursive || dsobj == 0);
+
        (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
        (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
        if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string))
@@ -3910,6 +3936,9 @@ zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
                return (zfs_error(hdl, EZFS_TAGTOOLONG, tag));
        zc.zc_cookie = recursive;
        zc.zc_temphold = temphold;
+       zc.zc_cleanup_fd = cleanup_fd;
+       zc.zc_sendobj = dsobj;
+       zc.zc_createtxg = createtxg;
 
        if (zfs_ioctl(hdl, ZFS_IOC_HOLD, &zc) != 0) {
                char errbuf[ZFS_MAXNAMELEN+32];
@@ -3939,7 +3968,7 @@ zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
                        return (zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf));
                case ENOENT:
                        if (enoent_ok)
-                               return (0);
+                               return (ENOENT);
                        /* FALLTHROUGH */
                default:
                        return (zfs_standard_error_fmt(hdl, errno, errbuf));
@@ -3949,102 +3978,6 @@ zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
        return (0);
 }
 
-struct hold_range_arg {
-       zfs_handle_t    *origin;
-       const char      *fromsnap;
-       const char      *tosnap;
-       char            lastsnapheld[ZFS_MAXNAMELEN];
-       const char      *tag;
-       boolean_t       temphold;
-       boolean_t       seento;
-       boolean_t       seenfrom;
-       boolean_t       holding;
-       boolean_t       recursive;
-       snapfilter_cb_t *filter_cb;
-       void            *filter_cb_arg;
-};
-
-static int
-zfs_hold_range_one(zfs_handle_t *zhp, void *arg)
-{
-       struct hold_range_arg *hra = arg;
-       const char *thissnap;
-       int error;
-
-       thissnap = strchr(zfs_get_name(zhp), '@') + 1;
-
-       if (hra->fromsnap && !hra->seenfrom &&
-           strcmp(hra->fromsnap, thissnap) == 0)
-               hra->seenfrom = B_TRUE;
-
-       /* snap is older or newer than the desired range, ignore it */
-       if (hra->seento || !hra->seenfrom) {
-               zfs_close(zhp);
-               return (0);
-       }
-
-       if (!hra->seento && strcmp(hra->tosnap, thissnap) == 0)
-               hra->seento = B_TRUE;
-
-       if (hra->filter_cb != NULL &&
-           hra->filter_cb(zhp, hra->filter_cb_arg) == B_FALSE) {
-               zfs_close(zhp);
-               return (0);
-       }
-
-       if (hra->holding) {
-               /* We could be racing with destroy, so ignore ENOENT. */
-               error = zfs_hold(hra->origin, thissnap, hra->tag,
-                   hra->recursive, hra->temphold, B_TRUE);
-               if (error == 0) {
-                       (void) strlcpy(hra->lastsnapheld, zfs_get_name(zhp),
-                           sizeof (hra->lastsnapheld));
-               }
-       } else {
-               error = zfs_release(hra->origin, thissnap, hra->tag,
-                   hra->recursive);
-       }
-
-       zfs_close(zhp);
-       return (error);
-}
-
-/*
- * Add a user hold on the set of snapshots starting with fromsnap up to
- * and including tosnap. If we're unable to to acquire a particular hold,
- * undo any holds up to that point.
- */
-int
-zfs_hold_range(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
-    const char *tag, boolean_t recursive, boolean_t temphold,
-    snapfilter_cb_t filter_cb, void *cbarg)
-{
-       struct hold_range_arg arg = { 0 };
-       int error;
-
-       arg.origin = zhp;
-       arg.fromsnap = fromsnap;
-       arg.tosnap = tosnap;
-       arg.tag = tag;
-       arg.temphold = temphold;
-       arg.holding = B_TRUE;
-       arg.recursive = recursive;
-       arg.seenfrom = (fromsnap == NULL);
-       arg.filter_cb = filter_cb;
-       arg.filter_cb_arg = cbarg;
-
-       error = zfs_iter_snapshots_sorted(zhp, zfs_hold_range_one, &arg);
-
-       /*
-        * Make sure we either hold the entire range or none.
-        */
-       if (error && arg.lastsnapheld[0] != '\0') {
-               (void) zfs_release_range(zhp, fromsnap,
-                   (const char *)arg.lastsnapheld, tag, recursive);
-       }
-       return (error);
-}
-
 int
 zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
     boolean_t recursive)
@@ -4086,26 +4019,6 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
        return (0);
 }
 
-/*
- * Release a user hold from the set of snapshots starting with fromsnap
- * up to and including tosnap.
- */
-int
-zfs_release_range(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
-    const char *tag, boolean_t recursive)
-{
-       struct hold_range_arg arg = { 0 };
-
-       arg.origin = zhp;
-       arg.fromsnap = fromsnap;
-       arg.tosnap = tosnap;
-       arg.tag = tag;
-       arg.recursive = recursive;
-       arg.seenfrom = (fromsnap == NULL);
-
-       return (zfs_iter_snapshots_sorted(zhp, zfs_hold_range_one, &arg));
-}
-
 uint64_t
 zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
 {
diff --git a/lib/libzfs/libzfs_diff.c b/lib/libzfs/libzfs_diff.c
new file mode 100644 (file)
index 0000000..888224f
--- /dev/null
@@ -0,0 +1,826 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * zfs diff support
+ */
+#include <ctype.h>
+#include <errno.h>
+#include <libintl.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <attr.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stropts.h>
+#include <pthread.h>
+#include <sys/zfs_ioctl.h>
+#include <libzfs.h>
+#include "libzfs_impl.h"
+
+#define        ZDIFF_SNAPDIR           "/.zfs/snapshot/"
+#define        ZDIFF_SHARESDIR         "/.zfs/shares/"
+#define        ZDIFF_PREFIX            "zfs-diff-%d"
+
+#define        ZDIFF_ADDED     '+'
+#define        ZDIFF_MODIFIED  'M'
+#define        ZDIFF_REMOVED   '-'
+#define        ZDIFF_RENAMED   'R'
+
+static boolean_t
+do_name_cmp(const char *fpath, const char *tpath)
+{
+       char *fname, *tname;
+       fname = strrchr(fpath, '/') + 1;
+       tname = strrchr(tpath, '/') + 1;
+       return (strcmp(fname, tname) == 0);
+}
+
+typedef struct differ_info {
+       zfs_handle_t *zhp;
+       char *fromsnap;
+       char *frommnt;
+       char *tosnap;
+       char *tomnt;
+       char *ds;
+       char *dsmnt;
+       char *tmpsnap;
+       char errbuf[1024];
+       boolean_t isclone;
+       boolean_t scripted;
+       boolean_t classify;
+       boolean_t timestamped;
+       uint64_t shares;
+       int zerr;
+       int cleanupfd;
+       int outputfd;
+       int datafd;
+} differ_info_t;
+
+/*
+ * Given a {dsname, object id}, get the object path
+ */
+static int
+get_stats_for_obj(differ_info_t *di, const char *dsname, uint64_t obj,
+    char *pn, int maxlen, zfs_stat_t *sb)
+{
+       zfs_cmd_t zc = { 0 };
+       int error;
+
+       (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name));
+       zc.zc_obj = obj;
+
+       errno = 0;
+       error = ioctl(di->zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_STATS, &zc);
+       di->zerr = errno;
+
+       /* we can get stats even if we failed to get a path */
+       (void) memcpy(sb, &zc.zc_stat, sizeof (zfs_stat_t));
+       if (error == 0) {
+               ASSERT(di->zerr == 0);
+               (void) strlcpy(pn, zc.zc_value, maxlen);
+               return (0);
+       }
+
+       if (di->zerr == EPERM) {
+               (void) snprintf(di->errbuf, sizeof (di->errbuf),
+                   dgettext(TEXT_DOMAIN,
+                   "The sys_config privilege or diff delegated permission "
+                   "is needed\nto discover path names"));
+               return (-1);
+       } else {
+               (void) snprintf(di->errbuf, sizeof (di->errbuf),
+                   dgettext(TEXT_DOMAIN,
+                   "Unable to determine path or stats for "
+                   "object %lld in %s"), obj, dsname);
+               return (-1);
+       }
+}
+
+/*
+ * stream_bytes
+ *
+ * Prints a file name out a character at a time.  If the character is
+ * not in the range of what we consider "printable" ASCII, display it
+ * as an escaped 3-digit octal value.  ASCII values less than a space
+ * are all control characters and we declare the upper end as the
+ * DELete character.  This also is the last 7-bit ASCII character.
+ * We choose to treat all 8-bit ASCII as not printable for this
+ * application.
+ */
+static void
+stream_bytes(FILE *fp, const char *string)
+{
+       while (*string) {
+               if (*string > ' ' && *string != '\\' && *string < '\177')
+                       (void) fprintf(fp, "%c", *string++);
+               else
+                       (void) fprintf(fp, "\\%03o", *string++);
+       }
+}
+
+static void
+print_what(FILE *fp, mode_t what)
+{
+       char symbol;
+
+       switch (what & S_IFMT) {
+       case S_IFBLK:
+               symbol = 'B';
+               break;
+       case S_IFCHR:
+               symbol = 'C';
+               break;
+       case S_IFDIR:
+               symbol = '/';
+               break;
+       case S_IFDOOR:
+               symbol = '>';
+               break;
+       case S_IFIFO:
+               symbol = '|';
+               break;
+       case S_IFLNK:
+               symbol = '@';
+               break;
+       case S_IFPORT:
+               symbol = 'P';
+               break;
+       case S_IFSOCK:
+               symbol = '=';
+               break;
+       case S_IFREG:
+               symbol = 'F';
+               break;
+       default:
+               symbol = '?';
+               break;
+       }
+       (void) fprintf(fp, "%c", symbol);
+}
+
+static void
+print_cmn(FILE *fp, differ_info_t *di, const char *file)
+{
+       stream_bytes(fp, di->dsmnt);
+       stream_bytes(fp, file);
+}
+
+static void
+print_rename(FILE *fp, differ_info_t *di, const char *old, const char *new,
+    zfs_stat_t *isb)
+{
+       if (di->timestamped)
+               (void) fprintf(fp, "%10lld.%09lld\t",
+                   (longlong_t)isb->zs_ctime[0],
+                   (longlong_t)isb->zs_ctime[1]);
+       (void) fprintf(fp, "%c\t", ZDIFF_RENAMED);
+       if (di->classify) {
+               print_what(fp, isb->zs_mode);
+               (void) fprintf(fp, "\t");
+       }
+       print_cmn(fp, di, old);
+       if (di->scripted)
+               (void) fprintf(fp, "\t");
+       else
+               (void) fprintf(fp, " -> ");
+       print_cmn(fp, di, new);
+       (void) fprintf(fp, "\n");
+}
+
+static void
+print_link_change(FILE *fp, differ_info_t *di, int delta, const char *file,
+    zfs_stat_t *isb)
+{
+       if (di->timestamped)
+               (void) fprintf(fp, "%10lld.%09lld\t",
+                   (longlong_t)isb->zs_ctime[0],
+                   (longlong_t)isb->zs_ctime[1]);
+       (void) fprintf(fp, "%c\t", ZDIFF_MODIFIED);
+       if (di->classify) {
+               print_what(fp, isb->zs_mode);
+               (void) fprintf(fp, "\t");
+       }
+       print_cmn(fp, di, file);
+       (void) fprintf(fp, "\t(%+d)", delta);
+       (void) fprintf(fp, "\n");
+}
+
+static void
+print_file(FILE *fp, differ_info_t *di, char type, const char *file,
+    zfs_stat_t *isb)
+{
+       if (di->timestamped)
+               (void) fprintf(fp, "%10lld.%09lld\t",
+                   (longlong_t)isb->zs_ctime[0],
+                   (longlong_t)isb->zs_ctime[1]);
+       (void) fprintf(fp, "%c\t", type);
+       if (di->classify) {
+               print_what(fp, isb->zs_mode);
+               (void) fprintf(fp, "\t");
+       }
+       print_cmn(fp, di, file);
+       (void) fprintf(fp, "\n");
+}
+
+static int
+write_inuse_diffs_one(FILE *fp, differ_info_t *di, uint64_t dobj)
+{
+       struct zfs_stat fsb, tsb;
+       boolean_t same_name;
+       mode_t fmode, tmode;
+       char fobjname[MAXPATHLEN], tobjname[MAXPATHLEN];
+       int fobjerr, tobjerr;
+       int change;
+
+       if (dobj == di->shares)
+               return (0);
+
+       /*
+        * Check the from and to snapshots for info on the object. If
+        * we get ENOENT, then the object just didn't exist in that
+        * snapshot.  If we get ENOTSUP, then we tried to get
+        * info on a non-ZPL object, which we don't care about anyway.
+        */
+       fobjerr = get_stats_for_obj(di, di->fromsnap, dobj, fobjname,
+           MAXPATHLEN, &fsb);
+       if (fobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP)
+               return (-1);
+
+       tobjerr = get_stats_for_obj(di, di->tosnap, dobj, tobjname,
+           MAXPATHLEN, &tsb);
+       if (tobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP)
+               return (-1);
+
+       /*
+        * Unallocated object sharing the same meta dnode block
+        */
+       if (fobjerr && tobjerr) {
+               ASSERT(di->zerr == ENOENT || di->zerr == ENOTSUP);
+               di->zerr = 0;
+               return (0);
+       }
+
+       di->zerr = 0; /* negate get_stats_for_obj() from side that failed */
+       fmode = fsb.zs_mode & S_IFMT;
+       tmode = tsb.zs_mode & S_IFMT;
+       if (fmode == S_IFDIR || tmode == S_IFDIR || fsb.zs_links == 0 ||
+           tsb.zs_links == 0)
+               change = 0;
+       else
+               change = tsb.zs_links - fsb.zs_links;
+
+       if (fobjerr) {
+               if (change) {
+                       print_link_change(fp, di, change, tobjname, &tsb);
+                       return (0);
+               }
+               print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb);
+               return (0);
+       } else if (tobjerr) {
+               if (change) {
+                       print_link_change(fp, di, change, fobjname, &fsb);
+                       return (0);
+               }
+               print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb);
+               return (0);
+       }
+
+       if (fmode != tmode && fsb.zs_gen == tsb.zs_gen)
+               tsb.zs_gen++;   /* Force a generational difference */
+       same_name = do_name_cmp(fobjname, tobjname);
+
+       /* Simple modification or no change */
+       if (fsb.zs_gen == tsb.zs_gen) {
+               /* No apparent changes.  Could we assert !this?  */
+               if (fsb.zs_ctime[0] == tsb.zs_ctime[0] &&
+                   fsb.zs_ctime[1] == tsb.zs_ctime[1])
+                       return (0);
+               if (change) {
+                       print_link_change(fp, di, change,
+                           change > 0 ? fobjname : tobjname, &tsb);
+               } else if (same_name) {
+                       print_file(fp, di, ZDIFF_MODIFIED, fobjname, &tsb);
+               } else {
+                       print_rename(fp, di, fobjname, tobjname, &tsb);
+               }
+               return (0);
+       } else {
+               /* file re-created or object re-used */
+               print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb);
+               print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb);
+               return (0);
+       }
+}
+
+static int
+write_inuse_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr)
+{
+       uint64_t o;
+       int err;
+
+       for (o = dr->ddr_first; o <= dr->ddr_last; o++) {
+               if (err = write_inuse_diffs_one(fp, di, o))
+                       return (err);
+       }
+       return (0);
+}
+
+static int
+describe_free(FILE *fp, differ_info_t *di, uint64_t object, char *namebuf,
+    int maxlen)
+{
+       struct zfs_stat sb;
+
+       if (get_stats_for_obj(di, di->fromsnap, object, namebuf,
+           maxlen, &sb) != 0) {
+               /* Let it slide, if in the delete queue on from side */
+               if (di->zerr == ENOENT && sb.zs_links == 0) {
+                       di->zerr = 0;
+                       return (0);
+               }
+               return (-1);
+       }
+
+       print_file(fp, di, ZDIFF_REMOVED, namebuf, &sb);
+       return (0);
+}
+
+static int
+write_free_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr)
+{
+       zfs_cmd_t zc = { 0 };
+       libzfs_handle_t *lhdl = di->zhp->zfs_hdl;
+       char fobjname[MAXPATHLEN];
+
+       (void) strlcpy(zc.zc_name, di->fromsnap, sizeof (zc.zc_name));
+       zc.zc_obj = dr->ddr_first - 1;
+
+       ASSERT(di->zerr == 0);
+
+       while (zc.zc_obj < dr->ddr_last) {
+               int err;
+
+               err = ioctl(lhdl->libzfs_fd, ZFS_IOC_NEXT_OBJ, &zc);
+               if (err == 0) {
+                       if (zc.zc_obj == di->shares) {
+                               zc.zc_obj++;
+                               continue;
+                       }
+                       if (zc.zc_obj > dr->ddr_last) {
+                               break;
+                       }
+                       err = describe_free(fp, di, zc.zc_obj, fobjname,
+                           MAXPATHLEN);
+                       if (err)
+                               break;
+               } else if (errno == ESRCH) {
+                       break;
+               } else {
+                       (void) snprintf(di->errbuf, sizeof (di->errbuf),
+                           dgettext(TEXT_DOMAIN,
+                           "next allocated object (> %lld) find failure"),
+                           zc.zc_obj);
+                       di->zerr = errno;
+                       break;
+               }
+       }
+       if (di->zerr)
+               return (-1);
+       return (0);
+}
+
+static void *
+differ(void *arg)
+{
+       differ_info_t *di = arg;
+       dmu_diff_record_t dr;
+       FILE *ofp;
+       int err = 0;
+
+       if ((ofp = fdopen(di->outputfd, "w")) == NULL) {
+               di->zerr = errno;
+               (void) strerror_r(errno, di->errbuf, sizeof (di->errbuf));
+               (void) close(di->datafd);
+               return ((void *)-1);
+       }
+
+       for (;;) {
+               char *cp = (char *)&dr;
+               int len = sizeof (dr);
+               int rv;
+
+               do {
+                       rv = read(di->datafd, cp, len);
+                       cp += rv;
+                       len -= rv;
+               } while (len > 0 && rv > 0);
+
+               if (rv < 0 || (rv == 0 && len != sizeof (dr))) {
+                       di->zerr = EPIPE;
+                       break;
+               } else if (rv == 0) {
+                       /* end of file at a natural breaking point */
+                       break;
+               }
+
+               switch (dr.ddr_type) {
+               case DDR_FREE:
+                       err = write_free_diffs(ofp, di, &dr);
+                       break;
+               case DDR_INUSE:
+                       err = write_inuse_diffs(ofp, di, &dr);
+                       break;
+               default:
+                       di->zerr = EPIPE;
+                       break;
+               }
+
+               if (err || di->zerr)
+                       break;
+       }
+
+       (void) fclose(ofp);
+       (void) close(di->datafd);
+       if (err)
+               return ((void *)-1);
+       if (di->zerr) {
+               ASSERT(di->zerr == EINVAL);
+               (void) snprintf(di->errbuf, sizeof (di->errbuf),
+                   dgettext(TEXT_DOMAIN,
+                   "Internal error: bad data from diff IOCTL"));
+               return ((void *)-1);
+       }
+       return ((void *)0);
+}
+
+static int
+find_shares_object(differ_info_t *di)
+{
+       char fullpath[MAXPATHLEN];
+       struct stat64 sb = { 0 };
+
+       (void) strlcpy(fullpath, di->dsmnt, MAXPATHLEN);
+       (void) strlcat(fullpath, ZDIFF_SHARESDIR, MAXPATHLEN);
+
+       if (stat64(fullpath, &sb) != 0) {
+               (void) snprintf(di->errbuf, sizeof (di->errbuf),
+                   dgettext(TEXT_DOMAIN, "Cannot stat %s"), fullpath);
+               return (zfs_error(di->zhp->zfs_hdl, EZFS_DIFF, di->errbuf));
+       }
+
+       di->shares = (uint64_t)sb.st_ino;
+       return (0);
+}
+
+static int
+make_temp_snapshot(differ_info_t *di)
+{
+       libzfs_handle_t *hdl = di->zhp->zfs_hdl;
+       zfs_cmd_t zc = { 0 };
+
+       (void) snprintf(zc.zc_value, sizeof (zc.zc_value),
+           ZDIFF_PREFIX, getpid());
+       (void) strlcpy(zc.zc_name, di->ds, sizeof (zc.zc_name));
+       zc.zc_cleanup_fd = di->cleanupfd;
+
+       if (ioctl(hdl->libzfs_fd, ZFS_IOC_TMP_SNAPSHOT, &zc) != 0) {
+               int err = errno;
+               if (err == EPERM) {
+                       (void) snprintf(di->errbuf, sizeof (di->errbuf),
+                           dgettext(TEXT_DOMAIN, "The diff delegated "
+                           "permission is needed in order\nto create a "
+                           "just-in-time snapshot for diffing\n"));
+                       return (zfs_error(hdl, EZFS_DIFF, di->errbuf));
+               } else {
+                       (void) snprintf(di->errbuf, sizeof (di->errbuf),
+                           dgettext(TEXT_DOMAIN, "Cannot create just-in-time "
+                           "snapshot of '%s'"), zc.zc_name);
+                       return (zfs_standard_error(hdl, err, di->errbuf));
+               }
+       }
+
+       di->tmpsnap = zfs_strdup(hdl, zc.zc_value);
+       di->tosnap = zfs_asprintf(hdl, "%s@%s", di->ds, di->tmpsnap);
+       return (0);
+}
+
+static void
+teardown_differ_info(differ_info_t *di)
+{
+       free(di->ds);
+       free(di->dsmnt);
+       free(di->fromsnap);
+       free(di->frommnt);
+       free(di->tosnap);
+       free(di->tmpsnap);
+       free(di->tomnt);
+       (void) close(di->cleanupfd);
+}
+
+static int
+get_snapshot_names(differ_info_t *di, const char *fromsnap,
+    const char *tosnap)
+{
+       libzfs_handle_t *hdl = di->zhp->zfs_hdl;
+       char *atptrf = NULL;
+       char *atptrt = NULL;
+       int fdslen, fsnlen;
+       int tdslen, tsnlen;
+
+       /*
+        * Can accept
+        *    dataset@snap1
+        *    dataset@snap1 dataset@snap2
+        *    dataset@snap1 @snap2
+        *    dataset@snap1 dataset
+        *    @snap1 dataset@snap2
+        */
+       if (tosnap == NULL) {
+               /* only a from snapshot given, must be valid */
+               (void) snprintf(di->errbuf, sizeof (di->errbuf),
+                   dgettext(TEXT_DOMAIN,
+                   "Badly formed snapshot name %s"), fromsnap);
+
+               if (!zfs_validate_name(hdl, fromsnap, ZFS_TYPE_SNAPSHOT,
+                   B_FALSE)) {
+                       return (zfs_error(hdl, EZFS_INVALIDNAME,
+                           di->errbuf));
+               }
+
+               atptrf = strchr(fromsnap, '@');
+               ASSERT(atptrf != NULL);
+               fdslen = atptrf - fromsnap;
+
+               di->fromsnap = zfs_strdup(hdl, fromsnap);
+               di->ds = zfs_strdup(hdl, fromsnap);
+               di->ds[fdslen] = '\0';
+
+               /* the to snap will be a just-in-time snap of the head */
+               return (make_temp_snapshot(di));
+       }
+
+       (void) snprintf(di->errbuf, sizeof (di->errbuf),
+           dgettext(TEXT_DOMAIN,
+           "Unable to determine which snapshots to compare"));
+
+       atptrf = strchr(fromsnap, '@');
+       atptrt = strchr(tosnap, '@');
+       fdslen = atptrf ? atptrf - fromsnap : strlen(fromsnap);
+       tdslen = atptrt ? atptrt - tosnap : strlen(tosnap);
+       fsnlen = strlen(fromsnap) - fdslen;     /* includes @ sign */
+       tsnlen = strlen(tosnap) - tdslen;       /* includes @ sign */
+
+       if (fsnlen <= 1 || tsnlen == 1 || (fdslen == 0 && tdslen == 0) ||
+           (fsnlen == 0 && tsnlen == 0)) {
+               return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf));
+       } else if ((fdslen > 0 && tdslen > 0) &&
+           ((tdslen != fdslen || strncmp(fromsnap, tosnap, fdslen) != 0))) {
+               /*
+                * not the same dataset name, might be okay if
+                * tosnap is a clone of a fromsnap descendant.
+                */
+               char origin[ZFS_MAXNAMELEN];
+               zprop_source_t src;
+               zfs_handle_t *zhp;
+
+               di->ds = zfs_alloc(di->zhp->zfs_hdl, tdslen + 1);
+               (void) strncpy(di->ds, tosnap, tdslen);
+               di->ds[tdslen] = '\0';
+
+               zhp = zfs_open(hdl, di->ds, ZFS_TYPE_FILESYSTEM);
+               while (zhp != NULL) {
+                       (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN,
+                           origin, sizeof (origin), &src, NULL, 0, B_FALSE);
+
+                       if (strncmp(origin, fromsnap, fsnlen) == 0)
+                               break;
+
+                       (void) zfs_close(zhp);
+                       zhp = zfs_open(hdl, origin, ZFS_TYPE_FILESYSTEM);
+               }
+
+               if (zhp == NULL) {
+                       (void) snprintf(di->errbuf, sizeof (di->errbuf),
+                           dgettext(TEXT_DOMAIN,
+                           "Not an earlier snapshot from the same fs"));
+                       return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf));
+               } else {
+                       (void) zfs_close(zhp);
+               }
+
+               di->isclone = B_TRUE;
+               di->fromsnap = zfs_strdup(hdl, fromsnap);
+               if (tsnlen) {
+                       di->tosnap = zfs_strdup(hdl, tosnap);
+               } else {
+                       return (make_temp_snapshot(di));
+               }
+       } else {
+               int dslen = fdslen ? fdslen : tdslen;
+
+               di->ds = zfs_alloc(hdl, dslen + 1);
+               (void) strncpy(di->ds, fdslen ? fromsnap : tosnap, dslen);
+               di->ds[dslen] = '\0';
+
+               di->fromsnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrf);
+               if (tsnlen) {
+                       di->tosnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrt);
+               } else {
+                       return (make_temp_snapshot(di));
+               }
+       }
+       return (0);
+}
+
+static int
+get_mountpoint(differ_info_t *di, char *dsnm, char **mntpt)
+{
+       boolean_t mounted;
+
+       mounted = is_mounted(di->zhp->zfs_hdl, dsnm, mntpt);
+       if (mounted == B_FALSE) {
+               (void) snprintf(di->errbuf, sizeof (di->errbuf),
+                   dgettext(TEXT_DOMAIN,
+                   "Cannot diff an unmounted snapshot"));
+               return (zfs_error(di->zhp->zfs_hdl, EZFS_BADTYPE, di->errbuf));
+       }
+
+       /* Avoid a double slash at the beginning of root-mounted datasets */
+       if (**mntpt == '/' && *(*mntpt + 1) == '\0')
+               **mntpt = '\0';
+       return (0);
+}
+
+static int
+get_mountpoints(differ_info_t *di)
+{
+       char *strptr;
+       char *frommntpt;
+
+       /*
+        * first get the mountpoint for the parent dataset
+        */
+       if (get_mountpoint(di, di->ds, &di->dsmnt) != 0)
+               return (-1);
+
+       strptr = strchr(di->tosnap, '@');
+       ASSERT3P(strptr, !=, NULL);
+       di->tomnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", di->dsmnt,
+           ZDIFF_SNAPDIR, ++strptr);
+
+       strptr = strchr(di->fromsnap, '@');
+       ASSERT3P(strptr, !=, NULL);
+
+       frommntpt = di->dsmnt;
+       if (di->isclone) {
+               char *mntpt;
+               int err;
+
+               *strptr = '\0';
+               err = get_mountpoint(di, di->fromsnap, &mntpt);
+               *strptr = '@';
+               if (err != 0)
+                       return (-1);
+               frommntpt = mntpt;
+       }
+
+       di->frommnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", frommntpt,
+           ZDIFF_SNAPDIR, ++strptr);
+
+       if (di->isclone)
+               free(frommntpt);
+
+       return (0);
+}
+
+static int
+setup_differ_info(zfs_handle_t *zhp, const char *fromsnap,
+    const char *tosnap, differ_info_t *di)
+{
+       di->zhp = zhp;
+
+       di->cleanupfd = open(ZFS_DEV, O_RDWR|O_EXCL);
+       VERIFY(di->cleanupfd >= 0);
+
+       if (get_snapshot_names(di, fromsnap, tosnap) != 0)
+               return (-1);
+
+       if (get_mountpoints(di) != 0)
+               return (-1);
+
+       if (find_shares_object(di) != 0)
+               return (-1);
+
+       return (0);
+}
+
+int
+zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap,
+    const char *tosnap, int flags)
+{
+       zfs_cmd_t zc = { 0 };
+       char errbuf[1024];
+       differ_info_t di = { 0 };
+       pthread_t tid;
+       int pipefd[2];
+       int iocerr;
+
+       (void) snprintf(errbuf, sizeof (errbuf),
+           dgettext(TEXT_DOMAIN, "zfs diff failed"));
+
+       if (setup_differ_info(zhp, fromsnap, tosnap, &di)) {
+               teardown_differ_info(&di);
+               return (-1);
+       }
+
+       if (pipe(pipefd)) {
+               zfs_error_aux(zhp->zfs_hdl, strerror(errno));
+               teardown_differ_info(&di);
+               return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, errbuf));
+       }
+
+       di.scripted = (flags & ZFS_DIFF_PARSEABLE);
+       di.classify = (flags & ZFS_DIFF_CLASSIFY);
+       di.timestamped = (flags & ZFS_DIFF_TIMESTAMP);
+
+       di.outputfd = outfd;
+       di.datafd = pipefd[0];
+
+       if (pthread_create(&tid, NULL, differ, &di)) {
+               zfs_error_aux(zhp->zfs_hdl, strerror(errno));
+               (void) close(pipefd[0]);
+               (void) close(pipefd[1]);
+               teardown_differ_info(&di);
+               return (zfs_error(zhp->zfs_hdl,
+                   EZFS_THREADCREATEFAILED, errbuf));
+       }
+
+       /* do the ioctl() */
+       (void) strlcpy(zc.zc_value, di.fromsnap, strlen(di.fromsnap) + 1);
+       (void) strlcpy(zc.zc_name, di.tosnap, strlen(di.tosnap) + 1);
+       zc.zc_cookie = pipefd[1];
+
+       iocerr = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DIFF, &zc);
+       if (iocerr != 0) {
+               (void) snprintf(errbuf, sizeof (errbuf),
+                   dgettext(TEXT_DOMAIN, "Unable to obtain diffs"));
+               if (errno == EPERM) {
+                       zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
+                           "\n   The sys_mount privilege or diff delegated "
+                           "permission is needed\n   to execute the "
+                           "diff ioctl"));
+               } else if (errno == EXDEV) {
+                       zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
+                           "\n   Not an earlier snapshot from the same fs"));
+               } else if (errno != EPIPE || di.zerr == 0) {
+                       zfs_error_aux(zhp->zfs_hdl, strerror(errno));
+               }
+               (void) close(pipefd[1]);
+               (void) pthread_cancel(tid);
+               (void) pthread_join(tid, NULL);
+               teardown_differ_info(&di);
+               if (di.zerr != 0 && di.zerr != EPIPE) {
+                       zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr));
+                       return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf));
+               } else {
+                       return (zfs_error(zhp->zfs_hdl, EZFS_DIFFDATA, errbuf));
+               }
+       }
+
+       (void) close(pipefd[1]);
+       (void) pthread_join(tid, NULL);
+
+       if (di.zerr != 0) {
+               zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr));
+               return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf));
+       }
+       teardown_differ_info(&di);
+       return (0);
+}
index fd3044b1da333b261844210abca80c6c80702a09..e1370350fd52d7026dcb1988eb3d3f7d07192523 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -1559,6 +1558,17 @@ zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr,
 
        switch (stateval) {
        case POOL_STATE_EXPORTED:
+               /*
+                * A pool with an exported state may in fact be imported
+                * read-only, so check the in-core state to see if it's
+                * active and imported read-only.  If it is, set
+                * its state to active.
+                */
+               if (pool_active(hdl, name, guid, &isactive) == 0 && isactive &&
+                   (zhp = zpool_open_canfail(hdl, name)) != NULL &&
+                   zpool_get_prop_int(zhp, ZPOOL_PROP_READONLY, NULL))
+                       stateval = POOL_STATE_ACTIVE;
+
                ret = B_TRUE;
                break;
 
index 0675ec229790de630c9d21eae59d9361809ab16f..922220620464b692d311a32a665afedeff2185a3 100644 (file)
@@ -270,6 +270,12 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags)
        else
                (void) strlcpy(mntopts, options, sizeof (mntopts));
 
+       /*
+        * If the pool is imported read-only then all mounts must be read-only
+        */
+       if (zpool_get_prop_int(zhp->zpool_hdl, ZPOOL_PROP_READONLY, NULL))
+               flags |= MS_RDONLY;
+
        if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL))
                return (0);
 
@@ -437,18 +443,14 @@ zfs_is_shared(zfs_handle_t *zhp)
 int
 zfs_share(zfs_handle_t *zhp)
 {
-       if (ZFS_IS_VOLUME(zhp))
-               return (0);
-
+       assert(!ZFS_IS_VOLUME(zhp));
        return (zfs_share_proto(zhp, share_all_proto));
 }
 
 int
 zfs_unshare(zfs_handle_t *zhp)
 {
-       if (ZFS_IS_VOLUME(zhp))
-               return (0);
-
+       assert(!ZFS_IS_VOLUME(zhp));
        return (zfs_unshareall(zhp));
 }
 
@@ -979,18 +981,29 @@ remove_mountpoint(zfs_handle_t *zhp)
        }
 }
 
-typedef struct mount_cbdata {
-       zfs_handle_t    **cb_datasets;
-       int             cb_used;
-       int             cb_alloc;
-} mount_cbdata_t;
+void
+libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp)
+{
+       if (cbp->cb_alloc == cbp->cb_used) {
+               size_t newsz;
+               void *ptr;
+
+               newsz = cbp->cb_alloc ? cbp->cb_alloc * 2 : 64;
+               ptr = zfs_realloc(zhp->zfs_hdl,
+                   cbp->cb_handles, cbp->cb_alloc * sizeof (void *),
+                   newsz * sizeof (void *));
+               cbp->cb_handles = ptr;
+               cbp->cb_alloc = newsz;
+       }
+       cbp->cb_handles[cbp->cb_used++] = zhp;
+}
 
 static int
 mount_cb(zfs_handle_t *zhp, void *data)
 {
-       mount_cbdata_t *cbp = data;
+       get_all_cb_t *cbp = data;
 
-       if (!(zfs_get_type(zhp) & (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME))) {
+       if (!(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM)) {
                zfs_close(zhp);
                return (0);
        }
@@ -1000,25 +1013,16 @@ mount_cb(zfs_handle_t *zhp, void *data)
                return (0);
        }
 
-       if (cbp->cb_alloc == cbp->cb_used) {
-               void *ptr;
-
-               if ((ptr = zfs_realloc(zhp->zfs_hdl,
-                   cbp->cb_datasets, cbp->cb_alloc * sizeof (void *),
-                   cbp->cb_alloc * 2 * sizeof (void *))) == NULL)
-                       return (-1);
-               cbp->cb_datasets = ptr;
-
-               cbp->cb_alloc *= 2;
+       libzfs_add_handle(cbp, zhp);
+       if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) {
+               zfs_close(zhp);
+               return (-1);
        }
-
-       cbp->cb_datasets[cbp->cb_used++] = zhp;
-
-       return (zfs_iter_filesystems(zhp, mount_cb, cbp));
+       return (0);
 }
 
-static int
-dataset_cmp(const void *a, const void *b)
+int
+libzfs_dataset_cmp(const void *a, const void *b)
 {
        zfs_handle_t **za = (zfs_handle_t **)a;
        zfs_handle_t **zb = (zfs_handle_t **)b;
@@ -1056,7 +1060,7 @@ dataset_cmp(const void *a, const void *b)
 int
 zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
 {
-       mount_cbdata_t cb = { 0 };
+       get_all_cb_t cb = { 0 };
        libzfs_handle_t *hdl = zhp->zpool_hdl;
        zfs_handle_t *zfsp;
        int i, ret = -1;
@@ -1065,23 +1069,17 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
        /*
         * Gather all non-snap datasets within the pool.
         */
-       if ((cb.cb_datasets = zfs_alloc(hdl, 4 * sizeof (void *))) == NULL)
-               return (-1);
-       cb.cb_alloc = 4;
-
        if ((zfsp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) == NULL)
                goto out;
 
-       cb.cb_datasets[0] = zfsp;
-       cb.cb_used = 1;
-
+       libzfs_add_handle(&cb, zfsp);
        if (zfs_iter_filesystems(zfsp, mount_cb, &cb) != 0)
                goto out;
-
        /*
         * Sort the datasets by mountpoint.
         */
-       qsort(cb.cb_datasets, cb.cb_used, sizeof (void *), dataset_cmp);
+       qsort(cb.cb_handles, cb.cb_used, sizeof (void *),
+           libzfs_dataset_cmp);
 
        /*
         * And mount all the datasets, keeping track of which ones
@@ -1093,7 +1091,7 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
 
        ret = 0;
        for (i = 0; i < cb.cb_used; i++) {
-               if (zfs_mount(cb.cb_datasets[i], mntopts, flags) != 0)
+               if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0)
                        ret = -1;
                else
                        good[i] = 1;
@@ -1106,7 +1104,7 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
         * zfs_alloc is supposed to exit if memory isn't available.
         */
        for (i = 0; i < cb.cb_used; i++) {
-               if (good[i] && zfs_share(cb.cb_datasets[i]) != 0)
+               if (good[i] && zfs_share(cb.cb_handles[i]) != 0)
                        ret = -1;
        }
 
@@ -1114,8 +1112,8 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
 
 out:
        for (i = 0; i < cb.cb_used; i++)
-               zfs_close(cb.cb_datasets[i]);
-       free(cb.cb_datasets);
+               zfs_close(cb.cb_handles[i]);
+       free(cb.cb_handles);
 
        return (ret);
 }
index 7836e587354c0526ab6e4ac698dc087f8086702e..7df7e910ddc505669128ada42c6eee6544eb1e3e 100644 (file)
 
 static int read_efi_label(nvlist_t *config, diskaddr_t *sb);
 
-#if defined(__i386) || defined(__amd64)
-#define        BOOTCMD "installgrub(1M)"
-#else
-#define        BOOTCMD "installboot(1M)"
-#endif
-
 #define        DISK_ROOT       "/dev/dsk"
 #define        RDISK_ROOT      "/dev/rdsk"
 #define        BACKUP_SLICE    "s2"
 
+typedef struct prop_flags {
+       int create:1;   /* Validate property on creation */
+       int import:1;   /* Validate property on import */
+} prop_flags_t;
+
 /*
  * ====================================================================
  *   zpool property functions
@@ -376,7 +375,7 @@ pool_is_bootable(zpool_handle_t *zhp)
  */
 static nvlist_t *
 zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
-    nvlist_t *props, uint64_t version, boolean_t create_or_import, char *errbuf)
+    nvlist_t *props, uint64_t version, prop_flags_t flags, char *errbuf)
 {
        nvpair_t *elem;
        nvlist_t *retprops;
@@ -433,7 +432,7 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
                        break;
 
                case ZPOOL_PROP_BOOTFS:
-                       if (create_or_import) {
+                       if (flags.create || flags.import) {
                                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                                    "property '%s' cannot be set at creation "
                                    "or import time"), propname);
@@ -486,7 +485,7 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
                        break;
 
                case ZPOOL_PROP_ALTROOT:
-                       if (!create_or_import) {
+                       if (!flags.create && !flags.import) {
                                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                                    "property '%s' can only be set during pool "
                                    "creation or import"), propname);
@@ -541,6 +540,16 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
 
                        *slash = '/';
                        break;
+
+               case ZPOOL_PROP_READONLY:
+                       if (!flags.import) {
+                               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                                   "property '%s' can only be set at "
+                                   "import time"), propname);
+                               (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+                               goto error;
+                       }
+                       break;
                }
        }
 
@@ -562,6 +571,7 @@ zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval)
        nvlist_t *nvl = NULL;
        nvlist_t *realprops;
        uint64_t version;
+       prop_flags_t flags = { 0 };
 
        (void) snprintf(errbuf, sizeof (errbuf),
            dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
@@ -577,7 +587,7 @@ zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval)
 
        version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
        if ((realprops = zpool_valid_proplist(zhp->zpool_hdl,
-           zhp->zpool_name, nvl, version, B_FALSE, errbuf)) == NULL) {
+           zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) {
                nvlist_free(nvl);
                return (-1);
        }
@@ -884,8 +894,10 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
                return (-1);
 
        if (props) {
+               prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE };
+
                if ((zc_props = zpool_valid_proplist(hdl, pool, props,
-                   SPA_VERSION_1, B_TRUE, msg)) == NULL) {
+                   SPA_VERSION_1, flags, msg)) == NULL) {
                        goto create_failed;
                }
        }
@@ -1003,13 +1015,12 @@ zpool_destroy(zpool_handle_t *zhp)
        char msg[1024];
 
        if (zhp->zpool_state == POOL_STATE_ACTIVE &&
-           (zfp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
-           ZFS_TYPE_FILESYSTEM)) == NULL)
+           (zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL)
                return (-1);
 
        (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 
-       if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
+       if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
                (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
                    "cannot destroy '%s'"), zhp->zpool_name);
 
@@ -1092,7 +1103,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
                return (-1);
        (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 
-       if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
+       if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
                switch (errno) {
                case EBUSY:
                        /*
@@ -1208,19 +1219,23 @@ zpool_export_force(zpool_handle_t *zhp)
 
 static void
 zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun,
-    nvlist_t *rbi)
+    nvlist_t *config)
 {
+       nvlist_t *nv = NULL;
        uint64_t rewindto;
        int64_t loss = -1;
        struct tm t;
        char timestr[128];
 
-       if (!hdl->libzfs_printerr || rbi == NULL)
+       if (!hdl->libzfs_printerr || config == NULL)
                return;
 
-       if (nvlist_lookup_uint64(rbi, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
+       if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0)
                return;
-       (void) nvlist_lookup_int64(rbi, ZPOOL_CONFIG_REWIND_TIME, &loss);
+
+       if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
+               return;
+       (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
 
        if (localtime_r((time_t *)&rewindto, &t) != NULL &&
            strftime(timestr, 128, 0, &t) != 0) {
@@ -1255,6 +1270,7 @@ void
 zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason,
     nvlist_t *config)
 {
+       nvlist_t *nv = NULL;
        int64_t loss = -1;
        uint64_t edata = UINT64_MAX;
        uint64_t rewindto;
@@ -1270,12 +1286,12 @@ zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason,
                (void) printf(dgettext(TEXT_DOMAIN, "\t"));
 
        /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */
-       if (nvlist_lookup_uint64(config,
-           ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
+       if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 ||
+           nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
                goto no_info;
 
-       (void) nvlist_lookup_int64(config, ZPOOL_CONFIG_REWIND_TIME, &loss);
-       (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
+       (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
+       (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
            &edata);
 
        (void) printf(dgettext(TEXT_DOMAIN,
@@ -1359,12 +1375,40 @@ zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
                }
        }
 
-       ret = zpool_import_props(hdl, config, newname, props, B_FALSE);
+       ret = zpool_import_props(hdl, config, newname, props,
+           ZFS_IMPORT_NORMAL);
        if (props)
                nvlist_free(props);
        return (ret);
 }
 
+static void
+print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv,
+    int indent)
+{
+       nvlist_t **child;
+       uint_t c, children;
+       char *vname;
+       uint64_t is_log = 0;
+
+       (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG,
+           &is_log);
+
+       if (name != NULL)
+               (void) printf("\t%*s%s%s\n", indent, "", name,
+                   is_log ? " [log]" : "");
+
+       if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+           &child, &children) != 0)
+               return;
+
+       for (c = 0; c < children; c++) {
+               vname = zpool_vdev_name(hdl, NULL, child[c], B_TRUE);
+               print_vdev_tree(hdl, vname, child[c], indent + 2);
+               free(vname);
+       }
+}
+
 /*
  * Import the given pool using the known configuration and a list of
  * properties to be set. The configuration should have come from
@@ -1373,15 +1417,17 @@ zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
  */
 int
 zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
-    nvlist_t *props, boolean_t importfaulted)
+    nvlist_t *props, int flags)
 {
        zfs_cmd_t zc = { 0 };
        zpool_rewind_policy_t policy;
-       nvlist_t *nvi = NULL;
+       nvlist_t *nv = NULL;
+       nvlist_t *nvinfo = NULL;
+       nvlist_t *missing = NULL;
        char *thename;
        char *origname;
-       uint64_t returned_size;
        int ret;
+       int error = 0;
        char errbuf[1024];
 
        verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
@@ -1402,12 +1448,13 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
 
        if (props) {
                uint64_t version;
+               prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };
 
                verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
                    &version) == 0);
 
                if ((props = zpool_valid_proplist(hdl, origname,
-                   props, version, B_TRUE, errbuf)) == NULL) {
+                   props, version, flags, errbuf)) == NULL) {
                        return (-1);
                } else if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) {
                        nvlist_free(props);
@@ -1424,27 +1471,36 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
                nvlist_free(props);
                return (-1);
        }
-       returned_size =  zc.zc_nvlist_conf_size + 512;
-       if (zcmd_alloc_dst_nvlist(hdl, &zc, returned_size) != 0) {
+       if (zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2) != 0) {
                nvlist_free(props);
                return (-1);
        }
 
-       zc.zc_cookie = (uint64_t)importfaulted;
-       ret = 0;
-       if (zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc) != 0) {
+       zc.zc_cookie = flags;
+       while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 &&
+           errno == ENOMEM) {
+               if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
+                       zcmd_free_nvlists(&zc);
+                       return (-1);
+               }
+       }
+       if (ret != 0)
+               error = errno;
+
+       (void) zcmd_read_dst_nvlist(hdl, &zc, &nv);
+       zpool_get_rewind_policy(config, &policy);
+
+       if (error) {
                char desc[1024];
 
-               (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
-               zpool_get_rewind_policy(config, &policy);
                /*
                 * Dry-run failed, but we print out what success
                 * looks like if we found a best txg
                 */
-               if ((policy.zrp_request & ZPOOL_TRY_REWIND) && nvi) {
+               if (policy.zrp_request & ZPOOL_TRY_REWIND) {
                        zpool_rewind_exclaim(hdl, newname ? origname : thename,
-                           B_TRUE, nvi);
-                       nvlist_free(nvi);
+                           B_TRUE, nv);
+                       nvlist_free(nv);
                        return (-1);
                }
 
@@ -1457,7 +1513,7 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
                            dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"),
                            origname, thename);
 
-               switch (errno) {
+               switch (error) {
                case ENOTSUP:
                        /*
                         * Unsupported version.
@@ -1475,15 +1531,32 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
                        (void) zfs_error(hdl, EZFS_BADDEV, desc);
                        break;
 
+               case ENXIO:
+                       if (nv && nvlist_lookup_nvlist(nv,
+                           ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 &&
+                           nvlist_lookup_nvlist(nvinfo,
+                           ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) {
+                               (void) printf(dgettext(TEXT_DOMAIN,
+                                   "The devices below are missing, use "
+                                   "'-m' to import the pool anyway:\n"));
+                               print_vdev_tree(hdl, NULL, missing, 2);
+                               (void) printf("\n");
+                       }
+                       (void) zpool_standard_error(hdl, error, desc);
+                       break;
+
+               case EEXIST:
+                       (void) zpool_standard_error(hdl, error, desc);
+                       break;
+
                default:
-                       (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
-                       (void) zpool_standard_error(hdl, errno, desc);
+                       (void) zpool_standard_error(hdl, error, desc);
                        zpool_explain_recover(hdl,
-                           newname ? origname : thename, -errno, nvi);
-                       nvlist_free(nvi);
+                           newname ? origname : thename, -error, nv);
                        break;
                }
 
+               nvlist_free(nv);
                ret = -1;
        } else {
                zpool_handle_t *zhp;
@@ -1495,15 +1568,12 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
                        ret = -1;
                else if (zhp != NULL)
                        zpool_close(zhp);
-               (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
-               zpool_get_rewind_policy(config, &policy);
                if (policy.zrp_request &
                    (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
                        zpool_rewind_exclaim(hdl, newname ? origname : thename,
-                           ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0),
-                           nvi);
+                           ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), nv);
                }
-               nvlist_free(nvi);
+               nvlist_free(nv);
                return (0);
        }
 
@@ -1526,7 +1596,7 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func)
        (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
        zc.zc_cookie = func;
 
-       if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SCAN, &zc) == 0 ||
+       if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0 ||
            (errno == ENOENT && func != POOL_SCAN_NONE))
                return (0);
 
@@ -1618,26 +1688,17 @@ vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare,
        srchkey = nvpair_name(pair);
 
        switch (nvpair_type(pair)) {
-       case DATA_TYPE_UINT64: {
-               uint64_t srchval, theguid, present;
-
-               verify(nvpair_value_uint64(pair, &srchval) == 0);
+       case DATA_TYPE_UINT64:
                if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) {
-                       if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
-                           &present) == 0) {
-                               /*
-                                * If the device has never been present since
-                                * import, the only reliable way to match the
-                                * vdev is by GUID.
-                                */
-                               verify(nvlist_lookup_uint64(nv,
-                                   ZPOOL_CONFIG_GUID, &theguid) == 0);
-                               if (theguid == srchval)
-                                       return (nv);
-                       }
+                       uint64_t srchval, theguid;
+
+                       verify(nvpair_value_uint64(pair, &srchval) == 0);
+                       verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
+                           &theguid) == 0);
+                       if (theguid == srchval)
+                               return (nv);
                }
                break;
-       }
 
        case DATA_TYPE_STRING: {
                char *srchval, *val;
@@ -1819,6 +1880,9 @@ zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath,
            &nvroot) == 0);
 
        *avail_spare = B_FALSE;
+       *l2cache = B_FALSE;
+       if (log != NULL)
+               *log = B_FALSE;
        ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
        nvlist_free(search);
 
@@ -2114,14 +2178,14 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,
 
                if (wholedisk) {
                        pathname += strlen(DISK_ROOT) + 1;
-                       (void) zpool_relabel_disk(zhp->zpool_hdl, pathname);
+                       (void) zpool_relabel_disk(hdl, pathname);
                }
        }
 
        zc.zc_cookie = VDEV_STATE_ONLINE;
        zc.zc_obj = flags;
 
-       if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) {
+       if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) {
                if (errno == EINVAL) {
                        zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split "
                            "from this pool into a new one.  Use '%s' "
@@ -2163,7 +2227,7 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
        zc.zc_cookie = VDEV_STATE_OFFLINE;
        zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0;
 
-       if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+       if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
                return (0);
 
        switch (errno) {
@@ -2203,7 +2267,7 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
        zc.zc_cookie = VDEV_STATE_FAULTED;
        zc.zc_obj = aux;
 
-       if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+       if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
                return (0);
 
        switch (errno) {
@@ -2238,7 +2302,7 @@ zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
        zc.zc_cookie = VDEV_STATE_DEGRADED;
        zc.zc_obj = aux;
 
-       if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+       if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
                return (0);
 
        return (zpool_standard_error(hdl, errno, msg));
@@ -2286,7 +2350,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
        nvlist_t *tgt;
        boolean_t avail_spare, l2cache, islog;
        uint64_t val;
-       char *path, *newname;
+       char *newname;
        nvlist_t **child;
        uint_t children;
        nvlist_t *config_root;
@@ -2352,41 +2416,17 @@ zpool_vdev_attach(zpool_handle_t *zhp,
                return (zfs_error(hdl, EZFS_BADTARGET, msg));
        }
 
-       /*
-        * If we are attempting to replace a spare, it canot be applied to an
-        * already spared device.
-        */
-       if (replacing &&
-           nvlist_lookup_string(child[0], ZPOOL_CONFIG_PATH, &path) == 0 &&
-           zpool_find_vdev(zhp, newname, &avail_spare,
-           &l2cache, NULL) != NULL && avail_spare &&
-           is_replacing_spare(config_root, tgt, 0)) {
-               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                   "device has already been replaced with a spare"));
-               free(newname);
-               return (zfs_error(hdl, EZFS_BADTARGET, msg));
-       }
-
        free(newname);
 
        if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
                return (-1);
 
-       ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_ATTACH, &zc);
+       ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc);
 
        zcmd_free_nvlists(&zc);
 
        if (ret == 0) {
                if (rootpool) {
-                       /*
-                        * XXX - This should be removed once we can
-                        * automatically install the bootblocks on the
-                        * newly attached disk.
-                        */
-                       (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Please "
-                           "be sure to invoke %s to make '%s' bootable.\n"),
-                           BOOTCMD, new_disk);
-
                        /*
                         * XXX need a better way to prevent user from
                         * booting up a half-baked vdev.
@@ -2404,9 +2444,16 @@ zpool_vdev_attach(zpool_handle_t *zhp,
                 * Can't attach to or replace this type of vdev.
                 */
                if (replacing) {
+                       uint64_t version = zpool_get_prop_int(zhp,
+                           ZPOOL_PROP_VERSION, NULL);
+
                        if (islog)
                                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                                    "cannot replace a log with a spare"));
+                       else if (version >= SPA_VERSION_MULTI_REPLACE)
+                               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                                   "already in replacing/spare config; wait "
+                                   "for completion or use 'zpool detach'"));
                        else
                                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                                    "cannot replace a replacing device"));
@@ -2504,7 +2551,7 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path)
                 */
                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only "
                    "applicable to mirror and replacing vdevs"));
-               (void) zfs_error(zhp->zpool_hdl, EZFS_BADTARGET, msg);
+               (void) zfs_error(hdl, EZFS_BADTARGET, msg);
                break;
 
        case EBUSY:
@@ -2596,8 +2643,9 @@ zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot,
        verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0);
 
        if (props) {
+               prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };
                if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name,
-                   props, vers, B_TRUE, msg)) == NULL)
+                   props, vers, flags, msg)) == NULL)
                        return (-1);
        }
 
@@ -2831,6 +2879,7 @@ zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl)
        boolean_t avail_spare, l2cache;
        libzfs_handle_t *hdl = zhp->zpool_hdl;
        nvlist_t *nvi = NULL;
+       int error;
 
        if (path)
                (void) snprintf(msg, sizeof (msg),
@@ -2861,14 +2910,21 @@ zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl)
        zpool_get_rewind_policy(rewindnvl, &policy);
        zc.zc_cookie = policy.zrp_request;
 
-       if (zcmd_alloc_dst_nvlist(hdl, &zc, 8192) != 0)
+       if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2) != 0)
                return (-1);
 
-       if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, rewindnvl) != 0)
+       if (zcmd_write_src_nvlist(hdl, &zc, rewindnvl) != 0)
                return (-1);
 
-       if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0 ||
-           ((policy.zrp_request & ZPOOL_TRY_REWIND) &&
+       while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 &&
+           errno == ENOMEM) {
+               if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
+                       zcmd_free_nvlists(&zc);
+                       return (-1);
+               }
+       }
+
+       if (!error || ((policy.zrp_request & ZPOOL_TRY_REWIND) &&
            errno != EPERM && errno != EACCES)) {
                if (policy.zrp_request &
                    (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
index 672e004ef5cd9086e76d8d6b64c4d5cc45eabd68..3093ab974d063a821169b57661343d2665e35710 100644 (file)
@@ -51,7 +51,7 @@
 extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
 
 static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t,
-    int, const char *, nvlist_t *, avl_tree_t *, char **);
+    int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *);
 
 static const zio_cksum_t zero_cksum = { 0 };
 
@@ -782,14 +782,30 @@ static int
 zfs_sort_snaps(zfs_handle_t *zhp, void *data)
 {
        avl_tree_t *avl = data;
-       zfs_node_t *node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t));
+       zfs_node_t *node;
+       zfs_node_t search;
+
+       search.zn_handle = zhp;
+       node = avl_find(avl, &search, NULL);
+       if (node) {
+               /*
+                * If this snapshot was renamed while we were creating the
+                * AVL tree, it's possible that we already inserted it under
+                * its old name. Remove the old handle before adding the new
+                * one.
+                */
+               zfs_close(node->zn_handle);
+               avl_remove(avl, node);
+               free(node);
+       }
 
+       node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t));
        node->zn_handle = zhp;
        avl_add(avl, node);
+
        return (0);
 }
 
-/* ARGSUSED */
 static int
 zfs_snapshot_compare(const void *larg, const void *rarg)
 {
@@ -844,6 +860,7 @@ typedef struct send_dump_data {
        const char *fromsnap;
        const char *tosnap;
        char prevsnap[ZFS_MAXNAMELEN];
+       uint64_t prevsnap_obj;
        boolean_t seenfrom, seento, replicate, doall, fromorigin;
        boolean_t verbose;
        int outfd;
@@ -853,6 +870,8 @@ typedef struct send_dump_data {
        snapfilter_cb_t *filter_cb;
        void *filter_cb_arg;
        nvlist_t *debugnv;
+       char holdtag[ZFS_MAXNAMELEN];
+       int cleanup_fd;
 } send_dump_data_t;
 
 /*
@@ -860,23 +879,21 @@ typedef struct send_dump_data {
  * NULL) to the file descriptor specified by outfd.
  */
 static int
-dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
-    int outfd, boolean_t enoent_ok, boolean_t *got_enoent, nvlist_t *debugnv)
+dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj,
+    boolean_t fromorigin, int outfd, nvlist_t *debugnv)
 {
        zfs_cmd_t zc = { 0 };
        libzfs_handle_t *hdl = zhp->zfs_hdl;
        nvlist_t *thisdbg;
 
        assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
-       assert(fromsnap == NULL || fromsnap[0] == '\0' || !fromorigin);
+       assert(fromsnap_obj == 0 || !fromorigin);
 
        (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-       if (fromsnap)
-               (void) strlcpy(zc.zc_value, fromsnap, sizeof (zc.zc_value));
        zc.zc_cookie = outfd;
        zc.zc_obj = fromorigin;
-
-       *got_enoent = B_FALSE;
+       zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
+       zc.zc_fromobj = fromsnap_obj;
 
        VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0));
        if (fromsnap && fromsnap[0] != '\0') {
@@ -904,10 +921,6 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
                        return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 
                case ENOENT:
-                       if (enoent_ok) {
-                               *got_enoent = B_TRUE;
-                               return (0);
-                       }
                        if (zfs_dataset_exists(hdl, zc.zc_name,
                            ZFS_TYPE_SNAPSHOT)) {
                                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
@@ -942,13 +955,48 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
        return (0);
 }
 
+static int
+hold_for_send(zfs_handle_t *zhp, send_dump_data_t *sdd)
+{
+       zfs_handle_t *pzhp;
+       int error = 0;
+       char *thissnap;
+
+       assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
+
+       /*
+        * zfs_send() only opens a cleanup_fd for sends that need it,
+        * e.g. replication and doall.
+        */
+       if (sdd->cleanup_fd == -1)
+               return (0);
+
+       thissnap = strchr(zhp->zfs_name, '@') + 1;
+       *(thissnap - 1) = '\0';
+       pzhp = zfs_open(zhp->zfs_hdl, zhp->zfs_name, ZFS_TYPE_DATASET);
+       *(thissnap - 1) = '@';
+
+       /*
+        * It's OK if the parent no longer exists.  The send code will
+        * handle that error.
+        */
+       if (pzhp) {
+               error = zfs_hold(pzhp, thissnap, sdd->holdtag,
+                   B_FALSE, B_TRUE, B_TRUE, sdd->cleanup_fd,
+                   zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID),
+                   zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG));
+               zfs_close(pzhp);
+       }
+
+       return (error);
+}
+
 static int
 dump_snapshot(zfs_handle_t *zhp, void *arg)
 {
        send_dump_data_t *sdd = arg;
-       const char *thissnap;
+       char *thissnap;
        int err;
-       boolean_t got_enoent;
        boolean_t isfromsnap, istosnap;
        boolean_t exclude = B_FALSE;
 
@@ -957,10 +1005,17 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
            strcmp(sdd->fromsnap, thissnap) == 0);
 
        if (!sdd->seenfrom && isfromsnap) {
-               sdd->seenfrom = B_TRUE;
-               (void) strcpy(sdd->prevsnap, thissnap);
+               err = hold_for_send(zhp, sdd);
+               if (err == 0) {
+                       sdd->seenfrom = B_TRUE;
+                       (void) strcpy(sdd->prevsnap, thissnap);
+                       sdd->prevsnap_obj = zfs_prop_get_int(zhp,
+                           ZFS_PROP_OBJSETID);
+               } else if (err == ENOENT) {
+                       err = 0;
+               }
                zfs_close(zhp);
-               return (0);
+               return (err);
        }
 
        if (sdd->seento || !sdd->seenfrom) {
@@ -1001,7 +1056,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
            sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) {
                /*
                 * This snapshot is filtered out.  Don't send it, and don't
-                * set prevsnap, so it will be as if this snapshot didn't
+                * set prevsnap_obj, so it will be as if this snapshot didn't
                 * exist, and the next accepted snapshot will be sent as
                 * an incremental from the last accepted one, or as the
                 * first (and full) snapshot in the case of a replication,
@@ -1011,20 +1066,26 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
                return (0);
        }
 
+       err = hold_for_send(zhp, sdd);
+       if (err) {
+               if (err == ENOENT)
+                       err = 0;
+               zfs_close(zhp);
+               return (err);
+       }
+
        /* send it */
        if (sdd->verbose) {
                (void) fprintf(stderr, "sending from @%s to %s\n",
                    sdd->prevsnap, zhp->zfs_name);
        }
 
-       err = dump_ioctl(zhp, sdd->prevsnap,
+       err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
            sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate),
-           sdd->outfd, B_TRUE, &got_enoent, sdd->debugnv);
+           sdd->outfd, sdd->debugnv);
 
-       if (got_enoent)
-               err = 0;
-       else
-               (void) strcpy(sdd->prevsnap, thissnap);
+       (void) strcpy(sdd->prevsnap, thissnap);
+       sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
        zfs_close(zhp);
        return (err);
 }
@@ -1064,6 +1125,7 @@ dump_filesystem(zfs_handle_t *zhp, void *arg)
        }
 
        sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0;
+       sdd->prevsnap_obj = 0;
        if (sdd->fromsnap == NULL || missingfrom)
                sdd->seenfrom = B_TRUE;
 
@@ -1202,7 +1264,6 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
        int err;
        nvlist_t *fss = NULL;
        avl_tree_t *fsavl = NULL;
-       char holdtag[128];
        static uint64_t holdseq;
        int spa_version;
        boolean_t holdsnaps = B_FALSE;
@@ -1211,14 +1272,6 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
        dedup_arg_t dda = { 0 };
        int featureflags = 0;
 
-       if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) {
-               uint64_t version;
-               version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
-               if (version >= ZPL_VERSION_SA) {
-                       featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
-               }
-       }
-
        (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
            "cannot send '%s'"), zhp->zfs_name);
 
@@ -1228,8 +1281,17 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
                return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
        }
 
+       if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) {
+               uint64_t version;
+               version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+               if (version >= ZPL_VERSION_SA) {
+                       featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
+               }
+       }
+
        if (zfs_spa_version(zhp, &spa_version) == 0 &&
-           spa_version >= SPA_VERSION_USERREFS)
+           spa_version >= SPA_VERSION_USERREFS &&
+           (flags.doall || flags.replicate))
                holdsnaps = B_TRUE;
 
        if (flags.dedup) {
@@ -1258,17 +1320,6 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
                size_t buflen = 0;
                zio_cksum_t zc = { 0 };
 
-               if (holdsnaps) {
-                       (void) snprintf(holdtag, sizeof (holdtag),
-                           ".send-%d-%llu", getpid(), (u_longlong_t)holdseq);
-                       ++holdseq;
-                       err = zfs_hold_range(zhp, fromsnap, tosnap,
-                           holdtag, flags.replicate, B_TRUE, filter_func,
-                           cb_arg);
-                       if (err)
-                               goto err_out;
-               }
-
                if (flags.replicate || flags.props) {
                        nvlist_t *hdrnv;
 
@@ -1285,13 +1336,8 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
 
                        err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name,
                            fromsnap, tosnap, flags.replicate, &fss, &fsavl);
-                       if (err) {
-                               if (holdsnaps) {
-                                       (void) zfs_release_range(zhp, fromsnap,
-                                           tosnap, holdtag, flags.replicate);
-                               }
+                       if (err)
                                goto err_out;
-                       }
                        VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss));
                        err = nvlist_pack(hdrnv, &packbuf, &buflen,
                            NV_ENCODE_XDR, 0);
@@ -1302,10 +1348,6 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
                        if (err) {
                                fsavl_destroy(fsavl);
                                nvlist_free(fss);
-                               if (holdsnaps) {
-                                       (void) zfs_release_range(zhp, fromsnap,
-                                           tosnap, holdtag, flags.replicate);
-                               }
                                goto stderr_out;
                        }
                }
@@ -1331,10 +1373,6 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
                if (err == -1) {
                        fsavl_destroy(fsavl);
                        nvlist_free(fss);
-                       if (holdsnaps) {
-                               (void) zfs_release_range(zhp, fromsnap, tosnap,
-                                   holdtag, flags.replicate);
-                       }
                        err = errno;
                        goto stderr_out;
                }
@@ -1349,10 +1387,6 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
                                fsavl_destroy(fsavl);
                                nvlist_free(fss);
                                err = errno;
-                               if (holdsnaps) {
-                                       (void) zfs_release_range(zhp, fromsnap,
-                                           tosnap, holdtag, flags.replicate);
-                               }
                                goto stderr_out;
                        }
                }
@@ -1375,6 +1409,18 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
        sdd.filter_cb_arg = cb_arg;
        if (debugnvp)
                sdd.debugnv = *debugnvp;
+       if (holdsnaps) {
+               ++holdseq;
+               (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag),
+                   ".send-%d-%llu", getpid(), (u_longlong_t)holdseq);
+               sdd.cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
+               if (sdd.cleanup_fd < 0) {
+                       err = errno;
+                       goto stderr_out;
+               }
+       } else {
+               sdd.cleanup_fd = -1;
+       }
        err = dump_filesystems(zhp, &sdd);
        fsavl_destroy(fsavl);
        nvlist_free(fss);
@@ -1384,6 +1430,11 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
                (void) pthread_join(tid, NULL);
        }
 
+       if (sdd.cleanup_fd != -1) {
+               VERIFY(0 == close(sdd.cleanup_fd));
+               sdd.cleanup_fd = -1;
+       }
+
        if (flags.replicate || flags.doall || flags.props) {
                /*
                 * write final end record.  NB: want to do this even if
@@ -1392,10 +1443,6 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
                 */
                dmu_replay_record_t drr = { 0 };
                drr.drr_type = DRR_END;
-               if (holdsnaps) {
-                       (void) zfs_release_range(zhp, fromsnap, tosnap,
-                           holdtag, flags.replicate);
-               }
                if (write(outfd, &drr, sizeof (drr)) == -1) {
                        return (zfs_standard_error(zhp->zfs_hdl,
                            errno, errbuf));
@@ -1407,6 +1454,8 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
 stderr_out:
        err = zfs_standard_error(zhp->zfs_hdl, err, errbuf);
 err_out:
+       if (sdd.cleanup_fd != -1)
+               VERIFY(0 == close(sdd.cleanup_fd));
        if (flags.dedup) {
                (void) pthread_cancel(tid);
                (void) pthread_join(tid, NULL);
@@ -1992,7 +2041,7 @@ again:
 static int
 zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
     recvflags_t flags, dmu_replay_record_t *drr, zio_cksum_t *zc,
-    char **top_zfs)
+    char **top_zfs, int cleanup_fd, uint64_t *action_handlep)
 {
        nvlist_t *stream_nv = NULL;
        avl_tree_t *stream_avl = NULL;
@@ -2158,7 +2207,8 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
                 * recv_skip() and return 0).
                 */
                error = zfs_receive_impl(hdl, destname, flags, fd,
-                   sendfs, stream_nv, stream_avl, top_zfs);
+                   sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd,
+                   action_handlep);
                if (error == ENODATA) {
                        error = 0;
                        break;
@@ -2281,7 +2331,8 @@ static int
 zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
     recvflags_t flags, dmu_replay_record_t *drr,
     dmu_replay_record_t *drr_noswap, const char *sendfs,
-    nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs)
+    nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
+    uint64_t *action_handlep)
 {
        zfs_cmd_t zc = { 0 };
        time_t begin_time;
@@ -2609,6 +2660,8 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 
        zc.zc_nvlist_dst = (uint64_t)(uintptr_t)prop_errbuf;
        zc.zc_nvlist_dst_size = sizeof (prop_errbuf);
+       zc.zc_cleanup_fd = cleanup_fd;
+       zc.zc_action_handle = *action_handlep;
 
        err = ioctl_err = zfs_ioctl(hdl, ZFS_IOC_RECV, &zc);
        ioctl_errno = errno;
@@ -2796,6 +2849,8 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
        if (err || ioctl_err)
                return (-1);
 
+       *action_handlep = zc.zc_action_handle;
+
        if (flags.verbose) {
                char buf1[64];
                char buf2[64];
@@ -2816,7 +2871,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 static int
 zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
     int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl,
-    char **top_zfs)
+    char **top_zfs, int cleanup_fd, uint64_t *action_handlep)
 {
        int err;
        dmu_replay_record_t drr, drr_noswap;
@@ -2909,12 +2964,12 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
                }
                return (zfs_receive_one(hdl, infd, tosnap, flags,
                    &drr, &drr_noswap, sendfs, stream_nv, stream_avl,
-                   top_zfs));
+                   top_zfs, cleanup_fd, action_handlep));
        } else {
                assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
                    DMU_COMPOUNDSTREAM);
                return (zfs_receive_package(hdl, infd, tosnap, flags,
-                   &drr, &zcksum, top_zfs));
+                   &drr, &zcksum, top_zfs, cleanup_fd, action_handlep));
        }
 }
 
@@ -2930,9 +2985,16 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
 {
        char *top_zfs = NULL;
        int err;
+       int cleanup_fd;
+       uint64_t action_handle = 0;
+
+       cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
+       VERIFY(cleanup_fd >= 0);
 
        err = zfs_receive_impl(hdl, tosnap, flags, infd, NULL, NULL,
-           stream_avl, &top_zfs);
+           stream_avl, &top_zfs, cleanup_fd, &action_handle);
+
+       VERIFY(0 == close(cleanup_fd));
 
        if (err == 0 && !flags.nomount && top_zfs) {
                zfs_handle_t *zhp;
index 2e73f76ea5086b2ee28fbafcd6bd707f91270170..01b7c8732efd85e8edb06c274e45ca04aa251f66 100644 (file)
@@ -69,7 +69,7 @@ libzfs_error_description(libzfs_handle_t *hdl)
        case EZFS_BADPROP:
                return (dgettext(TEXT_DOMAIN, "invalid property value"));
        case EZFS_PROPREADONLY:
-               return (dgettext(TEXT_DOMAIN, "read only property"));
+               return (dgettext(TEXT_DOMAIN, "read-only property"));
        case EZFS_PROPTYPE:
                return (dgettext(TEXT_DOMAIN, "property doesn't apply to "
                    "datasets of this type"));
@@ -89,7 +89,7 @@ libzfs_error_description(libzfs_handle_t *hdl)
        case EZFS_BADSTREAM:
                return (dgettext(TEXT_DOMAIN, "invalid backup stream"));
        case EZFS_DSREADONLY:
-               return (dgettext(TEXT_DOMAIN, "dataset is read only"));
+               return (dgettext(TEXT_DOMAIN, "dataset is read-only"));
        case EZFS_VOLTOOBIG:
                return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for "
                    "this system"));
@@ -181,9 +181,6 @@ libzfs_error_description(libzfs_handle_t *hdl)
        case EZFS_NODELEGATION:
                return (dgettext(TEXT_DOMAIN, "delegated administration is "
                    "disabled on pool"));
-       case EZFS_PERMRDONLY:
-               return (dgettext(TEXT_DOMAIN, "snapshot permissions cannot be"
-                   " modified"));
        case EZFS_BADCACHE:
                return (dgettext(TEXT_DOMAIN, "invalid or missing cache file"));
        case EZFS_ISL2CACHE:
@@ -219,6 +216,12 @@ libzfs_error_description(libzfs_handle_t *hdl)
                    "use 'zpool scrub -s' to cancel current scrub"));
        case EZFS_NO_SCRUB:
                return (dgettext(TEXT_DOMAIN, "there is no active scrub"));
+       case EZFS_DIFF:
+               return (dgettext(TEXT_DOMAIN, "unable to generate diffs"));
+       case EZFS_DIFFDATA:
+               return (dgettext(TEXT_DOMAIN, "invalid diff data"));
+       case EZFS_POOLREADONLY:
+               return (dgettext(TEXT_DOMAIN, "pool is read-only"));
        case EZFS_UNKNOWN:
                return (dgettext(TEXT_DOMAIN, "unknown error"));
        default:
@@ -367,9 +370,7 @@ zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
                zfs_verror(hdl, EZFS_BUSY, fmt, ap);
                break;
        case EROFS:
-               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                   "snapshot permissions cannot be modified"));
-               zfs_verror(hdl, EZFS_PERMRDONLY, fmt, ap);
+               zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
                break;
        case ENAMETOOLONG:
                zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap);
@@ -455,12 +456,17 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
        case EDQUOT:
                zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
                return (-1);
+
        case EAGAIN:
                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                    "pool I/O is currently suspended"));
                zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
                break;
 
+       case EROFS:
+               zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
+               break;
+
        default:
                zfs_error_aux(hdl, strerror(error));
                zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
@@ -493,6 +499,29 @@ zfs_alloc(libzfs_handle_t *hdl, size_t size)
        return (data);
 }
 
+/*
+ * A safe form of asprintf() which will die if the allocation fails.
+ */
+/*PRINTFLIKE2*/
+char *
+zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...)
+{
+       va_list ap;
+       char *ret;
+       int err;
+
+       va_start(ap, fmt);
+
+       err = vasprintf(&ret, fmt, ap);
+
+       va_end(ap);
+
+       if (err < 0)
+               (void) no_memory(hdl);
+
+       return (ret);
+}
+
 /*
  * A safe form of realloc(), which also zeroes newly allocated space.
  */
@@ -579,7 +608,7 @@ libzfs_init(void)
 {
        libzfs_handle_t *hdl;
 
-       if ((hdl = calloc(sizeof (libzfs_handle_t), 1)) == NULL) {
+       if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) {
                return (NULL);
        }
 
@@ -692,7 +721,7 @@ int
 zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len)
 {
        if (len == 0)
-               len = 4*1024;
+               len = 16 * 1024;
        zc->zc_nvlist_dst_size = len;
        if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t)
            zfs_alloc(hdl, zc->zc_nvlist_dst_size)) == NULL)
index 9a6d712e53fe9fb11a267a11665c588eec948975..fc543559bd39e7fb34b23dc7ba86b52b1c755c16 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_ZFS_CONTEXT_H
@@ -231,8 +230,10 @@ typedef struct kmutex {
 } kmutex_t;
 
 #define        MUTEX_DEFAULT   USYNC_THREAD
-#undef MUTEX_HELD
+#undef MUTEX_HELD
+#undef MUTEX_NOT_HELD
 #define        MUTEX_HELD(m) _mutex_held(&(m)->m_lock)
+#define        MUTEX_NOT_HELD(m) (!MUTEX_HELD(m))
 
 /*
  * Argh -- we have to get cheesy here because the kernel and userland
@@ -323,10 +324,21 @@ extern void kstat_delete(kstat_t *);
 #define        kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f)
 #define        kmem_cache_free(_c, _b) umem_cache_free(_c, _b)
 #define        kmem_debugging()        0
-#define        kmem_cache_reap_now(c)
+#define        kmem_cache_reap_now(_c)         /* nothing */
+#define        kmem_cache_set_move(_c, _cb)    /* nothing */
+#define        POINTER_INVALIDATE(_pp)         /* nothing */
+#define        POINTER_IS_VALID(_p)    0
 
 typedef umem_cache_t kmem_cache_t;
 
+typedef enum kmem_cbrc {
+       KMEM_CBRC_YES,
+       KMEM_CBRC_NO,
+       KMEM_CBRC_LATER,
+       KMEM_CBRC_DONT_NEED,
+       KMEM_CBRC_DONT_KNOW
+} kmem_cbrc_t;
+
 /*
  * Task queues
  */
@@ -389,6 +401,8 @@ typedef struct xoptattr {
        uint8_t         xoa_av_modified;
        uint8_t         xoa_av_scanstamp[AV_SCANSTAMP_SZ];
        uint8_t         xoa_reparse;
+       uint8_t         xoa_offline;
+       uint8_t         xoa_sparse;
 } xoptattr_t;
 
 typedef struct vattr {
index 5284c125320fefe7dc6bae1b9219f341c1419cd0..f323bf60b099051abaa378a974a68e9bd4bd7d53 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <assert.h>
@@ -944,3 +943,39 @@ kmem_asprintf(const char *fmt, ...)
 
        return (buf);
 }
+
+/* ARGSUSED */
+int
+zfs_onexit_fd_hold(int fd, minor_t *minorp)
+{
+       *minorp = 0;
+       return (0);
+}
+
+/* ARGSUSED */
+void
+zfs_onexit_fd_rele(int fd)
+{
+}
+
+/* ARGSUSED */
+int
+zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+    uint64_t *action_handle)
+{
+       return (0);
+}
+
+/* ARGSUSED */
+int
+zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
+{
+       return (0);
+}
+
+/* ARGSUSED */
+int
+zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
+{
+       return (0);
+}
index 58037b06537e5bc7ee716b92f86c67860533f15e..30ff4e0667b3e701856cd75080e88c655ff6cf40 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_NVPAIR_H
@@ -158,6 +157,8 @@ int nvlist_unpack(char *, size_t, nvlist_t **, int);
 int nvlist_dup(nvlist_t *, nvlist_t **, int);
 int nvlist_merge(nvlist_t *, nvlist_t *, int);
 
+uint_t nvlist_nvflag(nvlist_t *);
+
 int nvlist_xalloc(nvlist_t **, uint_t, nv_alloc_t *);
 int nvlist_xpack(nvlist_t *, char **, size_t *, int, nv_alloc_t *);
 int nvlist_xunpack(char *, size_t, nvlist_t **, nv_alloc_t *);
index 8115091ab9a9ab7176e89f0ee2628dad8b8d8a5a..00d44263ccda099b8b9351af054639e418942fa8 100644 (file)
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/stropts.h>
@@ -257,6 +256,12 @@ nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv)
        nvl->nvl_pad = 0;
 }
 
+uint_t
+nvlist_nvflag(nvlist_t *nvl)
+{
+       return (nvl->nvl_nvflag);
+}
+
 /*
  * nvlist_alloc - Allocate nvlist.
  */
index 3c95c91ddf711918f0755f4b71c3e42ca6577456..da0b12bab4a99e0fb95e5d453604d7ce07431222 100644 (file)
@@ -160,6 +160,7 @@ typedef enum {
        ZPOOL_PROP_DEDUPRATIO,
        ZPOOL_PROP_FREE,
        ZPOOL_PROP_ALLOCATED,
+       ZPOOL_PROP_READONLY,
        ZPOOL_NUM_PROPS
 } zpool_prop_t;
 
@@ -335,14 +336,17 @@ typedef enum {
 #define        SPA_VERSION_24                  24ULL
 #define        SPA_VERSION_25                  25ULL
 #define        SPA_VERSION_26                  26ULL
+#define        SPA_VERSION_27                  27ULL
+#define        SPA_VERSION_28                  28ULL
+
 /*
  * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
  * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
  * and do the appropriate changes.  Also bump the version number in
  * usr/src/grub/capability.
  */
-#define        SPA_VERSION                     SPA_VERSION_26
-#define        SPA_VERSION_STRING              "26"
+#define        SPA_VERSION                     SPA_VERSION_28
+#define        SPA_VERSION_STRING              "28"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -391,6 +395,8 @@ typedef enum {
 #define        SPA_VERSION_SCAN                SPA_VERSION_25
 #define        SPA_VERSION_DIR_CLONES          SPA_VERSION_26
 #define        SPA_VERSION_DEADLISTS           SPA_VERSION_26
+#define        SPA_VERSION_FAST_SNAP           SPA_VERSION_27
+#define        SPA_VERSION_MULTI_REPLACE       SPA_VERSION_28
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change
@@ -465,6 +471,7 @@ typedef struct zpool_rewind_policy {
 #define        ZPOOL_CONFIG_NPARITY            "nparity"
 #define        ZPOOL_CONFIG_HOSTID             "hostid"
 #define        ZPOOL_CONFIG_HOSTNAME           "hostname"
+#define        ZPOOL_CONFIG_LOADED_TIME        "initial_load_time"
 #define        ZPOOL_CONFIG_UNSPARE            "unspare"
 #define        ZPOOL_CONFIG_PHYS_PATH          "phys_path"
 #define        ZPOOL_CONFIG_IS_LOG             "is_log"
@@ -480,9 +487,12 @@ typedef struct zpool_rewind_policy {
 #define        ZPOOL_CONFIG_SPLIT_GUID         "split_guid"
 #define        ZPOOL_CONFIG_SPLIT_LIST         "guid_list"
 #define        ZPOOL_CONFIG_REMOVING           "removing"
+#define        ZPOOL_CONFIG_RESILVERING        "resilvering"
 #define        ZPOOL_CONFIG_SUSPENDED          "suspended"     /* not stored on disk */
 #define        ZPOOL_CONFIG_TIMESTAMP          "timestamp"     /* not stored on disk */
 #define        ZPOOL_CONFIG_BOOTFS             "bootfs"        /* not stored on disk */
+#define        ZPOOL_CONFIG_MISSING_DEVICES    "missing_vdevs" /* not stored on disk */
+#define        ZPOOL_CONFIG_LOAD_INFO          "load_info"     /* not stored on disk */
 /*
  * The persistent vdev state is stored as separate values rather than a single
  * 'vdev_state' entry.  This is because a device can be in multiple states, such
@@ -760,7 +770,11 @@ typedef enum zfs_ioc {
        ZFS_IOC_RELEASE,
        ZFS_IOC_GET_HOLDS,
        ZFS_IOC_OBJSET_RECVD_PROPS,
-       ZFS_IOC_VDEV_SPLIT
+       ZFS_IOC_VDEV_SPLIT,
+       ZFS_IOC_NEXT_OBJ,
+       ZFS_IOC_DIFF,
+       ZFS_IOC_TMP_SNAPSHOT,
+       ZFS_IOC_OBJ_TO_STATS
 } zfs_ioc_t;
 
 /*
@@ -807,6 +821,15 @@ typedef enum {
 #define        ZFS_ONLINE_EXPAND       0x8
 #define        ZFS_OFFLINE_TEMPORARY   0x1
 
+/*
+ * Flags for ZFS_IOC_POOL_IMPORT
+ */
+#define        ZFS_IMPORT_NORMAL       0x0
+#define        ZFS_IMPORT_VERBATIM     0x1
+#define        ZFS_IMPORT_ANY_HOST     0x2
+#define        ZFS_IMPORT_MISSING_LOG  0x4
+#define        ZFS_IMPORT_ONLY         0x8
+
 /*
  * Sysevent payload members.  ZFS will generate the following sysevents with the
  * given payloads:
index e90cd0d5f4ba9c4182049b75e01ef2a2164fdc3c..b4cb8e2b4e3779a4b2cf383257c0ae55900948a0 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _ZFS_DELEG_H
@@ -63,6 +62,7 @@ typedef enum {
        ZFS_DELEG_NOTE_GROUPUSED,
        ZFS_DELEG_NOTE_HOLD,
        ZFS_DELEG_NOTE_RELEASE,
+       ZFS_DELEG_NOTE_DIFF,
        ZFS_DELEG_NOTE_NONE
 } zfs_deleg_note_t;
 
index 35f81b584641a2cca19e5a38f4d5b6632aa77907..83d9edb2138923182d6de65983e88c3f659d4371 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #if defined(_KERNEL)
@@ -69,6 +68,7 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
        {ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
        {ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
        {ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
+       {ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF},
        {NULL, ZFS_DELEG_NOTE_NONE }
 };
 
index 0b8a9529a2988f6254695055398232b524cb9282..988d05de6e209ee3fffe0b6e97e878386d9c2928 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zio.h>
@@ -105,6 +104,8 @@ zpool_prop_init(void)
            boolean_table);
        zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0,
            PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table);
+       zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0,
+           PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "RDONLY", boolean_table);
 
        /* default index properties */
        zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
index 8adb54dc6e195748d56578f3aa1e8eb213a42f86..a82718e8bc6e02792c86534cc9b17c5be72e1bfd 100644 (file)
@@ -952,11 +952,6 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
-       kmutex_t *hash_lock;
-
-       hash_lock = HDR_LOCK(buf->b_hdr);
-       mutex_enter(hash_lock);
-
        if (zfs_flags & ZFS_DEBUG_MODIFY) {
                if (buf->b_hdr->b_state != arc_anon)
                        panic("modifying non-anon buffer!");
@@ -978,7 +973,6 @@ arc_buf_thaw(arc_buf_t *buf)
        }
 
        mutex_exit(&buf->b_hdr->b_freeze_lock);
-       mutex_exit(hash_lock);
 }
 
 void
@@ -1750,6 +1744,7 @@ static void
 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
 {
        arc_buf_hdr_t *ab, *ab_prev;
+       arc_buf_hdr_t marker = { 0 };
        list_t *list = &state->arcs_list[ARC_BUFC_DATA];
        kmutex_t *hash_lock;
        uint64_t bytes_deleted = 0;
@@ -1762,6 +1757,11 @@ top:
                ab_prev = list_prev(list, ab);
                if (spa && ab->b_spa != spa)
                        continue;
+
+               /* ignore markers */
+               if (ab->b_spa == 0)
+                       continue;
+
                hash_lock = HDR_LOCK(ab);
                /* caller may be trying to modify this buffer, skip it */
                if (MUTEX_HELD(hash_lock))
@@ -1788,15 +1788,21 @@ top:
                        DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
                        if (bytes >= 0 && bytes_deleted >= bytes)
                                break;
-               } else {
-                       if (bytes < 0) {
-                               mutex_exit(&state->arcs_mtx);
-                               mutex_enter(hash_lock);
-                               mutex_exit(hash_lock);
-                               goto top;
-                       }
+               } else if (bytes < 0) {
+                       /*
+                        * Insert a list marker and then wait for the
+                        * hash lock to become available. Once its
+                        * available, restart from where we left off.
+                        */
+                       list_insert_after(list, ab, &marker);
+                       mutex_exit(&state->arcs_mtx);
+                       mutex_enter(hash_lock);
+                       mutex_exit(hash_lock);
+                       mutex_enter(&state->arcs_mtx);
+                       ab_prev = list_prev(list, &marker);
+                       list_remove(list, &marker);
+               } else
                        bufs_skipped += 1;
-               }
        }
        mutex_exit(&state->arcs_mtx);
 
@@ -1825,8 +1831,9 @@ arc_adjust(void)
         * Adjust MRU size
         */
 
-       adjustment = MIN(arc_size - arc_c,
-           arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p);
+       adjustment = MIN((int64_t)(arc_size - arc_c),
+           (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
+           arc_p));
 
        if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
                delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
@@ -2113,9 +2120,7 @@ arc_reclaim_thread(void)
                        arc_no_grow = FALSE;
                }
 
-               if (2 * arc_c < arc_size +
-                   arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)
-                       arc_adjust();
+               arc_adjust();
 
                if (arc_eviction_list != NULL)
                        arc_do_user_evicts();
@@ -2159,6 +2164,7 @@ arc_adapt(int bytes, arc_state_t *state)
        if (state == arc_mru_ghost) {
                mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
                    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+               mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
 
                arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
        } else if (state == arc_mfu_ghost) {
@@ -2166,6 +2172,7 @@ arc_adapt(int bytes, arc_state_t *state)
 
                mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
                    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+               mult = MIN(mult, 10);
 
                delta = MIN(bytes * mult, arc_p);
                arc_p = MAX(arc_p_min, arc_p - delta);
@@ -4437,6 +4444,16 @@ l2arc_feed_thread(void)
                spa = dev->l2ad_spa;
                ASSERT(spa != NULL);
 
+               /*
+                * If the pool is read-only then force the feed thread to
+                * sleep a little longer.
+                */
+               if (!spa_writeable(spa)) {
+                       next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
+                       spa_config_exit(spa, SCL_L2ARC, dev);
+                       continue;
+               }
+
                /*
                 * Avoid contributing to memory pressure.
                 */
index f81c48aca68b15196b929726c47a27050358ab13..72be31235607813876b264e67505d07a689c9dff 100644 (file)
@@ -113,16 +113,15 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
        ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
        ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
 
+       err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
+       if (err)
+               return (err);
+
        bpo->bpo_os = os;
        bpo->bpo_object = object;
        bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
        bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
        bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
-
-       err = dmu_bonus_hold(bpo->bpo_os,
-           bpo->bpo_object, bpo, &bpo->bpo_dbuf);
-       if (err)
-               return (err);
        bpo->bpo_phys = bpo->bpo_dbuf->db_data;
        return (0);
 }
@@ -140,6 +139,7 @@ bpobj_close(bpobj_t *bpo)
        bpo->bpo_dbuf = NULL;
        bpo->bpo_phys = NULL;
        bpo->bpo_cached_dbuf = NULL;
+       bpo->bpo_object = 0;
 
        mutex_destroy(&bpo->bpo_lock);
 }
@@ -210,8 +210,10 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
 
        ASSERT(bpo->bpo_havecomp);
        err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
-       if (err)
+       if (err) {
+               mutex_exit(&bpo->bpo_lock);
                return (err);
+       }
        epb = doi.doi_data_block_size / sizeof (uint64_t);
 
        for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
@@ -252,7 +254,7 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
                            &used_after, &comp_after, &uncomp_after));
                        bpo->bpo_phys->bpo_bytes -= used_before - used_after;
                        ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
-                       bpo->bpo_phys->bpo_comp -= comp_before - used_after;
+                       bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
                        bpo->bpo_phys->bpo_uncomp -=
                            uncomp_before - uncomp_after;
                }
@@ -312,17 +314,17 @@ void
 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 {
        bpobj_t subbpo;
-       uint64_t used, comp, uncomp;
+       uint64_t used, comp, uncomp, subsubobjs;
 
        ASSERT(bpo->bpo_havesubobj);
        ASSERT(bpo->bpo_havecomp);
 
        VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
        VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
-       bpobj_close(&subbpo);
 
        if (used == 0) {
                /* No point in having an empty subobj. */
+               bpobj_close(&subbpo);
                bpobj_free(bpo->bpo_os, subobj, tx);
                return;
        }
@@ -338,10 +340,41 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
            bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
            sizeof (subobj), &subobj, tx);
        bpo->bpo_phys->bpo_num_subobjs++;
+
+       /*
+        * If subobj has only one block of subobjs, then move subobj's
+        * subobjs to bpo's subobj list directly.  This reduces
+        * recursion in bpobj_iterate due to nested subobjs.
+        */
+       subsubobjs = subbpo.bpo_phys->bpo_subobjs;
+       if (subsubobjs != 0) {
+               dmu_object_info_t doi;
+
+               VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
+               if (doi.doi_max_offset == doi.doi_data_block_size) {
+                       dmu_buf_t *subdb;
+                       uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
+
+                       VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
+                           0, FTAG, &subdb, 0));
+                       dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+                           bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+                           numsubsub * sizeof (subobj), subdb->db_data, tx);
+                       dmu_buf_rele(subdb, FTAG);
+                       bpo->bpo_phys->bpo_num_subobjs += numsubsub;
+
+                       dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
+                       subbpo.bpo_phys->bpo_subobjs = 0;
+                       VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
+                           subsubobjs, tx));
+               }
+       }
        bpo->bpo_phys->bpo_bytes += used;
        bpo->bpo_phys->bpo_comp += comp;
        bpo->bpo_phys->bpo_uncomp += uncomp;
        mutex_exit(&bpo->bpo_lock);
+
+       bpobj_close(&subbpo);
 }
 
 void
index 42ae439972e4ce25ebd2ef38692af468bca7e0ac..9c4e0296db2bdab35fca237db669679c66cebae7 100644 (file)
@@ -217,6 +217,22 @@ dbuf_evict_user(dmu_buf_impl_t *db)
        db->db_evict_func = NULL;
 }
 
+boolean_t
+dbuf_is_metadata(dmu_buf_impl_t *db)
+{
+       if (db->db_level > 0) {
+               return (B_TRUE);
+       } else {
+               boolean_t is_metadata;
+
+               DB_DNODE_ENTER(db);
+               is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata;
+               DB_DNODE_EXIT(db);
+
+               return (is_metadata);
+       }
+}
+
 void
 dbuf_evict(dmu_buf_impl_t *db)
 {
@@ -281,7 +297,7 @@ dbuf_fini(void)
 static void
 dbuf_verify(dmu_buf_impl_t *db)
 {
-       dnode_t *dn = db->db_dnode;
+       dnode_t *dn;
        dbuf_dirty_record_t *dr;
 
        ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -290,6 +306,8 @@ dbuf_verify(dmu_buf_impl_t *db)
                return;
 
        ASSERT(db->db_objset != NULL);
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
        if (dn == NULL) {
                ASSERT(db->db_parent == NULL);
                ASSERT(db->db_blkptr == NULL);
@@ -297,8 +315,9 @@ dbuf_verify(dmu_buf_impl_t *db)
                ASSERT3U(db->db.db_object, ==, dn->dn_object);
                ASSERT3P(db->db_objset, ==, dn->dn_objset);
                ASSERT3U(db->db_level, <, dn->dn_nlevels);
-               ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid ==
-                   DMU_SPILL_BLKID || list_head(&dn->dn_dbufs));
+               ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+                   db->db_blkid == DMU_SPILL_BLKID ||
+                   !list_is_empty(&dn->dn_dbufs));
        }
        if (db->db_blkid == DMU_BONUS_BLKID) {
                ASSERT(dn != NULL);
@@ -355,7 +374,7 @@ dbuf_verify(dmu_buf_impl_t *db)
                         * have the struct_rwlock.  XXX indblksz no longer
                         * grows.  safe to do this now?
                         */
-                       if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
+                       if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
                                ASSERT3P(db->db_blkptr, ==,
                                    ((blkptr_t *)db->db_parent->db.db_data +
                                    db->db_blkid % epb));
@@ -380,6 +399,7 @@ dbuf_verify(dmu_buf_impl_t *db)
                        }
                }
        }
+       DB_DNODE_EXIT(db);
 }
 #endif
 
@@ -424,8 +444,11 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
        mutex_enter(&db->db_mtx);
        if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
                int blksz = db->db.db_size;
+               spa_t *spa;
+
                mutex_exit(&db->db_mtx);
-               abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz);
+               DB_GET_SPA(&spa, db);
+               abuf = arc_loan_buf(spa, blksz);
                bcopy(db->db.db_data, abuf->b_data, blksz);
        } else {
                abuf = db->db_buf;
@@ -484,11 +507,14 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 static void
 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 {
-       dnode_t *dn = db->db_dnode;
+       dnode_t *dn;
+       spa_t *spa;
        zbookmark_t zb;
        uint32_t aflags = ARC_NOWAIT;
        arc_buf_t *pbuf;
 
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
        ASSERT(!refcount_is_zero(&db->db_holds));
        /* We need the struct_rwlock to prevent db_blkptr from changing. */
        ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@@ -506,6 +532,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
                        bzero(db->db.db_data, DN_MAX_BONUSLEN);
                if (bonuslen)
                        bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+               DB_DNODE_EXIT(db);
                dbuf_update_data(db);
                db->db_state = DB_CACHED;
                mutex_exit(&db->db_mtx);
@@ -524,6 +551,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 
                dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
                    db->db.db_size, db, type));
+               DB_DNODE_EXIT(db);
                bzero(db->db.db_data, db->db.db_size);
                db->db_state = DB_CACHED;
                *flags |= DB_RF_CACHED;
@@ -531,6 +559,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
                return;
        }
 
+       spa = dn->dn_objset->os_spa;
+       DB_DNODE_EXIT(db);
+
        db->db_state = DB_READ;
        mutex_exit(&db->db_mtx);
 
@@ -549,7 +580,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
        else
                pbuf = db->db_objset->os_phys_buf;
 
-       (void) dsl_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
+       (void) dsl_read(zio, spa, db->db_blkptr, pbuf,
            dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
            (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
            &aflags, &zb);
@@ -563,6 +594,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
        int err = 0;
        int havepzio = (zio != NULL);
        int prefetch;
+       dnode_t *dn;
 
        /*
         * We don't have to hold the mutex to check db_state because it
@@ -573,46 +605,51 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
        if (db->db_state == DB_NOFILL)
                return (EIO);
 
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
        if ((flags & DB_RF_HAVESTRUCT) == 0)
-               rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+               rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
        prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
-           (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
+           (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
            DBUF_IS_CACHEABLE(db);
 
        mutex_enter(&db->db_mtx);
        if (db->db_state == DB_CACHED) {
                mutex_exit(&db->db_mtx);
                if (prefetch)
-                       dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+                       dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
                            db->db.db_size, TRUE);
                if ((flags & DB_RF_HAVESTRUCT) == 0)
-                       rw_exit(&db->db_dnode->dn_struct_rwlock);
+                       rw_exit(&dn->dn_struct_rwlock);
+               DB_DNODE_EXIT(db);
        } else if (db->db_state == DB_UNCACHED) {
-               if (zio == NULL) {
-                       zio = zio_root(db->db_dnode->dn_objset->os_spa,
-                           NULL, NULL, ZIO_FLAG_CANFAIL);
-               }
+               spa_t *spa = dn->dn_objset->os_spa;
+
+               if (zio == NULL)
+                       zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
                dbuf_read_impl(db, zio, &flags);
 
                /* dbuf_read_impl has dropped db_mtx for us */
 
                if (prefetch)
-                       dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+                       dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
                            db->db.db_size, flags & DB_RF_CACHED);
 
                if ((flags & DB_RF_HAVESTRUCT) == 0)
-                       rw_exit(&db->db_dnode->dn_struct_rwlock);
+                       rw_exit(&dn->dn_struct_rwlock);
+               DB_DNODE_EXIT(db);
 
                if (!havepzio)
                        err = zio_wait(zio);
        } else {
                mutex_exit(&db->db_mtx);
                if (prefetch)
-                       dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+                       dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
                            db->db.db_size, TRUE);
                if ((flags & DB_RF_HAVESTRUCT) == 0)
-                       rw_exit(&db->db_dnode->dn_struct_rwlock);
+                       rw_exit(&dn->dn_struct_rwlock);
+               DB_DNODE_EXIT(db);
 
                mutex_enter(&db->db_mtx);
                if ((flags & DB_RF_NEVERWAIT) == 0) {
@@ -642,11 +679,12 @@ dbuf_noread(dmu_buf_impl_t *db)
                cv_wait(&db->db_changed, &db->db_mtx);
        if (db->db_state == DB_UNCACHED) {
                arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+               spa_t *spa;
 
                ASSERT(db->db_buf == NULL);
                ASSERT(db->db.db_data == NULL);
-               dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
-                   db->db.db_size, db, type));
+               DB_GET_SPA(&spa, db);
+               dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
                db->db_state = DB_FILL;
        } else if (db->db_state == DB_NOFILL) {
                dbuf_set_data(db, NULL);
@@ -687,7 +725,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
        /*
         * If the last dirty record for this dbuf has not yet synced
         * and its referencing the dbuf data, either:
-        *      reset the reference to point to a new copy,
+        *      reset the reference to point to a new copy,
         * or (if there a no active holders)
         *      just null out the current db_data pointer.
         */
@@ -700,8 +738,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
        } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
                int size = db->db.db_size;
                arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-               dr->dt.dl.dr_data = arc_buf_alloc(
-                   db->db_dnode->dn_objset->os_spa, size, db, type);
+               spa_t *spa;
+
+               DB_GET_SPA(&spa, db);
+               dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
                bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
        } else {
                dbuf_set_data(db, NULL);
@@ -726,9 +766,12 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
        ASSERT(db->db_data_pending != dr);
 
        /* free this block */
-       if (!BP_IS_HOLE(bp))
-               zio_free(db->db_dnode->dn_objset->os_spa, txg, bp);
+       if (!BP_IS_HOLE(bp)) {
+               spa_t *spa;
 
+               DB_GET_SPA(&spa, db);
+               zio_free(spa, txg, bp);
+       }
        dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
        /*
         * Release the already-written buffer, so we leave it in
@@ -865,10 +908,15 @@ dbuf_block_freeable(dmu_buf_impl_t *db)
        else if (db->db_blkptr)
                birth_txg = db->db_blkptr->blk_birth;
 
-       /* If we don't exist or are in a snapshot, we can't be freed */
+       /*
+        * If we don't exist or are in a snapshot, we can't be freed.
+        * Don't pass the bp to dsl_dataset_block_freeable() since we
+        * are holding the db_mtx lock and might deadlock if we are
+        * prefetching a dedup-ed block.
+        */
        if (birth_txg)
                return (ds == NULL ||
-                   dsl_dataset_block_freeable(ds, db->db_blkptr, birth_txg));
+                   dsl_dataset_block_freeable(ds, NULL, birth_txg));
        else
                return (FALSE);
 }
@@ -879,11 +927,15 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
        arc_buf_t *buf, *obuf;
        int osize = db->db.db_size;
        arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+       dnode_t *dn;
 
        ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+
        /* XXX does *this* func really need the lock? */
-       ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
+       ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
        /*
         * This call to dbuf_will_dirty() with the dn_struct_rwlock held
@@ -898,7 +950,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
        dbuf_will_dirty(db, tx);
 
        /* create the data buffer for the new block */
-       buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
+       buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
 
        /* copy old block data to the new block */
        obuf = db->db_buf;
@@ -918,15 +970,17 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
        }
        mutex_exit(&db->db_mtx);
 
-       dnode_willuse_space(db->db_dnode, size-osize, tx);
+       dnode_willuse_space(dn, size-osize, tx);
+       DB_DNODE_EXIT(db);
 }
 
 void
 dbuf_release_bp(dmu_buf_impl_t *db)
 {
-       objset_t *os = db->db_dnode->dn_objset;
+       objset_t *os;
        zbookmark_t zb;
 
+       DB_GET_OBJSET(&os, db);
        ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
        ASSERT(arc_released(os->os_phys_buf) ||
            list_link_active(&os->os_dsl_dataset->ds_synced_link));
@@ -944,8 +998,8 @@ dbuf_release_bp(dmu_buf_impl_t *db)
 dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
-       dnode_t *dn = db->db_dnode;
-       objset_t *os = dn->dn_objset;
+       dnode_t *dn;
+       objset_t *os;
        dbuf_dirty_record_t **drp, *dr;
        int drop_struct_lock = FALSE;
        boolean_t do_free_accounting = B_FALSE;
@@ -955,6 +1009,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        ASSERT(!refcount_is_zero(&db->db_holds));
        DMU_TX_DIRTY_BUF(tx, db);
 
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
        /*
         * Shouldn't dirty a regular buffer in syncing context.  Private
         * objects may be dirtied in syncing context, but only if they
@@ -1009,6 +1065,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
                drp = &dr->dr_next;
        if (dr && dr->dr_txg == tx->tx_txg) {
+               DB_DNODE_EXIT(db);
+
                if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
                        /*
                         * If this buffer has already been written out,
@@ -1044,6 +1102,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
         * we already dirtied it in open context.  Hence we must make
         * this assertion only if we're not already dirty.
         */
+       os = dn->dn_objset;
        ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
            os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
        ASSERT(db->db.db_size != 0);
@@ -1132,6 +1191,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
                mutex_exit(&dn->dn_mtx);
                dnode_setdirty(dn, tx);
+               DB_DNODE_EXIT(db);
                return (dr);
        } else if (do_free_accounting) {
                blkptr_t *bp = db->db_blkptr;
@@ -1145,6 +1205,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                 * db_blkptr, but since this is just a guess,
                 * it's OK if we get an odd answer.
                 */
+               ddt_prefetch(os->os_spa, bp);
                dnode_willuse_space(dn, -willfree, tx);
        }
 
@@ -1193,8 +1254,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        } else {
                ASSERT(db->db_level+1 == dn->dn_nlevels);
                ASSERT(db->db_blkid < dn->dn_nblkptr);
-               ASSERT(db->db_parent == NULL ||
-                   db->db_parent == db->db_dnode->dn_dbuf);
+               ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
                mutex_enter(&dn->dn_mtx);
                ASSERT(!list_link_active(&dr->dr_dirty_node));
                list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
@@ -1204,13 +1264,14 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        }
 
        dnode_setdirty(dn, tx);
+       DB_DNODE_EXIT(db);
        return (dr);
 }
 
 static int
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
-       dnode_t *dn = db->db_dnode;
+       dnode_t *dn;
        uint64_t txg = tx->tx_txg;
        dbuf_dirty_record_t *dr, **drp;
 
@@ -1231,6 +1292,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        ASSERT(dr->dr_txg == txg);
        ASSERT(dr->dr_dbuf == db);
 
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+
        /*
         * If this buffer is currently held, we cannot undirty
         * it, since one of the current holders may be in the
@@ -1243,6 +1307,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                mutex_enter(&dn->dn_mtx);
                dnode_clear_range(dn, db->db_blkid, 1, tx);
                mutex_exit(&dn->dn_mtx);
+               DB_DNODE_EXIT(db);
                return (0);
        }
 
@@ -1264,6 +1329,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
                mutex_exit(&dn->dn_mtx);
        }
+       DB_DNODE_EXIT(db);
 
        if (db->db_level == 0) {
                if (db->db_state != DB_NOFILL) {
@@ -1309,8 +1375,10 @@ dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        ASSERT(tx->tx_txg != 0);
        ASSERT(!refcount_is_zero(&db->db_holds));
 
-       if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
+       DB_DNODE_ENTER(db);
+       if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
                rf |= DB_RF_HAVESTRUCT;
+       DB_DNODE_EXIT(db);
        (void) dbuf_read(db, NULL, rf);
        (void) dbuf_dirty(db, tx);
 }
@@ -1372,7 +1440,6 @@ void
 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 {
        ASSERT(!refcount_is_zero(&db->db_holds));
-       ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT);
        ASSERT(db->db_blkid != DMU_BONUS_BLKID);
        ASSERT(db->db_level == 0);
        ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
@@ -1436,7 +1503,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
  * in this case.  For callers from the DMU we will usually see:
  *     dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
  * For the arc callback, we will usually see:
- *     dbuf_do_evict()->dbuf_clear();dbuf_destroy()
+ *     dbuf_do_evict()->dbuf_clear();dbuf_destroy()
  * Sometimes, though, we will get a mix of these two:
  *     DMU: dbuf_clear()->arc_buf_evict()
  *     ARC: dbuf_do_evict()->dbuf_destroy()
@@ -1444,9 +1511,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 void
 dbuf_clear(dmu_buf_impl_t *db)
 {
-       dnode_t *dn = db->db_dnode;
+       dnode_t *dn;
        dmu_buf_impl_t *parent = db->db_parent;
-       dmu_buf_impl_t *dndb = dn->dn_dbuf;
+       dmu_buf_impl_t *dndb;
        int dbuf_gone = FALSE;
 
        ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1470,10 +1537,26 @@ dbuf_clear(dmu_buf_impl_t *db)
        db->db_state = DB_EVICTING;
        db->db_blkptr = NULL;
 
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+       dndb = dn->dn_dbuf;
        if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
                list_remove(&dn->dn_dbufs, db);
+               (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
+               membar_producer();
+               DB_DNODE_EXIT(db);
+               /*
+                * Decrementing the dbuf count means that the hold corresponding
+                * to the removed dbuf is no longer discounted in dnode_move(),
+                * so the dnode cannot be moved until after we release the hold.
+                * The membar_producer() ensures visibility of the decremented
+                * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
+                * release any lock.
+                */
                dnode_rele(dn, db);
-               db->db_dnode = NULL;
+               db->db_dnode_handle = NULL;
+       } else {
+               DB_DNODE_EXIT(db);
        }
 
        if (db->db_buf)
@@ -1483,7 +1566,7 @@ dbuf_clear(dmu_buf_impl_t *db)
                mutex_exit(&db->db_mtx);
 
        /*
-        * If this dbuf is referened from an indirect dbuf,
+        * If this dbuf is referenced from an indirect dbuf,
         * decrement the ref count on the indirect dbuf.
         */
        if (parent && parent != dndb)
@@ -1575,7 +1658,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
        db->db_blkid = blkid;
        db->db_last_dirty = NULL;
        db->db_dirtycnt = 0;
-       db->db_dnode = dn;
+       db->db_dnode_handle = dn->dn_handle;
        db->db_parent = parent;
        db->db_blkptr = blkptr;
 
@@ -1632,6 +1715,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
        ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
            refcount_count(&dn->dn_holds) > 0);
        (void) refcount_add(&dn->dn_holds, db);
+       (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
 
        dprintf_dbuf(db, "db=%p\n", db);
 
@@ -1671,15 +1755,24 @@ dbuf_destroy(dmu_buf_impl_t *db)
                 * If this dbuf is still on the dn_dbufs list,
                 * remove it from that list.
                 */
-               if (db->db_dnode) {
-                       dnode_t *dn = db->db_dnode;
+               if (db->db_dnode_handle != NULL) {
+                       dnode_t *dn;
 
+                       DB_DNODE_ENTER(db);
+                       dn = DB_DNODE(db);
                        mutex_enter(&dn->dn_dbufs_mtx);
                        list_remove(&dn->dn_dbufs, db);
+                       (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
                        mutex_exit(&dn->dn_dbufs_mtx);
-
+                       DB_DNODE_EXIT(db);
+                       /*
+                        * Decrementing the dbuf count means that the hold
+                        * corresponding to the removed dbuf is no longer
+                        * discounted in dnode_move(), so the dnode cannot be
+                        * moved until after we release the hold.
+                        */
                        dnode_rele(dn, db);
-                       db->db_dnode = NULL;
+                       db->db_dnode_handle = NULL;
                }
                dbuf_hash_remove(db);
        }
@@ -1710,17 +1803,13 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
 
        /* dbuf_find() returns with db_mtx held */
        if (db = dbuf_find(dn, 0, blkid)) {
-               if (refcount_count(&db->db_holds) > 0) {
-                       /*
-                        * This dbuf is active.  We assume that it is
-                        * already CACHED, or else about to be either
-                        * read or filled.
-                        */
-                       mutex_exit(&db->db_mtx);
-                       return;
-               }
+               /*
+                * This dbuf is already in the cache.  We assume that
+                * it is already CACHED, or else about to be either
+                * read or filled.
+                */
                mutex_exit(&db->db_mtx);
-               db = NULL;
+               return;
        }
 
        if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
@@ -1818,7 +1907,7 @@ top:
                        arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
                        dbuf_set_data(db,
-                           arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+                           arc_buf_alloc(dn->dn_objset->os_spa,
                            db->db.db_size, db, type));
                        bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
                            db->db.db_size);
@@ -1834,7 +1923,7 @@ top:
        if (parent)
                dbuf_rele(parent, NULL);
 
-       ASSERT3P(db->db_dnode, ==, dn);
+       ASSERT3P(DB_DNODE(db), ==, dn);
        ASSERT3U(db->db_blkid, ==, blkid);
        ASSERT3U(db->db_level, ==, level);
        *dbp = db;
@@ -1871,6 +1960,8 @@ int
 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
 {
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       dnode_t *dn;
+
        if (db->db_blkid != DMU_SPILL_BLKID)
                return (ENOTSUP);
        if (blksz == 0)
@@ -1880,9 +1971,12 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
        else
                blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
 
-       rw_enter(&db->db_dnode->dn_struct_rwlock, RW_WRITER);
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+       rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
        dbuf_new_size(db, blksz, tx);
-       rw_exit(&db->db_dnode->dn_struct_rwlock);
+       rw_exit(&dn->dn_struct_rwlock);
+       DB_DNODE_EXIT(db);
 
        return (0);
 }
@@ -1901,6 +1995,13 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
        ASSERT(holds > 1);
 }
 
+/*
+ * If you call dbuf_rele() you had better not be referencing the dnode handle
+ * unless you have some other direct or indirect hold on the dnode. (An indirect
+ * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
+ * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
+ * dnode's parent dbuf evicting its dnode handles.
+ */
 #pragma weak dmu_buf_rele = dbuf_rele
 void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
@@ -1921,6 +2022,11 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
        ASSERT(MUTEX_HELD(&db->db_mtx));
        DBUF_VERIFY(db);
 
+       /*
+        * Remove the reference to the dbuf before removing its hold on the
+        * dnode so we can guarantee in dnode_move() that a referenced bonus
+        * buffer has a corresponding dnode hold.
+        */
        holds = refcount_remove(&db->db_holds, tag);
        ASSERT(holds >= 0);
 
@@ -1938,7 +2044,20 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
        if (holds == 0) {
                if (db->db_blkid == DMU_BONUS_BLKID) {
                        mutex_exit(&db->db_mtx);
-                       dnode_rele(db->db_dnode, db);
+
+                       /*
+                        * If the dnode moves here, we cannot cross this barrier
+                        * until the move completes.
+                        */
+                       DB_DNODE_ENTER(db);
+                       (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
+                       DB_DNODE_EXIT(db);
+                       /*
+                        * The bonus buffer's dnode hold is no longer discounted
+                        * in dnode_move(). The dnode cannot move until after
+                        * the dnode_rele().
+                        */
+                       dnode_rele(DB_DNODE(db), db);
                } else if (db->db_buf == NULL) {
                        /*
                         * This is a special case: we never associated this
@@ -2089,7 +2208,7 @@ static void
 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
        dmu_buf_impl_t *db = dr->dr_dbuf;
-       dnode_t *dn = db->db_dnode;
+       dnode_t *dn;
        zio_t *zio;
 
        ASSERT(dmu_tx_is_syncing(tx));
@@ -2107,10 +2226,13 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
                mutex_enter(&db->db_mtx);
        }
        ASSERT3U(db->db_state, ==, DB_CACHED);
-       ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
        ASSERT(db->db_buf != NULL);
 
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+       ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
        dbuf_check_blkptr(dn, db);
+       DB_DNODE_EXIT(db);
 
        db->db_data_pending = dr;
 
@@ -2130,8 +2252,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
        arc_buf_t **datap = &dr->dt.dl.dr_data;
        dmu_buf_impl_t *db = dr->dr_dbuf;
-       dnode_t *dn = db->db_dnode;
-       objset_t *os = dn->dn_objset;
+       dnode_t *dn;
+       objset_t *os;
        uint64_t txg = tx->tx_txg;
 
        ASSERT(dmu_tx_is_syncing(tx));
@@ -2154,6 +2276,9 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
        }
        DBUF_VERIFY(db);
 
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+
        if (db->db_blkid == DMU_SPILL_BLKID) {
                mutex_enter(&dn->dn_mtx);
                dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
@@ -2173,6 +2298,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
                ASSERT3U(db->db_level, ==, 0);
                ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
                bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+               DB_DNODE_EXIT(db);
+
                if (*datap != db->db.db_data) {
                        zio_buf_free(*datap, DN_MAX_BONUSLEN);
                        arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
@@ -2191,6 +2318,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
                return;
        }
 
+       os = dn->dn_objset;
+
        /*
         * This function may have dropped the db_mtx lock allowing a dmu_sync
         * operation to sneak in. As a result, we need to ensure that we
@@ -2200,7 +2329,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
        dbuf_check_blkptr(dn, db);
 
        /*
-        * If this buffer is in the middle of an immdiate write,
+        * If this buffer is in the middle of an immediate write,
         * wait for the synchronous IO to complete.
         */
        while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
@@ -2237,10 +2366,20 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
        dbuf_write(dr, *datap, tx);
 
        ASSERT(!list_link_active(&dr->dr_dirty_node));
-       if (dn->dn_object == DMU_META_DNODE_OBJECT)
+       if (dn->dn_object == DMU_META_DNODE_OBJECT) {
                list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
-       else
+               DB_DNODE_EXIT(db);
+       } else {
+               /*
+                * Although zio_nowait() does not "wait for an IO", it does
+                * initiate the IO. If this is an empty write it seems plausible
+                * that the IO could actually be completed before the nowait
+                * returns. We need to DB_DNODE_EXIT() first in case
+                * zio_nowait() invalidates the dbuf.
+                */
+               DB_DNODE_EXIT(db);
                zio_nowait(dr->dr_zio);
+       }
 }
 
 void
@@ -2274,9 +2413,9 @@ static void
 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
        dmu_buf_impl_t *db = vdb;
+       dnode_t *dn;
        blkptr_t *bp = zio->io_bp;
        blkptr_t *bp_orig = &zio->io_bp_orig;
-       dnode_t *dn = db->db_dnode;
        spa_t *spa = zio->io_spa;
        int64_t delta;
        uint64_t fill = 0;
@@ -2284,12 +2423,15 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 
        ASSERT(db->db_blkptr == bp);
 
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
        delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
        dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
        zio->io_prev_space_delta = delta;
 
        if (BP_IS_HOLE(bp)) {
                ASSERT(bp->blk_fill == 0);
+               DB_DNODE_EXIT(db);
                return;
        }
 
@@ -2303,7 +2445,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 
 #ifdef ZFS_DEBUG
        if (db->db_blkid == DMU_SPILL_BLKID) {
-               dnode_t *dn = db->db_dnode;
                ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
                ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
                    db->db_blkptr == &dn->dn_phys->dn_spill);
@@ -2336,6 +2477,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                        fill += ibp->blk_fill;
                }
        }
+       DB_DNODE_EXIT(db);
 
        bp->blk_fill = fill;
 
@@ -2349,8 +2491,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
        dmu_buf_impl_t *db = vdb;
        blkptr_t *bp = zio->io_bp;
        blkptr_t *bp_orig = &zio->io_bp_orig;
-       dnode_t *dn = db->db_dnode;
-       objset_t *os = dn->dn_objset;
        uint64_t txg = zio->io_txg;
        dbuf_dirty_record_t **drp, *dr;
 
@@ -2360,8 +2500,13 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
        if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
                ASSERT(BP_EQUAL(bp, bp_orig));
        } else {
-               dsl_dataset_t *ds = os->os_dsl_dataset;
-               dmu_tx_t *tx = os->os_synctx;
+               objset_t *os;
+               dsl_dataset_t *ds;
+               dmu_tx_t *tx;
+
+               DB_GET_OBJSET(&os, db);
+               ds = os->os_dsl_dataset;
+               tx = os->os_synctx;
 
                (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
                dsl_dataset_block_born(ds, bp, tx);
@@ -2382,10 +2527,14 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 
 #ifdef ZFS_DEBUG
        if (db->db_blkid == DMU_SPILL_BLKID) {
-               dnode_t *dn = db->db_dnode;
+               dnode_t *dn;
+
+               DB_DNODE_ENTER(db);
+               dn = DB_DNODE(db);
                ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
                ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
                    db->db_blkptr == &dn->dn_phys->dn_spill);
+               DB_DNODE_EXIT(db);
        }
 #endif
 
@@ -2400,6 +2549,10 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
                                arc_set_callback(db->db_buf, dbuf_do_evict, db);
                }
        } else {
+               dnode_t *dn;
+
+               DB_DNODE_ENTER(db);
+               dn = DB_DNODE(db);
                ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
                ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
                if (!BP_IS_HOLE(db->db_blkptr)) {
@@ -2411,6 +2564,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
                            >> (db->db_level * epbs), >=, db->db_blkid);
                        arc_set_callback(db->db_buf, dbuf_do_evict, db);
                }
+               DB_DNODE_EXIT(db);
                mutex_destroy(&dr->dt.di.dr_mtx);
                list_destroy(&dr->dt.di.dr_children);
        }
@@ -2466,8 +2620,8 @@ static void
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
        dmu_buf_impl_t *db = dr->dr_dbuf;
-       dnode_t *dn = db->db_dnode;
-       objset_t *os = dn->dn_objset;
+       dnode_t *dn;
+       objset_t *os;
        dmu_buf_impl_t *parent = db->db_parent;
        uint64_t txg = tx->tx_txg;
        zbookmark_t zb;
@@ -2475,6 +2629,10 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
        zio_t *zio;
        int wp_flag = 0;
 
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+       os = dn->dn_objset;
+
        if (db->db_state != DB_NOFILL) {
                if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
                        /*
@@ -2519,6 +2677,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
        wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
 
        dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
+       DB_DNODE_EXIT(db);
 
        if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
                ASSERT(db->db_state != DB_NOFILL);
index 926b4df9a5d909d263e2bcb35c612f92e5304df1..71833149676561796130db2e2a1ec5009e3d8bb7 100644 (file)
 #include <sys/zio_compress.h>
 #include <sys/dsl_scan.h>
 
+/*
+ * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
+ */
+int zfs_dedup_prefetch = 1;
+
 static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
        &ddt_zap_ops,
 };
@@ -456,9 +461,6 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
        if (ddo_total->ddo_count != 0) {
                ddo_total->ddo_dspace /= ddo_total->ddo_count;
                ddo_total->ddo_mspace /= ddo_total->ddo_count;
-       } else {
-               ASSERT(ddo_total->ddo_dspace == 0);
-               ASSERT(ddo_total->ddo_mspace == 0);
        }
 }
 
@@ -730,13 +732,13 @@ ddt_prefetch(spa_t *spa, const blkptr_t *bp)
        ddt_t *ddt;
        ddt_entry_t dde;
 
-       if (!BP_GET_DEDUP(bp))
+       if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
                return;
 
        /*
-        * We remove the DDT once it's empty and only prefetch dedup blocks
-        * when there are entries in the DDT.  Thus no locking is required
-        * as the DDT can't disappear on us.
+        * We only remove the DDT once all tables are empty and only
+        * prefetch dedup blocks when there are entries in the DDT.
+        * Thus no locking is required as the DDT can't disappear on us.
         */
        ddt = ddt_select(spa, bp);
        ddt_key_fill(&dde.dde_key, bp);
@@ -1072,11 +1074,15 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
        }
 
        for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+               uint64_t count = 0;
+               for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+                       if (ddt_object_exists(ddt, type, class)) {
+                               ddt_object_sync(ddt, type, class, tx);
+                               count += ddt_object_count(ddt, type, class);
+                       }
+               }
                for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
-                       if (!ddt_object_exists(ddt, type, class))
-                               continue;
-                       ddt_object_sync(ddt, type, class, tx);
-                       if (ddt_object_count(ddt, type, class) == 0)
+                       if (count == 0 && ddt_object_exists(ddt, type, class))
                                ddt_object_destroy(ddt, type, class, tx);
                }
        }
index 5b87c81c639af00528e1db63967985b678d36b75..39234eba53b2e4730d07d4259b680038dcd541cd 100644 (file)
@@ -133,7 +133,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
        }
 
        dnode_rele(dn, FTAG);
-       *dbp = &db->db;
+       *dbp = &db->db; /* NULL db plus first field offset is NULL */
        return (err);
 }
 
@@ -144,31 +144,64 @@ dmu_bonus_max(void)
 }
 
 int
-dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
+dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
 {
-       dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       dnode_t *dn;
+       int error;
 
-       if (dn->dn_bonus != (dmu_buf_impl_t *)db)
-               return (EINVAL);
-       if (newsize < 0 || newsize > db->db_size)
-               return (EINVAL);
-       dnode_setbonuslen(dn, newsize, tx);
-       return (0);
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+
+       if (dn->dn_bonus != db) {
+               error = EINVAL;
+       } else if (newsize < 0 || newsize > db_fake->db_size) {
+               error = EINVAL;
+       } else {
+               dnode_setbonuslen(dn, newsize, tx);
+               error = 0;
+       }
+
+       DB_DNODE_EXIT(db);
+       return (error);
 }
 
 int
-dmu_set_bonustype(dmu_buf_t *db, dmu_object_type_t type, dmu_tx_t *tx)
+dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
 {
-       dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       dnode_t *dn;
+       int error;
 
-       if (type > DMU_OT_NUMTYPES)
-               return (EINVAL);
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
 
-       if (dn->dn_bonus != (dmu_buf_impl_t *)db)
-               return (EINVAL);
+       if (type > DMU_OT_NUMTYPES) {
+               error = EINVAL;
+       } else if (dn->dn_bonus != db) {
+               error = EINVAL;
+       } else {
+               dnode_setbonus_type(dn, type, tx);
+               error = 0;
+       }
 
-       dnode_setbonus_type(dn, type, tx);
-       return (0);
+       DB_DNODE_EXIT(db);
+       return (error);
+}
+
+dmu_object_type_t
+dmu_get_bonustype(dmu_buf_t *db_fake)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       dnode_t *dn;
+       dmu_object_type_t type;
+
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+       type = dn->dn_bonustype;
+       DB_DNODE_EXIT(db);
+
+       return (type);
 }
 
 int
@@ -208,11 +241,19 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
                        dbuf_create_bonus(dn);
        }
        db = dn->dn_bonus;
-       rw_exit(&dn->dn_struct_rwlock);
 
        /* as long as the bonus buf is held, the dnode will be held */
-       if (refcount_add(&db->db_holds, tag) == 1)
+       if (refcount_add(&db->db_holds, tag) == 1) {
                VERIFY(dnode_add_ref(dn, db));
+               (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
+       }
+
+       /*
+        * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
+        * hold and incrementing the dbuf count to ensure that dnode_move() sees
+        * a dnode hold for every dbuf.
+        */
+       rw_exit(&dn->dn_struct_rwlock);
 
        dnode_rele(dn, FTAG);
 
@@ -246,35 +287,56 @@ dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
                rw_exit(&dn->dn_struct_rwlock);
 
        ASSERT(db != NULL);
-       err = dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | flags);
-       *dbp = &db->db;
+       err = dbuf_read(db, NULL, flags);
+       if (err == 0)
+               *dbp = &db->db;
+       else
+               dbuf_rele(db, tag);
        return (err);
 }
 
 int
 dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 {
-       dnode_t *dn = ((dmu_buf_impl_t *)bonus)->db_dnode;
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+       dnode_t *dn;
        int err;
 
-       if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA)
-               return (EINVAL);
-       rw_enter(&dn->dn_struct_rwlock, RW_READER);
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+
+       if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
+               err = EINVAL;
+       } else {
+               rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+               if (!dn->dn_have_spill) {
+                       err = ENOENT;
+               } else {
+                       err = dmu_spill_hold_by_dnode(dn,
+                           DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
+               }
 
-       if (!dn->dn_have_spill) {
                rw_exit(&dn->dn_struct_rwlock);
-               return (ENOENT);
        }
-       err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT, tag, dbp);
-       rw_exit(&dn->dn_struct_rwlock);
+
+       DB_DNODE_EXIT(db);
        return (err);
 }
 
 int
 dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 {
-       return (dmu_spill_hold_by_dnode(((dmu_buf_impl_t *)bonus)->db_dnode,
-           0, tag, dbp));
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+       dnode_t *dn;
+       int err;
+
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+       err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
+       DB_DNODE_EXIT(db);
+
+       return (err);
 }
 
 /*
@@ -396,14 +458,18 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
 }
 
 int
-dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
     uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
 {
-       dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       dnode_t *dn;
        int err;
 
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
        err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
            numbufsp, dbpp, DMU_READ_PREFETCH);
+       DB_DNODE_EXIT(db);
 
        return (err);
 }
@@ -436,7 +502,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
                return;
 
        if (len == 0) {  /* they're interested in the bonus buffer */
-               dn = os->os_meta_dnode;
+               dn = DMU_META_DNODE(os);
 
                if (object == 0 || object >= DN_MAX_OBJECT)
                        return;
@@ -997,11 +1063,19 @@ int
 dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+       dnode_t *dn;
+       int err;
+
        if (size == 0)
                return (0);
 
-       return (dmu_write_uio_dnode(((dmu_buf_impl_t *)zdb)->db_dnode,
-           uio, size, tx));
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+       err = dmu_write_uio_dnode(dn, uio, size, tx);
+       DB_DNODE_EXIT(db);
+
+       return (err);
 }
 
 int
@@ -1087,9 +1161,11 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 arc_buf_t *
 dmu_request_arcbuf(dmu_buf_t *handle, int size)
 {
-       dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
+       spa_t *spa;
 
-       return (arc_loan_buf(dn->dn_objset->os_spa, size));
+       DB_GET_SPA(&spa, db);
+       return (arc_loan_buf(spa, size));
 }
 
 /*
@@ -1111,23 +1187,35 @@ void
 dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
-       dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+       dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
+       dnode_t *dn;
        dmu_buf_impl_t *db;
        uint32_t blksz = (uint32_t)arc_buf_size(buf);
        uint64_t blkid;
 
+       DB_DNODE_ENTER(dbuf);
+       dn = DB_DNODE(dbuf);
        rw_enter(&dn->dn_struct_rwlock, RW_READER);
        blkid = dbuf_whichblock(dn, offset);
        VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
        rw_exit(&dn->dn_struct_rwlock);
+       DB_DNODE_EXIT(dbuf);
 
        if (offset == db->db.db_offset && blksz == db->db.db_size) {
                dbuf_assign_arcbuf(db, buf, tx);
                dbuf_rele(db, FTAG);
        } else {
+               objset_t *os;
+               uint64_t object;
+
+               DB_DNODE_ENTER(dbuf);
+               dn = DB_DNODE(dbuf);
+               os = dn->dn_objset;
+               object = dn->dn_object;
+               DB_DNODE_EXIT(dbuf);
+
                dbuf_rele(db, FTAG);
-               dmu_write(dn->dn_objset, dn->dn_object, offset, blksz,
-                   buf->b_data, tx);
+               dmu_write(os, object, offset, blksz, buf->b_data, tx);
                dmu_return_arcbuf(buf);
                XUIOSTAT_BUMP(xuiostat_wbuf_copied);
        }
@@ -1146,7 +1234,6 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
        dmu_sync_arg_t *dsa = varg;
        dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
-       dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
        blkptr_t *bp = zio->io_bp;
 
        if (zio->io_error == 0) {
@@ -1157,7 +1244,6 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
                         */
                        BP_SET_LSIZE(bp, db->db_size);
                } else {
-                       ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
                        ASSERT(BP_GET_LEVEL(bp) == 0);
                        bp->blk_fill = 1;
                }
@@ -1280,6 +1366,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
        dmu_sync_arg_t *dsa;
        zbookmark_t zb;
        zio_prop_t zp;
+       dnode_t *dn;
 
        ASSERT(pio != NULL);
        ASSERT(BP_IS_HOLE(bp));
@@ -1288,7 +1375,10 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
        SET_BOOKMARK(&zb, ds->ds_object,
            db->db.db_object, db->db_level, db->db_blkid);
 
-       dmu_write_policy(os, db->db_dnode, db->db_level, WP_DMU_SYNC, &zp);
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+       dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
+       DB_DNODE_EXIT(db);
 
        /*
         * If we're frozen (running ziltest), we always need to generate a bp.
@@ -1413,7 +1503,8 @@ void
 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 {
        dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
-       boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata);
+       boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata ||
+           (wp & WP_SPILL));
        enum zio_checksum checksum = os->os_checksum;
        enum zio_compress compress = os->os_compress;
        enum zio_checksum dedup_checksum = os->os_dedup_checksum;
@@ -1569,9 +1660,13 @@ dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
  * As above, but faster; can be used when you have a held dbuf in hand.
  */
 void
-dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
+dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
 {
-       dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+       DB_DNODE_ENTER(db);
+       dmu_object_info_from_dnode(DB_DNODE(db), doi);
+       DB_DNODE_EXIT(db);
 }
 
 /*
@@ -1579,14 +1674,20 @@ dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
  * This is specifically optimized for zfs_getattr().
  */
 void
-dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
+dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
+    u_longlong_t *nblk512)
 {
-       dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       dnode_t *dn;
+
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
 
        *blksize = dn->dn_datablksz;
        /* add 1 for dnode space */
        *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
            SPA_MINBLOCKSHIFT) + 1;
+       DB_DNODE_EXIT(db);
 }
 
 void
@@ -1638,23 +1739,25 @@ void
 dmu_init(void)
 {
        zfs_dbgmsg_init();
-       dbuf_init();
+       sa_cache_init();
+       xuio_stat_init();
+       dmu_objset_init();
        dnode_init();
+       dbuf_init();
        zfetch_init();
        arc_init();
        l2arc_init();
-       xuio_stat_init();
-       sa_cache_init();
 }
 
 void
 dmu_fini(void)
 {
+       l2arc_fini();
        arc_fini();
        zfetch_fini();
-       dnode_fini();
        dbuf_fini();
-       l2arc_fini();
+       dnode_fini();
+       dmu_objset_fini();
        xuio_stat_fini();
        sa_cache_fini();
        zfs_dbgmsg_fini();
diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c
new file mode 100644 (file)
index 0000000..22340eb
--- /dev/null
@@ -0,0 +1,221 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+
+struct diffarg {
+       struct vnode *da_vp;            /* file to which we are reporting */
+       offset_t *da_offp;
+       int da_err;                     /* error that stopped diff search */
+       dmu_diff_record_t da_ddr;
+};
+
+static int
+write_record(struct diffarg *da)
+{
+       ssize_t resid; /* have to get resid to get detailed errno */
+
+       if (da->da_ddr.ddr_type == DDR_NONE) {
+               da->da_err = 0;
+               return (0);
+       }
+
+       da->da_err = vn_rdwr(UIO_WRITE, da->da_vp, (caddr_t)&da->da_ddr,
+           sizeof (da->da_ddr), 0, UIO_SYSSPACE, FAPPEND,
+           RLIM64_INFINITY, CRED(), &resid);
+       *da->da_offp += sizeof (da->da_ddr);
+       return (da->da_err);
+}
+
+static int
+report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last)
+{
+       ASSERT(first <= last);
+       if (da->da_ddr.ddr_type != DDR_FREE ||
+           first != da->da_ddr.ddr_last + 1) {
+               if (write_record(da) != 0)
+                       return (da->da_err);
+               da->da_ddr.ddr_type = DDR_FREE;
+               da->da_ddr.ddr_first = first;
+               da->da_ddr.ddr_last = last;
+               return (0);
+       }
+       da->da_ddr.ddr_last = last;
+       return (0);
+}
+
+static int
+report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp)
+{
+       ASSERT(dnp != NULL);
+       if (dnp->dn_type == DMU_OT_NONE)
+               return (report_free_dnode_range(da, object, object));
+
+       if (da->da_ddr.ddr_type != DDR_INUSE ||
+           object != da->da_ddr.ddr_last + 1) {
+               if (write_record(da) != 0)
+                       return (da->da_err);
+               da->da_ddr.ddr_type = DDR_INUSE;
+               da->da_ddr.ddr_first = da->da_ddr.ddr_last = object;
+               return (0);
+       }
+       da->da_ddr.ddr_last = object;
+       return (0);
+}
+
+#define        DBP_SPAN(dnp, level)                              \
+       (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
+       (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+
+/* ARGSUSED */
+static int
+diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+       struct diffarg *da = arg;
+       int err = 0;
+
+       if (issig(JUSTLOOKING) && issig(FORREAL))
+               return (EINTR);
+
+       if (zb->zb_object != DMU_META_DNODE_OBJECT)
+               return (0);
+
+       if (bp == NULL) {
+               uint64_t span = DBP_SPAN(dnp, zb->zb_level);
+               uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
+
+               err = report_free_dnode_range(da, dnobj,
+                   dnobj + (span >> DNODE_SHIFT) - 1);
+               if (err)
+                       return (err);
+       } else if (zb->zb_level == 0) {
+               dnode_phys_t *blk;
+               arc_buf_t *abuf;
+               uint32_t aflags = ARC_WAIT;
+               int blksz = BP_GET_LSIZE(bp);
+               int i;
+
+               if (dsl_read(NULL, spa, bp, pbuf,
+                   arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+                   ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
+                       return (EIO);
+
+               blk = abuf->b_data;
+               for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
+                       uint64_t dnobj = (zb->zb_blkid <<
+                           (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
+                       err = report_dnode(da, dnobj, blk+i);
+                       if (err)
+                               break;
+               }
+               (void) arc_buf_remove_ref(abuf, &abuf);
+               if (err)
+                       return (err);
+               /* Don't care about the data blocks */
+               return (TRAVERSE_VISIT_NO_CHILDREN);
+       }
+       return (0);
+}
+
+int
+dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp)
+{
+       struct diffarg da;
+       dsl_dataset_t *ds = tosnap->os_dsl_dataset;
+       dsl_dataset_t *fromds = fromsnap->os_dsl_dataset;
+       dsl_dataset_t *findds;
+       dsl_dataset_t *relds;
+       int err = 0;
+
+       /* make certain we are looking at snapshots */
+       if (!dsl_dataset_is_snapshot(ds) || !dsl_dataset_is_snapshot(fromds))
+               return (EINVAL);
+
+       /* fromsnap must be earlier and from the same lineage as tosnap */
+       if (fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)
+               return (EXDEV);
+
+       relds = NULL;
+       findds = ds;
+
+       while (fromds->ds_dir != findds->ds_dir) {
+               dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+               if (!dsl_dir_is_clone(findds->ds_dir)) {
+                       if (relds)
+                               dsl_dataset_rele(relds, FTAG);
+                       return (EXDEV);
+               }
+
+               rw_enter(&dp->dp_config_rwlock, RW_READER);
+               err = dsl_dataset_hold_obj(dp,
+                   findds->ds_dir->dd_phys->dd_origin_obj, FTAG, &findds);
+               rw_exit(&dp->dp_config_rwlock);
+
+               if (relds)
+                       dsl_dataset_rele(relds, FTAG);
+
+               if (err)
+                       return (EXDEV);
+
+               relds = findds;
+       }
+
+       if (relds)
+               dsl_dataset_rele(relds, FTAG);
+
+       da.da_vp = vp;
+       da.da_offp = offp;
+       da.da_ddr.ddr_type = DDR_NONE;
+       da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0;
+       da.da_err = 0;
+
+       err = traverse_dataset(ds, fromds->ds_phys->ds_creation_txg,
+           TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da);
+
+       if (err) {
+               da.da_err = err;
+       } else {
+               /* we set the da.da_err we return as side-effect */
+               (void) write_record(&da);
+       }
+
+       return (da.da_err);
+}
index 98228d4035081df9c1542b5d856c6f6e2f143adc..8dff46048902d44b48aec40b26a061e7593e94bf 100644 (file)
@@ -33,7 +33,7 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
 {
        uint64_t object;
        uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
-           (os->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+           (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
        dnode_t *dn = NULL;
        int restarted = B_FALSE;
 
@@ -49,7 +49,7 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
                 */
                if (P2PHASE(object, L2_dnode_count) == 0) {
                        uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
-                       int error = dnode_next_offset(os->os_meta_dnode,
+                       int error = dnode_next_offset(DMU_META_DNODE(os),
                            DNODE_FIND_HOLE,
                            &offset, 2, DNODES_PER_BLOCK >> 2, 0);
                        restarted = B_TRUE;
@@ -187,7 +187,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
        uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
        int error;
 
-       error = dnode_next_offset(os->os_meta_dnode,
+       error = dnode_next_offset(DMU_META_DNODE(os),
            (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
 
        *objectp = offset >> DNODE_SHIFT;
index 690e6ecdee6ab84e5611ee49ca8ca295fb25c97a..7caebd979f02a99d150d7410b7f821e9d2d413b3 100644 (file)
 #include <sys/zil.h>
 #include <sys/dmu_impl.h>
 #include <sys/zfs_ioctl.h>
-#include <sys/sunddi.h>
 #include <sys/sa.h>
+#include <sys/zfs_onexit.h>
+
+/*
+ * Needed to close a window in dnode_move() that allows the objset to be freed
+ * before it can be safely accessed.
+ */
+krwlock_t os_lock;
+
+void
+dmu_objset_init(void)
+{
+       rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
+}
+
+void
+dmu_objset_fini(void)
+{
+       rw_destroy(&os_lock);
+}
 
 spa_t *
 dmu_objset_spa(objset_t *os)
@@ -350,7 +368,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
                os->os_secondary_cache = ZFS_CACHE_ALL;
        }
 
-       os->os_zil_header = os->os_phys->os_zil_header;
+       if (ds == NULL || !dsl_dataset_is_snapshot(ds))
+               os->os_zil_header = os->os_phys->os_zil_header;
        os->os_zil = zil_alloc(os, &os->os_zil_header);
 
        for (i = 0; i < TXG_SIZE; i++) {
@@ -368,13 +387,16 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
        mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 
-       os->os_meta_dnode = dnode_special_open(os,
-           &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+       DMU_META_DNODE(os) = dnode_special_open(os,
+           &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
+           &os->os_meta_dnode);
        if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
-               os->os_userused_dnode = dnode_special_open(os,
-                   &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT);
-               os->os_groupused_dnode = dnode_special_open(os,
-                   &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT);
+               DMU_USERUSED_DNODE(os) = dnode_special_open(os,
+                   &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
+                   &os->os_userused_dnode);
+               DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
+                   &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
+                   &os->os_groupused_dnode);
        }
 
        /*
@@ -401,7 +423,7 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
        *osp = ds->ds_objset;
        if (*osp == NULL) {
                err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
-                   ds, &ds->ds_phys->ds_bp, osp);
+                   ds, dsl_dataset_get_blkptr(ds), osp);
        }
        mutex_exit(&ds->ds_opening_lock);
        return (err);
@@ -470,8 +492,8 @@ dmu_objset_evict_dbufs(objset_t *os)
        mutex_enter(&os->os_lock);
 
        /* process the mdn last, since the other dnodes have holds on it */
-       list_remove(&os->os_dnodes, os->os_meta_dnode);
-       list_insert_tail(&os->os_dnodes, os->os_meta_dnode);
+       list_remove(&os->os_dnodes, DMU_META_DNODE(os));
+       list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
 
        /*
         * Find the first dnode with holds.  We have to do this dance
@@ -497,8 +519,9 @@ dmu_objset_evict_dbufs(objset_t *os)
                mutex_enter(&os->os_lock);
                dn = next_dn;
        }
+       dn = list_head(&os->os_dnodes);
        mutex_exit(&os->os_lock);
-       return (list_head(&os->os_dnodes) != os->os_meta_dnode);
+       return (dn != DMU_META_DNODE(os));
 }
 
 void
@@ -539,16 +562,26 @@ dmu_objset_evict(objset_t *os)
         */
        (void) dmu_objset_evict_dbufs(os);
 
-       dnode_special_close(os->os_meta_dnode);
-       if (os->os_userused_dnode) {
-               dnode_special_close(os->os_userused_dnode);
-               dnode_special_close(os->os_groupused_dnode);
+       dnode_special_close(&os->os_meta_dnode);
+       if (DMU_USERUSED_DNODE(os)) {
+               dnode_special_close(&os->os_userused_dnode);
+               dnode_special_close(&os->os_groupused_dnode);
        }
        zil_free(os->os_zil);
 
        ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
 
        VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1);
+
+       /*
+        * This is a barrier to prevent the objset from going away in
+        * dnode_move() until we can safely ensure that the objset is still in
+        * use. We consider the objset valid before the barrier and invalid
+        * after the barrier.
+        */
+       rw_enter(&os_lock, RW_READER);
+       rw_exit(&os_lock);
+
        mutex_destroy(&os->os_lock);
        mutex_destroy(&os->os_obj_lock);
        mutex_destroy(&os->os_user_ptr_lock);
@@ -570,12 +603,12 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
        dnode_t *mdn;
 
        ASSERT(dmu_tx_is_syncing(tx));
-       if (ds)
-               mutex_enter(&ds->ds_opening_lock);
-       VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &os));
-       if (ds)
-               mutex_exit(&ds->ds_opening_lock);
-       mdn = os->os_meta_dnode;
+       if (ds != NULL)
+               VERIFY(0 == dmu_objset_from_ds(ds, &os));
+       else
+               VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os));
+
+       mdn = DMU_META_DNODE(os);
 
        dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
            DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
@@ -663,34 +696,33 @@ static void
 dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dir_t *dd = arg1;
+       spa_t *spa = dd->dd_pool->dp_spa;
        struct oscarg *oa = arg2;
-       uint64_t dsobj;
+       uint64_t obj;
 
        ASSERT(dmu_tx_is_syncing(tx));
 
-       dsobj = dsl_dataset_create_sync(dd, oa->lastname,
+       obj = dsl_dataset_create_sync(dd, oa->lastname,
            oa->clone_origin, oa->flags, oa->cr, tx);
 
        if (oa->clone_origin == NULL) {
+               dsl_pool_t *dp = dd->dd_pool;
                dsl_dataset_t *ds;
                blkptr_t *bp;
                objset_t *os;
 
-               VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj,
-                   FTAG, &ds));
+               VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
                bp = dsl_dataset_get_blkptr(ds);
                ASSERT(BP_IS_HOLE(bp));
 
-               os = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
-                   ds, bp, oa->type, tx);
+               os = dmu_objset_create_impl(spa, ds, bp, oa->type, tx);
 
                if (oa->userfunc)
                        oa->userfunc(os, oa->userarg, oa->cr, tx);
                dsl_dataset_rele(ds, FTAG);
        }
 
-       spa_history_log_internal(LOG_DS_CREATE, dd->dd_pool->dp_spa,
-           tx, "dataset = %llu", dsobj);
+       spa_history_log_internal(LOG_DS_CREATE, spa, tx, "dataset = %llu", obj);
 }
 
 int
@@ -758,18 +790,8 @@ dmu_objset_destroy(const char *name, boolean_t defer)
        dsl_dataset_t *ds;
        int error;
 
-       /*
-        * dsl_dataset_destroy() can free any claimed-but-unplayed
-        * intent log, but if there is an active log, it has blocks that
-        * are allocated, but may not yet be reflected in the on-disk
-        * structure.  Only the ZIL knows how to free them, so we have
-        * to call into it here.
-        */
        error = dsl_dataset_own(name, B_TRUE, FTAG, &ds);
        if (error == 0) {
-               objset_t *os;
-               if (dmu_objset_from_ds(ds, &os) == 0)
-                       zil_destroy(dmu_objset_zil(os), B_FALSE);
                error = dsl_dataset_destroy(ds, FTAG, defer);
                /* dsl_dataset_destroy() closes the ds. */
        }
@@ -780,9 +802,14 @@ dmu_objset_destroy(const char *name, boolean_t defer)
 struct snaparg {
        dsl_sync_task_group_t *dstg;
        char *snapname;
+       char *htag;
        char failed[MAXPATHLEN];
        boolean_t recursive;
+       boolean_t needsuspend;
+       boolean_t temporary;
        nvlist_t *props;
+       struct dsl_ds_holdarg *ha;      /* only needed in the temporary case */
+       dsl_dataset_t *newds;
 };
 
 static int
@@ -790,11 +817,41 @@ snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        objset_t *os = arg1;
        struct snaparg *sn = arg2;
+       int error;
 
        /* The props have already been checked by zfs_check_userprops(). */
 
-       return (dsl_dataset_snapshot_check(os->os_dsl_dataset,
-           sn->snapname, tx));
+       error = dsl_dataset_snapshot_check(os->os_dsl_dataset,
+           sn->snapname, tx);
+       if (error)
+               return (error);
+
+       if (sn->temporary) {
+               /*
+                * Ideally we would just call
+                * dsl_dataset_user_hold_check() and
+                * dsl_dataset_destroy_check() here.  However the
+                * dataset we want to hold and destroy is the snapshot
+                * that we just confirmed we can create, but it won't
+                * exist until after these checks are run.  Do any
+                * checks we can here and if more checks are added to
+                * those routines in the future, similar checks may be
+                * necessary here.
+                */
+               if (spa_version(os->os_spa) < SPA_VERSION_USERREFS)
+                       return (ENOTSUP);
+               /*
+                * Not checking number of tags because the tag will be
+                * unique, as it will be the only tag.
+                */
+               if (strlen(sn->htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
+                       return (E2BIG);
+
+               sn->ha = kmem_alloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+               sn->ha->temphold = B_TRUE;
+               sn->ha->htag = sn->htag;
+       }
+       return (error);
 }
 
 static void
@@ -812,6 +869,19 @@ snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
                pa.pa_source = ZPROP_SRC_LOCAL;
                dsl_props_set_sync(ds->ds_prev, &pa, tx);
        }
+
+       if (sn->temporary) {
+               struct dsl_ds_destroyarg da;
+
+               dsl_dataset_user_hold_sync(ds->ds_prev, sn->ha, tx);
+               kmem_free(sn->ha, sizeof (struct dsl_ds_holdarg));
+               sn->ha = NULL;
+               sn->newds = ds->ds_prev;
+
+               da.ds = ds->ds_prev;
+               da.defer = B_TRUE;
+               dsl_dataset_destroy_sync(&da, FTAG, tx);
+       }
 }
 
 static int
@@ -857,29 +927,27 @@ dmu_objset_snapshot_one(const char *name, void *arg)
                return (sn->recursive ? 0 : EBUSY);
        }
 
-       /*
-        * NB: we need to wait for all in-flight changes to get to disk,
-        * so that we snapshot those changes.  zil_suspend does this as
-        * a side effect.
-        */
-       err = zil_suspend(dmu_objset_zil(os));
-       if (err == 0) {
-               dsl_sync_task_create(sn->dstg, snapshot_check,
-                   snapshot_sync, os, sn, 3);
-       } else {
-               dmu_objset_rele(os, sn);
+       if (sn->needsuspend) {
+               err = zil_suspend(dmu_objset_zil(os));
+               if (err) {
+                       dmu_objset_rele(os, sn);
+                       return (err);
+               }
        }
+       dsl_sync_task_create(sn->dstg, snapshot_check, snapshot_sync,
+           os, sn, 3);
 
-       return (err);
+       return (0);
 }
 
 int
-dmu_objset_snapshot(char *fsname, char *snapname,
-    nvlist_t *props, boolean_t recursive)
+dmu_objset_snapshot(char *fsname, char *snapname, char *tag,
+    nvlist_t *props, boolean_t recursive, boolean_t temporary, int cleanup_fd)
 {
        dsl_sync_task_t *dst;
        struct snaparg sn;
        spa_t *spa;
+       minor_t minor;
        int err;
 
        (void) strcpy(sn.failed, fsname);
@@ -888,10 +956,26 @@ dmu_objset_snapshot(char *fsname, char *snapname,
        if (err)
                return (err);
 
+       if (temporary) {
+               if (cleanup_fd < 0) {
+                       spa_close(spa, FTAG);
+                       return (EINVAL);
+               }
+               if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) {
+                       spa_close(spa, FTAG);
+                       return (err);
+               }
+       }
+
        sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
        sn.snapname = snapname;
+       sn.htag = tag;
        sn.props = props;
        sn.recursive = recursive;
+       sn.needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+       sn.temporary = temporary;
+       sn.ha = NULL;
+       sn.newds = NULL;
 
        if (recursive) {
                err = dmu_objset_find(fsname,
@@ -907,14 +991,20 @@ dmu_objset_snapshot(char *fsname, char *snapname,
            dst = list_next(&sn.dstg->dstg_tasks, dst)) {
                objset_t *os = dst->dst_arg1;
                dsl_dataset_t *ds = os->os_dsl_dataset;
-               if (dst->dst_err)
+               if (dst->dst_err) {
                        dsl_dataset_name(ds, sn.failed);
-               zil_resume(dmu_objset_zil(os));
+               } else if (temporary) {
+                       dsl_register_onexit_hold_cleanup(sn.newds, tag, minor);
+               }
+               if (sn.needsuspend)
+                       zil_resume(dmu_objset_zil(os));
                dmu_objset_rele(os, &sn);
        }
 
        if (err)
                (void) strcpy(fsname, sn.failed);
+       if (temporary)
+               zfs_onexit_fd_rele(cleanup_fd);
        dsl_sync_task_group_destroy(sn.dstg);
        spa_close(spa, FTAG);
        return (err);
@@ -1035,17 +1125,17 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
        /*
         * Sync special dnodes - the parent IO for the sync is the root block
         */
-       os->os_meta_dnode->dn_zio = zio;
-       dnode_sync(os->os_meta_dnode, tx);
+       DMU_META_DNODE(os)->dn_zio = zio;
+       dnode_sync(DMU_META_DNODE(os), tx);
 
        os->os_phys->os_flags = os->os_flags;
 
-       if (os->os_userused_dnode &&
-           os->os_userused_dnode->dn_type != DMU_OT_NONE) {
-               os->os_userused_dnode->dn_zio = zio;
-               dnode_sync(os->os_userused_dnode, tx);
-               os->os_groupused_dnode->dn_zio = zio;
-               dnode_sync(os->os_groupused_dnode, tx);
+       if (DMU_USERUSED_DNODE(os) &&
+           DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
+               DMU_USERUSED_DNODE(os)->dn_zio = zio;
+               dnode_sync(DMU_USERUSED_DNODE(os), tx);
+               DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
+               dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
        }
 
        txgoff = tx->tx_txg & TXG_MASK;
@@ -1063,7 +1153,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
        dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
        dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
 
-       list = &os->os_meta_dnode->dn_dirty_records[txgoff];
+       list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
        while (dr = list_head(list)) {
                ASSERT(dr->dr_dbuf->db_level == 0);
                list_remove(list, dr);
@@ -1085,7 +1175,16 @@ dmu_objset_is_dirty(objset_t *os, uint64_t txg)
            !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
 }
 
-objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
+boolean_t
+dmu_objset_is_dirty_anywhere(objset_t *os)
+{
+       for (int t = 0; t < TXG_SIZE; t++)
+               if (dmu_objset_is_dirty(os, t))
+                       return (B_TRUE);
+       return (B_FALSE);
+}
+
+static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
 
 void
 dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
@@ -1097,8 +1196,8 @@ boolean_t
 dmu_objset_userused_enabled(objset_t *os)
 {
        return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
-           used_cbs[os->os_phys->os_type] &&
-           os->os_userused_dnode);
+           used_cbs[os->os_phys->os_type] != NULL &&
+           DMU_USERUSED_DNODE(os) != NULL);
 }
 
 static void
@@ -1125,13 +1224,14 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
        ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
 
        while (dn = list_head(list)) {
+               int flags;
                ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
                ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
                    dn->dn_phys->dn_flags &
                    DNODE_FLAG_USERUSED_ACCOUNTED);
 
                /* Allocate the user/groupused objects if necessary. */
-               if (os->os_userused_dnode->dn_type == DMU_OT_NONE) {
+               if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
                        VERIFY(0 == zap_create_claim(os,
                            DMU_USERUSED_OBJECT,
                            DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
@@ -1148,18 +1248,19 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
                 * a bprewrite.
                 */
 
-               mutex_enter(&dn->dn_mtx);
-               ASSERT(dn->dn_id_flags);
-               if (dn->dn_id_flags & DN_ID_OLD_EXIST)  {
+               flags = dn->dn_id_flags;
+               ASSERT(flags);
+               if (flags & DN_ID_OLD_EXIST)  {
                        do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
                            dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
                }
-               if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
+               if (flags & DN_ID_NEW_EXIST) {
                        do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
                            dn->dn_phys->dn_flags,  dn->dn_newuid,
                            dn->dn_newgid, B_FALSE, tx);
                }
 
+               mutex_enter(&dn->dn_mtx);
                dn->dn_oldused = 0;
                dn->dn_oldflags = 0;
                if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
@@ -1199,13 +1300,23 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
                if (dr->dr_txg == tx->tx_txg)
                        break;
 
-       if (dr == NULL)
+       if (dr == NULL) {
                data = NULL;
-       else if (dr->dr_dbuf->db_dnode->dn_bonuslen == 0 &&
-           dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
-               data = dr->dt.dl.dr_data->b_data;
-       else
-               data = dr->dt.dl.dr_data;
+       } else {
+               dnode_t *dn;
+
+               DB_DNODE_ENTER(dr->dr_dbuf);
+               dn = DB_DNODE(dr->dr_dbuf);
+
+               if (dn->dn_bonuslen == 0 &&
+                   dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
+                       data = dr->dt.dl.dr_data->b_data;
+               else
+                       data = dr->dt.dl.dr_data;
+
+               DB_DNODE_EXIT(dr->dr_dbuf);
+       }
+
        return (data);
 }
 
@@ -1242,7 +1353,8 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
 
                        if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
                                rf |= DB_RF_HAVESTRUCT;
-                       error = dmu_spill_hold_by_dnode(dn, rf,
+                       error = dmu_spill_hold_by_dnode(dn,
+                           rf | DB_RF_MUST_SUCCEED,
                            FTAG, (dmu_buf_t **)&db);
                        ASSERT(error == 0);
                        mutex_enter(&db->db_mtx);
index 6b00b73b43be124143e7582da4ba1edcefb04f1b..e47d533a44f4d0584de832c0f29736790b76604f 100644 (file)
@@ -42,6 +42,7 @@
 #include <zfs_fletcher.h>
 #include <sys/avl.h>
 #include <sys/ddt.h>
+#include <sys/zfs_onexit.h>
 
 static char *dmu_recv_tag = "dmu_recv_tag";
 
@@ -573,6 +574,14 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
        if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
                return (ETXTBSY);
 
+       /* new snapshot name must not exist */
+       err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
+           ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
+       if (err == 0)
+               return (EEXIST);
+       if (err != ENOENT)
+               return (err);
+
        if (rbsa->fromguid) {
                /* if incremental, most recent snapshot must match fromguid */
                if (ds->ds_prev == NULL)
@@ -620,13 +629,6 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
        if (err != ENOENT)
                return (err);
 
-       /* new snapshot name must not exist */
-       err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
-           ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
-       if (err == 0)
-               return (EEXIST);
-       if (err != ENOENT)
-               return (err);
        return (0);
 }
 
@@ -661,7 +663,6 @@ recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx)
            dp->dp_spa, tx, "dataset = %lld", dsobj);
 }
 
-
 static boolean_t
 dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb)
 {
@@ -786,7 +787,7 @@ dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
                        return (err);
 
                if (dmu_recv_verify_features(ds, drrb)) {
-                       dsl_dataset_rele(ds, dmu_recv_tag);
+                       dsl_dataset_rele(ds, FTAG);
                        return (ENOTSUP);
                }
 
@@ -810,7 +811,7 @@ struct restorearg {
        uint64_t voff;
        int bufsize; /* amount of memory allocated for buf */
        zio_cksum_t cksum;
-       avl_tree_t guid_to_ds_map;
+       avl_tree_t *guid_to_ds_map;
 };
 
 typedef struct guid_map_entry {
@@ -887,6 +888,21 @@ find_ds_by_guid(const char *name, void *arg)
        return (0);
 }
 
+static void
+free_guid_map_onexit(void *arg)
+{
+       avl_tree_t *ca = arg;
+       void *cookie = NULL;
+       guid_map_entry_t *gmep;
+
+       while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
+               dsl_dataset_rele(gmep->gme_ds, ca);
+               kmem_free(gmep, sizeof (guid_map_entry_t));
+       }
+       avl_destroy(ca);
+       kmem_free(ca, sizeof (avl_tree_t));
+}
+
 static void *
 restore_read(struct restorearg *ra, int len)
 {
@@ -1173,7 +1189,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
         */
        if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
                gmesrch.guid = drrwbr->drr_refguid;
-               if ((gmep = avl_find(&ra->guid_to_ds_map, &gmesrch,
+               if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
                    &where)) == NULL) {
                        return (EINVAL);
                }
@@ -1276,13 +1292,13 @@ restore_free(struct restorearg *ra, objset_t *os,
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
 int
-dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
+dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
+    int cleanup_fd, uint64_t *action_handlep)
 {
        struct restorearg ra = { 0 };
        dmu_replay_record_t *drr;
        objset_t *os;
        zio_cksum_t pcksum;
-       guid_map_entry_t *gmep;
        int featureflags;
 
        if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
@@ -1336,12 +1352,38 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
 
        /* if this stream is dedup'ed, set up the avl tree for guid mapping */
        if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
-               avl_create(&ra.guid_to_ds_map, guid_compare,
-                   sizeof (guid_map_entry_t),
-                   offsetof(guid_map_entry_t, avlnode));
-               (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid,
-                   (void *)&ra.guid_to_ds_map,
-                   DS_FIND_CHILDREN);
+               minor_t minor;
+
+               if (cleanup_fd == -1) {
+                       ra.err = EBADF;
+                       goto out;
+               }
+               ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
+               if (ra.err) {
+                       cleanup_fd = -1;
+                       goto out;
+               }
+
+               if (*action_handlep == 0) {
+                       ra.guid_to_ds_map =
+                           kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+                       avl_create(ra.guid_to_ds_map, guid_compare,
+                           sizeof (guid_map_entry_t),
+                           offsetof(guid_map_entry_t, avlnode));
+                       (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid,
+                           (void *)ra.guid_to_ds_map,
+                           DS_FIND_CHILDREN);
+                       ra.err = zfs_onexit_add_cb(minor,
+                           free_guid_map_onexit, ra.guid_to_ds_map,
+                           action_handlep);
+                       if (ra.err)
+                               goto out;
+               } else {
+                       ra.err = zfs_onexit_cb_data(minor, *action_handlep,
+                           (void **)&ra.guid_to_ds_map);
+                       if (ra.err)
+                               goto out;
+               }
        }
 
        /*
@@ -1423,6 +1465,9 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
        ASSERT(ra.err != 0);
 
 out:
+       if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
+               zfs_onexit_fd_rele(cleanup_fd);
+
        if (ra.err != 0) {
                /*
                 * destroy what we created, so we don't leave it in the
@@ -1438,16 +1483,6 @@ out:
                }
        }
 
-       if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
-               void *cookie = NULL;
-
-               while (gmep = avl_destroy_nodes(&ra.guid_to_ds_map, &cookie)) {
-                       dsl_dataset_rele(gmep->gme_ds, &ra.guid_to_ds_map);
-                       kmem_free(gmep, sizeof (guid_map_entry_t));
-               }
-               avl_destroy(&ra.guid_to_ds_map);
-       }
-
        kmem_free(ra.buf, ra.bufsize);
        *voffp = ra.voff;
        return (ra.err);
index 429c76ae11f12e2a40c9f55a250aa5643dc04f60..023f90e12e342999aa8c3a767b076de8f755f047 100644 (file)
@@ -36,7 +36,9 @@
 #include <sys/sa_impl.h>
 #include <sys/callb.h>
 
-struct prefetch_data {
+int zfs_pd_blks_max = 100;
+
+typedef struct prefetch_data {
        kmutex_t pd_mtx;
        kcondvar_t pd_cv;
        int pd_blks_max;
@@ -44,27 +46,26 @@ struct prefetch_data {
        int pd_flags;
        boolean_t pd_cancel;
        boolean_t pd_exited;
-};
+} prefetch_data_t;
 
-struct traverse_data {
+typedef struct traverse_data {
        spa_t *td_spa;
        uint64_t td_objset;
        blkptr_t *td_rootbp;
        uint64_t td_min_txg;
        int td_flags;
-       struct prefetch_data *td_pfd;
+       prefetch_data_t *td_pfd;
        blkptr_cb_t *td_func;
        void *td_arg;
-};
+} traverse_data_t;
 
-static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
+static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
     arc_buf_t *buf, uint64_t objset, uint64_t object);
 
-/* ARGSUSED */
 static int
 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
-       struct traverse_data *td = arg;
+       traverse_data_t *td = arg;
        zbookmark_t zb;
 
        if (bp->blk_birth == 0)
@@ -81,11 +82,10 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
        return (0);
 }
 
-/* ARGSUSED */
 static int
 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 {
-       struct traverse_data *td = arg;
+       traverse_data_t *td = arg;
 
        if (lrc->lrc_txtype == TX_WRITE) {
                lr_write_t *lr = (lr_write_t *)lrc;
@@ -98,8 +98,8 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
                if (claim_txg == 0 || bp->blk_birth < claim_txg)
                        return (0);
 
-               SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL,
-                   lr->lr_offset / BP_GET_LSIZE(bp));
+               SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
+                   ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
                (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL,
                    td->td_arg);
@@ -108,7 +108,7 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 }
 
 static void
-traverse_zil(struct traverse_data *td, zil_header_t *zh)
+traverse_zil(traverse_data_t *td, zil_header_t *zh)
 {
        uint64_t claim_txg = zh->zh_claim_txg;
        zilog_t *zilog;
@@ -129,13 +129,13 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh)
 }
 
 static int
-traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
+traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
 {
        zbookmark_t czb;
        int err = 0, lasterr = 0;
        arc_buf_t *buf = NULL;
-       struct prefetch_data *pd = td->td_pfd;
+       prefetch_data_t *pd = td->td_pfd;
        boolean_t hard = td->td_flags & TRAVERSE_HARD;
 
        if (bp->blk_birth == 0) {
@@ -162,6 +162,8 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
        if (td->td_flags & TRAVERSE_PRE) {
                err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
                    td->td_arg);
+               if (err == TRAVERSE_VISIT_NO_CHILDREN)
+                       return (0);
                if (err)
                        return (err);
        }
@@ -225,8 +227,6 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
                        return (err);
 
                osp = buf->b_data;
-               traverse_zil(td, &osp->os_zil_header);
-
                dnp = &osp->os_meta_dnode;
                err = traverse_dnode(td, dnp, buf, zb->zb_objset,
                    DMU_META_DNODE_OBJECT);
@@ -262,7 +262,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 }
 
 static int
-traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
+traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
     arc_buf_t *buf, uint64_t objset, uint64_t object)
 {
        int j, err = 0, lasterr = 0;
@@ -300,7 +300,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp,
     void *arg)
 {
-       struct prefetch_data *pfd = arg;
+       prefetch_data_t *pfd = arg;
        uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
 
        ASSERT(pfd->pd_blks_fetched >= 0);
@@ -330,8 +330,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 static void
 traverse_prefetch_thread(void *arg)
 {
-       struct traverse_data *td_main = arg;
-       struct traverse_data td = *td_main;
+       traverse_data_t *td_main = arg;
+       traverse_data_t td = *td_main;
        zbookmark_t czb;
 
        td.td_func = traverse_prefetcher;
@@ -353,16 +353,16 @@ traverse_prefetch_thread(void *arg)
  * in syncing context).
  */
 static int
-traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
+traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp,
     uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
 {
-       struct traverse_data td;
-       struct prefetch_data pd = { 0 };
+       traverse_data_t td;
+       prefetch_data_t pd = { 0 };
        zbookmark_t czb;
        int err;
 
        td.td_spa = spa;
-       td.td_objset = objset;
+       td.td_objset = ds ? ds->ds_object : 0;
        td.td_rootbp = rootbp;
        td.td_min_txg = txg_start;
        td.td_func = func;
@@ -370,17 +370,28 @@ traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
        td.td_pfd = &pd;
        td.td_flags = flags;
 
-       pd.pd_blks_max = 100;
+       pd.pd_blks_max = zfs_pd_blks_max;
        pd.pd_flags = flags;
        mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
 
+       /* See comment on ZIL traversal in dsl_scan_visitds. */
+       if (ds != NULL && !dsl_dataset_is_snapshot(ds)) {
+               objset_t *os;
+
+               err = dmu_objset_from_ds(ds, &os);
+               if (err)
+                       return (err);
+
+               traverse_zil(&td, &os->os_zil_header);
+       }
+
        if (!(flags & TRAVERSE_PREFETCH) ||
            0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
            &td, TQ_NOQUEUE))
                pd.pd_exited = B_TRUE;
 
-       SET_BOOKMARK(&czb, objset,
+       SET_BOOKMARK(&czb, td.td_objset,
            ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
        err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
 
@@ -405,7 +416,7 @@ int
 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
     blkptr_cb_t func, void *arg)
 {
-       return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object,
+       return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds,
            &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
 }
 
@@ -423,7 +434,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
        boolean_t hard = (flags & TRAVERSE_HARD);
 
        /* visit the MOS */
-       err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
+       err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa),
            txg_start, flags, func, arg);
        if (err)
                return (err);
index 5fc062c16b4b4867e196357e02f1a1cb9bc06e06..bd5c71a2265e3d2aa7ab38de0458a5461b1e0cd0 100644 (file)
@@ -186,7 +186,7 @@ dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
                ASSERT(level != 0);
                db = NULL;
        } else {
-               ASSERT(db->db_dnode == dn);
+               ASSERT(DB_DNODE(db) == dn);
                ASSERT(db->db_level == level);
                ASSERT(db->db.db_size == space);
                ASSERT(db->db_blkid == blkid);
@@ -384,7 +384,7 @@ static void
 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 {
        dnode_t *dn = txh->txh_dnode;
-       dnode_t *mdn = txh->txh_tx->tx_objset->os_meta_dnode;
+       dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
        uint64_t space = mdn->dn_datablksz +
            ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 
@@ -787,18 +787,24 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 {
        dmu_tx_hold_t *txh;
        int match_object = FALSE, match_offset = FALSE;
-       dnode_t *dn = db->db_dnode;
+       dnode_t *dn;
 
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
        ASSERT(tx->tx_txg != 0);
        ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
        ASSERT3U(dn->dn_object, ==, db->db.db_object);
 
-       if (tx->tx_anyobj)
+       if (tx->tx_anyobj) {
+               DB_DNODE_EXIT(db);
                return;
+       }
 
        /* XXX No checking on the meta dnode for now */
-       if (db->db.db_object == DMU_META_DNODE_OBJECT)
+       if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+               DB_DNODE_EXIT(db);
                return;
+       }
 
        for (txh = list_head(&tx->tx_holds); txh;
            txh = list_next(&tx->tx_holds, txh)) {
@@ -870,9 +876,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
                                ASSERT(!"bad txh_type");
                        }
                }
-               if (match_object && match_offset)
+               if (match_object && match_offset) {
+                       DB_DNODE_EXIT(db);
                        return;
+               }
        }
+       DB_DNODE_EXIT(db);
        panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
            (u_longlong_t)db->db.db_object, db->db_level,
            (u_longlong_t)db->db_blkid);
@@ -1355,9 +1364,19 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
        if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
                dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
 
-       if (sa->sa_force_spill || may_grow || hdl->sa_spill ||
-           ((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_have_spill) {
+       if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
                ASSERT(tx->tx_txg == 0);
                dmu_tx_hold_spill(tx, object);
+       } else {
+               dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+               dnode_t *dn;
+
+               DB_DNODE_ENTER(db);
+               dn = DB_DNODE(db);
+               if (dn->dn_have_spill) {
+                       ASSERT(tx->tx_txg == 0);
+                       dmu_tx_hold_spill(tx, object);
+               }
+               DB_DNODE_EXIT(db);
        }
 }
index c16902d21697d2bc8a3f516cfce7651b6ff589cc..850dd5816bf32705408617d96363382e71f120ef 100644 (file)
 static int free_range_compar(const void *node1, const void *node2);
 
 static kmem_cache_t *dnode_cache;
+/*
+ * Define DNODE_STATS to turn on statistic gathering. By default, it is only
+ * turned on when DEBUG is also defined.
+ */
+#ifdef DEBUG
+#define        DNODE_STATS
+#endif /* DEBUG */
+
+#ifdef DNODE_STATS
+#define        DNODE_STAT_ADD(stat)                    ((stat)++)
+#else
+#define        DNODE_STAT_ADD(stat)                    /* nothing */
+#endif /* DNODE_STATS */
 
 static dnode_phys_t dnode_phys_zero;
 
 int zfs_default_bs = SPA_MINBLOCKSHIFT;
 int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
 
+static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
+
 /* ARGSUSED */
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
 {
-       int i;
        dnode_t *dn = arg;
-       bzero(dn, sizeof (dnode_t));
+       int i;
 
        rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
        mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -59,8 +73,18 @@ dnode_cons(void *arg, void *unused, int kmflag)
 
        refcount_create(&dn->dn_holds);
        refcount_create(&dn->dn_tx_holds);
+       list_link_init(&dn->dn_link);
+
+       bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
+       bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
+       bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
+       bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
+       bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
+       bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
+       bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
 
        for (i = 0; i < TXG_SIZE; i++) {
+               list_link_init(&dn->dn_dirty_link[i]);
                avl_create(&dn->dn_ranges[i], free_range_compar,
                    sizeof (free_range_t),
                    offsetof(struct free_range, fr_node));
@@ -69,9 +93,27 @@ dnode_cons(void *arg, void *unused, int kmflag)
                    offsetof(dbuf_dirty_record_t, dr_dirty_node));
        }
 
+       dn->dn_allocated_txg = 0;
+       dn->dn_free_txg = 0;
+       dn->dn_assigned_txg = 0;
+       dn->dn_dirtyctx = 0;
+       dn->dn_dirtyctx_firstset = NULL;
+       dn->dn_bonus = NULL;
+       dn->dn_have_spill = B_FALSE;
+       dn->dn_zio = NULL;
+       dn->dn_oldused = 0;
+       dn->dn_oldflags = 0;
+       dn->dn_olduid = 0;
+       dn->dn_oldgid = 0;
+       dn->dn_newuid = 0;
+       dn->dn_newgid = 0;
+       dn->dn_id_flags = 0;
+
+       dn->dn_dbufs_count = 0;
        list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
 
+       dn->dn_moved = 0;
        return (0);
 }
 
@@ -88,27 +130,56 @@ dnode_dest(void *arg, void *unused)
        cv_destroy(&dn->dn_notxholds);
        refcount_destroy(&dn->dn_holds);
        refcount_destroy(&dn->dn_tx_holds);
+       ASSERT(!list_link_active(&dn->dn_link));
 
        for (i = 0; i < TXG_SIZE; i++) {
+               ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
                avl_destroy(&dn->dn_ranges[i]);
                list_destroy(&dn->dn_dirty_records[i]);
+               ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
+               ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
+               ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+               ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
+               ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
+               ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
+               ASSERT3U(dn->dn_next_blksz[i], ==, 0);
        }
 
+       ASSERT3U(dn->dn_allocated_txg, ==, 0);
+       ASSERT3U(dn->dn_free_txg, ==, 0);
+       ASSERT3U(dn->dn_assigned_txg, ==, 0);
+       ASSERT3U(dn->dn_dirtyctx, ==, 0);
+       ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
+       ASSERT3P(dn->dn_bonus, ==, NULL);
+       ASSERT(!dn->dn_have_spill);
+       ASSERT3P(dn->dn_zio, ==, NULL);
+       ASSERT3U(dn->dn_oldused, ==, 0);
+       ASSERT3U(dn->dn_oldflags, ==, 0);
+       ASSERT3U(dn->dn_olduid, ==, 0);
+       ASSERT3U(dn->dn_oldgid, ==, 0);
+       ASSERT3U(dn->dn_newuid, ==, 0);
+       ASSERT3U(dn->dn_newgid, ==, 0);
+       ASSERT3U(dn->dn_id_flags, ==, 0);
+
+       ASSERT3U(dn->dn_dbufs_count, ==, 0);
        list_destroy(&dn->dn_dbufs);
 }
 
 void
 dnode_init(void)
 {
+       ASSERT(dnode_cache == NULL);
        dnode_cache = kmem_cache_create("dnode_t",
            sizeof (dnode_t),
            0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+       kmem_cache_set_move(dnode_cache, dnode_move);
 }
 
 void
 dnode_fini(void)
 {
        kmem_cache_destroy(dnode_cache);
+       dnode_cache = NULL;
 }
 
 
@@ -120,6 +191,7 @@ dnode_verify(dnode_t *dn)
 
        ASSERT(dn->dn_phys);
        ASSERT(dn->dn_objset);
+       ASSERT(dn->dn_handle->dnh_dnode == dn);
 
        ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
 
@@ -298,18 +370,29 @@ dnode_setdblksz(dnode_t *dn, int size)
 
 static dnode_t *
 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
-    uint64_t object)
+    uint64_t object, dnode_handle_t *dnh)
 {
        dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
-       (void) dnode_cons(dn, NULL, 0); /* XXX */
 
-       dn->dn_objset = os;
+       ASSERT(!POINTER_IS_VALID(dn->dn_objset));
+       dn->dn_moved = 0;
+
+       /*
+        * Defer setting dn_objset until the dnode is ready to be a candidate
+        * for the dnode_move() callback.
+        */
        dn->dn_object = object;
        dn->dn_dbuf = db;
+       dn->dn_handle = dnh;
        dn->dn_phys = dnp;
 
-       if (dnp->dn_datablkszsec)
+       if (dnp->dn_datablkszsec) {
                dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+       } else {
+               dn->dn_datablksz = 0;
+               dn->dn_datablkszsec = 0;
+               dn->dn_datablkshift = 0;
+       }
        dn->dn_indblkshift = dnp->dn_indblkshift;
        dn->dn_nlevels = dnp->dn_nlevels;
        dn->dn_type = dnp->dn_type;
@@ -325,45 +408,65 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
        dmu_zfetch_init(&dn->dn_zfetch, dn);
 
        ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+
        mutex_enter(&os->os_lock);
        list_insert_head(&os->os_dnodes, dn);
+       membar_producer();
+       /*
+        * Everything else must be valid before assigning dn_objset makes the
+        * dnode eligible for dnode_move().
+        */
+       dn->dn_objset = os;
        mutex_exit(&os->os_lock);
 
        arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
        return (dn);
 }
 
+/*
+ * Caller must be holding the dnode handle, which is released upon return.
+ */
 static void
 dnode_destroy(dnode_t *dn)
 {
        objset_t *os = dn->dn_objset;
 
-#ifdef ZFS_DEBUG
-       int i;
-
-       for (i = 0; i < TXG_SIZE; i++) {
-               ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
-               ASSERT(NULL == list_head(&dn->dn_dirty_records[i]));
-               ASSERT(0 == avl_numnodes(&dn->dn_ranges[i]));
-       }
-       ASSERT(NULL == list_head(&dn->dn_dbufs));
-#endif
        ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 
        mutex_enter(&os->os_lock);
+       POINTER_INVALIDATE(&dn->dn_objset);
        list_remove(&os->os_dnodes, dn);
        mutex_exit(&os->os_lock);
 
-       if (dn->dn_dirtyctx_firstset) {
+       /* the dnode can no longer move, so we can release the handle */
+       zrl_remove(&dn->dn_handle->dnh_zrlock);
+
+       dn->dn_allocated_txg = 0;
+       dn->dn_free_txg = 0;
+       dn->dn_assigned_txg = 0;
+
+       dn->dn_dirtyctx = 0;
+       if (dn->dn_dirtyctx_firstset != NULL) {
                kmem_free(dn->dn_dirtyctx_firstset, 1);
                dn->dn_dirtyctx_firstset = NULL;
        }
-       dmu_zfetch_rele(&dn->dn_zfetch);
-       if (dn->dn_bonus) {
+       if (dn->dn_bonus != NULL) {
                mutex_enter(&dn->dn_bonus->db_mtx);
                dbuf_evict(dn->dn_bonus);
                dn->dn_bonus = NULL;
        }
+       dn->dn_zio = NULL;
+
+       dn->dn_have_spill = B_FALSE;
+       dn->dn_oldused = 0;
+       dn->dn_oldflags = 0;
+       dn->dn_olduid = 0;
+       dn->dn_oldgid = 0;
+       dn->dn_newuid = 0;
+       dn->dn_newgid = 0;
+       dn->dn_id_flags = 0;
+
+       dmu_zfetch_rele(&dn->dn_zfetch);
        kmem_cache_free(dnode_cache, dn);
        arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
 }
@@ -408,6 +511,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
        ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
 
        for (i = 0; i < TXG_SIZE; i++) {
+               ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
                ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
                ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
                ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
@@ -522,9 +626,304 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        mutex_exit(&dn->dn_mtx);
 }
 
+#ifdef DNODE_STATS
+static struct {
+       uint64_t dms_dnode_invalid;
+       uint64_t dms_dnode_recheck1;
+       uint64_t dms_dnode_recheck2;
+       uint64_t dms_dnode_special;
+       uint64_t dms_dnode_handle;
+       uint64_t dms_dnode_rwlock;
+       uint64_t dms_dnode_active;
+} dnode_move_stats;
+#endif /* DNODE_STATS */
+
+static void
+dnode_move_impl(dnode_t *odn, dnode_t *ndn)
+{
+       int i;
+
+       ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
+       ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
+       ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
+       ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
+
+       /* Copy fields. */
+       ndn->dn_objset = odn->dn_objset;
+       ndn->dn_object = odn->dn_object;
+       ndn->dn_dbuf = odn->dn_dbuf;
+       ndn->dn_handle = odn->dn_handle;
+       ndn->dn_phys = odn->dn_phys;
+       ndn->dn_type = odn->dn_type;
+       ndn->dn_bonuslen = odn->dn_bonuslen;
+       ndn->dn_bonustype = odn->dn_bonustype;
+       ndn->dn_nblkptr = odn->dn_nblkptr;
+       ndn->dn_checksum = odn->dn_checksum;
+       ndn->dn_compress = odn->dn_compress;
+       ndn->dn_nlevels = odn->dn_nlevels;
+       ndn->dn_indblkshift = odn->dn_indblkshift;
+       ndn->dn_datablkshift = odn->dn_datablkshift;
+       ndn->dn_datablkszsec = odn->dn_datablkszsec;
+       ndn->dn_datablksz = odn->dn_datablksz;
+       ndn->dn_maxblkid = odn->dn_maxblkid;
+       bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
+           sizeof (odn->dn_next_nblkptr));
+       bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
+           sizeof (odn->dn_next_nlevels));
+       bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
+           sizeof (odn->dn_next_indblkshift));
+       bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
+           sizeof (odn->dn_next_bonustype));
+       bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
+           sizeof (odn->dn_rm_spillblk));
+       bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
+           sizeof (odn->dn_next_bonuslen));
+       bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
+           sizeof (odn->dn_next_blksz));
+       for (i = 0; i < TXG_SIZE; i++) {
+               list_move_tail(&ndn->dn_dirty_records[i],
+                   &odn->dn_dirty_records[i]);
+       }
+       bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
+       ndn->dn_allocated_txg = odn->dn_allocated_txg;
+       ndn->dn_free_txg = odn->dn_free_txg;
+       ndn->dn_assigned_txg = odn->dn_assigned_txg;
+       ndn->dn_dirtyctx = odn->dn_dirtyctx;
+       ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
+       ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
+       refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
+       ASSERT(list_is_empty(&ndn->dn_dbufs));
+       list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+       ndn->dn_dbufs_count = odn->dn_dbufs_count;
+       ndn->dn_bonus = odn->dn_bonus;
+       ndn->dn_have_spill = odn->dn_have_spill;
+       ndn->dn_zio = odn->dn_zio;
+       ndn->dn_oldused = odn->dn_oldused;
+       ndn->dn_oldflags = odn->dn_oldflags;
+       ndn->dn_olduid = odn->dn_olduid;
+       ndn->dn_oldgid = odn->dn_oldgid;
+       ndn->dn_newuid = odn->dn_newuid;
+       ndn->dn_newgid = odn->dn_newgid;
+       ndn->dn_id_flags = odn->dn_id_flags;
+       dmu_zfetch_init(&ndn->dn_zfetch, NULL);
+       list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
+       ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
+       ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
+       ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
+
+       /*
+        * Update back pointers. Updating the handle fixes the back pointer of
+        * every descendant dbuf as well as the bonus dbuf.
+        */
+       ASSERT(ndn->dn_handle->dnh_dnode == odn);
+       ndn->dn_handle->dnh_dnode = ndn;
+       if (ndn->dn_zfetch.zf_dnode == odn) {
+               ndn->dn_zfetch.zf_dnode = ndn;
+       }
+
+       /*
+        * Invalidate the original dnode by clearing all of its back pointers.
+        */
+       odn->dn_dbuf = NULL;
+       odn->dn_handle = NULL;
+       list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+           offsetof(dmu_buf_impl_t, db_link));
+       odn->dn_dbufs_count = 0;
+       odn->dn_bonus = NULL;
+       odn->dn_zfetch.zf_dnode = NULL;
+
+       /*
+        * Set the low bit of the objset pointer to ensure that dnode_move()
+        * recognizes the dnode as invalid in any subsequent callback.
+        */
+       POINTER_INVALIDATE(&odn->dn_objset);
+
+       /*
+        * Satisfy the destructor.
+        */
+       for (i = 0; i < TXG_SIZE; i++) {
+               list_create(&odn->dn_dirty_records[i],
+                   sizeof (dbuf_dirty_record_t),
+                   offsetof(dbuf_dirty_record_t, dr_dirty_node));
+               odn->dn_ranges[i].avl_root = NULL;
+               odn->dn_ranges[i].avl_numnodes = 0;
+               odn->dn_next_nlevels[i] = 0;
+               odn->dn_next_indblkshift[i] = 0;
+               odn->dn_next_bonustype[i] = 0;
+               odn->dn_rm_spillblk[i] = 0;
+               odn->dn_next_bonuslen[i] = 0;
+               odn->dn_next_blksz[i] = 0;
+       }
+       odn->dn_allocated_txg = 0;
+       odn->dn_free_txg = 0;
+       odn->dn_assigned_txg = 0;
+       odn->dn_dirtyctx = 0;
+       odn->dn_dirtyctx_firstset = NULL;
+       odn->dn_have_spill = B_FALSE;
+       odn->dn_zio = NULL;
+       odn->dn_oldused = 0;
+       odn->dn_oldflags = 0;
+       odn->dn_olduid = 0;
+       odn->dn_oldgid = 0;
+       odn->dn_newuid = 0;
+       odn->dn_newgid = 0;
+       odn->dn_id_flags = 0;
+
+       /*
+        * Mark the dnode.
+        */
+       ndn->dn_moved = 1;
+       odn->dn_moved = (uint8_t)-1;
+}
+
+#ifdef _KERNEL
+/*ARGSUSED*/
+static kmem_cbrc_t
+dnode_move(void *buf, void *newbuf, size_t size, void *arg)
+{
+       dnode_t *odn = buf, *ndn = newbuf;
+       objset_t *os;
+       int64_t refcount;
+       uint32_t dbufs;
+
+       /*
+        * The dnode is on the objset's list of known dnodes if the objset
+        * pointer is valid. We set the low bit of the objset pointer when
+        * freeing the dnode to invalidate it, and the memory patterns written
+        * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
+        * A newly created dnode sets the objset pointer last of all to indicate
+        * that the dnode is known and in a valid state to be moved by this
+        * function.
+        */
+       os = odn->dn_objset;
+       if (!POINTER_IS_VALID(os)) {
+               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
+               return (KMEM_CBRC_DONT_KNOW);
+       }
+
+       /*
+        * Ensure that the objset does not go away during the move.
+        */
+       rw_enter(&os_lock, RW_WRITER);
+       if (os != odn->dn_objset) {
+               rw_exit(&os_lock);
+               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
+               return (KMEM_CBRC_DONT_KNOW);
+       }
+
+       /*
+        * If the dnode is still valid, then so is the objset. We know that no
+        * valid objset can be freed while we hold os_lock, so we can safely
+        * ensure that the objset remains in use.
+        */
+       mutex_enter(&os->os_lock);
+
+       /*
+        * Recheck the objset pointer in case the dnode was removed just before
+        * acquiring the lock.
+        */
+       if (os != odn->dn_objset) {
+               mutex_exit(&os->os_lock);
+               rw_exit(&os_lock);
+               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
+               return (KMEM_CBRC_DONT_KNOW);
+       }
+
+       /*
+        * At this point we know that as long as we hold os->os_lock, the dnode
+        * cannot be freed and fields within the dnode can be safely accessed.
+        * The objset listing this dnode cannot go away as long as this dnode is
+        * on its list.
+        */
+       rw_exit(&os_lock);
+       if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
+               mutex_exit(&os->os_lock);
+               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
+               return (KMEM_CBRC_NO);
+       }
+       ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
+
+       /*
+        * Lock the dnode handle to prevent the dnode from obtaining any new
+        * holds. This also prevents the descendant dbufs and the bonus dbuf
+        * from accessing the dnode, so that we can discount their holds. The
+        * handle is safe to access because we know that while the dnode cannot
+        * go away, neither can its handle. Once we hold dnh_zrlock, we can
+        * safely move any dnode referenced only by dbufs.
+        */
+       if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
+               mutex_exit(&os->os_lock);
+               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
+               return (KMEM_CBRC_LATER);
+       }
+
+       /*
+        * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
+        * We need to guarantee that there is a hold for every dbuf in order to
+        * determine whether the dnode is actively referenced. Falsely matching
+        * a dbuf to an active hold would lead to an unsafe move. It's possible
+        * that a thread already having an active dnode hold is about to add a
+        * dbuf, and we can't compare hold and dbuf counts while the add is in
+        * progress.
+        */
+       if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
+               zrl_exit(&odn->dn_handle->dnh_zrlock);
+               mutex_exit(&os->os_lock);
+               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
+               return (KMEM_CBRC_LATER);
+       }
+
+       /*
+        * A dbuf may be removed (evicted) without an active dnode hold. In that
+        * case, the dbuf count is decremented under the handle lock before the
+        * dbuf's hold is released. This order ensures that if we count the hold
+        * after the dbuf is removed but before its hold is released, we will
+        * treat the unmatched hold as active and exit safely. If we count the
+        * hold before the dbuf is removed, the hold is discounted, and the
+        * removal is blocked until the move completes.
+        */
+       refcount = refcount_count(&odn->dn_holds);
+       ASSERT(refcount >= 0);
+       dbufs = odn->dn_dbufs_count;
+
+       /* We can't have more dbufs than dnode holds. */
+       ASSERT3U(dbufs, <=, refcount);
+       DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
+           uint32_t, dbufs);
+
+       if (refcount > dbufs) {
+               rw_exit(&odn->dn_struct_rwlock);
+               zrl_exit(&odn->dn_handle->dnh_zrlock);
+               mutex_exit(&os->os_lock);
+               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
+               return (KMEM_CBRC_LATER);
+       }
+
+       rw_exit(&odn->dn_struct_rwlock);
+
+       /*
+        * At this point we know that anyone with a hold on the dnode is not
+        * actively referencing it. The dnode is known and in a valid state to
+        * move. We're holding the locks needed to execute the critical section.
+        */
+       dnode_move_impl(odn, ndn);
+
+       list_link_replace(&odn->dn_link, &ndn->dn_link);
+       /* If the dnode was safe to move, the refcount cannot have changed. */
+       ASSERT(refcount == refcount_count(&ndn->dn_holds));
+       ASSERT(dbufs == ndn->dn_dbufs_count);
+       zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
+       mutex_exit(&os->os_lock);
+
+       return (KMEM_CBRC_YES);
+}
+#endif /* _KERNEL */
+
 void
-dnode_special_close(dnode_t *dn)
+dnode_special_close(dnode_handle_t *dnh)
 {
+       dnode_t *dn = dnh->dnh_dnode;
+
        /*
         * Wait for final references to the dnode to clear.  This can
         * only happen if the arc is asyncronously evicting state that
@@ -533,13 +932,19 @@ dnode_special_close(dnode_t *dn)
         */
        while (refcount_count(&dn->dn_holds) > 0)
                delay(1);
-       dnode_destroy(dn);
+       zrl_add(&dnh->dnh_zrlock);
+       dnode_destroy(dn); /* implicit zrl_remove() */
+       zrl_destroy(&dnh->dnh_zrlock);
+       dnh->dnh_dnode = NULL;
 }
 
 dnode_t *
-dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object)
+dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
+    dnode_handle_t *dnh)
 {
-       dnode_t *dn = dnode_create(os, dnp, NULL, object);
+       dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
+       dnh->dnh_dnode = dn;
+       zrl_init(&dnh->dnh_zrlock);
        DNODE_VERIFY(dn);
        return (dn);
 }
@@ -547,34 +952,43 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object)
 static void
 dnode_buf_pageout(dmu_buf_t *db, void *arg)
 {
-       dnode_t **children_dnodes = arg;
+       dnode_children_t *children_dnodes = arg;
        int i;
        int epb = db->db_size >> DNODE_SHIFT;
 
+       ASSERT(epb == children_dnodes->dnc_count);
+
        for (i = 0; i < epb; i++) {
-               dnode_t *dn = children_dnodes[i];
-               int n;
+               dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
+               dnode_t *dn;
 
-               if (dn == NULL)
+               /*
+                * The dnode handle lock guards against the dnode moving to
+                * another valid address, so there is no need here to guard
+                * against changes to or from NULL.
+                */
+               if (dnh->dnh_dnode == NULL) {
+                       zrl_destroy(&dnh->dnh_zrlock);
                        continue;
-#ifdef ZFS_DEBUG
+               }
+
+               zrl_add(&dnh->dnh_zrlock);
+               dn = dnh->dnh_dnode;
                /*
                 * If there are holds on this dnode, then there should
                 * be holds on the dnode's containing dbuf as well; thus
-                * it wouldn't be eligable for eviction and this function
+                * it wouldn't be eligible for eviction and this function
                 * would not have been called.
                 */
                ASSERT(refcount_is_zero(&dn->dn_holds));
-               ASSERT(list_head(&dn->dn_dbufs) == NULL);
                ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 
-               for (n = 0; n < TXG_SIZE; n++)
-                       ASSERT(!list_link_active(&dn->dn_dirty_link[n]));
-#endif
-               children_dnodes[i] = NULL;
-               dnode_destroy(dn);
+               dnode_destroy(dn); /* implicit zrl_remove() */
+               zrl_destroy(&dnh->dnh_zrlock);
+               dnh->dnh_dnode = NULL;
        }
-       kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+       kmem_free(children_dnodes, sizeof (dnode_children_t) +
+           (epb - 1) * sizeof (dnode_handle_t));
 }
 
 /*
@@ -593,7 +1007,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
        uint64_t blk;
        dnode_t *mdn, *dn;
        dmu_buf_impl_t *db;
-       dnode_t **children_dnodes;
+       dnode_children_t *children_dnodes;
+       dnode_handle_t *dnh;
 
        /*
         * If you are holding the spa config lock as writer, you shouldn't
@@ -603,12 +1018,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
         */
        ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
            (spa_is_root(os->os_spa) &&
-           spa_config_held(os->os_spa, SCL_STATE, RW_WRITER) &&
-           !spa_config_held(os->os_spa, SCL_ZIO, RW_WRITER)));
+           spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
 
        if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
                dn = (object == DMU_USERUSED_OBJECT) ?
-                   os->os_userused_dnode : os->os_groupused_dnode;
+                   DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
                if (dn == NULL)
                        return (ENOENT);
                type = dn->dn_type;
@@ -625,7 +1039,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
        if (object == 0 || object >= DN_MAX_OBJECT)
                return (EINVAL);
 
-       mdn = os->os_meta_dnode;
+       mdn = DMU_META_DNODE(os);
+       ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
 
        DNODE_VERIFY(mdn);
 
@@ -652,26 +1067,39 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
 
        idx = object & (epb-1);
 
+       ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
        children_dnodes = dmu_buf_get_user(&db->db);
        if (children_dnodes == NULL) {
-               dnode_t **winner;
-               children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *),
-                   KM_SLEEP);
+               int i;
+               dnode_children_t *winner;
+               children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
+                   (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP);
+               children_dnodes->dnc_count = epb;
+               dnh = &children_dnodes->dnc_children[0];
+               for (i = 0; i < epb; i++) {
+                       zrl_init(&dnh[i].dnh_zrlock);
+                       dnh[i].dnh_dnode = NULL;
+               }
                if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
                    dnode_buf_pageout)) {
-                       kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+                       kmem_free(children_dnodes, sizeof (dnode_children_t) +
+                           (epb - 1) * sizeof (dnode_handle_t));
                        children_dnodes = winner;
                }
        }
+       ASSERT(children_dnodes->dnc_count == epb);
 
-       if ((dn = children_dnodes[idx]) == NULL) {
-               dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx;
+       dnh = &children_dnodes->dnc_children[idx];
+       zrl_add(&dnh->dnh_zrlock);
+       if ((dn = dnh->dnh_dnode) == NULL) {
+               dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
                dnode_t *winner;
 
-               dn = dnode_create(os, dnp, db, object);
-               winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
+               dn = dnode_create(os, phys, db, object, dnh);
+               winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
                if (winner != NULL) {
-                       dnode_destroy(dn);
+                       zrl_add(&dnh->dnh_zrlock);
+                       dnode_destroy(dn); /* implicit zrl_remove() */
                        dn = winner;
                }
        }
@@ -683,13 +1111,16 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
            ((flag & DNODE_MUST_BE_FREE) &&
            (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
                mutex_exit(&dn->dn_mtx);
+               zrl_remove(&dnh->dnh_zrlock);
                dbuf_rele(db, FTAG);
                return (type == DMU_OT_NONE ? ENOENT : EEXIST);
        }
        mutex_exit(&dn->dn_mtx);
 
        if (refcount_add(&dn->dn_holds, tag) == 1)
-               dbuf_add_ref(db, dn);
+               dbuf_add_ref(db, dnh);
+       /* Now we can rely on the hold to prevent the dnode from moving. */
+       zrl_remove(&dnh->dnh_zrlock);
 
        DNODE_VERIFY(dn);
        ASSERT3P(dn->dn_dbuf, ==, db);
@@ -731,13 +1162,37 @@ void
 dnode_rele(dnode_t *dn, void *tag)
 {
        uint64_t refs;
+       /* Get while the hold prevents the dnode from moving. */
+       dmu_buf_impl_t *db = dn->dn_dbuf;
+       dnode_handle_t *dnh = dn->dn_handle;
 
        mutex_enter(&dn->dn_mtx);
        refs = refcount_remove(&dn->dn_holds, tag);
        mutex_exit(&dn->dn_mtx);
+
+       /*
+        * It's unsafe to release the last hold on a dnode by dnode_rele() or
+        * indirectly by dbuf_rele() while relying on the dnode handle to
+        * prevent the dnode from moving, since releasing the last hold could
+        * result in the dnode's parent dbuf evicting its dnode handles. For
+        * that reason anyone calling dnode_rele() or dbuf_rele() without some
+        * other direct or indirect hold on the dnode must first drop the dnode
+        * handle.
+        */
+       ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
+
        /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
-       if (refs == 0 && dn->dn_dbuf)
-               dbuf_rele(dn->dn_dbuf, dn);
+       if (refs == 0 && db != NULL) {
+               /*
+                * Another thread could add a hold to the dnode handle in
+                * dnode_hold_impl() while holding the parent dbuf. Since the
+                * hold on the parent dbuf prevents the handle from being
+                * destroyed, the hold on the handle is OK. We can't yet assert
+                * that the handle has zero references, but that will be
+                * asserted anyway when the handle gets destroyed.
+                */
+               dbuf_rele(db, dnh);
+       }
 }
 
 void
@@ -756,7 +1211,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 #ifdef ZFS_DEBUG
        mutex_enter(&dn->dn_mtx);
        ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
-       /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */
+       ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
        mutex_exit(&dn->dn_mtx);
 #endif
 
@@ -795,7 +1250,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
        /*
         * The dnode maintains a hold on its containing dbuf as
         * long as there are holds on it.  Each instantiated child
-        * dbuf maintaines a hold on the dnode.  When the last child
+        * dbuf maintains a hold on the dnode.  When the last child
         * drops its hold, the dnode will drop its hold on the
         * containing dbuf. We add a "dirty hold" here so that the
         * dnode will hang around after we finish processing its
index f9ec9f6023d2abc3c4feb63c2111756cd2ce58b9..2ee990a3b32c42c9f4c53635b21ff5b684c1860d 100644 (file)
@@ -76,7 +76,11 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
 
                if (child == NULL)
                        continue;
-               ASSERT3P(child->db_dnode, ==, dn);
+#ifdef DEBUG
+               DB_DNODE_ENTER(child);
+               ASSERT3P(DB_DNODE(child), ==, dn);
+               DB_DNODE_EXIT(child);
+#endif /* DEBUG */
                if (child->db_parent && child->db_parent != dn->dn_dbuf) {
                        ASSERT(child->db_parent->db_level == db->db_level);
                        ASSERT(child->db_blkptr !=
@@ -135,15 +139,18 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
        int off, num;
        int i, err, epbs;
        uint64_t txg = tx->tx_txg;
+       dnode_t *dn;
 
-       epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+       epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
        off = start - (db->db_blkid * 1<<epbs);
        num = end - start + 1;
 
        ASSERT3U(off, >=, 0);
        ASSERT3U(num, >=, 0);
        ASSERT3U(db->db_level, >, 0);
-       ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift);
+       ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
        ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
        ASSERT(db->db_blkptr != NULL);
 
@@ -155,10 +162,10 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 
                ASSERT(db->db_level == 1);
 
-               rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
-               err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
+               rw_enter(&dn->dn_struct_rwlock, RW_READER);
+               err = dbuf_hold_impl(dn, db->db_level-1,
                    (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
-               rw_exit(&db->db_dnode->dn_struct_rwlock);
+               rw_exit(&dn->dn_struct_rwlock);
                if (err == ENOENT)
                        continue;
                ASSERT(err == 0);
@@ -200,6 +207,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 
                dbuf_rele(child, FTAG);
        }
+       DB_DNODE_EXIT(db);
 }
 #endif
 
@@ -209,7 +217,7 @@ static int
 free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
     dmu_tx_t *tx)
 {
-       dnode_t *dn = db->db_dnode;
+       dnode_t *dn;
        blkptr_t *bp;
        dmu_buf_impl_t *subdb;
        uint64_t start, end, dbstart, dbend, i;
@@ -230,7 +238,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
        dbuf_release_bp(db);
        bp = (blkptr_t *)db->db.db_data;
 
-       epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+       epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
        shift = (db->db_level - 1) * epbs;
        dbstart = db->db_blkid << epbs;
        start = blkid >> shift;
@@ -253,6 +263,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
                blocks_freed = free_blocks(dn, bp, end-start+1, tx);
                arc_buf_freeze(db->db_buf);
                ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+               DB_DNODE_EXIT(db);
                return (all ? ALL : blocks_freed);
        }
 
@@ -272,6 +283,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
                }
                dbuf_rele(subdb, FTAG);
        }
+       DB_DNODE_EXIT(db);
        arc_buf_freeze(db->db_buf);
 #ifdef ZFS_DEBUG
        bp -= (end-start)+1;
@@ -375,7 +387,11 @@ dnode_evict_dbufs(dnode_t *dn)
                for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
                        list_remove(&dn->dn_dbufs, db);
                        list_insert_tail(&dn->dn_dbufs, db);
-                       ASSERT3P(db->db_dnode, ==, dn);
+#ifdef DEBUG
+                       DB_DNODE_ENTER(db);
+                       ASSERT3P(DB_DNODE(db), ==, dn);
+                       DB_DNODE_EXIT(db);
+#endif /* DEBUG */
 
                        mutex_enter(&db->db_mtx);
                        if (db->db_state == DB_EVICTING) {
index ddd83576c65e8fbc69cb6b6eba180bc89cc0f66f..59ac4a60947a6bb045e9fd1693431c2156dd48e2 100644 (file)
 #include <sys/zfs_ioctl.h>
 #include <sys/spa.h>
 #include <sys/zfs_znode.h>
+#include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sys/dsl_deadlist.h>
 
-/*
- * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
- */
-int zfs_dedup_prefetch = 1;
-
 static char *dsl_reaper = "the grim reaper";
 
 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
@@ -253,8 +249,7 @@ dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
        if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
                return (B_FALSE);
 
-       if (zfs_dedup_prefetch && bp && BP_GET_DEDUP(bp))
-               ddt_prefetch(dsl_dataset_get_spa(ds), bp);
+       ddt_prefetch(dsl_dataset_get_spa(ds), bp);
 
        return (B_TRUE);
 }
@@ -372,6 +367,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
        dmu_buf_t *dbuf;
        dsl_dataset_t *ds;
        int err;
+       dmu_object_info_t doi;
 
        ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
            dsl_pool_sync_context(dp));
@@ -379,6 +375,12 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
        err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
        if (err)
                return (err);
+
+       /* Make sure dsobj has the correct object type. */
+       dmu_object_info_from_db(dbuf, &doi);
+       if (doi.doi_type != DMU_OT_DSL_DATASET)
+               return (EINVAL);
+
        ds = dmu_buf_get_user(dbuf);
        if (ds == NULL) {
                dsl_dataset_t *winner;
@@ -881,6 +883,21 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
 
        dsl_dir_close(dd, FTAG);
 
+       /*
+        * If we are creating a clone, make sure we zero out any stale
+        * data from the origin snapshots zil header.
+        */
+       if (origin != NULL) {
+               dsl_dataset_t *ds;
+               objset_t *os;
+
+               VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+               VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
+               bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+               dsl_dataset_dirty(ds, tx);
+               dsl_dataset_rele(ds, FTAG);
+       }
+
        return (dsobj);
 }
 
@@ -1081,11 +1098,16 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
                 */
                (void) dmu_free_object(os, obj);
        }
+       if (err != ESRCH)
+               goto out;
 
        /*
-        * We need to sync out all in-flight IO before we try to evict
-        * (the dataset evict func is trying to clear the cached entries
-        * for this dataset in the ARC).
+        * Only the ZIL knows how to free log blocks.
+        */
+       zil_destroy(dmu_objset_zil(os), B_FALSE);
+
+       /*
+        * Sync out all in-flight IO.
         */
        txg_wait_synced(dd->dd_pool, 0);
 
@@ -1103,9 +1125,6 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
                    count == 0);
        }
 
-       if (err != ESRCH)
-               goto out;
-
        rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
        err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
        rw_exit(&dd->dd_pool->dp_config_rwlock);
@@ -1356,6 +1375,11 @@ dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
        return (0);
 }
 
+/*
+ * If you add new checks here, you may need to add
+ * additional checks to the "temporary" case in
+ * snapshot_check() in dmu_objset.c.
+ */
 /* ARGSUSED */
 int
 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
@@ -1597,21 +1621,23 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
        dsl_pool_t *dp = ds->ds_dir->dd_pool;
        objset_t *mos = dp->dp_meta_objset;
        dsl_dataset_t *ds_prev = NULL;
+       boolean_t wont_destroy;
        uint64_t obj;
 
-       ASSERT(ds->ds_owner);
+       wont_destroy = (dsda->defer &&
+           (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
+
+       ASSERT(ds->ds_owner || wont_destroy);
        ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
        ASSERT(ds->ds_prev == NULL ||
            ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
        ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
 
-       if (dsda->defer) {
+       if (wont_destroy) {
                ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
-               if (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1) {
-                       dmu_buf_will_dirty(ds->ds_dbuf, tx);
-                       ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
-                       return;
-               }
+               dmu_buf_will_dirty(ds->ds_dbuf, tx);
+               ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
+               return;
        }
 
        /* signal any waiters that this dataset is going away */
@@ -1620,11 +1646,6 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
        cv_broadcast(&ds->ds_exclusive_cv);
        mutex_exit(&ds->ds_lock);
 
-       if (ds->ds_objset) {
-               dmu_objset_evict(ds->ds_objset);
-               ds->ds_objset = NULL;
-       }
-
        /* Remove our reservation */
        if (ds->ds_reserved != 0) {
                dsl_prop_setarg_t psa;
@@ -1850,6 +1871,15 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
                }
        }
 
+       /*
+        * This must be done after the dsl_traverse(), because it will
+        * re-open the objset.
+        */
+       if (ds->ds_objset) {
+               dmu_objset_evict(ds->ds_objset);
+               ds->ds_objset = NULL;
+       }
+
        if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
                /* Erase the link in the dir */
                dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
@@ -1928,7 +1958,7 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
         */
        ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
        asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
-       if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE))
+       if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
                return (ENOSPC);
 
        /*
@@ -2224,8 +2254,21 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
        if (ds->ds_prev == NULL)
                return (B_FALSE);
        if (ds->ds_phys->ds_bp.blk_birth >
-           ds->ds_prev->ds_phys->ds_creation_txg)
-               return (B_TRUE);
+           ds->ds_prev->ds_phys->ds_creation_txg) {
+               objset_t *os, *os_prev;
+               /*
+                * It may be that only the ZIL differs, because it was
+                * reset in the head.  Don't count that as being
+                * modified.
+                */
+               if (dmu_objset_from_ds(ds, &os) != 0)
+                       return (B_TRUE);
+               if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0)
+                       return (B_TRUE);
+               return (bcmp(&os->os_phys->os_meta_dnode,
+                   &os_prev->os_phys->os_meta_dnode,
+                   sizeof (os->os_phys->os_meta_dnode)) != 0);
+       }
        return (B_FALSE);
 }
 
@@ -3144,9 +3187,14 @@ dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
        ASSERT(clone->ds_owner);
        ASSERT(origin_head->ds_owner);
 retry:
-       /* Need exclusive access for the swap */
-       rw_enter(&clone->ds_rwlock, RW_WRITER);
-       if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
+       /*
+        * Need exclusive access for the swap. If we're swapping these
+        * datasets back after an error, we already hold the locks.
+        */
+       if (!RW_WRITE_HELD(&clone->ds_rwlock))
+               rw_enter(&clone->ds_rwlock, RW_WRITER);
+       if (!RW_WRITE_HELD(&origin_head->ds_rwlock) &&
+           !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
                rw_exit(&clone->ds_rwlock);
                rw_enter(&origin_head->ds_rwlock, RW_WRITER);
                if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
@@ -3411,22 +3459,41 @@ dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
        return (err);
 }
 
-struct dsl_ds_holdarg {
-       dsl_sync_task_group_t *dstg;
-       char *htag;
-       char *snapname;
-       boolean_t recursive;
-       boolean_t gotone;
-       boolean_t temphold;
-       char failed[MAXPATHLEN];
-};
+typedef struct zfs_hold_cleanup_arg {
+       dsl_pool_t *dp;
+       uint64_t dsobj;
+       char htag[MAXNAMELEN];
+} zfs_hold_cleanup_arg_t;
+
+static void
+dsl_dataset_user_release_onexit(void *arg)
+{
+       zfs_hold_cleanup_arg_t *ca = arg;
+
+       (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag,
+           B_TRUE);
+       kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
+}
+
+void
+dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
+    minor_t minor)
+{
+       zfs_hold_cleanup_arg_t *ca;
+
+       ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP);
+       ca->dp = ds->ds_dir->dd_pool;
+       ca->dsobj = ds->ds_object;
+       (void) strlcpy(ca->htag, htag, sizeof (ca->htag));
+       VERIFY3U(0, ==, zfs_onexit_add_cb(minor,
+           dsl_dataset_user_release_onexit, ca, NULL));
+}
 
 /*
- * The max length of a temporary tag prefix is the number of hex digits
- * required to express UINT64_MAX plus one for the hyphen.
+ * If you add new checks here, you may need to add
+ * additional checks to the "temporary" case in
+ * snapshot_check() in dmu_objset.c.
  */
-#define        MAX_TAG_PREFIX_LEN      17
-
 static int
 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
@@ -3461,7 +3528,7 @@ dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
        return (error);
 }
 
-static void
+void
 dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
        dsl_dataset_t *ds = arg1;
@@ -3523,14 +3590,42 @@ dsl_dataset_user_hold_one(const char *dsname, void *arg)
        return (error);
 }
 
+int
+dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
+    boolean_t temphold)
+{
+       struct dsl_ds_holdarg *ha;
+       int error;
+
+       ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+       ha->htag = htag;
+       ha->temphold = temphold;
+       error = dsl_sync_task_do(ds->ds_dir->dd_pool,
+           dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync,
+           ds, ha, 0);
+       kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+
+       return (error);
+}
+
 int
 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
-    boolean_t recursive, boolean_t temphold)
+    boolean_t recursive, boolean_t temphold, int cleanup_fd)
 {
        struct dsl_ds_holdarg *ha;
        dsl_sync_task_t *dst;
        spa_t *spa;
        int error;
+       minor_t minor = 0;
+
+       if (cleanup_fd != -1) {
+               /* Currently we only support cleanup-on-exit of tempholds. */
+               if (!temphold)
+                       return (EINVAL);
+               error = zfs_onexit_fd_hold(cleanup_fd, &minor);
+               if (error)
+                       return (error);
+       }
 
        ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
 
@@ -3539,6 +3634,8 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
        error = spa_open(dsname, &spa, FTAG);
        if (error) {
                kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+               if (cleanup_fd != -1)
+                       zfs_onexit_fd_rele(cleanup_fd);
                return (error);
        }
 
@@ -3547,6 +3644,7 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
        ha->snapname = snapname;
        ha->recursive = recursive;
        ha->temphold = temphold;
+
        if (recursive) {
                error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
                    ha, DS_FIND_CHILDREN);
@@ -3563,6 +3661,12 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
                if (dst->dst_err) {
                        dsl_dataset_name(ds, ha->failed);
                        *strchr(ha->failed, '@') = '\0';
+               } else if (error == 0 && minor != 0 && temphold) {
+                       /*
+                        * If this hold is to be released upon process exit,
+                        * register that action now.
+                        */
+                       dsl_register_onexit_hold_cleanup(ds, htag, minor);
                }
                dsl_dataset_rele(ds, ha->dstg);
        }
@@ -3574,8 +3678,11 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
                (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
 
        dsl_sync_task_group_destroy(ha->dstg);
+
        kmem_free(ha, sizeof (struct dsl_ds_holdarg));
        spa_close(spa, FTAG);
+       if (cleanup_fd != -1)
+               zfs_onexit_fd_rele(cleanup_fd);
        return (error);
 }
 
@@ -3667,11 +3774,6 @@ dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
        uint64_t refs;
        int error;
 
-       if (ds->ds_objset) {
-               dmu_objset_evict(ds->ds_objset);
-               ds->ds_objset = NULL;
-       }
-
        mutex_enter(&ds->ds_lock);
        ds->ds_userrefs--;
        refs = ds->ds_userrefs;
@@ -3831,10 +3933,12 @@ top:
 }
 
 /*
- * Called at spa_load time to release a stale temporary user hold.
+ * Called at spa_load time (with retry == B_FALSE) to release a stale
+ * temporary user hold. Also called by the onexit code (with retry == B_TRUE).
  */
 int
-dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag)
+dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag,
+    boolean_t retry)
 {
        dsl_dataset_t *ds;
        char *snap;
@@ -3842,20 +3946,36 @@ dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag)
        int namelen;
        int error;
 
-       rw_enter(&dp->dp_config_rwlock, RW_READER);
-       error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
-       rw_exit(&dp->dp_config_rwlock);
-       if (error)
-               return (error);
-       namelen = dsl_dataset_namelen(ds)+1;
-       name = kmem_alloc(namelen, KM_SLEEP);
-       dsl_dataset_name(ds, name);
-       dsl_dataset_rele(ds, FTAG);
+       do {
+               rw_enter(&dp->dp_config_rwlock, RW_READER);
+               error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+               rw_exit(&dp->dp_config_rwlock);
+               if (error)
+                       return (error);
+               namelen = dsl_dataset_namelen(ds)+1;
+               name = kmem_alloc(namelen, KM_SLEEP);
+               dsl_dataset_name(ds, name);
+               dsl_dataset_rele(ds, FTAG);
 
-       snap = strchr(name, '@');
-       *snap = '\0';
-       ++snap;
-       return (dsl_dataset_user_release(name, snap, htag, B_FALSE));
+               snap = strchr(name, '@');
+               *snap = '\0';
+               ++snap;
+               error = dsl_dataset_user_release(name, snap, htag, B_FALSE);
+               kmem_free(name, namelen);
+
+               /*
+                * The object can't have been destroyed because we have a hold,
+                * but it might have been renamed, resulting in ENOENT.  Retry
+                * if we've been requested to do so.
+                *
+                * It would be nice if we could use the dsobj all the way
+                * through and avoid ENOENT entirely.  But we might need to
+                * unmount the snapshot, and there's currently no way to lookup
+                * a vfsp using a ZFS object id.
+                */
+       } while ((error == ENOENT) && retry);
+
+       return (error);
 }
 
 int
index 85490c8d5fd9247743f9ff6ea7ff9d285984d183..529fb052fa75db007efb217169b920ef16545c3b 100644 (file)
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -528,9 +528,8 @@ dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl,
  * Check if user has requested permission.
  */
 int
-dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
+dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr)
 {
-       dsl_dataset_t *ds;
        dsl_dir_t *dd;
        dsl_pool_t *dp;
        void *cookie;
@@ -540,23 +539,15 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
        avl_tree_t permsets;
        perm_set_t *setnode;
 
-       error = dsl_dataset_hold(dsname, FTAG, &ds);
-       if (error)
-               return (error);
-
        dp = ds->ds_dir->dd_pool;
        mos = dp->dp_meta_objset;
 
-       if (dsl_delegation_on(mos) == B_FALSE) {
-               dsl_dataset_rele(ds, FTAG);
+       if (dsl_delegation_on(mos) == B_FALSE)
                return (ECANCELED);
-       }
 
        if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) <
-           SPA_VERSION_DELEGATED_PERMS) {
-               dsl_dataset_rele(ds, FTAG);
+           SPA_VERSION_DELEGATED_PERMS)
                return (EPERM);
-       }
 
        if (dsl_dataset_is_snapshot(ds)) {
                /*
@@ -633,7 +624,6 @@ again:
        error = EPERM;
 success:
        rw_exit(&dp->dp_config_rwlock);
-       dsl_dataset_rele(ds, FTAG);
 
        cookie = NULL;
        while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
@@ -642,6 +632,22 @@ success:
        return (error);
 }
 
+int
+dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
+{
+       dsl_dataset_t *ds;
+       int error;
+
+       error = dsl_dataset_hold(dsname, FTAG, &ds);
+       if (error)
+               return (error);
+
+       error = dsl_deleg_access_impl(ds, perm, cr);
+       dsl_dataset_rele(ds, FTAG);
+
+       return (error);
+}
+
 /*
  * Other routines.
  */
index 2cd21a102beb10904ae8139993af786167265dcf..700cc962865da79b18a5876e86725715144502d9 100644 (file)
@@ -42,7 +42,7 @@
 
 int zfs_no_write_throttle = 0;
 int zfs_write_limit_shift = 3;                 /* 1/8th of physical memory */
-int zfs_txg_synctime_ms = 5000;                /* target millisecs to sync a txg */
+int zfs_txg_synctime_ms = 1000;                /* target millisecs to sync a txg */
 
 uint64_t zfs_write_limit_min = 32 << 20;       /* min write limit is 32MB */
 uint64_t zfs_write_limit_max = 0;              /* max data payload per txg */
@@ -451,7 +451,7 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
        while (ds = list_head(&dp->dp_synced_datasets)) {
                list_remove(&dp->dp_synced_datasets, ds);
                os = ds->ds_objset;
-               zil_clean(os->os_zil);
+               zil_clean(os->os_zil, txg);
                ASSERT(!dmu_objset_is_dirty(os, txg));
                dmu_buf_rele(ds->ds_dbuf, ds);
        }
@@ -768,7 +768,7 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
                *htag = '\0';
                ++htag;
                dsobj = strtonum(za.za_name, NULL);
-               (void) dsl_dataset_user_release_tmp(dp, dsobj, htag);
+               (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE);
        }
        zap_cursor_fini(&zc);
 }
index 23c37c7ccfd23923c2fdbaed8e466eb740c6e631..56d41083673eba5d2cf9d47baedb63f06e43364e 100644 (file)
@@ -56,6 +56,11 @@ static scan_cb_t dsl_scan_remove_cb;
 static dsl_syncfunc_t dsl_scan_cancel_sync;
 static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
 
+int zfs_top_maxinflight = 32;          /* maximum I/Os per top-level */
+int zfs_resilver_delay = 2;            /* number of ticks to delay resilver */
+int zfs_scrub_delay = 4;               /* number of ticks to delay scrub */
+int zfs_scan_idle = 50;                        /* idle window in clock ticks */
+
 int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
 int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
 int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
@@ -601,8 +606,8 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
         * done before setting xlateall (similar to dsl_read())
         */
        (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
-           buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
-           &flags, &czb);
+           buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+           ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
 }
 
 static boolean_t
@@ -650,6 +655,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
     const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
 {
        dsl_pool_t *dp = scn->scn_dp;
+       int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
        int err;
 
        if (BP_GET_LEVEL(bp) > 0) {
@@ -660,7 +666,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 
                err = arc_read_nolock(NULL, dp->dp_spa, bp,
                    arc_getbuf_func, bufp,
-                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+                   ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
                if (err) {
                        scn->scn_phys.scn_errors++;
                        return (err);
@@ -683,7 +689,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 
                err = arc_read_nolock(NULL, dp->dp_spa, bp,
                    arc_getbuf_func, bufp,
-                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+                   ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
                if (err) {
                        scn->scn_phys.scn_errors++;
                        return (err);
@@ -696,7 +702,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 
                err = arc_read_nolock(NULL, dp->dp_spa, bp,
                    arc_getbuf_func, bufp,
-                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+                   ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
                if (err) {
                        scn->scn_phys.scn_errors++;
                        return (err);
@@ -719,7 +725,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 
                err = arc_read_nolock(NULL, dp->dp_spa, bp,
                    arc_getbuf_func, bufp,
-                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+                   ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
                if (err) {
                        scn->scn_phys.scn_errors++;
                        return (err);
@@ -727,9 +733,6 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 
                osp = (*bufp)->b_data;
 
-               if (DSL_SCAN_IS_SCRUB_RESILVER(scn))
-                       dsl_scan_zil(dp, &osp->os_zil_header);
-
                dsl_scan_visitdnode(scn, ds, osp->os_type,
                    &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
 
@@ -1072,9 +1075,23 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
 {
        dsl_pool_t *dp = scn->scn_dp;
        dsl_dataset_t *ds;
+       objset_t *os;
 
        VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 
+       if (dmu_objset_from_ds(ds, &os))
+               goto out;
+
+       /*
+        * Only the ZIL in the head (non-snapshot) is valid.  Even though
+        * snapshots can have ZIL block pointers (which may be the same
+        * BP as in the head), they must be ignored.  So we traverse the
+        * ZIL here, rather than in scan_recurse(), because the regular
+        * snapshot block-sharing rules don't apply to it.
+        */
+       if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds))
+               dsl_scan_zil(dp, &os->os_zil_header);
+
        /*
         * Iterate over the bps in this ds.
         */
@@ -1446,7 +1463,6 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                dsl_scan_setup_sync(scn, &func, tx);
        }
 
-
        if (!dsl_scan_active(scn) ||
            spa_sync_pass(dp->dp_spa) > 1)
                return;
@@ -1489,7 +1505,6 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
        if (scn->scn_phys.scn_state != DSS_SCANNING)
                return;
 
-
        if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
            scn->scn_phys.scn_ddt_class_max) {
                zfs_dbgmsg("doing scan sync txg %llu; "
@@ -1644,8 +1659,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
        spa_t *spa = dp->dp_spa;
        uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
        boolean_t needs_io;
-       int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+       int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
        int zio_priority;
+       int scan_delay = 0;
 
        if (phys_birth <= scn->scn_phys.scn_min_txg ||
            phys_birth >= scn->scn_phys.scn_max_txg)
@@ -1658,10 +1674,12 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
                zio_flags |= ZIO_FLAG_SCRUB;
                zio_priority = ZIO_PRIORITY_SCRUB;
                needs_io = B_TRUE;
+               scan_delay = zfs_scrub_delay;
        } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
                zio_flags |= ZIO_FLAG_RESILVER;
                zio_priority = ZIO_PRIORITY_RESILVER;
                needs_io = B_FALSE;
+               scan_delay = zfs_resilver_delay;
        }
 
        /* If it's an intent log block, failure is expected. */
@@ -1699,14 +1717,23 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
        }
 
        if (needs_io && !zfs_no_scrub_io) {
+               vdev_t *rvd = spa->spa_root_vdev;
+               uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
                void *data = zio_data_buf_alloc(size);
 
                mutex_enter(&spa->spa_scrub_lock);
-               while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
+               while (spa->spa_scrub_inflight >= maxinflight)
                        cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
                spa->spa_scrub_inflight++;
                mutex_exit(&spa->spa_scrub_lock);
 
+               /*
+                * If we're seeing recent (zfs_scan_idle) "important" I/Os
+                * then throttle our workload to limit the impact of a scan.
+                */
+               if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
+                       delay(scan_delay);
+
                zio_nowait(zio_read(NULL, spa, bp, data, size,
                    dsl_scan_scrub_done, NULL, zio_priority,
                    zio_flags, zb));
index 832685b0fcf18938c1e3073fe3328cdd720bc7c9..b0818ce274d44a3e6c1ec932b933ee48d5c50db1 100644 (file)
@@ -213,6 +213,8 @@ dsl_sync_task_do(dsl_pool_t *dp,
        dsl_sync_task_group_t *dstg;
        int err;
 
+       ASSERT(spa_writeable(dp->dp_spa));
+
        dstg = dsl_sync_task_group_create(dp);
        dsl_sync_task_create(dstg, checkfunc, syncfunc,
            arg1, arg2, blocks_modified);
@@ -228,6 +230,9 @@ dsl_sync_task_do_nowait(dsl_pool_t *dp,
 {
        dsl_sync_task_group_t *dstg;
 
+       if (!spa_writeable(dp->dp_spa))
+               return;
+
        dstg = dsl_sync_task_group_create(dp);
        dsl_sync_task_create(dstg, checkfunc, syncfunc,
            arg1, arg2, blocks_modified);
index 78943eda82957b0b7db7d303d926611fca1aaa3a..4efcff4f464adf18714c2d0e0368f4319fee7154 100644 (file)
@@ -383,6 +383,20 @@ fm_panic(const char *format, ...)
        va_end(ap);
 }
 
+/*
+ * Simply tell the caller if fm_panicstr is set, ie. an fma event has
+ * caused the panic. If so, something other than the default panic
+ * diagnosis method will diagnose the cause of the panic.
+ */
+int
+is_fm_panic()
+{
+       if (fm_panicstr)
+               return (1);
+       else
+               return (0);
+}
+
 /*
  * Print any appropriate FMA banner message before the panic message.  This
  * function is called by panicsys() and prints the message for fm_panic().
@@ -610,8 +624,8 @@ fm_nvlist_create(nv_alloc_t *nva)
 
        if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) {
                if (hdl_alloced) {
-                       kmem_free(nvhdl, sizeof (nv_alloc_t));
                        nv_alloc_fini(nvhdl);
+                       kmem_free(nvhdl, sizeof (nv_alloc_t));
                }
                return (NULL);
        }
index 4c05806e3ee284d357a17ab9c7975e12ea784d9f..cf1bbc030f452f022fa788c8f6d20ddcf8c747b8 100644 (file)
@@ -32,6 +32,7 @@
 #include <sys/arc.h>
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
+#include <sys/zrlock.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -82,9 +83,6 @@ struct dmu_tx;
  * etc.
  */
 
-#define        LIST_LINK_INACTIVE(link) \
-       ((link)->list_next == NULL && (link)->list_prev == NULL)
-
 struct dmu_buf_impl;
 
 typedef enum override_states {
@@ -149,15 +147,17 @@ typedef struct dmu_buf_impl {
        struct objset *db_objset;
 
        /*
-        * the dnode we belong to (NULL when evicted)
+        * handle to safely access the dnode we belong to (NULL when evicted)
         */
-       struct dnode *db_dnode;
+       struct dnode_handle *db_dnode_handle;
 
        /*
         * our parent buffer; if the dnode points to us directly,
-        * db_parent == db_dnode->dn_dbuf
+        * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
         * only accessed by sync thread ???
         * (NULL when evicted)
+        * May change from NULL to non-NULL under the protection of db_mtx
+        * (see dbuf_check_blkptr())
         */
        struct dmu_buf_impl *db_parent;
 
@@ -284,24 +284,46 @@ void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
 
 void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
 
+#define        DB_DNODE(_db)           ((_db)->db_dnode_handle->dnh_dnode)
+#define        DB_DNODE_LOCK(_db)      ((_db)->db_dnode_handle->dnh_zrlock)
+#define        DB_DNODE_ENTER(_db)     (zrl_add(&DB_DNODE_LOCK(_db)))
+#define        DB_DNODE_EXIT(_db)      (zrl_remove(&DB_DNODE_LOCK(_db)))
+#define        DB_DNODE_HELD(_db)      (!zrl_is_zero(&DB_DNODE_LOCK(_db)))
+#define        DB_GET_SPA(_spa_p, _db) {               \
+       dnode_t *__dn;                          \
+       DB_DNODE_ENTER(_db);                    \
+       __dn = DB_DNODE(_db);                   \
+       *(_spa_p) = __dn->dn_objset->os_spa;    \
+       DB_DNODE_EXIT(_db);                     \
+}
+#define        DB_GET_OBJSET(_os_p, _db) {             \
+       dnode_t *__dn;                          \
+       DB_DNODE_ENTER(_db);                    \
+       __dn = DB_DNODE(_db);                   \
+       *(_os_p) = __dn->dn_objset;             \
+       DB_DNODE_EXIT(_db);                     \
+}
+
 void dbuf_init(void);
 void dbuf_fini(void);
 
-#define        DBUF_IS_METADATA(db)    \
-       ((db)->db_level > 0 || dmu_ot[(db)->db_dnode->dn_type].ot_metadata)
+boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
+
+#define        DBUF_IS_METADATA(_db)   \
+       (dbuf_is_metadata(_db))
 
-#define        DBUF_GET_BUFC_TYPE(db)  \
-       (DBUF_IS_METADATA(db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+#define        DBUF_GET_BUFC_TYPE(_db) \
+       (DBUF_IS_METADATA(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
 
-#define        DBUF_IS_CACHEABLE(db)                                           \
-       ((db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||          \
-       (DBUF_IS_METADATA(db) &&                                        \
-       ((db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
+#define        DBUF_IS_CACHEABLE(_db)                                          \
+       ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||         \
+       (DBUF_IS_METADATA(_db) &&                                       \
+       ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
 
-#define        DBUF_IS_L2CACHEABLE(db)                                         \
-       ((db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||        \
-       (DBUF_IS_METADATA(db) &&                                        \
-       ((db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
+#define        DBUF_IS_L2CACHEABLE(_db)                                        \
+       ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||       \
+       (DBUF_IS_METADATA(_db) &&                                       \
+       ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
 
 #ifdef ZFS_DEBUG
 
@@ -332,7 +354,7 @@ _NOTE(CONSTCOND) } while (0)
        sprintf_blkptr(__blkbuf, bp);                           \
        dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);   \
        kmem_free(__blkbuf, BP_SPRINTF_LEN);                    \
-       }                                                       \
+       }                                                       \
 _NOTE(CONSTCOND) } while (0)
 
 #define        DBUF_VERIFY(db) dbuf_verify(db)
index 83932f467a6fa2081b43fd436b1232a15930820c..07f5949ebfeaa4a6c6746ebf3f17c14bf9789d66 100644 (file)
@@ -192,8 +192,8 @@ int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
     uint64_t flags);
 int dmu_objset_destroy(const char *name, boolean_t defer);
 int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
-int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props,
-    boolean_t recursive);
+int dmu_objset_snapshot(char *fsname, char *snapname, char *tag,
+    struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd);
 int dmu_objset_rename(const char *name, const char *newname,
     boolean_t recursive);
 int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
@@ -335,6 +335,7 @@ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
 int dmu_bonus_max(void);
 int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
 int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
+dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
 int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
 
 /*
@@ -721,9 +722,13 @@ typedef struct dmu_recv_cookie {
 
 int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *,
     boolean_t force, objset_t *origin, dmu_recv_cookie_t *);
-int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp);
+int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
+    int cleanup_fd, uint64_t *action_handlep);
 int dmu_recv_end(dmu_recv_cookie_t *drc);
 
+int dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp,
+    offset_t *off);
+
 /* CRC64 table */
 #define        ZFS_CRC64_POLY  0xC96C5795D7870F42ULL   /* ECMA-182, reflected form */
 extern uint64_t zfs_crc64_table[256];
index 5c5119a2073d01aca7c32a70551ffb40f17a2eb6..c6d202e2e81a69d46684fe1df4b4d2136e742b5a 100644 (file)
@@ -40,6 +40,8 @@
 extern "C" {
 #endif
 
+extern krwlock_t os_lock;
+
 struct dsl_dataset;
 struct dmu_tx;
 
@@ -68,9 +70,15 @@ struct objset {
        spa_t *os_spa;
        arc_buf_t *os_phys_buf;
        objset_phys_t *os_phys;
-       dnode_t *os_meta_dnode;
-       dnode_t *os_userused_dnode;
-       dnode_t *os_groupused_dnode;
+       /*
+        * The following "special" dnodes have no parent and are exempt from
+        * dnode_move(), but they root their descendents in this objset using
+        * handles anyway, so that all access to dnodes from dbufs consistently
+        * uses handles.
+        */
+       dnode_handle_t os_meta_dnode;
+       dnode_handle_t os_userused_dnode;
+       dnode_handle_t os_groupused_dnode;
        zilog_t *os_zil;
 
        /* can change, under dsl_dir's locks: */
@@ -113,6 +121,9 @@ struct objset {
 #define        DMU_META_OBJSET         0
 #define        DMU_META_DNODE_OBJECT   0
 #define        DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
+#define        DMU_META_DNODE(os)      ((os)->os_meta_dnode.dnh_dnode)
+#define        DMU_USERUSED_DNODE(os)  ((os)->os_userused_dnode.dnh_dnode)
+#define        DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode)
 
 #define        DMU_OS_IS_L2CACHEABLE(os)                               \
        ((os)->os_secondary_cache == ZFS_CACHE_ALL ||           \
@@ -131,8 +142,8 @@ int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
 int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
     uint64_t flags);
 int dmu_objset_destroy(const char *name, boolean_t defer);
-int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props,
-    boolean_t recursive);
+int dmu_objset_snapshot(char *fsname, char *snapname, char *tag,
+    struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd);
 void dmu_objset_stats(objset_t *os, nvlist_t *nv);
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
@@ -150,6 +161,7 @@ timestruc_t dmu_objset_snap_cmtime(objset_t *os);
 /* called from dsl */
 void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
 boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
+boolean_t dmu_objset_is_dirty_anywhere(objset_t *os);
 objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
     blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
 int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
@@ -161,6 +173,9 @@ boolean_t dmu_objset_userused_enabled(objset_t *os);
 int dmu_objset_userspace_upgrade(objset_t *os);
 boolean_t dmu_objset_userspace_present(objset_t *os);
 
+void dmu_objset_init(void);
+void dmu_objset_fini(void);
+
 #ifdef __cplusplus
 }
 #endif
index 844e7f1aebe2540bcd356265e2c2ad9fc579543b..5b326cd99c09d61966cb84d725ae5249f3064da6 100644 (file)
@@ -49,6 +49,9 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 #define        TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA)
 #define        TRAVERSE_HARD                   (1<<4)
 
+/* Special traverse error return value to indicate skipping of children */
+#define        TRAVERSE_VISIT_NO_CHILDREN      -1
+
 int traverse_dataset(struct dsl_dataset *ds,
     uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
 int traverse_pool(spa_t *spa,
index 8bae1602e79132ec0895ea7ef550256a44278f67..9ad4be36bf85668900ff54fc1cd9df120433a186 100644 (file)
@@ -32,6 +32,7 @@
 #include <sys/zio.h>
 #include <sys/refcount.h>
 #include <sys/dmu_zfetch.h>
+#include <sys/zrlock.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -156,6 +157,7 @@ typedef struct dnode {
        struct objset *dn_objset;
        uint64_t dn_object;
        struct dmu_buf_impl *dn_dbuf;
+       struct dnode_handle *dn_handle;
        dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
 
        /*
@@ -172,6 +174,7 @@ typedef struct dnode {
        uint8_t dn_nlevels;
        uint8_t dn_indblkshift;
        uint8_t dn_datablkshift;        /* zero if blksz not power of 2! */
+       uint8_t dn_moved;               /* Has this dnode been moved? */
        uint16_t dn_datablkszsec;       /* in 512b sectors */
        uint32_t dn_datablksz;          /* in bytes */
        uint64_t dn_maxblkid;
@@ -183,6 +186,9 @@ typedef struct dnode {
        uint16_t dn_next_bonuslen[TXG_SIZE];
        uint32_t dn_next_blksz[TXG_SIZE];       /* next block size in bytes */
 
+       /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
+       uint32_t dn_dbufs_count;        /* count of dn_dbufs */
+
        /* protected by os_lock: */
        list_node_t dn_dirty_link[TXG_SIZE];    /* next on dataset's dirty */
 
@@ -202,8 +208,11 @@ typedef struct dnode {
        refcount_t dn_holds;
 
        kmutex_t dn_dbufs_mtx;
-       list_t dn_dbufs;                /* linked list of descendent dbuf_t's */
+       list_t dn_dbufs;                /* descendent dbufs */
+
+       /* protected by dn_struct_rwlock */
        struct dmu_buf_impl *dn_bonus;  /* bonus buffer dbuf */
+
        boolean_t dn_have_spill;        /* have spill or are spilling */
 
        /* parent IO for current sync write */
@@ -220,6 +229,22 @@ typedef struct dnode {
        struct zfetch   dn_zfetch;
 } dnode_t;
 
+/*
+ * Adds a level of indirection between the dbuf and the dnode to avoid
+ * iterating descendent dbufs in dnode_move(). Handles are not allocated
+ * individually, but as an array of child dnodes in dnode_hold_impl().
+ */
+typedef struct dnode_handle {
+       /* Protects dnh_dnode from modification by dnode_move(). */
+       zrlock_t dnh_zrlock;
+       dnode_t *dnh_dnode;
+} dnode_handle_t;
+
+typedef struct dnode_children {
+       size_t dnc_count;               /* number of children */
+       dnode_handle_t dnc_children[1]; /* sized dynamically */
+} dnode_children_t;
+
 typedef struct free_range {
        avl_node_t fr_node;
        uint64_t fr_blkid;
@@ -227,8 +252,8 @@ typedef struct free_range {
 } free_range_t;
 
 dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
-    uint64_t object);
-void dnode_special_close(dnode_t *dn);
+    uint64_t object, dnode_handle_t *dnh);
+void dnode_special_close(dnode_handle_t *dnh);
 
 void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
 void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
index 58414e13318b8f6ce9fe254b0e65a4b113916254..22733d070e8bb96138f83cbde6d653602f053b54 100644 (file)
@@ -162,6 +162,22 @@ struct dsl_ds_destroyarg {
        boolean_t need_prep;            /* do we need to retry due to EBUSY? */
 };
 
+/*
+ * The max length of a temporary tag prefix is the number of hex digits
+ * required to express UINT64_MAX plus one for the hyphen.
+ */
+#define        MAX_TAG_PREFIX_LEN      17
+
+struct dsl_ds_holdarg {
+       dsl_sync_task_group_t *dstg;
+       char *htag;
+       char *snapname;
+       boolean_t recursive;
+       boolean_t gotone;
+       boolean_t temphold;
+       char failed[MAXPATHLEN];
+};
+
 #define        dsl_dataset_is_snapshot(ds) \
        ((ds)->ds_phys->ds_num_children != 0)
 
@@ -182,6 +198,8 @@ void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag);
 boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok,
     void *tag);
 void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag);
+void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
+    minor_t minor);
 uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
 uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
@@ -192,16 +210,19 @@ dsl_checkfunc_t dsl_dataset_destroy_check;
 dsl_syncfunc_t dsl_dataset_destroy_sync;
 dsl_checkfunc_t dsl_dataset_snapshot_check;
 dsl_syncfunc_t dsl_dataset_snapshot_sync;
+dsl_syncfunc_t dsl_dataset_user_hold_sync;
 int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
 int dsl_dataset_promote(const char *name, char *conflsnap);
 int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
     boolean_t force);
 int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
-    boolean_t recursive, boolean_t temphold);
+    boolean_t recursive, boolean_t temphold, int cleanup_fd);
+int dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
+    boolean_t temphold);
 int dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
     boolean_t recursive);
 int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj,
-    char *htag);
+    char *htag, boolean_t retry);
 int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp);
 
 blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
index a26a3f7058a198cb8ee715289538f89e87921182..73c43bd23879d9705562acfa1a875029d19cefc2 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_DSL_DELEG_H
@@ -55,6 +54,7 @@ extern "C" {
 #define        ZFS_DELEG_PERM_GROUPUSED        "groupused"
 #define        ZFS_DELEG_PERM_HOLD             "hold"
 #define        ZFS_DELEG_PERM_RELEASE          "release"
+#define        ZFS_DELEG_PERM_DIFF             "diff"
 
 /*
  * Note: the names of properties that are marked delegatable are also
@@ -64,6 +64,7 @@ extern "C" {
 int dsl_deleg_get(const char *ddname, nvlist_t **nvp);
 int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset);
 int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr);
+int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr);
 void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr);
 int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr);
 int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr);
index c4103c48a47fac9c76ebe7399fbd155ef5ecff15..5eca760dadc50313580071f64e004fb4decb1108 100644 (file)
@@ -43,12 +43,13 @@ extern "C" {
 #define        FM_CLASS                        "class"
 #define        FM_VERSION                      "version"
 
-/* FM event class values */
+/* FM protocol category 1 class names */
 #define        FM_EREPORT_CLASS                "ereport"
 #define        FM_FAULT_CLASS                  "fault"
 #define        FM_DEFECT_CLASS                 "defect"
 #define        FM_RSRC_CLASS                   "resource"
 #define        FM_LIST_EVENT                   "list"
+#define        FM_IREPORT_CLASS                "ireport"
 
 /* FM list.* event class values */
 #define        FM_LIST_SUSPECT_CLASS           FM_LIST_EVENT ".suspect"
@@ -72,6 +73,12 @@ extern "C" {
 /* list.* event payload member names */
 #define        FM_LIST_EVENT_SIZE              "list-sz"
 
+/* ireport.* event payload member names */
+#define        FM_IREPORT_DETECTOR             "detector"
+#define        FM_IREPORT_UUID                 "uuid"
+#define        FM_IREPORT_PRIORITY             "pri"
+#define        FM_IREPORT_ATTRIBUTES           "attr"
+
 /*
  * list.suspect, isolated, updated, repaired and resolved
  * versions/payload member names.
@@ -192,6 +199,7 @@ extern "C" {
 #define        FM_FMRI_SCHEME_PKG              "pkg"
 #define        FM_FMRI_SCHEME_LEGACY           "legacy-hc"
 #define        FM_FMRI_SCHEME_ZFS              "zfs"
+#define        FM_FMRI_SCHEME_SW               "sw"
 
 /* Scheme versions */
 #define        FMD_SCHEME_VERSION0             0
@@ -215,6 +223,8 @@ extern "C" {
 #define        FM_SVC_SCHEME_VERSION           SVC_SCHEME_VERSION0
 #define        ZFS_SCHEME_VERSION0             0
 #define        FM_ZFS_SCHEME_VERSION           ZFS_SCHEME_VERSION0
+#define        SW_SCHEME_VERSION0              0
+#define        FM_SW_SCHEME_VERSION            SW_SCHEME_VERSION0
 
 /* hc scheme member names */
 #define        FM_FMRI_HC_SERIAL_ID            "serial"
@@ -299,6 +309,25 @@ extern "C" {
 #define        FM_FMRI_ZFS_POOL                "pool"
 #define        FM_FMRI_ZFS_VDEV                "vdev"
 
+/* sw scheme member names - extra indentation for members of an nvlist */
+#define        FM_FMRI_SW_OBJ                  "object"
+#define        FM_FMRI_SW_OBJ_PATH                     "path"
+#define        FM_FMRI_SW_OBJ_ROOT                     "root"
+#define        FM_FMRI_SW_OBJ_PKG                      "pkg"
+#define        FM_FMRI_SW_SITE                 "site"
+#define        FM_FMRI_SW_SITE_TOKEN                   "token"
+#define        FM_FMRI_SW_SITE_MODULE                  "module"
+#define        FM_FMRI_SW_SITE_FILE                    "file"
+#define        FM_FMRI_SW_SITE_LINE                    "line"
+#define        FM_FMRI_SW_SITE_FUNC                    "func"
+#define        FM_FMRI_SW_CTXT                 "context"
+#define        FM_FMRI_SW_CTXT_ORIGIN                  "origin"
+#define        FM_FMRI_SW_CTXT_EXECNAME                "execname"
+#define        FM_FMRI_SW_CTXT_PID                     "pid"
+#define        FM_FMRI_SW_CTXT_ZONE                    "zone"
+#define        FM_FMRI_SW_CTXT_CTID                    "ctid"
+#define        FM_FMRI_SW_CTXT_STACK                   "stack"
+
 extern nv_alloc_t *fm_nva_xcreate(char *, size_t);
 extern void fm_nva_xdestroy(nv_alloc_t *);
 
index 4934814d86c3d987dfe628a08dd13f39b898de4b..37334101b3cfc69808741553d205d3a3d6049a8d 100644 (file)
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_FM_UTIL_H
 #define        _SYS_FM_UTIL_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -96,6 +93,7 @@ extern void fm_ereport_post(nvlist_t *, int);
 
 extern void fm_payload_stack_add(nvlist_t *, const pc_t *, int);
 
+extern int is_fm_panic();
 #endif  /* _KERNEL */
 
 #ifdef __cplusplus
index bc3ade80f183872f2cc34b507ce51bc771ec742d..1752c64e3e8be2dbc636dd7dcaa188526178780d 100644 (file)
@@ -40,7 +40,7 @@ extern "C" {
  */
 #define        FTAG ((char *)__func__)
 
-#if defined(DEBUG) || !defined(_KERNEL)
+#ifdef ZFS_DEBUG
 typedef struct reference {
        list_node_t ref_link;
        void *ref_holder;
@@ -67,11 +67,12 @@ int64_t refcount_add(refcount_t *rc, void *holder_tag);
 int64_t refcount_remove(refcount_t *rc, void *holder_tag);
 int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
 int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
+void refcount_transfer(refcount_t *dst, refcount_t *src);
 
 void refcount_init(void);
 void refcount_fini(void);
 
-#else /* DEBUG */
+#else  /* ZFS_DEBUG */
 
 typedef struct refcount {
        uint64_t rc_count;
@@ -97,7 +98,7 @@ typedef struct refcount {
 #define        refcount_init()
 #define        refcount_fini()
 
-#endif /* DEBUG */
+#endif /* ZFS_DEBUG */
 
 #ifdef __cplusplus
 }
index e9a96a0f950e90dc0f0dda73819108d87ee12d4a..bc89fa07d222c0007c45e390a8ae81a3e1708a8f 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_SA_H
@@ -141,7 +140,7 @@ dmu_buf_t *sa_get_db(sa_handle_t *);
 uint64_t sa_handle_object(sa_handle_t *);
 boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size);
 void sa_register_update_callback(objset_t *, sa_update_cb_t *);
-sa_attr_type_t *sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int);
+int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **);
 void sa_tear_down(objset_t *);
 int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *,
     int, dmu_tx_t *);
index 62497e70258ca78d7401eff9e8d5d390d22aeb10..6661e47cfc833099e86b63a3dd759e1f87110427 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_SA_IMPL_H
@@ -232,7 +231,7 @@ struct sa_handle {
        ((a == DMU_OT_SA) ? B_TRUE : B_FALSE)
 
 #define        SA_BONUSTYPE_FROM_DB(db) \
-       (((dmu_buf_impl_t *)db)->db_dnode->dn_bonustype)
+       (dmu_get_bonustype((dmu_buf_t *)db))
 
 #define        SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t))
 
index 41a40300ebfec2234b4385b3ab33065947a45f3e..456ec06dc456160bd5a7869240d078f08676fae8 100644 (file)
@@ -418,8 +418,8 @@ extern int spa_get_stats(const char *pool, nvlist_t **config,
 extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
     const char *history_str, nvlist_t *zplprops);
 extern int spa_import_rootpool(char *devpath, char *devid);
-extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props);
-extern int spa_import_verbatim(const char *, nvlist_t *, nvlist_t *);
+extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props,
+    uint64_t flags);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(char *pool);
 extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
@@ -602,6 +602,7 @@ extern objset_t *spa_meta_objset(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern int spa_rename(const char *oldname, const char *newname);
+extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
 extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
 extern char *spa_strdup(const char *);
 extern void spa_strfree(char *);
@@ -620,7 +621,6 @@ extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
-extern void spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *to);
 
 extern int spa_mode(spa_t *spa);
 extern uint64_t strtonum(const char *str, char **nptr);
index e2e1851ecca1096acd69b382f49b46219350d57f..c965ffbbef871c52ada6da3c4b1280834928ac79 100644 (file)
@@ -114,13 +114,14 @@ struct spa {
        nvlist_t        *spa_config;            /* last synced config */
        nvlist_t        *spa_config_syncing;    /* currently syncing config */
        nvlist_t        *spa_config_splitting;  /* config for splitting */
+       nvlist_t        *spa_load_info;         /* info and errors from load */
        uint64_t        spa_config_txg;         /* txg of last config change */
        int             spa_sync_pass;          /* iterate-to-convergence */
        pool_state_t    spa_state;              /* pool state */
        int             spa_inject_ref;         /* injection references */
        uint8_t         spa_sync_on;            /* sync threads are running */
        spa_load_state_t spa_load_state;        /* current load operation */
-       boolean_t       spa_load_verbatim;      /* load the given config? */
+       uint64_t        spa_import_flags;       /* import specific flags */
        taskq_t         *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
        dsl_pool_t      *spa_dsl_pool;
        metaslab_class_t *spa_normal_class;     /* normal data class */
@@ -130,6 +131,7 @@ struct spa {
        uint64_t        spa_freeze_txg;         /* freeze pool at this txg */
        uint64_t        spa_load_max_txg;       /* best initial ub_txg */
        uint64_t        spa_claim_max_txg;      /* highest claimed birth txg */
+       timespec_t      spa_loaded_ts;          /* 1st successful open time */
        objset_t        *spa_meta_objset;       /* copy of dp->dp_meta_objset */
        txg_list_t      spa_vdev_txg_list;      /* per-txg dirty vdev list */
        vdev_t          *spa_root_vdev;         /* top-level vdev container */
@@ -146,9 +148,9 @@ struct spa {
        uberblock_t     spa_ubsync;             /* last synced uberblock */
        uberblock_t     spa_uberblock;          /* current uberblock */
        boolean_t       spa_extreme_rewind;     /* rewind past deferred frees */
+       uint64_t        spa_last_io;            /* lbolt of last non-scan I/O */
        kmutex_t        spa_scrub_lock;         /* resilver/scrub lock */
        uint64_t        spa_scrub_inflight;     /* in-flight scrub I/Os */
-       uint64_t        spa_scrub_maxinflight;  /* max in-flight scrub I/Os */
        kcondvar_t      spa_scrub_io_cv;        /* scrub I/O completion */
        uint8_t         spa_scrub_active;       /* active or suspended? */
        uint8_t         spa_scrub_type;         /* type of scrub we're doing */
index 2b886bc58831ef899543f2c6b88812da7abd9334..161bd21f05a6eea1e7fd8ff04d307ab1692fac06 100644 (file)
@@ -169,6 +169,7 @@ struct vdev {
        uint64_t        vdev_faulted;   /* persistent faulted state     */
        uint64_t        vdev_degraded;  /* persistent degraded state    */
        uint64_t        vdev_removed;   /* persistent removed state     */
+       uint64_t        vdev_resilvering; /* persistent resilvering state */
        uint64_t        vdev_nparity;   /* number of parity devices for raidz */
        char            *vdev_path;     /* vdev path (if any)           */
        char            *vdev_devid;    /* vdev devid (if any)          */
@@ -283,6 +284,7 @@ extern void vdev_remove_parent(vdev_t *cvd);
  * vdev sync load and sync
  */
 extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
+extern boolean_t vdev_log_state_valid(vdev_t *vd);
 extern void vdev_load(vdev_t *vd);
 extern void vdev_sync(vdev_t *vd, uint64_t txg);
 extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
index 72e868fabb06ab15eac49e8019e0a0130910acdd..c1a0aeebdce442462680c52e836d19e3da429ad5 100644 (file)
@@ -185,10 +185,6 @@ typedef struct zfs_acl_ids {
        struct zfs_fuid_info    *z_fuidp;       /* for tracking fuids for log */
 } zfs_acl_ids_t;
 
-#define        ZFS_EXTERNAL_ACL(zp) \
-       (zp->z_is_sa ? 0 : zfs_external_acl(zp))
-#define        ZNODE_ACL_VERSION(zp) \
-       (zp->z_is_sa ? ZFS_ACL_VERSION_FUID : zfs_znode_acl_version(zp))
 /*
  * Property values for acl_mode and acl_inherit.
  *
@@ -222,7 +218,7 @@ int zfs_fastaccesschk_execute(struct znode *, cred_t *);
 extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *);
 extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *);
 extern int zfs_acl_access(struct znode *, int, cred_t *);
-int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t);
+void zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t);
 int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
 int zfs_zaccess_rename(struct znode *, struct znode *,
     struct znode *, struct znode *, cred_t *cr);
index b0cb4955e1999f8e602f1d45b663b8b1acf880e1..84bf794fe5f027fb2874d12ebf64029d10acb400 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_ZFS_IOCTL_H
@@ -31,6 +30,7 @@
 #include <sys/zio.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
+#include <sys/zfs_stat.h>
 
 #ifdef _KERNEL
 #include <sys/nvpair.h>
@@ -199,6 +199,22 @@ typedef struct dmu_replay_record {
        } drr_u;
 } dmu_replay_record_t;
 
+/* diff record range types */
+typedef enum diff_type {
+       DDR_NONE = 0x1,
+       DDR_INUSE = 0x2,
+       DDR_FREE = 0x4
+} diff_type_t;
+
+/*
+ * The diff reports back ranges of free or in-use objects.
+ */
+typedef struct dmu_diff_record {
+       uint64_t ddr_type;
+       uint64_t ddr_first;
+       uint64_t ddr_last;
+} dmu_diff_record_t;
+
 typedef struct zinject_record {
        uint64_t        zi_objset;
        uint64_t        zi_object;
@@ -265,6 +281,13 @@ typedef struct zfs_cmd {
        zinject_record_t zc_inject_record;
        boolean_t       zc_defer_destroy;
        boolean_t       zc_temphold;
+       uint64_t        zc_action_handle;
+       int             zc_cleanup_fd;
+       uint8_t         zc_pad[4];              /* alignment */
+       uint64_t        zc_sendobj;
+       uint64_t        zc_fromobj;
+       uint64_t        zc_createtxg;
+       zfs_stat_t      zc_stat;
 } zfs_cmd_t;
 
 typedef struct zfs_useracct {
@@ -274,8 +297,8 @@ typedef struct zfs_useracct {
        uint64_t zu_space;
 } zfs_useracct_t;
 
-#define        ZVOL_MAX_MINOR  (1 << 16)
-#define        ZFS_MIN_MINOR   (ZVOL_MAX_MINOR + 1)
+#define        ZFSDEV_MAX_MINOR        (1 << 16)
+#define        ZFS_MIN_MINOR   (ZFSDEV_MAX_MINOR + 1)
 
 #define        ZPOOL_EXPORT_AFTER_SPLIT 0x1
 
@@ -295,6 +318,28 @@ extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
 extern int zfs_busy(void);
 extern int zfs_unmount_snap(const char *, void *);
 
+/*
+ * ZFS minor numbers can refer to either a control device instance or
+ * a zvol. Depending on the value of zss_type, zss_data points to either
+ * a zvol_state_t or a zfs_onexit_t.
+ */
+enum zfs_soft_state_type {
+       ZSST_ZVOL,
+       ZSST_CTLDEV
+};
+
+typedef struct zfs_soft_state {
+       enum zfs_soft_state_type zss_type;
+       void *zss_data;
+} zfs_soft_state_t;
+
+extern void *zfsdev_get_soft_state(minor_t minor,
+    enum zfs_soft_state_type which);
+extern minor_t zfsdev_minor_alloc(void);
+
+extern void *zfsdev_state;
+extern kmutex_t zfsdev_state_lock;
+
 #endif /* _KERNEL */
 
 #ifdef __cplusplus
diff --git a/module/zfs/include/sys/zfs_onexit.h b/module/zfs/include/sys/zfs_onexit.h
new file mode 100644 (file)
index 0000000..4982bd4
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef        _SYS_ZFS_ONEXIT_H
+#define        _SYS_ZFS_ONEXIT_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+typedef struct zfs_onexit {
+       kmutex_t        zo_lock;
+       list_t          zo_actions;
+} zfs_onexit_t;
+
+typedef struct zfs_onexit_action_node {
+       list_node_t     za_link;
+       void            (*za_func)(void *);
+       void            *za_data;
+} zfs_onexit_action_node_t;
+
+extern void zfs_onexit_init(zfs_onexit_t **zo);
+extern void zfs_onexit_destroy(zfs_onexit_t *zo);
+
+#endif
+
+extern int zfs_onexit_fd_hold(int fd, minor_t *minorp);
+extern void zfs_onexit_fd_rele(int fd);
+extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+    uint64_t *action_handle);
+extern int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle,
+    boolean_t fire);
+extern int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle,
+    void **data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_ONEXIT_H */
diff --git a/module/zfs/include/sys/zfs_stat.h b/module/zfs/include/sys/zfs_stat.h
new file mode 100644 (file)
index 0000000..465aefa
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef        _SYS_FS_ZFS_STAT_H
+#define        _SYS_FS_ZFS_STAT_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/dmu.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * A limited number of zpl level stats are retrievable
+ * with an ioctl.  zfs diff is the current consumer.
+ */
+typedef struct zfs_stat {
+       uint64_t        zs_gen;
+       uint64_t        zs_mode;
+       uint64_t        zs_links;
+       uint64_t        zs_ctime[2];
+} zfs_stat_t;
+
+extern int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+    char *buf, int len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_STAT_H */
index 86dcdacc08fe9c5dec9ade34ed434fbfbd63f0e4..38c87df4300fad8a8718d101d07cdd5b60c633fd 100644 (file)
@@ -79,6 +79,7 @@ struct zfsvfs {
        kmutex_t        z_lock;
        uint64_t        z_userquota_obj;
        uint64_t        z_groupquota_obj;
+       uint64_t        z_replay_eof;   /* New end of file - replay only */
        sa_attr_type_t  *z_attr_table;  /* SA attr mapping->id */
 #define        ZFS_OBJ_MTX_SZ  64
        kmutex_t        z_hold_mtx[ZFS_OBJ_MTX_SZ];     /* znode hold locks */
index 4781ee6862a75f420f02d412386d79c8f97f13cd..3e9621a0ee249453b172982a47fa20d4f305b4df 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef        _SYS_FS_ZFS_ZNODE_H
@@ -36,6 +35,7 @@
 #include <sys/zfs_vfsops.h>
 #include <sys/rrwlock.h>
 #include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
 #endif
 #include <sys/zfs_acl.h>
 #include <sys/zil.h>
@@ -60,6 +60,8 @@ extern "C" {
 #define        ZFS_AV_QUARANTINED      0x0000020000000000
 #define        ZFS_AV_MODIFIED         0x0000040000000000
 #define        ZFS_REPARSE             0x0000080000000000
+#define        ZFS_OFFLINE             0x0000100000000000
+#define        ZFS_SPARSE              0x0000200000000000
 
 #define        ZFS_ATTR_SET(zp, attr, value, pflags, tx) \
 { \
@@ -188,17 +190,17 @@ typedef struct znode {
        uint8_t         z_unlinked;     /* file has been unlinked */
        uint8_t         z_atime_dirty;  /* atime needs to be synced */
        uint8_t         z_zn_prefetch;  /* Prefetch znodes? */
+       uint8_t         z_moved;        /* Has this znode been moved? */
        uint_t          z_blksz;        /* block size in bytes */
        uint_t          z_seq;          /* modification sequence number */
        uint64_t        z_mapcnt;       /* number of pages mapped to file */
-       uint64_t        z_last_itx;     /* last ZIL itx on this znode */
        uint64_t        z_gen;          /* generation (cached) */
        uint64_t        z_size;         /* file size (cached) */
        uint64_t        z_atime[2];     /* atime (cached) */
        uint64_t        z_links;        /* file links (cached) */
        uint64_t        z_pflags;       /* pflags (cached) */
-       uid_t           z_uid;          /* uid mapped (cached) */
-       uid_t           z_gid;          /* gid mapped (cached) */
+       uint64_t        z_uid;          /* uid fuid (cached) */
+       uint64_t        z_gid;          /* gid fuid (cached) */
        mode_t          z_mode;         /* mode (cached) */
        uint32_t        z_sync_cnt;     /* synchronous open count */
        kmutex_t        z_acl_lock;     /* acl data lock */
@@ -321,7 +323,8 @@ extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp,
     vattr_t *vap);
 extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-    znode_t *dzp, char *name);
+    znode_t *dzp, char *name, uint64_t foid);
+#define        ZFS_NO_OBJECT   0       /* no object id */
 extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, znode_t *zp, char *name);
 extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
index 2f01cf922eda50ac141c2559e89d634c11e55232..a4c5575b2dbacf26c466991b55107dda1a277633 100644 (file)
@@ -169,18 +169,14 @@ typedef enum zil_create {
        (txtype) == TX_ACL ||           \
        (txtype) == TX_WRITE2)
 
-
 /*
  * Format of log records.
  * The fields are carefully defined to allow them to be aligned
  * and sized the same on sparc & intel architectures.
  * Each log record has a common structure at the beginning.
  *
- * Note, lrc_seq holds two different sequence numbers. Whilst in memory
- * it contains the transaction sequence number.  The log record on
- * disk holds the sequence number of all log records which is used to
- * ensure we don't replay the same record.  The two sequence numbers are
- * different because the transactions can now be pushed out of order.
+ * The log record on disk (lrc_seq) holds the sequence number of all log
+ * records which is used to ensure we don't replay the same record.
  */
 typedef struct {                       /* common log record header */
        uint64_t        lrc_txtype;     /* intent log transaction type */
@@ -371,6 +367,7 @@ typedef struct itx {
        itx_wr_state_t  itx_wr_state;   /* write state */
        uint8_t         itx_sync;       /* synchronous transaction */
        uint64_t        itx_sod;        /* record size on disk */
+       uint64_t        itx_oid;        /* object id */
        lr_t            itx_lr;         /* common part of log record */
        /* followed by type-specific part of lr_xx_t and its immediate data */
 } itx_t;
@@ -402,15 +399,15 @@ extern void       zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
 
 extern itx_t   *zil_itx_create(uint64_t txtype, size_t lrsize);
 extern void    zil_itx_destroy(itx_t *itx);
-extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
+extern void    zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
 
-extern void    zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
+extern void    zil_commit(zilog_t *zilog, uint64_t oid);
 
 extern int     zil_vdev_offline(const char *osname, void *txarg);
 extern int     zil_claim(const char *osname, void *txarg);
 extern int     zil_check_log_chain(const char *osname, void *txarg);
 extern void    zil_sync(zilog_t *zilog, dmu_tx_t *tx);
-extern void    zil_clean(zilog_t *zilog);
+extern void    zil_clean(zilog_t *zilog, uint64_t synced_txg);
 
 extern int     zil_suspend(zilog_t *zilog);
 extern void    zil_resume(zilog_t *zilog);
index 6560a7942b4904ebcba1b6917997eeebca532f6e..1d4c0cc6c1de8b116838e314c57bc286e077a9cd 100644 (file)
@@ -49,6 +49,28 @@ typedef struct lwb {
        list_node_t     lwb_node;       /* zilog->zl_lwb_list linkage */
 } lwb_t;
 
+/*
+ * Intent log transaction lists
+ */
+typedef struct itxs {
+       list_t          i_sync_list;    /* list of synchronous itxs */
+       avl_tree_t      i_async_tree;   /* tree of foids for async itxs */
+} itxs_t;
+
+typedef struct itxg {
+       kmutex_t        itxg_lock;      /* lock for this structure */
+       uint64_t        itxg_txg;       /* txg for this chain */
+       uint64_t        itxg_sod;       /* total size on disk for this txg */
+       itxs_t          *itxg_itxs;     /* sync and async itxs */
+} itxg_t;
+
+/* for async nodes we build up an AVL tree of lists of async itxs per file */
+typedef struct itx_async_node {
+       uint64_t        ia_foid;        /* file object id */
+       list_t          ia_list;        /* list of async itxs for this foid */
+       avl_node_t      ia_node;        /* AVL tree linkage */
+} itx_async_node_t;
+
 /*
  * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
  * we've touched so we know which ones need a write cache flush at the end.
@@ -71,9 +93,7 @@ struct zilog {
        objset_t        *zl_os;         /* object set we're logging */
        zil_get_data_t  *zl_get_data;   /* callback to get object content */
        zio_t           *zl_root_zio;   /* log writer root zio */
-       uint64_t        zl_itx_seq;     /* next in-core itx sequence number */
        uint64_t        zl_lr_seq;      /* on-disk log record sequence number */
-       uint64_t        zl_commit_seq;  /* committed upto this number */
        uint64_t        zl_commit_lr_seq; /* last committed on-disk lr seq */
        uint64_t        zl_destroy_txg; /* txg of last zil_destroy() */
        uint64_t        zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
@@ -93,10 +113,13 @@ struct zilog {
        uint64_t        zl_parse_lr_seq; /* highest lr seq on last parse */
        uint64_t        zl_parse_blk_count; /* number of blocks parsed */
        uint64_t        zl_parse_lr_count; /* number of log records parsed */
-       list_t          zl_itx_list;    /* in-memory itx list */
+       uint64_t        zl_next_batch;  /* next batch number */
+       uint64_t        zl_com_batch;   /* committed batch number */
+       kcondvar_t      zl_cv_batch[2]; /* batch condition variables */
+       itxg_t          zl_itxg[TXG_SIZE]; /* intent log txg chains */
+       list_t          zl_itx_commit_list; /* itx list to be committed */
        uint64_t        zl_itx_list_sz; /* total size of records on list */
        uint64_t        zl_cur_used;    /* current commit log size used */
-       uint64_t        zl_prev_used;   /* previous commit log size used */
        list_t          zl_lwb_list;    /* in-flight log write list */
        kmutex_t        zl_vdev_lock;   /* protects zl_vdev_tree */
        avl_tree_t      zl_vdev_tree;   /* vdevs to flush in zil_commit() */
index 0400c1702eaa148a91adf78fdcfe34e39a3a8bb2..97d8ec74d2e9d1a1ed36700ac3e66f9aa7e1a3d6 100644 (file)
@@ -147,7 +147,7 @@ enum zio_flag {
        ZIO_FLAG_SELF_HEAL      = 1 << 2,
        ZIO_FLAG_RESILVER       = 1 << 3,
        ZIO_FLAG_SCRUB          = 1 << 4,
-       ZIO_FLAG_SCRUB_THREAD   = 1 << 5,
+       ZIO_FLAG_SCAN_THREAD    = 1 << 5,
 
 #define        ZIO_FLAG_AGG_INHERIT    (ZIO_FLAG_CANFAIL - 1)
 
diff --git a/module/zfs/include/sys/zrlock.h b/module/zfs/include/sys/zrlock.h
new file mode 100644 (file)
index 0000000..dcd63f7
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef        _SYS_ZRLOCK_H
+#define        _SYS_ZRLOCK_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct zrlock {
+       kmutex_t zr_mtx;
+       volatile int32_t zr_refcount;
+       kcondvar_t zr_cv;
+       uint16_t zr_pad;
+#ifdef ZFS_DEBUG
+       kthread_t *zr_owner;
+       const char *zr_caller;
+#endif
+} zrlock_t;
+
+extern void zrl_init(zrlock_t *);
+extern void zrl_destroy(zrlock_t *);
+#ifdef ZFS_DEBUG
+#define        zrl_add(_z)     zrl_add_debug((_z), __func__)
+extern void zrl_add_debug(zrlock_t *, const char *);
+#else
+extern void zrl_add(zrlock_t *);
+#endif
+extern void zrl_remove(zrlock_t *);
+extern int zrl_tryenter(zrlock_t *);
+extern void zrl_exit(zrlock_t *);
+extern int zrl_is_zero(zrlock_t *);
+extern int zrl_is_locked(zrlock_t *);
+#ifdef ZFS_DEBUG
+extern kthread_t *zrl_owner(zrlock_t *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZRLOCK_H */
index 10952f472b333c8e2d208410bb4d6230c3a82e0a..ab3de51b7259a7ff230f3b2fae2c27c4c083c0b3 100644 (file)
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
- * We keep our own copy of this algorithm for 2 main reasons:
+ * We keep our own copy of this algorithm for 3 main reasons:
  *     1. If we didn't, anyone modifying common/os/compress.c would
  *         directly break our on disk format
  *     2. Our version of lzjb does not have a number of checks that the
@@ -33,8 +32,8 @@
  *     3. We initialize the lempel to ensure deterministic results,
  *        so that identical blocks can always be deduplicated.
  * In particular, we are adding the "feature" that compress() can
- * take a destination buffer size and return -1 if the data will not
- * compress to d_len or less.
+ * take a destination buffer size and returns the compressed length, or the
+ * source length if compression would overflow the destination buffer.
  */
 
 #include <sys/types.h>
index 8358b4ceeb0cdfd268591763346f37c3a6af200b..600132f080e70e70586e1e00d6683257d7cf53ae 100644 (file)
@@ -25,7 +25,7 @@
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 
-#if defined(DEBUG) || !defined(_KERNEL)
+#ifdef ZFS_DEBUG
 
 #ifdef _KERNEL
 int reference_tracking_enable = FALSE; /* runs out of memory too easily */
@@ -189,4 +189,35 @@ refcount_remove(refcount_t *rc, void *holder)
        return (refcount_remove_many(rc, 1, holder));
 }
 
-#endif
+void
+refcount_transfer(refcount_t *dst, refcount_t *src)
+{
+       int64_t count, removed_count;
+       list_t list, removed;
+
+       list_create(&list, sizeof (reference_t),
+           offsetof(reference_t, ref_link));
+       list_create(&removed, sizeof (reference_t),
+           offsetof(reference_t, ref_link));
+
+       mutex_enter(&src->rc_mtx);
+       count = src->rc_count;
+       removed_count = src->rc_removed_count;
+       src->rc_count = 0;
+       src->rc_removed_count = 0;
+       list_move_tail(&list, &src->rc_list);
+       list_move_tail(&removed, &src->rc_removed);
+       mutex_exit(&src->rc_mtx);
+
+       mutex_enter(&dst->rc_mtx);
+       dst->rc_count += count;
+       dst->rc_removed_count += removed_count;
+       list_move_tail(&dst->rc_list, &list);
+       list_move_tail(&dst->rc_removed, &removed);
+       mutex_exit(&dst->rc_mtx);
+
+       list_destroy(&list);
+       list_destroy(&removed);
+}
+
+#endif /* ZFS_DEBUG */
index a91b379f998455ec4291e8928aa8dcb68933c7d6..4cb4546b251109e105820a4bcdc721372e68f22c 100644 (file)
@@ -300,8 +300,8 @@ sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
        return (crc);
 }
 
-static boolean_t
-sa_has_blkptr(sa_handle_t *hdl)
+static int
+sa_get_spill(sa_handle_t *hdl)
 {
        int rc;
        if (hdl->sa_spill == NULL) {
@@ -312,7 +312,7 @@ sa_has_blkptr(sa_handle_t *hdl)
                rc = 0;
        }
 
-       return (rc == 0 ? B_TRUE : B_FALSE);
+       return (rc);
 }
 
 /*
@@ -349,7 +349,8 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
                                buftypes |= SA_BONUS;
                        }
                }
-               if (bulk[i].sa_addr == NULL && sa_has_blkptr(hdl)) {
+               if (bulk[i].sa_addr == NULL &&
+                   ((error = sa_get_spill(hdl)) == 0)) {
                        if (TOC_ATTR_PRESENT(
                            hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
                                SA_ATTR_INFO(sa, hdl->sa_spill_tab,
@@ -362,6 +363,10 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
                                }
                        }
                }
+               if (error && error != ENOENT) {
+                       return ((error == ECKSUM) ? EIO : error);
+               }
+
                switch (data_op) {
                case SA_LOOKUP:
                        if (bulk[i].sa_addr == NULL)
@@ -421,12 +426,10 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
                char attr_name[8];
 
                if (sa->sa_layout_attr_obj == 0) {
-                       int error;
                        sa->sa_layout_attr_obj = zap_create(os,
                            DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx);
-                       error = zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1,
-                           &sa->sa_layout_attr_obj, tx);
-                       ASSERT3U(error, ==, 0);
+                       VERIFY(zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1,
+                           &sa->sa_layout_attr_obj, tx) == 0);
                }
 
                (void) snprintf(attr_name, sizeof (attr_name),
@@ -667,10 +670,8 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
                boolean_t dummy;
 
                if (hdl->sa_spill == NULL) {
-                       int error;
-                       error = dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
-                           &hdl->sa_spill);
-                       ASSERT3U(error, ==, 0);
+                       VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
+                           &hdl->sa_spill) == 0);
                }
                dmu_buf_will_dirty(hdl->sa_spill, tx);
 
@@ -712,7 +713,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
                        length = attr_desc[i].sa_length;
 
                if (buf_space < length) {  /* switch to spill buffer */
-                       ASSERT(bonustype != DMU_OT_ZNODE);
+                       VERIFY(bonustype == DMU_OT_SA);
                        if (buftype == SA_BONUS && !sa->sa_force_spill) {
                                sa_find_layout(hdl->sa_os, hash, attrs_start,
                                    lot_count, tx, &lot);
@@ -746,6 +747,14 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
        }
 
        sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
+
+       /*
+        * Verify that old znodes always have layout number 0.
+        * Must be DMU_OT_SA for arbitrary layouts
+        */
+       VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
+           (bonustype == DMU_OT_SA && lot->lot_num > 1));
+
        if (bonustype == DMU_OT_SA) {
                SA_SET_HDR(sahdr, lot->lot_num,
                    buftype == SA_BONUS ? hdrsize : spillhdrsize);
@@ -763,11 +772,6 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
                if (!spilling) {
                        /*
                         * remove spill block that is no longer needed.
-                        * set sa_spill_remove to prevent sa_attr_op
-                        * from trying to retrieve spill block before its
-                        * been removed.  The flag will be cleared if/when
-                        * the handle is destroyed recreated or
-                        * sa_build_layouts() needs to spill again.
                         */
                        dmu_buf_rele(hdl->sa_spill, NULL);
                        hdl->sa_spill = NULL;
@@ -783,10 +787,31 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
 }
 
 static void
+sa_free_attr_table(sa_os_t *sa)
+{
+       int i;
+
+       if (sa->sa_attr_table == NULL)
+               return;
+
+       for (i = 0; i != sa->sa_num_attrs; i++) {
+               if (sa->sa_attr_table[i].sa_name)
+                       kmem_free(sa->sa_attr_table[i].sa_name,
+                           strlen(sa->sa_attr_table[i].sa_name) + 1);
+       }
+
+       kmem_free(sa->sa_attr_table,
+           sizeof (sa_attr_table_t) * sa->sa_num_attrs);
+
+       sa->sa_attr_table = NULL;
+}
+
+static int
 sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
 {
        sa_os_t *sa = os->os_sa;
        uint64_t sa_attr_count = 0;
+       uint64_t sa_reg_count;
        int error = 0;
        uint64_t attr_value;
        sa_attr_table_t *tb;
@@ -800,8 +825,20 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
            kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
        sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
 
-       if (sa->sa_reg_attr_obj != 0)
-               VERIFY(zap_count(os, sa->sa_reg_attr_obj, &sa_attr_count) == 0);
+       if (sa->sa_reg_attr_obj != 0) {
+               error = zap_count(os, sa->sa_reg_attr_obj,
+                   &sa_attr_count);
+
+               /*
+                * Make sure we retrieved a count and that it isn't zero
+                */
+               if (error || (error == 0 && sa_attr_count == 0)) {
+                       if (error == 0)
+                               error = EINVAL;
+                       goto bail;
+               }
+               sa_reg_count = sa_attr_count;
+       }
 
        if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
                sa_attr_count += sa_legacy_attr_count;
@@ -830,7 +867,6 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
                else
                        error = ENOENT;
                switch (error) {
-               default:
                case ENOENT:
                        sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
                        sa_attr_count++;
@@ -838,11 +874,13 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
                case 0:
                        sa->sa_user_table[i] = ATTR_NUM(attr_value);
                        break;
+               default:
+                       goto bail;
                }
        }
 
-       os->os_sa->sa_num_attrs = sa_attr_count;
-       tb = os->os_sa->sa_attr_table =
+       sa->sa_num_attrs = sa_attr_count;
+       tb = sa->sa_attr_table =
            kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
 
        /*
@@ -853,7 +891,7 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
 
        if (sa->sa_reg_attr_obj) {
                for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
-                   zap_cursor_retrieve(&zc, &za) == 0;
+                   (error = zap_cursor_retrieve(&zc, &za)) == 0;
                    zap_cursor_advance(&zc)) {
                        uint64_t value;
                        value  = za.za_first_integer;
@@ -873,6 +911,15 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
                            strlen(za.za_name) +1);
                }
                zap_cursor_fini(&zc);
+               /*
+                * Make sure we processed the correct number of registered
+                * attributes
+                */
+               if (registered_count != sa_reg_count) {
+                       ASSERT(error != 0);
+                       goto bail;
+               }
+
        }
 
        if (ostype == DMU_OST_ZFS) {
@@ -908,18 +955,27 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
                    strlen(reg_attrs[i].sa_name) + 1);
        }
 
-       os->os_sa->sa_need_attr_registration =
+       sa->sa_need_attr_registration =
            (sa_attr_count != registered_count);
+
+       return (0);
+bail:
+       kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
+       sa->sa_user_table = NULL;
+       sa_free_attr_table(sa);
+       return ((error != 0) ? error : EINVAL);
 }
 
-sa_attr_type_t *
-sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count)
+int
+sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
+    sa_attr_type_t **user_table)
 {
        zap_cursor_t zc;
        zap_attribute_t za;
        sa_os_t *sa;
        dmu_objset_type_t ostype = dmu_objset_type(os);
        sa_attr_type_t *tb;
+       int error;
 
        mutex_enter(&os->os_lock);
        if (os->os_sa) {
@@ -927,13 +983,15 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count)
                mutex_exit(&os->os_lock);
                tb = os->os_sa->sa_user_table;
                mutex_exit(&os->os_sa->sa_lock);
-               return (tb);
+               *user_table = tb;
+               return (0);
        }
 
        sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
        mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL);
        sa->sa_master_obj = sa_obj;
 
+       os->os_sa = sa;
        mutex_enter(&sa->sa_lock);
        mutex_exit(&os->os_lock);
        avl_create(&sa->sa_layout_num_tree, layout_num_compare,
@@ -942,26 +1000,36 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count)
            sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
 
        if (sa_obj) {
-               int error;
                error = zap_lookup(os, sa_obj, SA_LAYOUTS,
                    8, 1, &sa->sa_layout_attr_obj);
-               if (error != 0 && error != ENOENT) {
-                       return (NULL);
-               }
+               if (error != 0 && error != ENOENT)
+                       goto fail;
                error = zap_lookup(os, sa_obj, SA_REGISTRY,
                    8, 1, &sa->sa_reg_attr_obj);
-               if (error != 0 && error != ENOENT) {
-                       mutex_exit(&sa->sa_lock);
-                       return (NULL);
-               }
+               if (error != 0 && error != ENOENT)
+                       goto fail;
        }
 
-       os->os_sa = sa;
-       sa_attr_table_setup(os, reg_attrs, count);
+       if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
+               goto fail;
 
        if (sa->sa_layout_attr_obj != 0) {
+               uint64_t layout_count;
+
+               error = zap_count(os, sa->sa_layout_attr_obj,
+                   &layout_count);
+
+               /*
+                * Layout number count should be > 0
+                */
+               if (error || (error == 0 && layout_count == 0)) {
+                       if (error == 0)
+                               error = EINVAL;
+                       goto fail;
+               }
+
                for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
-                   zap_cursor_retrieve(&zc, &za) == 0;
+                   (error = zap_cursor_retrieve(&zc, &za)) == 0;
                    zap_cursor_advance(&zc)) {
                        sa_attr_type_t *lot_attrs;
                        uint64_t lot_num;
@@ -969,8 +1037,13 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count)
                        lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
                            za.za_num_integers, KM_SLEEP);
 
-                       VERIFY(zap_lookup(os, sa->sa_layout_attr_obj,
-                           za.za_name, 2, za.za_num_integers, lot_attrs) == 0);
+                       if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
+                           za.za_name, 2, za.za_num_integers,
+                           lot_attrs))) != 0) {
+                               kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+                                   za.za_num_integers);
+                               break;
+                       }
                        VERIFY(ddi_strtoull(za.za_name, NULL, 10,
                            (unsigned long long *)&lot_num) == 0);
 
@@ -982,6 +1055,15 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count)
                            za.za_num_integers);
                }
                zap_cursor_fini(&zc);
+
+               /*
+                * Make sure layout count matches number of entries added
+                * to AVL tree
+                */
+               if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
+                       ASSERT(error != 0);
+                       goto fail;
+               }
        }
 
        /* Add special layout number for old ZNODES */
@@ -994,8 +1076,17 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count)
                (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
                    0, B_FALSE, NULL);
        }
+       *user_table = os->os_sa->sa_user_table;
        mutex_exit(&sa->sa_lock);
-       return (os->os_sa->sa_user_table);
+       return (0);
+fail:
+       os->os_sa = NULL;
+       sa_free_attr_table(sa);
+       if (sa->sa_user_table)
+               kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+       mutex_exit(&sa->sa_lock);
+       kmem_free(sa, sizeof (sa_os_t));
+       return ((error == ECKSUM) ? EIO : error);
 }
 
 void
@@ -1004,20 +1095,12 @@ sa_tear_down(objset_t *os)
        sa_os_t *sa = os->os_sa;
        sa_lot_t *layout;
        void *cookie;
-       int i;
 
        kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
 
        /* Free up attr table */
 
-       for (i = 0; i != sa->sa_num_attrs; i++) {
-               if (sa->sa_attr_table[i].sa_name)
-                       kmem_free(sa->sa_attr_table[i].sa_name,
-                           strlen(sa->sa_attr_table[i].sa_name) + 1);
-       }
-
-       kmem_free(sa->sa_attr_table,
-           sizeof (sa_attr_table_t) * sa->sa_num_attrs);
+       sa_free_attr_table(sa);
 
        cookie = NULL;
        while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) {
@@ -1361,11 +1444,9 @@ sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
        ASSERT(hdl);
 
        mutex_enter(&hdl->sa_lock);
-       if (sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL) == 0) {
+       if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
                error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
                    uio->uio_resid), UIO_READ, uio);
-       } else {
-               error = ENOENT;
        }
        mutex_exit(&hdl->sa_lock);
        return (error);
@@ -1373,11 +1454,6 @@ sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
 }
 #endif
 
-/*
- * Find an already existing TOC from given os and data
- * This is a special interface to be used by the ZPL for
- * finding the uid/gid/gen attributes.
- */
 void *
 sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data)
 {
@@ -1475,12 +1551,10 @@ sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
        }
 
        if (sa->sa_reg_attr_obj == NULL) {
-               int error;
                sa->sa_reg_attr_obj = zap_create(hdl->sa_os,
                    DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx);
-               error = zap_add(hdl->sa_os, sa->sa_master_obj,
-                   SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx);
-               ASSERT(error == 0);
+               VERIFY(zap_add(hdl->sa_os, sa->sa_master_obj,
+                   SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx) == 0);
        }
        for (i = 0; i != sa->sa_num_attrs; i++) {
                if (sa->sa_attr_table[i].sa_registered)
@@ -1538,6 +1612,8 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
     uint16_t buflen, dmu_tx_t *tx)
 {
        sa_os_t *sa = hdl->sa_os->os_sa;
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+       dnode_t *dn;
        sa_bulk_attr_t *attr_desc;
        void *old_data[2];
        int bonus_attr_count = 0;
@@ -1555,7 +1631,9 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
 
        /* First make of copy of the old data */
 
-       if (((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_bonuslen) {
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+       if (dn->dn_bonuslen != 0) {
                bonus_data_size = hdl->sa_bonus->db_size;
                old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
                bcopy(hdl->sa_bonus->db_data, old_data[0],
@@ -1564,16 +1642,21 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
        } else {
                old_data[0] = NULL;
        }
+       DB_DNODE_EXIT(db);
 
        /* Bring spill buffer online if it isn't currently */
 
-       if (sa_has_blkptr(hdl)) {
+       if ((error = sa_get_spill(hdl)) == 0) {
                spill_data_size = hdl->sa_spill->db_size;
                old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
                bcopy(hdl->sa_spill->db_data, old_data[1],
                    hdl->sa_spill->db_size);
                spill_attr_count =
                    hdl->sa_spill_tab->sa_layout->lot_attr_count;
+       } else if (error && error != ENOENT) {
+               if (old_data[0])
+                       kmem_free(old_data[0], bonus_data_size);
+               return (error);
        } else {
                old_data[1] = NULL;
        }
@@ -1722,6 +1805,7 @@ int
 sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
 {
        sa_bulk_attr_t bulk;
+       int error;
 
        bulk.sa_data = NULL;
        bulk.sa_attr = attr;
@@ -1729,9 +1813,9 @@ sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
 
        ASSERT(hdl);
        mutex_enter(&hdl->sa_lock);
-       if (sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) {
+       if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
                mutex_exit(&hdl->sa_lock);
-               return (ENOENT);
+               return (error);
        }
        *size = bulk.sa_size;
 
index d7c5de0d357a2fab7e3a281ed52196aab97519c8..b6190e4cfafecf914627c191ef794831e2aba51d 100644 (file)
@@ -116,6 +116,7 @@ static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
     char **ereport);
+static void spa_vdev_resilver_done(spa_t *spa);
 
 uint_t         zio_taskq_batch_pct = 100;      /* 1 thread per cpu in pset */
 id_t           zio_taskq_psrset_bind = PS_NONE;
@@ -180,6 +181,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
                spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
                spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
                    size - alloc, src);
+               spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
+                   (spa_mode(spa) == FREAD), src);
 
                cap = (size == 0) ? 0 : (alloc * 100 / size);
                spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
@@ -529,7 +532,9 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp)
                    nvpair_name(elem))) == ZPROP_INVAL)
                        return (EINVAL);
 
-               if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT)
+               if (prop == ZPOOL_PROP_CACHEFILE ||
+                   prop == ZPOOL_PROP_ALTROOT ||
+                   prop == ZPOOL_PROP_READONLY)
                        continue;
 
                need_sync = B_TRUE;
@@ -1284,33 +1289,131 @@ spa_check_removed(vdev_t *vd)
 }
 
 /*
- * Load the slog device state from the config object since it's possible
- * that the label does not contain the most up-to-date information.
+ * Validate the current config against the MOS config
  */
-void
-spa_load_log_state(spa_t *spa, nvlist_t *nv)
+static boolean_t
+spa_config_valid(spa_t *spa, nvlist_t *config)
 {
-       vdev_t *ovd, *rvd = spa->spa_root_vdev;
+       vdev_t *mrvd, *rvd = spa->spa_root_vdev;
+       nvlist_t *nv;
+
+       VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
+
+       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+       VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
+
+       ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
 
        /*
-        * Load the original root vdev tree from the passed config.
+        * If we're doing a normal import, then build up any additional
+        * diagnostic information about missing devices in this config.
+        * We'll pass this up to the user for further processing.
         */
-       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-       VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
+       if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
+               nvlist_t **child, *nv;
+               uint64_t idx = 0;
+
+               child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
+                   KM_SLEEP);
+               VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+               for (int c = 0; c < rvd->vdev_children; c++) {
+                       vdev_t *tvd = rvd->vdev_child[c];
+                       vdev_t *mtvd  = mrvd->vdev_child[c];
+
+                       if (tvd->vdev_ops == &vdev_missing_ops &&
+                           mtvd->vdev_ops != &vdev_missing_ops &&
+                           mtvd->vdev_islog)
+                               child[idx++] = vdev_config_generate(spa, mtvd,
+                                   B_FALSE, 0);
+               }
 
+               if (idx) {
+                       VERIFY(nvlist_add_nvlist_array(nv,
+                           ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
+                       VERIFY(nvlist_add_nvlist(spa->spa_load_info,
+                           ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
+
+                       for (int i = 0; i < idx; i++)
+                               nvlist_free(child[i]);
+               }
+               nvlist_free(nv);
+               kmem_free(child, rvd->vdev_children * sizeof (char **));
+       }
+
+       /*
+        * Compare the root vdev tree with the information we have
+        * from the MOS config (mrvd). Check each top-level vdev
+        * with the corresponding MOS config top-level (mtvd).
+        */
        for (int c = 0; c < rvd->vdev_children; c++) {
-               vdev_t *cvd = rvd->vdev_child[c];
-               if (cvd->vdev_islog)
-                       vdev_load_log_state(cvd, ovd->vdev_child[c]);
+               vdev_t *tvd = rvd->vdev_child[c];
+               vdev_t *mtvd  = mrvd->vdev_child[c];
+
+               /*
+                * Resolve any "missing" vdevs in the current configuration.
+                * If we find that the MOS config has more accurate information
+                * about the top-level vdev then use that vdev instead.
+                */
+               if (tvd->vdev_ops == &vdev_missing_ops &&
+                   mtvd->vdev_ops != &vdev_missing_ops) {
+
+                       if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
+                               continue;
+
+                       /*
+                        * Device specific actions.
+                        */
+                       if (mtvd->vdev_islog) {
+                               spa_set_log_state(spa, SPA_LOG_CLEAR);
+                       } else {
+                               /*
+                                * XXX - once we have 'readonly' pool
+                                * support we should be able to handle
+                                * missing data devices by transitioning
+                                * the pool to readonly.
+                                */
+                               continue;
+                       }
+
+                       /*
+                        * Swap the missing vdev with the data we were
+                        * able to obtain from the MOS config.
+                        */
+                       vdev_remove_child(rvd, tvd);
+                       vdev_remove_child(mrvd, mtvd);
+
+                       vdev_add_child(rvd, mtvd);
+                       vdev_add_child(mrvd, tvd);
+
+                       spa_config_exit(spa, SCL_ALL, FTAG);
+                       vdev_load(mtvd);
+                       spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+                       vdev_reopen(rvd);
+               } else if (mtvd->vdev_islog) {
+                       /*
+                        * Load the slog device's state from the MOS config
+                        * since it's possible that the label does not
+                        * contain the most up-to-date information.
+                        */
+                       vdev_load_log_state(tvd, mtvd);
+                       vdev_reopen(tvd);
+               }
        }
-       vdev_free(ovd);
+       vdev_free(mrvd);
        spa_config_exit(spa, SCL_ALL, FTAG);
+
+       /*
+        * Ensure we were able to validate the config.
+        */
+       return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
 }
 
 /*
  * Check for missing log devices
  */
-int
+static int
 spa_check_logs(spa_t *spa)
 {
        switch (spa->spa_log_state) {
@@ -1474,9 +1577,19 @@ spa_load_verify(spa_t *spa)
 
        if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
            sle.sle_data_count <= policy.zrp_maxdata) {
+               int64_t loss = 0;
+
                verify_ok = B_TRUE;
                spa->spa_load_txg = spa->spa_uberblock.ub_txg;
                spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+               loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
+               VERIFY(nvlist_add_uint64(spa->spa_load_info,
+                   ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
+               VERIFY(nvlist_add_int64(spa->spa_load_info,
+                   ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
+               VERIFY(nvlist_add_uint64(spa->spa_load_info,
+                   ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
        } else {
                spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
        }
@@ -1635,13 +1748,21 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
                            KM_SLEEP) == 0);
                }
 
+               gethrestime(&spa->spa_loaded_ts);
                error = spa_load_impl(spa, pool_guid, config, state, type,
                    mosconfig, &ereport);
        }
 
        spa->spa_minref = refcount_count(&spa->spa_refcount);
-       if (error && error != EBADF)
-               zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+       if (error) {
+               if (error != EEXIST) {
+                       spa->spa_loaded_ts.tv_sec = 0;
+                       spa->spa_loaded_ts.tv_nsec = 0;
+               }
+               if (error != EBADF) {
+                       zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+               }
+       }
        spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
        spa->spa_ena = 0;
 
@@ -1661,7 +1782,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
        nvlist_t *nvroot = NULL;
        vdev_t *rvd;
        uberblock_t *ub = &spa->spa_uberblock;
-       uint64_t config_cache_txg = spa->spa_config_txg;
+       uint64_t children, config_cache_txg = spa->spa_config_txg;
        int orig_mode = spa->spa_mode;
        int parse;
        uint64_t obj;
@@ -1760,9 +1881,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 
        /*
         * If the vdev guid sum doesn't match the uberblock, we have an
-        * incomplete configuration.
+        * incomplete configuration.  We first check to see if the pool
+        * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
+        * If it is, defer the vdev_guid_sum check till later so we
+        * can handle missing vdevs.
         */
-       if (mosconfig && type != SPA_IMPORT_ASSEMBLE &&
+       if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+           &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
            rvd->vdev_guid_sum != ub->ub_guid_sum)
                return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
 
@@ -1981,13 +2106,6 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
        vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
        spa_config_exit(spa, SCL_ALL, FTAG);
 
-       /*
-        * Check the state of the root vdev.  If it can't be opened, it
-        * indicates one or more toplevel vdevs are faulted.
-        */
-       if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
-               return (ENXIO);
-
        /*
         * Load the DDTs (dedup tables).
         */
@@ -1997,16 +2115,12 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 
        spa_update_dspace(spa);
 
-       if (state != SPA_LOAD_TRYIMPORT) {
-               error = spa_load_verify(spa);
-               if (error)
-                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
-                           error));
-       }
-
        /*
-        * Load the intent log state and check log integrity.  If we're
-        * assembling a pool from a split, the log is not transferred over.
+        * Validate the config, using the MOS config to fill in any
+        * information which might be missing.  If we fail to validate
+        * the config then declare the pool unfit for use. If we're
+        * assembling a pool from a split, the log is not transferred
+        * over.
         */
        if (type != SPA_IMPORT_ASSEMBLE) {
                nvlist_t *nvconfig;
@@ -2014,17 +2128,37 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
                        return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
-               VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
-                   &nvroot) == 0);
-               spa_load_log_state(spa, nvroot);
+               if (!spa_config_valid(spa, nvconfig)) {
+                       nvlist_free(nvconfig);
+                       return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
+                           ENXIO));
+               }
                nvlist_free(nvconfig);
 
+               /*
+                * Now that we've validate the config, check the state of the
+                * root vdev.  If it can't be opened, it indicates one or
+                * more toplevel vdevs are faulted.
+                */
+               if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+                       return (ENXIO);
+
                if (spa_check_logs(spa)) {
                        *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
                        return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
                }
        }
 
+       /*
+        * We've successfully opened the pool, verify that we're ready
+        * to start pushing transactions.
+        */
+       if (state != SPA_LOAD_TRYIMPORT) {
+               if (error = spa_load_verify(spa))
+                       return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+                           error));
+       }
+
        if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
            spa->spa_load_max_txg == UINT64_MAX)) {
                dmu_tx_t *tx;
@@ -2066,12 +2200,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
                 * If the config cache is stale, or we have uninitialized
                 * metaslabs (see spa_vdev_add()), then update the config.
                 *
-                * If spa_load_verbatim is true, trust the current
+                * If this is a verbatim import, trust the current
                 * in-core spa_config and update the disk labels.
                 */
                if (config_cache_txg != spa->spa_config_txg ||
-                   state == SPA_LOAD_IMPORT || spa->spa_load_verbatim ||
-                   state == SPA_LOAD_RECOVER)
+                   state == SPA_LOAD_IMPORT ||
+                   state == SPA_LOAD_RECOVER ||
+                   (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
                        need_update = B_TRUE;
 
                for (int c = 0; c < rvd->vdev_children; c++)
@@ -2110,12 +2245,14 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 static int
 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
 {
+       int mode = spa->spa_mode;
+
        spa_unload(spa);
        spa_deactivate(spa);
 
        spa->spa_load_max_txg--;
 
-       spa_activate(spa, spa_mode_global);
+       spa_activate(spa, mode);
        spa_async_suspend(spa);
 
        return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
@@ -2173,9 +2310,6 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
                rewind_error = spa_load_retry(spa, state, mosconfig);
        }
 
-       if (config)
-               spa_rewind_data_to_nvlist(spa, config);
-
        spa->spa_extreme_rewind = B_FALSE;
        spa->spa_load_max_txg = UINT64_MAX;
 
@@ -2202,6 +2336,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
     nvlist_t **config)
 {
        spa_t *spa;
+       spa_load_state_t state = SPA_LOAD_OPEN;
        int error;
        int locked = B_FALSE;
 
@@ -2225,7 +2360,6 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
        }
 
        if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
-               spa_load_state_t state = SPA_LOAD_OPEN;
                zpool_rewind_policy_t policy;
 
                zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
@@ -2264,9 +2398,13 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
                         * information: the state of each vdev after the
                         * attempted vdev_open().  Return this to the user.
                         */
-                       if (config != NULL && spa->spa_config)
+                       if (config != NULL && spa->spa_config) {
                                VERIFY(nvlist_dup(spa->spa_config, config,
                                    KM_SLEEP) == 0);
+                               VERIFY(nvlist_add_nvlist(*config,
+                                   ZPOOL_CONFIG_LOAD_INFO,
+                                   spa->spa_load_info) == 0);
+                       }
                        spa_unload(spa);
                        spa_deactivate(spa);
                        spa->spa_last_open_failed = error;
@@ -2275,15 +2413,22 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
                        *spapp = NULL;
                        return (error);
                }
-
        }
 
        spa_open_ref(spa, tag);
 
-
        if (config != NULL)
                *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
+       /*
+        * If we've recovered the pool, pass back any information we
+        * gathered while doing the load.
+        */
+       if (state == SPA_LOAD_RECOVER) {
+               VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
+                   spa->spa_load_info) == 0);
+       }
+
        if (locked) {
                spa->spa_last_open_failed = 0;
                spa->spa_last_ubsync_txg = 0;
@@ -2459,6 +2604,13 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
                spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
                if (*config != NULL) {
+                       uint64_t loadtimes[2];
+
+                       loadtimes[0] = spa->spa_loaded_ts.tv_sec;
+                       loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
+                       VERIFY(nvlist_add_uint64_array(*config,
+                           ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
+
                        VERIFY(nvlist_add_uint64(*config,
                            ZPOOL_CONFIG_ERRCOUNT,
                            spa_get_errlog_size(spa)) == 0);
@@ -3032,7 +3184,7 @@ spa_import_rootpool(char *devpath, char *devid)
 
        spa = spa_add(pname, config, NULL);
        spa->spa_is_root = B_TRUE;
-       spa->spa_load_verbatim = B_TRUE;
+       spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
 
        /*
         * Build up a vdev tree based on the boot device's label config.
@@ -3081,7 +3233,8 @@ spa_import_rootpool(char *devpath, char *devid)
            !bvd->vdev_isspare) {
                cmn_err(CE_NOTE, "The boot device is currently spared. Please "
                    "try booting from '%s'",
-                   bvd->vdev_parent->vdev_child[1]->vdev_path);
+                   bvd->vdev_parent->
+                   vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
                error = EINVAL;
                goto out;
        }
@@ -3100,49 +3253,18 @@ out:
 
 #endif
 
-/*
- * Take a pool and insert it into the namespace as if it had been loaded at
- * boot.
- */
-int
-spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
-{
-       spa_t *spa;
-       char *altroot = NULL;
-
-       mutex_enter(&spa_namespace_lock);
-       if (spa_lookup(pool) != NULL) {
-               mutex_exit(&spa_namespace_lock);
-               return (EEXIST);
-       }
-
-       (void) nvlist_lookup_string(props,
-           zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-       spa = spa_add(pool, config, altroot);
-
-       spa->spa_load_verbatim = B_TRUE;
-
-       if (props != NULL)
-               spa_configfile_set(spa, props, B_FALSE);
-
-       spa_config_sync(spa, B_FALSE, B_TRUE);
-
-       mutex_exit(&spa_namespace_lock);
-       spa_history_log_version(spa, LOG_POOL_IMPORT);
-
-       return (0);
-}
-
 /*
  * Import a non-root pool into the system.
  */
 int
-spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
+spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 {
        spa_t *spa;
        char *altroot = NULL;
        spa_load_state_t state = SPA_LOAD_IMPORT;
        zpool_rewind_policy_t policy;
+       uint64_t mode = spa_mode_global;
+       uint64_t readonly = B_FALSE;
        int error;
        nvlist_t *nvroot;
        nvlist_t **spares, **l2cache;
@@ -3157,23 +3279,45 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
                return (EEXIST);
        }
 
-       zpool_get_rewind_policy(config, &policy);
-       if (policy.zrp_request & ZPOOL_DO_REWIND)
-               state = SPA_LOAD_RECOVER;
-
        /*
         * Create and initialize the spa structure.
         */
        (void) nvlist_lookup_string(props,
            zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+       (void) nvlist_lookup_uint64(props,
+           zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
+       if (readonly)
+               mode = FREAD;
        spa = spa_add(pool, config, altroot);
-       spa_activate(spa, spa_mode_global);
+       spa->spa_import_flags = flags;
+
+       /*
+        * Verbatim import - Take a pool and insert it into the namespace
+        * as if it had been loaded at boot.
+        */
+       if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
+               if (props != NULL)
+                       spa_configfile_set(spa, props, B_FALSE);
+
+               spa_config_sync(spa, B_FALSE, B_TRUE);
+
+               mutex_exit(&spa_namespace_lock);
+               spa_history_log_version(spa, LOG_POOL_IMPORT);
+
+               return (0);
+       }
+
+       spa_activate(spa, mode);
 
        /*
         * Don't start async tasks until we know everything is healthy.
         */
        spa_async_suspend(spa);
 
+       zpool_get_rewind_policy(config, &policy);
+       if (policy.zrp_request & ZPOOL_DO_REWIND)
+               state = SPA_LOAD_RECOVER;
+
        /*
         * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
         * because the user-supplied config is actually the one to trust when
@@ -3181,14 +3325,16 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
         */
        if (state != SPA_LOAD_RECOVER)
                spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+
        error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
            policy.zrp_request);
 
        /*
-        * Propagate anything learned about failing or best txgs
-        * back to caller
+        * Propagate anything learned while loading the pool and pass it
+        * back to caller (i.e. rewind info, missing devices, etc).
         */
-       spa_rewind_data_to_nvlist(spa, config);
+       VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+           spa->spa_load_info) == 0);
 
        spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
        /*
@@ -3228,6 +3374,8 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
                return (error);
        }
 
+       spa_async_resume(spa);
+
        /*
         * Override any spares and level 2 cache devices as specified by
         * the user, as these may have correct device names/devids, etc.
@@ -3278,8 +3426,6 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
                spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
        }
 
-       spa_async_resume(spa);
-
        /*
         * It's possible that the pool was expanded while it was exported.
         * We kick off an async task to handle this for us.
@@ -3542,6 +3688,8 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
        nvlist_t **spares, **l2cache;
        uint_t nspares, nl2cache;
 
+       ASSERT(spa_writeable(spa));
+
        txg = spa_vdev_enter(spa);
 
        if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
@@ -3653,6 +3801,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
        int newvd_isspare;
        int error;
 
+       ASSERT(spa_writeable(spa));
+
        txg = spa_vdev_enter(spa);
 
        oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
@@ -3702,7 +3852,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
                 * spares.
                 */
                if (pvd->vdev_ops == &vdev_spare_ops &&
-                   pvd->vdev_child[1] == oldvd &&
+                   oldvd->vdev_isspare &&
                    !spa_has_spare(spa, newvd->vdev_guid))
                        return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
@@ -3714,13 +3864,15 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
                 * the same (spare replaces spare, non-spare replaces
                 * non-spare).
                 */
-               if (pvd->vdev_ops == &vdev_replacing_ops)
+               if (pvd->vdev_ops == &vdev_replacing_ops &&
+                   spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
                        return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-               else if (pvd->vdev_ops == &vdev_spare_ops &&
-                   newvd->vdev_isspare != oldvd->vdev_isspare)
+               else if (pvd->vdev_ops == &vdev_spare_ops &&
+                   newvd->vdev_isspare != oldvd->vdev_isspare) {
                        return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-               else if (pvd->vdev_ops != &vdev_spare_ops &&
-                   newvd->vdev_isspare)
+               }
+
+               if (newvd->vdev_isspare)
                        pvops = &vdev_spare_ops;
                else
                        pvops = &vdev_replacing_ops;
@@ -3755,6 +3907,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
                }
        }
 
+       /* mark the device being resilvered */
+       newvd->vdev_resilvering = B_TRUE;
+
        /*
         * If the parent is not a mirror, or if we're replacing, insert the new
         * mirror/replacing/spare vdev above oldvd.
@@ -3823,6 +3978,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
        spa_strfree(oldvdpath);
        spa_strfree(newvdpath);
 
+       if (spa->spa_bootfs)
+               spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
+
        return (0);
 }
 
@@ -3840,9 +3998,10 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
        vdev_t *vd, *pvd, *cvd, *tvd;
        boolean_t unspare = B_FALSE;
        uint64_t unspare_guid;
-       size_t len;
        char *vdpath;
 
+       ASSERT(spa_writeable(spa));
+
        txg = spa_vdev_enter(spa);
 
        vd = spa_lookup_by_guid(spa, guid, B_FALSE);
@@ -3872,18 +4031,11 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
                return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
        /*
-        * If replace_done is specified, only remove this device if it's
-        * the first child of a replacing vdev.  For the 'spare' vdev, either
-        * disk can be removed.
+        * Only 'replacing' or 'spare' vdevs can be replaced.
         */
-       if (replace_done) {
-               if (pvd->vdev_ops == &vdev_replacing_ops) {
-                       if (vd->vdev_id != 0)
-                               return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-               } else if (pvd->vdev_ops != &vdev_spare_ops) {
-                       return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-               }
-       }
+       if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
+           pvd->vdev_ops != &vdev_spare_ops)
+               return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
        ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
            spa_version(spa) >= SPA_VERSION_SPARES);
@@ -3910,16 +4062,22 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
         * check to see if we changed the original vdev's path to have "/old"
         * at the end in spa_vdev_attach().  If so, undo that change now.
         */
-       if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 &&
-           pvd->vdev_child[0]->vdev_path != NULL &&
-           pvd->vdev_child[1]->vdev_path != NULL) {
-               ASSERT(pvd->vdev_child[1] == vd);
-               cvd = pvd->vdev_child[0];
-               len = strlen(vd->vdev_path);
-               if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
-                   strcmp(cvd->vdev_path + len, "/old") == 0) {
-                       spa_strfree(cvd->vdev_path);
-                       cvd->vdev_path = spa_strdup(vd->vdev_path);
+       if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
+           vd->vdev_path != NULL) {
+               size_t len = strlen(vd->vdev_path);
+
+               for (int c = 0; c < pvd->vdev_children; c++) {
+                       cvd = pvd->vdev_child[c];
+
+                       if (cvd == vd || cvd->vdev_path == NULL)
+                               continue;
+
+                       if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
+                           strcmp(cvd->vdev_path + len, "/old") == 0) {
+                               spa_strfree(cvd->vdev_path);
+                               cvd->vdev_path = spa_strdup(vd->vdev_path);
+                               break;
+                       }
                }
        }
 
@@ -3929,7 +4087,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
         * active spare list for the pool.
         */
        if (pvd->vdev_ops == &vdev_spare_ops &&
-           vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
+           vd->vdev_id == 0 &&
+           pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
                unspare = B_TRUE;
 
        /*
@@ -3951,7 +4110,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
        /*
         * Remember one of the remaining children so we can get tvd below.
         */
-       cvd = pvd->vdev_child[0];
+       cvd = pvd->vdev_child[pvd->vdev_children - 1];
 
        /*
         * If we need to remove the remaining child from the list of hot spares,
@@ -3967,14 +4126,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
                spa_spare_remove(cvd);
                unspare_guid = cvd->vdev_guid;
                (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+               cvd->vdev_unspare = B_TRUE;
        }
 
        /*
         * If the parent mirror/replacing vdev only has one child,
         * the parent is no longer needed.  Remove it from the tree.
         */
-       if (pvd->vdev_children == 1)
+       if (pvd->vdev_children == 1) {
+               if (pvd->vdev_ops == &vdev_spare_ops)
+                       cvd->vdev_unspare = B_FALSE;
                vdev_remove_parent(cvd);
+               cvd->vdev_resilvering = B_FALSE;
+       }
+
 
        /*
         * We don't set tvd until now because the parent we just removed
@@ -4016,6 +4181,9 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 
        spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
 
+       /* hang on to the spa before we release the lock */
+       spa_open_ref(spa, FTAG);
+
        error = spa_vdev_exit(spa, vd, txg, 0);
 
        spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
@@ -4028,24 +4196,31 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
         * list of every other pool.
         */
        if (unspare) {
-               spa_t *myspa = spa;
-               spa = NULL;
+               spa_t *altspa = NULL;
+
                mutex_enter(&spa_namespace_lock);
-               while ((spa = spa_next(spa)) != NULL) {
-                       if (spa->spa_state != POOL_STATE_ACTIVE)
-                               continue;
-                       if (spa == myspa)
+               while ((altspa = spa_next(altspa)) != NULL) {
+                       if (altspa->spa_state != POOL_STATE_ACTIVE ||
+                           altspa == spa)
                                continue;
-                       spa_open_ref(spa, FTAG);
+
+                       spa_open_ref(altspa, FTAG);
                        mutex_exit(&spa_namespace_lock);
-                       (void) spa_vdev_remove(spa, unspare_guid,
-                           B_TRUE);
+                       (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
                        mutex_enter(&spa_namespace_lock);
-                       spa_close(spa, FTAG);
+                       spa_close(altspa, FTAG);
                }
                mutex_exit(&spa_namespace_lock);
+
+               /* search the rest of the vdevs for spares to remove */
+               spa_vdev_resilver_done(spa);
        }
 
+       /* all done with the spa; OK to release */
+       mutex_enter(&spa_namespace_lock);
+       spa_close(spa, FTAG);
+       mutex_exit(&spa_namespace_lock);
+
        return (error);
 }
 
@@ -4066,8 +4241,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
        vdev_t *rvd, **vml = NULL;                      /* vdev modify list */
        boolean_t activate_slog;
 
-       if (!spa_writeable(spa))
-               return (EROFS);
+       ASSERT(spa_writeable(spa));
 
        txg = spa_vdev_enter(spa);
 
@@ -4484,6 +4658,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
        int error = 0;
        boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
 
+       ASSERT(spa_writeable(spa));
+
        if (!locked)
                txg = spa_vdev_enter(spa);
 
@@ -4593,11 +4769,18 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
        }
 
        /*
-        * Check for a completed replacement.
+        * Check for a completed replacement.  We always consider the first
+        * vdev in the list to be the oldest vdev, and the last one to be
+        * the newest (see spa_vdev_attach() for how that works).  In
+        * the case where the newest vdev is faulted, we will not automatically
+        * remove it after a resilver completes.  This is OK as it will require
+        * user intervention to determine which disk the admin wishes to keep.
         */
-       if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
+       if (vd->vdev_ops == &vdev_replacing_ops) {
+               ASSERT(vd->vdev_children > 1);
+
+               newvd = vd->vdev_child[vd->vdev_children - 1];
                oldvd = vd->vdev_child[0];
-               newvd = vd->vdev_child[1];
 
                if (vdev_dtl_empty(newvd, DTL_MISSING) &&
                    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
@@ -4608,16 +4791,41 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
        /*
         * Check for a completed resilver with the 'unspare' flag set.
         */
-       if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
-               newvd = vd->vdev_child[0];
-               oldvd = vd->vdev_child[1];
+       if (vd->vdev_ops == &vdev_spare_ops) {
+               vdev_t *first = vd->vdev_child[0];
+               vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
+
+               if (last->vdev_unspare) {
+                       oldvd = first;
+                       newvd = last;
+               } else if (first->vdev_unspare) {
+                       oldvd = last;
+                       newvd = first;
+               } else {
+                       oldvd = NULL;
+               }
 
-               if (newvd->vdev_unspare &&
+               if (oldvd != NULL &&
                    vdev_dtl_empty(newvd, DTL_MISSING) &&
                    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
-                   !vdev_dtl_required(oldvd)) {
-                       newvd->vdev_unspare = 0;
+                   !vdev_dtl_required(oldvd))
                        return (oldvd);
+
+               /*
+                * If there are more than two spares attached to a disk,
+                * and those spares are not required, then we want to
+                * attempt to free them up now so that they can be used
+                * by other pools.  Once we're back down to a single
+                * disk+spare, we stop removing them.
+                */
+               if (vd->vdev_children > 2) {
+                       newvd = vd->vdev_child[1];
+
+                       if (newvd->vdev_isspare && last->vdev_isspare &&
+                           vdev_dtl_empty(last, DTL_MISSING) &&
+                           vdev_dtl_empty(last, DTL_OUTAGE) &&
+                           !vdev_dtl_required(newvd))
+                               return (newvd);
                }
        }
 
@@ -4644,9 +4852,9 @@ spa_vdev_resilver_done(spa_t *spa)
                 * we need to detach the parent's first child (the original hot
                 * spare) as well.
                 */
-               if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
+               if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
+                   ppvd->vdev_children == 2) {
                        ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
-                       ASSERT(ppvd->vdev_children == 2);
                        sguid = ppvd->vdev_child[1]->vdev_guid;
                }
                spa_config_exit(spa, SCL_ALL, FTAG);
@@ -4670,6 +4878,8 @@ spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
        vdev_t *vd;
        boolean_t sync = B_FALSE;
 
+       ASSERT(spa_writeable(spa));
+
        spa_vdev_state_enter(spa, SCL_ALL);
 
        if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
@@ -5115,9 +5325,11 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
                        ASSERT(spa->spa_root != NULL);
                        break;
 
+               case ZPOOL_PROP_READONLY:
                case ZPOOL_PROP_CACHEFILE:
                        /*
-                        * 'cachefile' is also a non-persisitent property.
+                        * 'readonly' and 'cachefile' are also non-persisitent
+                        * properties.
                         */
                        break;
                default:
@@ -5249,6 +5461,8 @@ spa_sync(spa_t *spa, uint64_t txg)
        dmu_tx_t *tx;
        int error;
 
+       VERIFY(spa_writeable(spa));
+
        /*
         * Lock out configuration changes.
         */
@@ -5467,7 +5681,8 @@ spa_sync_allpools(void)
        spa_t *spa = NULL;
        mutex_enter(&spa_namespace_lock);
        while ((spa = spa_next(spa)) != NULL) {
-               if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa))
+               if (spa_state(spa) != POOL_STATE_ACTIVE ||
+                   !spa_writeable(spa) || spa_suspended(spa))
                        continue;
                spa_open_ref(spa, FTAG);
                mutex_exit(&spa_namespace_lock);
@@ -5547,6 +5762,8 @@ spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
 void
 spa_upgrade(spa_t *spa, uint64_t version)
 {
+       ASSERT(spa_writeable(spa));
+
        spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
        /*
index cdeda3f93cbaa0efde515765f53b42261f44bbd0..69d57f66dbb6c89154c35c94833a20ccede273c5 100644 (file)
@@ -304,24 +304,6 @@ spa_config_set(spa_t *spa, nvlist_t *config)
        mutex_exit(&spa->spa_props_lock);
 }
 
-/* Add discovered rewind info, if any to the provided nvlist */
-void
-spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *tonvl)
-{
-       int64_t loss = 0;
-
-       if (tonvl == NULL || spa->spa_load_txg == 0)
-               return;
-
-       VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_TIME,
-           spa->spa_load_txg_ts) == 0);
-       if (spa->spa_last_ubsync_txg)
-               loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
-       VERIFY(nvlist_add_int64(tonvl, ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
-       VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
-           spa->spa_load_data_errors) == 0);
-}
-
 /*
  * Generate the pool's configuration based on the current in-core state.
  * We infer whether to generate a complete config or just one top-level config
@@ -403,8 +385,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 
        /*
         * Add the top-level config.  We even add this on pools which
-        * don't support holes in the namespace as older pools will
-        * just ignore it.
+        * don't support holes in the namespace.
         */
        vdev_top_config_generate(spa, config);
 
@@ -449,8 +430,6 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
                kmem_free(dds, sizeof (ddt_stat_t));
        }
 
-       spa_rewind_data_to_nvlist(spa, config);
-
        if (locked)
                spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 
index 52af7fcb71210c7ec4e990ce7409e02e0eb4968c..1b54afb0be5e9623d01652ad8e2fcd246438c92c 100644 (file)
@@ -478,6 +478,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
        dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
        list_insert_head(&spa->spa_config_list, dp);
 
+       VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
+           KM_SLEEP) == 0);
+
        if (config != NULL)
                VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
 
@@ -516,6 +519,7 @@ spa_remove(spa_t *spa)
 
        list_destroy(&spa->spa_config_list);
 
+       nvlist_free(spa->spa_load_info);
        spa_config_set(spa, NULL);
 
        refcount_destroy(&spa->spa_refcount);
@@ -886,10 +890,6 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
         */
        vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
 
-       /*
-        * If the config changed, notify the scrub that it must restart.
-        * This will initiate a resilver if needed.
-        */
        if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
                config_changed = B_TRUE;
                spa->spa_config_generation++;
@@ -1078,12 +1078,12 @@ spa_rename(const char *name, const char *newname)
 }
 
 /*
- * Determine whether a pool with given pool_guid exists.  If device_guid is
- * non-zero, determine whether the pool exists *and* contains a device with the
- * specified device_guid.
+ * Return the spa_t associated with given pool_guid, if it exists.  If
+ * device_guid is non-zero, determine whether the pool exists *and* contains
+ * a device with the specified device_guid.
  */
-boolean_t
-spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+spa_t *
+spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
 {
        spa_t *spa;
        avl_tree_t *t = &spa_namespace_avl;
@@ -1114,7 +1114,16 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
                }
        }
 
-       return (spa != NULL);
+       return (spa);
+}
+
+/*
+ * Determine whether a pool with the given pool_guid exists.
+ */
+boolean_t
+spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+{
+       return (spa_by_guid(pool_guid, device_guid) != NULL);
 }
 
 char *
index f478ad0c67a8a30a04cf57ecb016663a39c51d8d..9b308ca4e71a1e789fc8193d7e690dd489e3fed5 100644 (file)
@@ -37,7 +37,7 @@
 static void txg_sync_thread(dsl_pool_t *dp);
 static void txg_quiesce_thread(dsl_pool_t *dp);
 
-int zfs_txg_timeout = 30;      /* max seconds worth of delta per txg */
+int zfs_txg_timeout = 5;       /* max seconds worth of delta per txg */
 
 /*
  * Prepare the txg subsystem.
index a61f29b8e78a6deb58690a813aeed9ccee715ad1..bac3e86054d6f8e64f0fed061725a621236c8535 100644 (file)
@@ -207,9 +207,6 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd)
         */
        for (; pvd != NULL; pvd = pvd->vdev_parent)
                pvd->vdev_guid_sum += cvd->vdev_guid_sum;
-
-       if (cvd->vdev_ops->vdev_op_leaf)
-               cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit;
 }
 
 void
@@ -244,9 +241,6 @@ vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
         */
        for (; pvd != NULL; pvd = pvd->vdev_parent)
                pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
-
-       if (cvd->vdev_ops->vdev_op_leaf)
-               cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit;
 }
 
 /*
@@ -524,6 +518,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
                (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
                    &vd->vdev_offline);
 
+               (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING,
+                   &vd->vdev_resilvering);
+
                /*
                 * When importing a pool, we want to ignore the persistent fault
                 * state, as the diagnosis made on another system may not be
@@ -1375,10 +1372,10 @@ vdev_validate(vdev_t *vd)
                nvlist_free(label);
 
                /*
-                * If spa->spa_load_verbatim is true, no need to check the
+                * If this is a verbatim import, no need to check the
                 * state of the pool.
                 */
-               if (!spa->spa_load_verbatim &&
+               if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
                    spa_load_state(spa) == SPA_LOAD_OPEN &&
                    state != POOL_STATE_ACTIVE)
                        return (EBADF);
@@ -1544,6 +1541,7 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
        ASSERT(vd == vd->vdev_top);
        ASSERT(!vd->vdev_ishole);
        ASSERT(ISP2(flags));
+       ASSERT(spa_writeable(vd->vdev_spa));
 
        if (flags & VDD_METASLAB)
                (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
@@ -1599,6 +1597,7 @@ vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 
        ASSERT(t < DTL_TYPES);
        ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+       ASSERT(spa_writeable(vd->vdev_spa));
 
        mutex_enter(sm->sm_lock);
        if (!space_map_contains(sm, txg, size))
@@ -1855,6 +1854,9 @@ vdev_dtl_required(vdev_t *vd)
        vd->vdev_cant_read = cant_read;
        vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
 
+       if (!required && zio_injection_enabled)
+               required = !!zio_handle_device_injection(vd, NULL, ECHILD);
+
        return (required);
 }
 
@@ -2070,7 +2072,7 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
 int
 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
-       vdev_t *vd;
+       vdev_t *vd, *tvd;
 
        spa_vdev_state_enter(spa, SCL_NONE);
 
@@ -2080,6 +2082,8 @@ vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
        if (!vd->vdev_ops->vdev_op_leaf)
                return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
+       tvd = vd->vdev_top;
+
        /*
         * We don't directly use the aux state here, but if we do a
         * vdev_reopen(), we need this value to be present to remember why we
@@ -2099,7 +2103,7 @@ vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
         * If this device has the only valid copy of the data, then
         * back off and simply mark the vdev as degraded instead.
         */
-       if (!vd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
+       if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
                vd->vdev_degraded = 1ULL;
                vd->vdev_faulted = 0ULL;
 
@@ -2107,7 +2111,7 @@ vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
                 * If we reopen the device and it's not dead, only then do we
                 * mark it degraded.
                 */
-               vdev_reopen(vd);
+               vdev_reopen(tvd);
 
                if (vdev_readable(vd))
                        vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
@@ -2349,15 +2353,15 @@ vdev_clear(spa_t *spa, vdev_t *vd)
                 */
                vd->vdev_forcefault = B_TRUE;
 
-               vd->vdev_faulted = vd->vdev_degraded = 0;
+               vd->vdev_faulted = vd->vdev_degraded = 0ULL;
                vd->vdev_cant_read = B_FALSE;
                vd->vdev_cant_write = B_FALSE;
 
-               vdev_reopen(vd);
+               vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
 
                vd->vdev_forcefault = B_FALSE;
 
-               if (vd != rvd)
+               if (vd != rvd && vdev_writeable(vd->vdev_top))
                        vdev_state_dirty(vd->vdev_top);
 
                if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
@@ -2541,7 +2545,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
                mutex_enter(&vd->vdev_stat_lock);
 
                if (flags & ZIO_FLAG_IO_REPAIR) {
-                       if (flags & ZIO_FLAG_SCRUB_THREAD) {
+                       if (flags & ZIO_FLAG_SCAN_THREAD) {
                                dsl_scan_phys_t *scn_phys =
                                    &spa->spa_dsl_pool->dp_scan->scn_phys;
                                uint64_t *processed = &scn_phys->scn_processed;
@@ -2597,7 +2601,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 
        if (type == ZIO_TYPE_WRITE && txg != 0 &&
            (!(flags & ZIO_FLAG_IO_REPAIR) ||
-           (flags & ZIO_FLAG_SCRUB_THREAD) ||
+           (flags & ZIO_FLAG_SCAN_THREAD) ||
            spa->spa_claiming)) {
                /*
                 * This is either a normal write (not a repair), or it's
@@ -2616,7 +2620,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
                 */
                if (vd->vdev_ops->vdev_op_leaf) {
                        uint64_t commit_txg = txg;
-                       if (flags & ZIO_FLAG_SCRUB_THREAD) {
+                       if (flags & ZIO_FLAG_SCAN_THREAD) {
                                ASSERT(flags & ZIO_FLAG_IO_REPAIR);
                                ASSERT(spa_sync_pass(spa) == 1);
                                vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
@@ -2699,6 +2703,8 @@ vdev_config_dirty(vdev_t *vd)
        vdev_t *rvd = spa->spa_root_vdev;
        int c;
 
+       ASSERT(spa_writeable(spa));
+
        /*
         * If this is an aux vdev (as with l2cache and spare devices), then we
         * update the vdev config manually and set the sync flag.
@@ -2787,6 +2793,7 @@ vdev_state_dirty(vdev_t *vd)
 {
        spa_t *spa = vd->vdev_spa;
 
+       ASSERT(spa_writeable(spa));
        ASSERT(vd == vd->vdev_top);
 
        /*
@@ -2944,12 +2951,13 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
                vd->vdev_removed = B_TRUE;
        } else if (state == VDEV_STATE_CANT_OPEN) {
                /*
-                * If we fail to open a vdev during an import, we mark it as
-                * "not available", which signifies that it was never there to
-                * begin with.  Failure to open such a device is not considered
-                * an error.
+                * If we fail to open a vdev during an import or recovery, we
+                * mark it as "not available", which signifies that it was
+                * never there to begin with.  Failure to open such a device
+                * is not considered an error.
                 */
-               if (spa_load_state(spa) == SPA_LOAD_IMPORT &&
+               if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
+                   spa_load_state(spa) == SPA_LOAD_RECOVER) &&
                    vd->vdev_ops->vdev_op_leaf)
                        vd->vdev_not_present = 1;
 
@@ -3042,31 +3050,51 @@ vdev_is_bootable(vdev_t *vd)
 /*
  * Load the state from the original vdev tree (ovd) which
  * we've retrieved from the MOS config object. If the original
- * vdev was offline then we transfer that state to the device
- * in the current vdev tree (nvd).
+ * vdev was offline or faulted then we transfer that state to the
+ * device in the current vdev tree (nvd).
  */
 void
 vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
 {
        spa_t *spa = nvd->vdev_spa;
 
+       ASSERT(nvd->vdev_top->vdev_islog);
        ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
        ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
 
        for (int c = 0; c < nvd->vdev_children; c++)
                vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
 
-       if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) {
+       if (nvd->vdev_ops->vdev_op_leaf) {
                /*
-                * It would be nice to call vdev_offline()
-                * directly but the pool isn't fully loaded and
-                * the txg threads have not been started yet.
+                * Restore the persistent vdev state
                 */
                nvd->vdev_offline = ovd->vdev_offline;
-               vdev_reopen(nvd->vdev_top);
+               nvd->vdev_faulted = ovd->vdev_faulted;
+               nvd->vdev_degraded = ovd->vdev_degraded;
+               nvd->vdev_removed = ovd->vdev_removed;
        }
 }
 
+/*
+ * Determine if a log device has valid content.  If the vdev was
+ * removed or faulted in the MOS config then we know that
+ * the content on the log device has already been written to the pool.
+ */
+boolean_t
+vdev_log_state_valid(vdev_t *vd)
+{
+       if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
+           !vd->vdev_removed)
+               return (B_TRUE);
+
+       for (int c = 0; c < vd->vdev_children; c++)
+               if (vdev_log_state_valid(vd->vdev_child[c]))
+                       return (B_TRUE);
+
+       return (B_FALSE);
+}
+
 /*
  * Expand a vdev if possible.
  */
index 75ec545345fb706efa70942b1dae0a946a3a9f0c..c08ed8ba0467eb74e48da5118bf488955ba69e46 100644 (file)
@@ -353,6 +353,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
                if (vd->vdev_offline && !vd->vdev_tmpoffline)
                        VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
                            B_TRUE) == 0);
+               if (vd->vdev_resilvering)
+                       VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVERING,
+                           B_TRUE) == 0);
                if (vd->vdev_faulted)
                        VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED,
                            B_TRUE) == 0);
@@ -570,6 +573,15 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
        if (spa_l2cache_exists(device_guid, NULL))
                return (B_TRUE);
 
+       /*
+        * We can't rely on a pool's state if it's been imported
+        * read-only.  Instead we look to see if the pools is marked
+        * read-only in the namespace and set the state to active.
+        */
+       if ((spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
+           spa_mode(spa) == FREAD)
+               state = POOL_STATE_ACTIVE;
+
        /*
         * If the device is marked ACTIVE, then this device is in use by another
         * pool on the system.
index 1181bd4433ca4694bd64f38c42a4d61937bde986..843b5ff06ef47a80cd44fe4de6dc85e225cf2e9d 100644 (file)
@@ -327,19 +327,35 @@ static acl_ops_t zfs_acl_fuid_ops = {
  * an external ACL and what version of ACL previously existed on the
  * file.  Would really be nice to not need this, sigh.
  */
-
 uint64_t
 zfs_external_acl(znode_t *zp)
 {
        zfs_acl_phys_t acl_phys;
+       int error;
 
        if (zp->z_is_sa)
                return (0);
 
-       VERIFY(0 == sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
-           &acl_phys, sizeof (acl_phys)));
+       /*
+        * Need to deal with a potential
+        * race where zfs_sa_upgrade could cause
+        * z_isa_sa to change.
+        *
+        * If the lookup fails then the state of z_is_sa should have
+        * changed.
+        */
 
-       return (acl_phys.z_acl_extern_obj);
+       if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+           &acl_phys, sizeof (acl_phys))) == 0)
+               return (acl_phys.z_acl_extern_obj);
+       else {
+               /*
+                * after upgrade the SA_ZPL_ZNODE_ACL should have been
+                * removed
+                */
+               VERIFY(zp->z_is_sa && error == ENOENT);
+               return (0);
+       }
 }
 
 /*
@@ -357,6 +373,7 @@ zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
        int size;
        int error;
 
+       ASSERT(MUTEX_HELD(&zp->z_acl_lock));
        if (zp->z_is_sa) {
                if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
                    &size)) != 0)
@@ -387,13 +404,31 @@ zfs_znode_acl_version(znode_t *zp)
 {
        zfs_acl_phys_t acl_phys;
 
-       if (zp->z_is_sa) {
+       if (zp->z_is_sa)
                return (ZFS_ACL_VERSION_FUID);
-       } else {
-               VERIFY(0 == sa_lookup(zp->z_sa_hdl,
+       else {
+               int error;
+
+               /*
+                * Need to deal with a potential
+                * race where zfs_sa_upgrade could cause
+                * z_isa_sa to change.
+                *
+                * If the lookup fails then the state of z_is_sa should have
+                * changed.
+                */
+               if ((error = sa_lookup(zp->z_sa_hdl,
                    SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
-                   &acl_phys, sizeof (acl_phys)));
-               return (acl_phys.z_acl_version);
+                   &acl_phys, sizeof (acl_phys))) == 0)
+                       return (acl_phys.z_acl_version);
+               else {
+                       /*
+                        * After upgrade SA_ZPL_ZNODE_ACL should have
+                        * been removed.
+                        */
+                       VERIFY(zp->z_is_sa && error == ENOENT);
+                       return (ZFS_ACL_VERSION_FUID);
+               }
        }
 }
 
@@ -1024,7 +1059,8 @@ zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
  * create a new acl and leave any cached acl in place.
  */
 static int
-zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
+zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
+    boolean_t will_modify)
 {
        zfs_acl_t       *aclp;
        int             aclsize;
@@ -1033,6 +1069,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
        zfs_acl_phys_t  znode_acl;
        int             version;
        int             error;
+       boolean_t       drop_lock = B_FALSE;
 
        ASSERT(MUTEX_HELD(&zp->z_acl_lock));
 
@@ -1041,11 +1078,23 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
                return (0);
        }
 
-       version = ZNODE_ACL_VERSION(zp);
+       /*
+        * close race where znode could be upgrade while trying to
+        * read the znode attributes.
+        *
+        * But this could only happen if the file isn't already an SA
+        * znode
+        */
+       if (!zp->z_is_sa && !have_lock) {
+               mutex_enter(&zp->z_lock);
+               drop_lock = B_TRUE;
+       }
+       version = zfs_znode_acl_version(zp);
 
        if ((error = zfs_acl_znode_info(zp, &aclsize,
-           &acl_count, &znode_acl)) != 0)
-               return (error);
+           &acl_count, &znode_acl)) != 0) {
+               goto done;
+       }
 
        aclp = zfs_acl_alloc(version);
 
@@ -1076,7 +1125,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
                /* convert checksum errors into IO errors */
                if (error == ECKSUM)
                        error = EIO;
-               return (error);
+               goto done;
        }
 
        list_insert_head(&aclp->z_acl, aclnode);
@@ -1084,7 +1133,10 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
        *aclpp = aclp;
        if (!will_modify)
                zp->z_acl_cached = aclp;
-       return (0);
+done:
+       if (drop_lock)
+               mutex_exit(&zp->z_lock);
+       return (error);
 }
 
 /*ARGSUSED*/
@@ -1104,44 +1156,18 @@ zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
        *length = cb->cb_acl_node->z_size;
 }
 
-
-static int
-zfs_acl_get_owner_fuids(znode_t *zp, uint64_t *fuid, uint64_t *fgid)
-{
-       int count = 0;
-       sa_bulk_attr_t  bulk[2];
-       int error;
-
-       if (IS_EPHEMERAL(zp->z_uid) || IS_EPHEMERAL(zp->z_gid)) {
-               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zp->z_zfsvfs), NULL,
-                   &fuid, sizeof (fuid));
-               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zp->z_zfsvfs), NULL,
-                   &fgid, sizeof (fuid));
-               if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
-                       return (error);
-               }
-       } else {
-               *fuid = zp->z_uid;
-               *fgid = zp->z_gid;
-       }
-       return (0);
-}
-
 int
 zfs_acl_chown_setattr(znode_t *zp)
 {
        int error;
        zfs_acl_t *aclp;
-       uint64_t fuid, fgid;
 
-       if ((error = zfs_acl_get_owner_fuids(zp, &fuid, &fgid)) != 0)
-               return (error);
+       ASSERT(MUTEX_HELD(&zp->z_lock));
+       ASSERT(MUTEX_HELD(&zp->z_acl_lock));
 
-       mutex_enter(&zp->z_acl_lock);
-       if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0)
+       if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0)
                zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
-                   &zp->z_pflags, fuid, fgid);
-       mutex_exit(&zp->z_acl_lock);
+                   &zp->z_pflags, zp->z_uid, zp->z_gid);
        return (error);
 }
 
@@ -1163,14 +1189,11 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
        sa_bulk_attr_t          bulk[5];
        uint64_t                ctime[2];
        int                     count = 0;
-       uint64_t                fuid, fgid;
 
        mode = zp->z_mode;
 
-       if ((error = zfs_acl_get_owner_fuids(zp, &fuid, &fgid)) != 0)
-               return (error);
-
-       mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, fuid, fgid);
+       mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
+           zp->z_uid, zp->z_gid);
 
        zp->z_mode = mode;
        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
@@ -1482,18 +1505,17 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp)
        list_insert_tail(&aclp->z_acl, newnode);
 }
 
-int
+void
 zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
 {
-       mutex_enter(&zp->z_lock);
        mutex_enter(&zp->z_acl_lock);
+       mutex_enter(&zp->z_lock);
        *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
        (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
        zfs_acl_chmod(zp->z_zfsvfs, mode, *aclp);
-       mutex_exit(&zp->z_acl_lock);
        mutex_exit(&zp->z_lock);
+       mutex_exit(&zp->z_acl_lock);
        ASSERT(*aclp);
-       return (0);
 }
 
 /*
@@ -1660,7 +1682,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
        gid_t           gid;
        boolean_t       need_chmod = B_TRUE;
        boolean_t       inherited = B_FALSE;
-       uint64_t        parentgid;
 
        bzero(acl_ids, sizeof (zfs_acl_ids_t));
        acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
@@ -1682,12 +1703,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
                    ZFS_GROUP, &acl_ids->z_fuidp);
                gid = vap->va_gid;
        } else {
-               if (IS_EPHEMERAL(dzp->z_gid))
-                       VERIFY(0 == sa_lookup(dzp->z_sa_hdl, SA_ZPL_GID(zfsvfs),
-                           &parentgid, sizeof (parentgid)));
-               else
-                       parentgid = (uint64_t)dzp->z_gid;
-
                acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
                    cr, &acl_ids->z_fuidp);
                acl_ids->z_fgid = 0;
@@ -1696,7 +1711,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
                            (uint64_t)vap->va_gid,
                            cr, ZFS_GROUP, &acl_ids->z_fuidp);
                        gid = vap->va_gid;
-                       if (acl_ids->z_fgid != parentgid &&
+                       if (acl_ids->z_fgid != dzp->z_gid &&
                            !groupmember(vap->va_gid, cr) &&
                            secpolicy_vnode_create_gid(cr) != 0)
                                acl_ids->z_fgid = 0;
@@ -1706,7 +1721,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
                                char            *domain;
                                uint32_t        rid;
 
-                               acl_ids->z_fgid = parentgid;
+                               acl_ids->z_fgid = dzp->z_gid;
                                gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
                                    cr, ZFS_GROUP);
 
@@ -1746,15 +1761,15 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
        }
 
        if (acl_ids->z_aclp == NULL) {
+               mutex_enter(&dzp->z_acl_lock);
                mutex_enter(&dzp->z_lock);
                if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR &&
                    (dzp->z_pflags & ZFS_INHERIT_ACE)) &&
                    !(dzp->z_pflags & ZFS_XATTR)) {
-                       mutex_enter(&dzp->z_acl_lock);
-                       VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
+                       VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
+                           &paclp, B_FALSE));
                        acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
                            vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
-                       mutex_exit(&dzp->z_acl_lock);
                        inherited = B_TRUE;
                } else {
                        acl_ids->z_aclp =
@@ -1762,6 +1777,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
                        acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
                }
                mutex_exit(&dzp->z_lock);
+               mutex_exit(&dzp->z_acl_lock);
                if (need_chmod) {
                        acl_ids->z_aclp->z_hints |= (vap->va_type == VDIR) ?
                            ZFS_ACL_AUTO_INHERIT : 0;
@@ -1824,7 +1840,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 
        mutex_enter(&zp->z_acl_lock);
 
-       error = zfs_acl_node_read(zp, &aclp, B_FALSE);
+       error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
        if (error != 0) {
                mutex_exit(&zp->z_acl_lock);
                return (error);
@@ -1970,6 +1986,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
        zfs_acl_t       *aclp;
        zfs_fuid_info_t *fuidp = NULL;
        boolean_t       fuid_dirtied;
+       uint64_t        acl_obj;
 
        if (mask == 0)
                return (ENOSYS);
@@ -1994,8 +2011,8 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
                    (zp->z_pflags & V4_ACL_WIDE_FLAGS);
        }
 top:
-       mutex_enter(&zp->z_lock);
        mutex_enter(&zp->z_acl_lock);
+       mutex_enter(&zp->z_lock);
 
        tx = dmu_tx_create(zfsvfs->z_os);
 
@@ -2010,14 +2027,15 @@ top:
         * upgrading then take out necessary DMU holds
         */
 
-       if (ZFS_EXTERNAL_ACL(zp)) {
-               if (zfsvfs->z_version <= ZPL_VERSION_SA &&
-                   ZNODE_ACL_VERSION(zp) <= ZFS_ACL_VERSION_INITIAL) {
-                       dmu_tx_hold_free(tx, ZFS_EXTERNAL_ACL(zp), 0,
+       if ((acl_obj = zfs_external_acl(zp)) != 0) {
+               if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+                   zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
+                       dmu_tx_hold_free(tx, acl_obj, 0,
                            DMU_OBJECT_END);
+                       dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+                           aclp->z_acl_bytes);
                } else {
-                       dmu_tx_hold_write(tx, ZFS_EXTERNAL_ACL(zp),
-                           0, aclp->z_acl_bytes);
+                       dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
                }
        } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
                dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
@@ -2041,6 +2059,7 @@ top:
 
        error = zfs_aclset_common(zp, aclp, cr, tx);
        ASSERT(error == 0);
+       ASSERT(zp->z_acl_cached == NULL);
        zp->z_acl_cached = aclp;
 
        if (fuid_dirtied)
@@ -2052,8 +2071,8 @@ top:
                zfs_fuid_info_free(fuidp);
        dmu_tx_commit(tx);
 done:
-       mutex_exit(&zp->z_acl_lock);
        mutex_exit(&zp->z_lock);
+       mutex_exit(&zp->z_acl_lock);
 
        return (error);
 }
@@ -2137,11 +2156,14 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
        uint32_t        deny_mask = 0;
        zfs_ace_hdr_t   *acep = NULL;
        boolean_t       checkit;
-       uint64_t        gowner;
+       uid_t           gowner;
+       uid_t           fowner;
+
+       zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
 
        mutex_enter(&zp->z_acl_lock);
 
-       error = zfs_acl_node_read(zp, &aclp, B_FALSE);
+       error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
        if (error != 0) {
                mutex_exit(&zp->z_acl_lock);
                return (error);
@@ -2149,12 +2171,6 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
 
        ASSERT(zp->z_acl_cached);
 
-       if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GID(zfsvfs),
-           &gowner, sizeof (gowner))) != 0) {
-               mutex_exit(&zp->z_acl_lock);
-               return (error);
-       }
-
        while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
            &iflags, &type)) {
                uint32_t mask_matched;
@@ -2176,7 +2192,7 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
 
                switch (entry_type) {
                case ACE_OWNER:
-                       if (uid == zp->z_uid)
+                       if (uid == fowner)
                                checkit = B_TRUE;
                        break;
                case OWNING_GROUP:
@@ -2254,8 +2270,10 @@ zfs_has_access(znode_t *zp, cred_t *cr)
        uint32_t have = ACE_ALL_PERMS;
 
        if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
-               return (secpolicy_vnode_any_access(cr, ZTOV(zp),
-                   zp->z_uid) == 0);
+               uid_t owner;
+
+               owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+               return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0);
        }
        return (B_TRUE);
 }
@@ -2332,7 +2350,7 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
                return (0);
        }
 
-       if (IS_EPHEMERAL(zdp->z_uid) != 0 || IS_EPHEMERAL(zdp->z_gid) != 0) {
+       if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) {
                mutex_exit(&zdp->z_acl_lock);
                goto slow;
        }
@@ -2389,6 +2407,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
        znode_t         *xzp;
        znode_t         *check_zp = zp;
        mode_t          needed_bits;
+       uid_t           owner;
 
        is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
 
@@ -2425,6 +2444,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
                }
        }
 
+       owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
        /*
         * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC
         * in needed_bits.  Map the bits mapped by working_mode (currently
@@ -2436,7 +2456,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
 
        working_mode = mode;
        if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
-           zp->z_uid == crgetuid(cr))
+           owner == crgetuid(cr))
                working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
 
        if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
@@ -2452,7 +2472,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
            &check_privs, skipaclchk, cr)) == 0) {
                if (is_attr)
                        VN_RELE(ZTOV(xzp));
-               return (secpolicy_vnode_access2(cr, ZTOV(zp), zp->z_uid,
+               return (secpolicy_vnode_access2(cr, ZTOV(zp), owner,
                    needed_bits, needed_bits));
        }
 
@@ -2478,7 +2498,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
                ASSERT(working_mode != 0);
 
                if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
-                   zp->z_uid == crgetuid(cr)))
+                   owner == crgetuid(cr)))
                        working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
 
                if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
@@ -2490,20 +2510,20 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
                if (working_mode & ACE_EXECUTE)
                        checkmode |= VEXEC;
 
-               error = secpolicy_vnode_access2(cr, ZTOV(check_zp), zp->z_uid,
+               error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner,
                    needed_bits & ~checkmode, needed_bits);
 
                if (error == 0 && (working_mode & ACE_WRITE_OWNER))
-                       error = secpolicy_vnode_chown(cr, zp->z_uid);
+                       error = secpolicy_vnode_chown(cr, owner);
                if (error == 0 && (working_mode & ACE_WRITE_ACL))
-                       error = secpolicy_vnode_setdac(cr, zp->z_uid);
+                       error = secpolicy_vnode_setdac(cr, owner);
 
                if (error == 0 && (working_mode &
                    (ACE_DELETE|ACE_DELETE_CHILD)))
                        error = secpolicy_vnode_remove(cr);
 
                if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
-                       error = secpolicy_vnode_chown(cr, zp->z_uid);
+                       error = secpolicy_vnode_chown(cr, owner);
                }
                if (error == 0) {
                        /*
@@ -2515,7 +2535,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
                        }
                }
        } else if (error == 0) {
-               error = secpolicy_vnode_access2(cr, ZTOV(zp), zp->z_uid,
+               error = secpolicy_vnode_access2(cr, ZTOV(zp), owner,
                    needed_bits, needed_bits);
        }
 
@@ -2552,9 +2572,12 @@ zfs_delete_final_check(znode_t *zp, znode_t *dzp,
     mode_t available_perms, cred_t *cr)
 {
        int error;
+       uid_t downer;
+
+       downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER);
 
        error = secpolicy_vnode_access2(cr, ZTOV(dzp),
-           dzp->z_uid, available_perms, VWRITE|VEXEC);
+           downer, available_perms, VWRITE|VEXEC);
 
        if (error == 0)
                error = zfs_sticky_remove_access(dzp, zp, cr);
index 362de4dd157f28c11825f88c21f3023b2cf7ea75..815f8895e702563755794adaba6e1f2caf0d9dbe 100644 (file)
@@ -590,7 +590,7 @@ zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
        ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
        (void) strcat(newpath, nm);
        refstr_rele(pathref);
-       vfs_setmntpoint(vfsp, newpath);
+       vfs_setmntpoint(vfsp, newpath, 0);
 
        pathref = vfs_getresource(vfsp);
        (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
@@ -599,7 +599,7 @@ zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
        ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
        (void) strcat(newpath, nm);
        refstr_rele(pathref);
-       vfs_setresource(vfsp, newpath);
+       vfs_setresource(vfsp, newpath, 0);
 
        vfs_unlock(vfsp);
 }
@@ -749,7 +749,8 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
                return (err);
 
        if (err == 0) {
-               err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE);
+               err = dmu_objset_snapshot(name, dirname, NULL, NULL,
+                   B_FALSE, B_FALSE, -1);
                if (err)
                        return (err);
                err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
index 6d6666822b3e0f4a4286e4fed4fc2c5cd767dff6..b06d29ab33e1a84c1e07272a61ccfae6ff711921 100644 (file)
@@ -630,7 +630,7 @@ zfs_rmnode(znode_t *zp)
                ASSERT(error == 0);
        }
 
-       acl_obj = ZFS_EXTERNAL_ACL(zp);
+       acl_obj = zfs_external_acl(zp);
 
        /*
         * Set up the final transaction.
@@ -1067,6 +1067,9 @@ int
 zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
 {
        uid_t           uid;
+       uid_t           downer;
+       uid_t           fowner;
+       zfsvfs_t        *zfsvfs = zdp->z_zfsvfs;
 
        if (zdp->z_zfsvfs->z_replay)
                return (0);
@@ -1074,7 +1077,10 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
        if ((zdp->z_mode & S_ISVTX) == 0)
                return (0);
 
-       if ((uid = crgetuid(cr)) == zdp->z_uid || uid == zp->z_uid ||
+       downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
+       fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+
+       if ((uid = crgetuid(cr)) == downer || uid == fowner ||
            (ZTOV(zp)->v_type == VREG &&
            zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
                return (0);
index 8c0424e84b7bc726c07ca6439e3f65666b5e4bf1..a853f4d73561fc61ec149b94d5ede9dfbac0f16f 100644 (file)
@@ -388,26 +388,8 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
 void
 zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
 {
-       uint64_t fuid, fgid;
-       sa_bulk_attr_t bulk[2];
-       int count = 0;
-
-       if (IS_EPHEMERAL(zp->z_uid) || IS_EPHEMERAL(zp->z_gid)) {
-               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zp->z_zfsvfs),
-                   NULL, &fuid, 8);
-               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zp->z_zfsvfs),
-                   NULL, &fgid, 8);
-               VERIFY(0 == sa_bulk_lookup(zp->z_sa_hdl, bulk, count));
-       }
-       if (IS_EPHEMERAL(zp->z_uid))
-               *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
-       else
-               *uidp = zp->z_uid;
-       if (IS_EPHEMERAL(zp->z_gid))
-               *gidp = zfs_fuid_map_id(zp->z_zfsvfs,
-                   zp->z_gid, cr, ZFS_GROUP);
-       else
-               *gidp = zp->z_gid;
+       *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+       *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_gid, cr, ZFS_GROUP);
 }
 
 uid_t
index de5fb1e4ce3efb9fae1e309befe3f4d1fb22e8c3..1b63c9bf45ef54cee88cc8001732a903e47db7ad 100644 (file)
@@ -60,6 +60,7 @@
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sharefs/share.h>
@@ -87,12 +88,18 @@ typedef enum {
        DATASET_NAME
 } zfs_ioc_namecheck_t;
 
+typedef enum {
+       POOL_CHECK_NONE         = 1 << 0,
+       POOL_CHECK_SUSPENDED    = 1 << 1,
+       POOL_CHECK_READONLY     = 1 << 2
+} zfs_ioc_poolcheck_t;
+
 typedef struct zfs_ioc_vec {
        zfs_ioc_func_t          *zvec_func;
        zfs_secpolicy_func_t    *zvec_secpolicy;
        zfs_ioc_namecheck_t     zvec_namecheck;
        boolean_t               zvec_his_log;
-       boolean_t               zvec_pool_check;
+       zfs_ioc_poolcheck_t     zvec_pool_check;
 } zfs_ioc_vec_t;
 
 /* This array is indexed by zfs_userquota_prop_t */
@@ -281,9 +288,8 @@ zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr)
 }
 
 static int
-zfs_dozonecheck(const char *dataset, cred_t *cr)
+zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr)
 {
-       uint64_t zoned;
        int writable = 1;
 
        /*
@@ -294,9 +300,6 @@ zfs_dozonecheck(const char *dataset, cred_t *cr)
            !zone_dataset_visible(dataset, &writable))
                return (ENOENT);
 
-       if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL))
-               return (ENOENT);
-
        if (INGLOBALZONE(curproc)) {
                /*
                 * If the fs is zoned, only root can access it from the
@@ -318,6 +321,32 @@ zfs_dozonecheck(const char *dataset, cred_t *cr)
        return (0);
 }
 
+static int
+zfs_dozonecheck(const char *dataset, cred_t *cr)
+{
+       uint64_t zoned;
+
+       if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL))
+               return (ENOENT);
+
+       return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
+static int
+zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr)
+{
+       uint64_t zoned;
+
+       rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
+       if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL)) {
+               rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+               return (ENOENT);
+       }
+       rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+
+       return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
 int
 zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
 {
@@ -332,6 +361,21 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
        return (error);
 }
 
+int
+zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
+    const char *perm, cred_t *cr)
+{
+       int error;
+
+       error = zfs_dozonecheck_ds(name, ds, cr);
+       if (error == 0) {
+               error = secpolicy_zfs(cr);
+               if (error)
+                       error = dsl_deleg_access_impl(ds, perm, cr);
+       }
+       return (error);
+}
+
 /*
  * Policy for setting the security label property.
  *
@@ -507,8 +551,38 @@ zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr)
 int
 zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr)
 {
-       return (zfs_secpolicy_write_perms(zc->zc_name,
-           ZFS_DELEG_PERM_SEND, cr));
+       spa_t *spa;
+       dsl_pool_t *dp;
+       dsl_dataset_t *ds;
+       char *cp;
+       int error;
+
+       /*
+        * Generate the current snapshot name from the given objsetid, then
+        * use that name for the secpolicy/zone checks.
+        */
+       cp = strchr(zc->zc_name, '@');
+       if (cp == NULL)
+               return (EINVAL);
+       error = spa_open(zc->zc_name, &spa, FTAG);
+       if (error)
+               return (error);
+
+       dp = spa_get_dsl(spa);
+       rw_enter(&dp->dp_config_rwlock, RW_READER);
+       error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+       rw_exit(&dp->dp_config_rwlock);
+       spa_close(spa, FTAG);
+       if (error)
+               return (error);
+
+       dsl_dataset_name(ds, zc->zc_name);
+
+       error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
+           ZFS_DELEG_PERM_SEND, cr);
+       dsl_dataset_rele(ds, FTAG);
+
+       return (error);
 }
 
 static int
@@ -785,6 +859,22 @@ zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr)
        return (0);
 }
 
+/*
+ * Policy for object to name lookups.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_diff(zfs_cmd_t *zc, cred_t *cr)
+{
+       int error;
+
+       if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
+               return (0);
+
+       error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
+       return (error);
+}
+
 /*
  * Policy for fault injection.  Requires all privileges.
  */
@@ -875,6 +965,33 @@ zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr)
            ZFS_DELEG_PERM_RELEASE, cr));
 }
 
+/*
+ * Policy for allowing temporary snapshots to be taken or released
+ */
+static int
+zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, cred_t *cr)
+{
+       /*
+        * A temporary snapshot is the same as a snapshot,
+        * hold, destroy and release all rolled into one.
+        * Delegated diff alone is sufficient that we allow this.
+        */
+       int error;
+
+       if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+           ZFS_DELEG_PERM_DIFF, cr)) == 0)
+               return (0);
+
+       error = zfs_secpolicy_snapshot(zc, cr);
+       if (!error)
+               error = zfs_secpolicy_hold(zc, cr);
+       if (!error)
+               error = zfs_secpolicy_release(zc, cr);
+       if (!error)
+               error = zfs_secpolicy_destroy(zc, cr);
+       return (error);
+}
+
 /*
  * Returns the nvlist as specified by the user in the zfs_cmd_t.
  */
@@ -1001,14 +1118,15 @@ getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
  * case its z_vfs will be NULL, and it will be opened as the owner.
  */
 static int
-zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp)
+zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
 {
        int error = 0;
 
        if (getzfsvfs(name, zfvp) != 0)
                error = zfsvfs_create(name, zfvp);
        if (error == 0) {
-               rrw_enter(&(*zfvp)->z_teardown_lock, RW_READER, tag);
+               rrw_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
+                   RW_READER, tag);
                if ((*zfvp)->z_unmounted) {
                        /*
                         * XXX we could probably try again, since the unmounting
@@ -1137,13 +1255,15 @@ zfs_ioc_pool_import(zfs_cmd_t *zc)
        if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
            guid != zc->zc_guid)
                error = EINVAL;
-       else if (zc->zc_cookie)
-               error = spa_import_verbatim(zc->zc_name, config, props);
        else
-               error = spa_import(zc->zc_name, config, props);
+               error = spa_import(zc->zc_name, config, props, zc->zc_cookie);
 
-       if (zc->zc_nvlist_dst != 0)
-               (void) put_nvlist(zc, config);
+       if (zc->zc_nvlist_dst != 0) {
+               int err;
+
+               if ((err = put_nvlist(zc, config)) != 0)
+                       error = err;
+       }
 
        nvlist_free(config);
 
@@ -1366,6 +1486,35 @@ zfs_ioc_obj_to_path(zfs_cmd_t *zc)
        return (error);
 }
 
+/*
+ * inputs:
+ * zc_name             name of filesystem
+ * zc_obj              object to find
+ *
+ * outputs:
+ * zc_stat             stats on object
+ * zc_value            path to object
+ */
+static int
+zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
+{
+       objset_t *os;
+       int error;
+
+       /* XXX reading from objset not owned */
+       if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
+               return (error);
+       if (dmu_objset_type(os) != DMU_OST_ZFS) {
+               dmu_objset_rele(os, FTAG);
+               return (EINVAL);
+       }
+       error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
+           sizeof (zc->zc_value));
+       dmu_objset_rele(os, FTAG);
+
+       return (error);
+}
+
 static int
 zfs_ioc_vdev_add(zfs_cmd_t *zc)
 {
@@ -1577,26 +1726,12 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
        return (error);
 }
 
-/*
- * inputs:
- * zc_name             name of filesystem
- * zc_nvlist_dst_size  size of buffer for property nvlist
- *
- * outputs:
- * zc_objset_stats     stats
- * zc_nvlist_dst       property nvlist
- * zc_nvlist_dst_size  size of property nvlist
- */
 static int
-zfs_ioc_objset_stats(zfs_cmd_t *zc)
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
 {
-       objset_t *os = NULL;
-       int error;
+       int error = 0;
        nvlist_t *nv;
 
-       if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
-               return (error);
-
        dmu_objset_fast_stat(os, &zc->zc_objset_stats);
 
        if (zc->zc_nvlist_dst != 0 &&
@@ -1617,7 +1752,32 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc)
                nvlist_free(nv);
        }
 
+       return (error);
+}
+
+/*
+ * inputs:
+ * zc_name             name of filesystem
+ * zc_nvlist_dst_size  size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_objset_stats     stats
+ * zc_nvlist_dst       property nvlist
+ * zc_nvlist_dst_size  size of property nvlist
+ */
+static int
+zfs_ioc_objset_stats(zfs_cmd_t *zc)
+{
+       objset_t *os = NULL;
+       int error;
+
+       if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+               return (error);
+
+       error = zfs_ioc_objset_stats_impl(zc, os);
+
        dmu_objset_rele(os, FTAG);
+
        return (error);
 }
 
@@ -1850,19 +2010,43 @@ top:
 
        error = dmu_snapshot_list_next(os,
            sizeof (zc->zc_name) - strlen(zc->zc_name),
-           zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL);
-       dmu_objset_rele(os, FTAG);
+           zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie,
+           NULL);
+
        if (error == 0) {
-               error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-               if (error == ENOENT)  {
-                       /* We lost a race with destroy, get the next one. */
-                       *strchr(zc->zc_name, '@') = '\0';
-                       goto top;
+               dsl_dataset_t *ds;
+               dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
+
+               /*
+                * Since we probably don't have a hold on this snapshot,
+                * it's possible that the objsetid could have been destroyed
+                * and reused for a new objset. It's OK if this happens during
+                * a zfs send operation, since the new createtxg will be
+                * beyond the range we're interested in.
+                */
+               rw_enter(&dp->dp_config_rwlock, RW_READER);
+               error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds);
+               rw_exit(&dp->dp_config_rwlock);
+               if (error) {
+                       if (error == ENOENT) {
+                               /* Racing with destroy, get the next one. */
+                               *strchr(zc->zc_name, '@') = '\0';
+                               dmu_objset_rele(os, FTAG);
+                               goto top;
+                       }
+               } else {
+                       objset_t *ossnap;
+
+                       error = dmu_objset_from_ds(ds, &ossnap);
+                       if (error == 0)
+                               error = zfs_ioc_objset_stats_impl(zc, ossnap);
+                       dsl_dataset_rele(ds, FTAG);
                }
        } else if (error == ENOENT) {
                error = ESRCH;
        }
 
+       dmu_objset_rele(os, FTAG);
        /* if we failed, undo the @ that we tacked on to zc_name */
        if (error)
                *strchr(zc->zc_name, '@') = '\0';
@@ -1905,7 +2089,7 @@ zfs_prop_set_userquota(const char *dsname, nvpair_t *pair)
        rid = valary[1];
        quota = valary[2];
 
-       err = zfsvfs_hold(dsname, FTAG, &zfsvfs);
+       err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
        if (err == 0) {
                err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
                zfsvfs_rele(zfsvfs, FTAG);
@@ -1970,7 +2154,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
        {
                zfsvfs_t *zfsvfs;
 
-               if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs)) != 0)
+               if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
                        break;
 
                err = zfs_set_version(zfsvfs, intval);
@@ -2872,8 +3056,8 @@ zfs_ioc_snapshot(zfs_cmd_t *zc)
                goto out;
        }
 
-       error = dmu_objset_snapshot(zc->zc_name, zc->zc_value,
-           nvprops, recursive);
+       error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, NULL,
+           nvprops, recursive, B_FALSE, -1);
 
 out:
        nvlist_free(nvprops);
@@ -3342,11 +3526,14 @@ static boolean_t zfs_ioc_recv_inject_err;
  * zc_cookie           file descriptor to recv from
  * zc_begin_record     the BEGIN record of the stream (not byteswapped)
  * zc_guid             force flag
+ * zc_cleanup_fd       cleanup-on-exit file descriptor
+ * zc_action_handle    handle for this guid/ds mapping (or zero on first call)
  *
  * outputs:
  * zc_cookie           number of bytes read
  * zc_nvlist_dst{_size} error for each unapplied received property
  * zc_obj              zprop_errflags_t
+ * zc_action_handle    handle for this guid/ds mapping
  */
 static int
 zfs_ioc_recv(zfs_cmd_t *zc)
@@ -3475,7 +3662,8 @@ zfs_ioc_recv(zfs_cmd_t *zc)
        }
 
        off = fp->f_offset;
-       error = dmu_recv_stream(&drc, fp->f_vnode, &off);
+       error = dmu_recv_stream(&drc, fp->f_vnode, &off, zc->zc_cleanup_fd,
+           &zc->zc_action_handle);
 
        if (error == 0) {
                zfsvfs_t *zfsvfs = NULL;
@@ -3567,9 +3755,10 @@ out:
 /*
  * inputs:
  * zc_name     name of snapshot to send
- * zc_value    short name of incremental fromsnap (may be empty)
  * zc_cookie   file descriptor to send stream to
- * zc_obj      fromorigin flag (mutually exclusive with zc_value)
+ * zc_obj      fromorigin flag (mutually exclusive with zc_fromobj)
+ * zc_sendobj  objsetid of snapshot to send
+ * zc_fromobj  objsetid of incremental fromsnap (may be zero)
  *
  * outputs: none
  */
@@ -3581,34 +3770,55 @@ zfs_ioc_send(zfs_cmd_t *zc)
        file_t *fp;
        int error;
        offset_t off;
+       dsl_dataset_t *ds;
+       dsl_dataset_t *dsfrom = NULL;
+       spa_t *spa;
+       dsl_pool_t *dp;
 
-       error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap);
+       error = spa_open(zc->zc_name, &spa, FTAG);
        if (error)
                return (error);
 
-       if (zc->zc_value[0] != '\0') {
-               char *buf;
-               char *cp;
-
-               buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-               (void) strncpy(buf, zc->zc_name, MAXPATHLEN);
-               cp = strchr(buf, '@');
-               if (cp)
-                       *(cp+1) = 0;
-               (void) strncat(buf, zc->zc_value, MAXPATHLEN);
-               error = dmu_objset_hold(buf, FTAG, &fromsnap);
-               kmem_free(buf, MAXPATHLEN);
+       dp = spa_get_dsl(spa);
+       rw_enter(&dp->dp_config_rwlock, RW_READER);
+       error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+       rw_exit(&dp->dp_config_rwlock);
+       if (error) {
+               spa_close(spa, FTAG);
+               return (error);
+       }
+
+       error = dmu_objset_from_ds(ds, &tosnap);
+       if (error) {
+               dsl_dataset_rele(ds, FTAG);
+               spa_close(spa, FTAG);
+               return (error);
+       }
+
+       if (zc->zc_fromobj != 0) {
+               rw_enter(&dp->dp_config_rwlock, RW_READER);
+               error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &dsfrom);
+               rw_exit(&dp->dp_config_rwlock);
+               spa_close(spa, FTAG);
+               if (error) {
+                       dsl_dataset_rele(ds, FTAG);
+                       return (error);
+               }
+               error = dmu_objset_from_ds(dsfrom, &fromsnap);
                if (error) {
-                       dmu_objset_rele(tosnap, FTAG);
+                       dsl_dataset_rele(dsfrom, FTAG);
+                       dsl_dataset_rele(ds, FTAG);
                        return (error);
                }
+       } else {
+               spa_close(spa, FTAG);
        }
 
        fp = getf(zc->zc_cookie);
        if (fp == NULL) {
-               dmu_objset_rele(tosnap, FTAG);
-               if (fromsnap)
-                       dmu_objset_rele(fromsnap, FTAG);
+               dsl_dataset_rele(ds, FTAG);
+               if (dsfrom)
+                       dsl_dataset_rele(dsfrom, FTAG);
                return (EBADF);
        }
 
@@ -3618,9 +3828,9 @@ zfs_ioc_send(zfs_cmd_t *zc)
        if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
                fp->f_offset = off;
        releasef(zc->zc_cookie);
-       if (fromsnap)
-               dmu_objset_rele(fromsnap, FTAG);
-       dmu_objset_rele(tosnap, FTAG);
+       if (dsfrom)
+               dsl_dataset_rele(dsfrom, FTAG);
+       dsl_dataset_rele(ds, FTAG);
        return (error);
 }
 
@@ -3717,7 +3927,10 @@ zfs_ioc_clear(zfs_cmd_t *zc)
                        error = spa_open_rewind(zc->zc_name, &spa, FTAG,
                            policy, &config);
                        if (config != NULL) {
-                               (void) put_nvlist(zc, config);
+                               int err;
+
+                               if ((err = put_nvlist(zc, config)) != 0)
+                                       error = err;
                                nvlist_free(config);
                        }
                        nvlist_free(policy);
@@ -3801,7 +4014,7 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc)
        if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
                return (EINVAL);
 
-       error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs);
+       error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
        if (error)
                return (error);
 
@@ -3832,7 +4045,7 @@ zfs_ioc_userspace_many(zfs_cmd_t *zc)
        if (bufsize <= 0)
                return (ENOMEM);
 
-       int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs);
+       int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
        if (error)
                return (error);
 
@@ -4031,6 +4244,113 @@ ace_t full_access[] = {
        {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
 };
 
+/*
+ * inputs:
+ * zc_name             name of containing filesystem
+ * zc_obj              object # beyond which we want next in-use object #
+ *
+ * outputs:
+ * zc_obj              next in-use object #
+ */
+static int
+zfs_ioc_next_obj(zfs_cmd_t *zc)
+{
+       objset_t *os = NULL;
+       int error;
+
+       error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+       if (error)
+               return (error);
+
+       error = dmu_object_next(os, &zc->zc_obj, B_FALSE,
+           os->os_dsl_dataset->ds_phys->ds_prev_snap_txg);
+
+       dmu_objset_rele(os, FTAG);
+       return (error);
+}
+
+/*
+ * inputs:
+ * zc_name             name of filesystem
+ * zc_value            prefix name for snapshot
+ * zc_cleanup_fd       cleanup-on-exit file descriptor for calling process
+ *
+ * outputs:
+ */
+static int
+zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
+{
+       char *snap_name;
+       int error;
+
+       snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
+           (u_longlong_t)ddi_get_lbolt64());
+
+       if (strlen(snap_name) >= MAXNAMELEN) {
+               strfree(snap_name);
+               return (E2BIG);
+       }
+
+       error = dmu_objset_snapshot(zc->zc_name, snap_name, snap_name,
+           NULL, B_FALSE, B_TRUE, zc->zc_cleanup_fd);
+       if (error != 0) {
+               strfree(snap_name);
+               return (error);
+       }
+
+       (void) strcpy(zc->zc_value, snap_name);
+       strfree(snap_name);
+       return (0);
+}
+
+/*
+ * inputs:
+ * zc_name             name of "to" snapshot
+ * zc_value            name of "from" snapshot
+ * zc_cookie           file descriptor to write diff data on
+ *
+ * outputs:
+ * dmu_diff_record_t's to the file descriptor
+ */
+static int
+zfs_ioc_diff(zfs_cmd_t *zc)
+{
+       objset_t *fromsnap;
+       objset_t *tosnap;
+       file_t *fp;
+       offset_t off;
+       int error;
+
+       error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap);
+       if (error)
+               return (error);
+
+       error = dmu_objset_hold(zc->zc_value, FTAG, &fromsnap);
+       if (error) {
+               dmu_objset_rele(tosnap, FTAG);
+               return (error);
+       }
+
+       fp = getf(zc->zc_cookie);
+       if (fp == NULL) {
+               dmu_objset_rele(fromsnap, FTAG);
+               dmu_objset_rele(tosnap, FTAG);
+               return (EBADF);
+       }
+
+       off = fp->f_offset;
+
+       error = dmu_diff(tosnap, fromsnap, fp->f_vnode, &off);
+
+       if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
+               fp->f_offset = off;
+       releasef(zc->zc_cookie);
+
+       dmu_objset_rele(fromsnap, FTAG);
+       dmu_objset_rele(tosnap, FTAG);
+       return (error);
+}
+
 /*
  * Remove all ACL files in shares dir
  */
@@ -4182,11 +4502,14 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
 
 /*
  * inputs:
- * zc_name     name of filesystem
- * zc_value    short name of snap
- * zc_string   user-supplied tag for this reference
- * zc_cookie   recursive flag
- * zc_temphold set if hold is temporary
+ * zc_name             name of filesystem
+ * zc_value            short name of snap
+ * zc_string           user-supplied tag for this hold
+ * zc_cookie           recursive flag
+ * zc_temphold         set if hold is temporary
+ * zc_cleanup_fd       cleanup-on-exit file descriptor for calling process
+ * zc_sendobj          if non-zero, the objid for zc_name@zc_value
+ * zc_createtxg                if zc_sendobj is non-zero, snap must have zc_createtxg
  *
  * outputs:            none
  */
@@ -4194,22 +4517,76 @@ static int
 zfs_ioc_hold(zfs_cmd_t *zc)
 {
        boolean_t recursive = zc->zc_cookie;
+       spa_t *spa;
+       dsl_pool_t *dp;
+       dsl_dataset_t *ds;
+       int error;
+       minor_t minor = 0;
 
        if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
                return (EINVAL);
 
-       return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value,
-           zc->zc_string, recursive, zc->zc_temphold));
+       if (zc->zc_sendobj == 0) {
+               return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value,
+                   zc->zc_string, recursive, zc->zc_temphold,
+                   zc->zc_cleanup_fd));
+       }
+
+       if (recursive)
+               return (EINVAL);
+
+       error = spa_open(zc->zc_name, &spa, FTAG);
+       if (error)
+               return (error);
+
+       dp = spa_get_dsl(spa);
+       rw_enter(&dp->dp_config_rwlock, RW_READER);
+       error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+       rw_exit(&dp->dp_config_rwlock);
+       spa_close(spa, FTAG);
+       if (error)
+               return (error);
+
+       /*
+        * Until we have a hold on this snapshot, it's possible that
+        * zc_sendobj could've been destroyed and reused as part
+        * of a later txg.  Make sure we're looking at the right object.
+        */
+       if (zc->zc_createtxg != ds->ds_phys->ds_creation_txg) {
+               dsl_dataset_rele(ds, FTAG);
+               return (ENOENT);
+       }
+
+       if (zc->zc_cleanup_fd != -1 && zc->zc_temphold) {
+               error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
+               if (error) {
+                       dsl_dataset_rele(ds, FTAG);
+                       return (error);
+               }
+       }
+
+       error = dsl_dataset_user_hold_for_send(ds, zc->zc_string,
+           zc->zc_temphold);
+       if (minor != 0) {
+               if (error == 0) {
+                       dsl_register_onexit_hold_cleanup(ds, zc->zc_string,
+                           minor);
+               }
+               zfs_onexit_fd_rele(zc->zc_cleanup_fd);
+       }
+       dsl_dataset_rele(ds, FTAG);
+
+       return (error);
 }
 
 /*
  * inputs:
- * zc_name     name of dataset from which we're releasing a user reference
+ * zc_name     name of dataset from which we're releasing a user hold
  * zc_value    short name of snap
- * zc_string   user-supplied tag for this reference
+ * zc_string   user-supplied tag for this hold
  * zc_cookie   recursive flag
  *
- * outputs:            none
+ * outputs:    none
  */
 static int
 zfs_ioc_release(zfs_cmd_t *zc)
@@ -4251,132 +4628,264 @@ zfs_ioc_get_holds(zfs_cmd_t *zc)
  */
 static zfs_ioc_vec_t zfs_ioc_vec[] = {
        { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE,
-           B_FALSE },
+           POOL_CHECK_NONE },
        { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE,
-           B_FALSE },
+           POOL_CHECK_NONE },
        { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-           B_FALSE },
+           POOL_CHECK_NONE },
        { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE,
-           B_FALSE },
+           POOL_CHECK_NONE },
        { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE,
-           B_FALSE },
+           POOL_CHECK_NONE },
        { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE,
-           B_FALSE },
+           POOL_CHECK_NONE },
        { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE,
-           B_FALSE },
+           POOL_CHECK_NONE },
        { zfs_ioc_pool_scan, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE,
-           B_FALSE },
+           POOL_CHECK_READONLY },
        { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE,
-           B_FALSE },
+           POOL_CHECK_NONE },
        { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-           B_FALSE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_vdev_setfru,  zfs_secpolicy_config, POOL_NAME, B_FALSE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED },
        { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-           B_FALSE },
+           POOL_CHECK_NONE },
        { zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED },
        { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-           B_TRUE },
-       { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE },
-       { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE },
+           POOL_CHECK_SUSPENDED },
+       { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE,
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+       { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE,
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE,
-           B_TRUE},
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE,
-           B_TRUE },
-       { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE, B_TRUE },
-       { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, B_TRUE },
-       { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE, B_FALSE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+       { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE,
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+       { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE,
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+       { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE,
+           POOL_CHECK_NONE },
        { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE,
-           B_FALSE },
+           POOL_CHECK_NONE },
        { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE,
-           B_FALSE },
+           POOL_CHECK_NONE },
        { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE,
-           B_FALSE },
+           POOL_CHECK_NONE },
        { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE,
-           B_FALSE },
-       { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_FALSE },
+           POOL_CHECK_NONE },
+       { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+           POOL_CHECK_NONE },
        { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, DATASET_NAME,
-           B_TRUE, B_TRUE },
+           B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE,
-           B_TRUE },
-       { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE,
-           B_FALSE },
-       { zfs_ioc_obj_to_path, zfs_secpolicy_config, DATASET_NAME, B_FALSE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+       { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, POOL_NAME, B_FALSE,
+           POOL_CHECK_NONE },
+       { zfs_ioc_obj_to_path, zfs_secpolicy_diff, DATASET_NAME, B_FALSE,
+           POOL_CHECK_SUSPENDED },
        { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE,
-           B_FALSE },
+           POOL_CHECK_NONE },
        { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-           B_FALSE },
-       { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, B_FALSE },
+           POOL_CHECK_NONE },
+       { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE,
+           POOL_CHECK_NONE },
        { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE,
-           B_FALSE },
-       { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one,
-           DATASET_NAME, B_FALSE, B_FALSE },
-       { zfs_ioc_userspace_many, zfs_secpolicy_userspace_many,
-           DATASET_NAME, B_FALSE, B_FALSE },
+           POOL_CHECK_NONE },
+       { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one, DATASET_NAME,
+           B_FALSE, POOL_CHECK_NONE },
+       { zfs_ioc_userspace_many, zfs_secpolicy_userspace_many, DATASET_NAME,
+           B_FALSE, POOL_CHECK_NONE },
        { zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
-           DATASET_NAME, B_FALSE, B_TRUE },
-       { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, B_TRUE },
+           DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+       { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE,
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
        { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-           B_TRUE },
+           POOL_CHECK_SUSPENDED },
        { zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-           B_FALSE },
+           POOL_CHECK_NONE },
        { zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-           B_TRUE }
+           POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+       { zfs_ioc_next_obj, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+           POOL_CHECK_NONE },
+       { zfs_ioc_diff, zfs_secpolicy_diff, DATASET_NAME, B_FALSE,
+           POOL_CHECK_NONE },
+       { zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, DATASET_NAME,
+           B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+       { zfs_ioc_obj_to_stats, zfs_secpolicy_diff, DATASET_NAME, B_FALSE,
+           POOL_CHECK_SUSPENDED }
 };
 
 int
-pool_status_check(const char *name, zfs_ioc_namecheck_t type)
+pool_status_check(const char *name, zfs_ioc_namecheck_t type,
+    zfs_ioc_poolcheck_t check)
 {
        spa_t *spa;
        int error;
 
        ASSERT(type == POOL_NAME || type == DATASET_NAME);
 
+       if (check & POOL_CHECK_NONE)
+               return (0);
+
        error = spa_open(name, &spa, FTAG);
        if (error == 0) {
-               if (spa_suspended(spa))
+               if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa))
                        error = EAGAIN;
+               else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa))
+                       error = EROFS;
                spa_close(spa, FTAG);
        }
        return (error);
 }
 
+/*
+ * Find a free minor number.
+ */
+minor_t
+zfsdev_minor_alloc(void)
+{
+       static minor_t last_minor;
+       minor_t m;
+
+       ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+       for (m = last_minor + 1; m != last_minor; m++) {
+               if (m > ZFSDEV_MAX_MINOR)
+                       m = 1;
+               if (ddi_get_soft_state(zfsdev_state, m) == NULL) {
+                       last_minor = m;
+                       return (m);
+               }
+       }
+
+       return (0);
+}
+
+static int
+zfs_ctldev_init(dev_t *devp)
+{
+       minor_t minor;
+       zfs_soft_state_t *zs;
+
+       ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+       ASSERT(getminor(*devp) == 0);
+
+       minor = zfsdev_minor_alloc();
+       if (minor == 0)
+               return (ENXIO);
+
+       if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS)
+               return (EAGAIN);
+
+       *devp = makedevice(getemajor(*devp), minor);
+
+       zs = ddi_get_soft_state(zfsdev_state, minor);
+       zs->zss_type = ZSST_CTLDEV;
+       zfs_onexit_init((zfs_onexit_t **)&zs->zss_data);
+
+       return (0);
+}
+
+static void
+zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor)
+{
+       ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+       zfs_onexit_destroy(zo);
+       ddi_soft_state_free(zfsdev_state, minor);
+}
+
+void *
+zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which)
+{
+       zfs_soft_state_t *zp;
+
+       zp = ddi_get_soft_state(zfsdev_state, minor);
+       if (zp == NULL || zp->zss_type != which)
+               return (NULL);
+
+       return (zp->zss_data);
+}
+
+static int
+zfsdev_open(dev_t *devp, int flag, int otyp, cred_t *cr)
+{
+       int error = 0;
+
+       if (getminor(*devp) != 0)
+               return (zvol_open(devp, flag, otyp, cr));
+
+       /* This is the control device. Allocate a new minor if requested. */
+       if (flag & FEXCL) {
+               mutex_enter(&zfsdev_state_lock);
+               error = zfs_ctldev_init(devp);
+               mutex_exit(&zfsdev_state_lock);
+       }
+
+       return (error);
+}
+
+static int
+zfsdev_close(dev_t dev, int flag, int otyp, cred_t *cr)
+{
+       zfs_onexit_t *zo;
+       minor_t minor = getminor(dev);
+
+       if (minor == 0)
+               return (0);
+
+       mutex_enter(&zfsdev_state_lock);
+       zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
+       if (zo == NULL) {
+               mutex_exit(&zfsdev_state_lock);
+               return (zvol_close(dev, flag, otyp, cr));
+       }
+       zfs_ctldev_destroy(zo, minor);
+       mutex_exit(&zfsdev_state_lock);
+
+       return (0);
+}
+
 static int
 zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 {
        zfs_cmd_t *zc;
        uint_t vec;
        int error, rc;
+       minor_t minor = getminor(dev);
 
-       if (getminor(dev) != 0)
+       if (minor != 0 &&
+           zfsdev_get_soft_state(minor, ZSST_CTLDEV) == NULL)
                return (zvol_ioctl(dev, cmd, arg, flag, cr, rvalp));
 
        vec = cmd - ZFS_IOC;
@@ -4405,17 +4914,17 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
                case POOL_NAME:
                        if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
                                error = EINVAL;
-                       if (zfs_ioc_vec[vec].zvec_pool_check)
-                               error = pool_status_check(zc->zc_name,
-                                   zfs_ioc_vec[vec].zvec_namecheck);
+                       error = pool_status_check(zc->zc_name,
+                           zfs_ioc_vec[vec].zvec_namecheck,
+                           zfs_ioc_vec[vec].zvec_pool_check);
                        break;
 
                case DATASET_NAME:
                        if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
                                error = EINVAL;
-                       if (zfs_ioc_vec[vec].zvec_pool_check)
-                               error = pool_status_check(zc->zc_name,
-                                   zfs_ioc_vec[vec].zvec_namecheck);
+                       error = pool_status_check(zc->zc_name,
+                           zfs_ioc_vec[vec].zvec_namecheck,
+                           zfs_ioc_vec[vec].zvec_pool_check);
                        break;
 
                case NO_NAME:
@@ -4499,8 +5008,8 @@ zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
  * so most of the standard driver entry points are in zvol.c.
  */
 static struct cb_ops zfs_cb_ops = {
-       zvol_open,      /* open */
-       zvol_close,     /* close */
+       zfsdev_open,    /* open */
+       zfsdev_close,   /* close */
        zvol_strategy,  /* strategy */
        nodev,          /* print */
        zvol_dump,      /* dump */
index bf9f37bcab5bbcbab6e00964c0687a4f610b45e5..26ab78279b31cc0edac6247b425230c32f45b58e 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -170,6 +169,12 @@ zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
        if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
                *attrs |= (xoap->xoa_reparse == 0) ? 0 :
                    XAT0_REPARSE;
+       if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+               *attrs |= (xoap->xoa_offline == 0) ? 0 :
+                   XAT0_OFFLINE;
+       if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+               *attrs |= (xoap->xoa_sparse == 0) ? 0 :
+                   XAT0_SPARSE;
 }
 
 static void *
@@ -231,7 +236,6 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     zfs_fuid_info_t *fuidp, vattr_t *vap)
 {
        itx_t *itx;
-       uint64_t seq;
        lr_create_t *lr;
        lr_acl_create_t *lracl;
        size_t aclsize;
@@ -333,9 +337,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
         */
        bcopy(name, end, namesize);
 
-       seq = zil_itx_assign(zilog, itx, tx);
-       dzp->z_last_itx = seq;
-       zp->z_last_itx = seq;
+       zil_itx_assign(zilog, itx, tx);
 }
 
 /*
@@ -343,10 +345,9 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
  */
 void
 zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-       znode_t *dzp, char *name)
+       znode_t *dzp, char *name, uint64_t foid)
 {
        itx_t *itx;
-       uint64_t seq;
        lr_remove_t *lr;
        size_t namesize = strlen(name) + 1;
 
@@ -358,8 +359,9 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
        lr->lr_doid = dzp->z_id;
        bcopy(name, (char *)(lr + 1), namesize);
 
-       seq = zil_itx_assign(zilog, itx, tx);
-       dzp->z_last_itx = seq;
+       itx->itx_oid = foid;
+
+       zil_itx_assign(zilog, itx, tx);
 }
 
 /*
@@ -370,7 +372,6 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
        znode_t *dzp, znode_t *zp, char *name)
 {
        itx_t *itx;
-       uint64_t seq;
        lr_link_t *lr;
        size_t namesize = strlen(name) + 1;
 
@@ -383,9 +384,7 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
        lr->lr_link_obj = zp->z_id;
        bcopy(name, (char *)(lr + 1), namesize);
 
-       seq = zil_itx_assign(zilog, itx, tx);
-       dzp->z_last_itx = seq;
-       zp->z_last_itx = seq;
+       zil_itx_assign(zilog, itx, tx);
 }
 
 /*
@@ -396,7 +395,6 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, znode_t *zp, char *name, char *link)
 {
        itx_t *itx;
-       uint64_t seq;
        lr_create_t *lr;
        size_t namesize = strlen(name) + 1;
        size_t linksize = strlen(link) + 1;
@@ -418,9 +416,7 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
        bcopy(name, (char *)(lr + 1), namesize);
        bcopy(link, (char *)(lr + 1) + namesize, linksize);
 
-       seq = zil_itx_assign(zilog, itx, tx);
-       dzp->z_last_itx = seq;
-       zp->z_last_itx = seq;
+       zil_itx_assign(zilog, itx, tx);
 }
 
 /*
@@ -431,7 +427,6 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
        znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
 {
        itx_t *itx;
-       uint64_t seq;
        lr_rename_t *lr;
        size_t snamesize = strlen(sname) + 1;
        size_t dnamesize = strlen(dname) + 1;
@@ -445,11 +440,9 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
        lr->lr_tdoid = tdzp->z_id;
        bcopy(sname, (char *)(lr + 1), snamesize);
        bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+       itx->itx_oid = szp->z_id;
 
-       seq = zil_itx_assign(zilog, itx, tx);
-       sdzp->z_last_itx = seq;
-       tdzp->z_last_itx = seq;
-       szp->z_last_itx = seq;
+       zil_itx_assign(zilog, itx, tx);
 }
 
 /*
@@ -520,13 +513,11 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 
                itx->itx_private = zp->z_zfsvfs;
 
-               if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) ||
-                   (ioflag & (FSYNC | FDSYNC)))
-                       itx->itx_sync = B_TRUE;
-               else
+               if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) &&
+                   (fsync_cnt == 0))
                        itx->itx_sync = B_FALSE;
 
-               zp->z_last_itx = zil_itx_assign(zilog, itx, tx);
+               zil_itx_assign(zilog, itx, tx);
 
                off += len;
                resid -= len;
@@ -541,7 +532,6 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
        znode_t *zp, uint64_t off, uint64_t len)
 {
        itx_t *itx;
-       uint64_t seq;
        lr_truncate_t *lr;
 
        if (zil_replaying(zilog, tx) || zp->z_unlinked)
@@ -554,8 +544,7 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
        lr->lr_length = len;
 
        itx->itx_sync = (zp->z_sync_cnt != 0);
-       seq = zil_itx_assign(zilog, itx, tx);
-       zp->z_last_itx = seq;
+       zil_itx_assign(zilog, itx, tx);
 }
 
 /*
@@ -566,7 +555,6 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
        znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
 {
        itx_t           *itx;
-       uint64_t        seq;
        lr_setattr_t    *lr;
        xvattr_t        *xvap = (xvattr_t *)vap;
        size_t          recsize = sizeof (lr_setattr_t);
@@ -618,8 +606,7 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
                (void) zfs_log_fuid_domains(fuidp, start);
 
        itx->itx_sync = (zp->z_sync_cnt != 0);
-       seq = zil_itx_assign(zilog, itx, tx);
-       zp->z_last_itx = seq;
+       zil_itx_assign(zilog, itx, tx);
 }
 
 /*
@@ -630,7 +617,6 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
     vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
 {
        itx_t *itx;
-       uint64_t seq;
        lr_acl_v0_t *lrv0;
        lr_acl_t *lr;
        int txtype;
@@ -686,6 +672,5 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
        }
 
        itx->itx_sync = (zp->z_sync_cnt != 0);
-       seq = zil_itx_assign(zilog, itx, tx);
-       zp->z_last_itx = seq;
+       zil_itx_assign(zilog, itx, tx);
 }
diff --git a/module/zfs/zfs_onexit.c b/module/zfs/zfs_onexit.c
new file mode 100644 (file)
index 0000000..9706de2
--- /dev/null
@@ -0,0 +1,246 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/open.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/mkdev.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+
+/*
+ * ZFS kernel routines may add/delete callback routines to be invoked
+ * upon process exit (triggered via the close operation from the /dev/zfs
+ * driver).
+ *
+ * These cleanup callbacks are intended to allow for the accumulation
+ * of kernel state across multiple ioctls.  User processes participate
+ * by opening ZFS_DEV with O_EXCL. This causes the ZFS driver to do a
+ * clone-open, generating a unique minor number. The process then passes
+ * along that file descriptor to each ioctl that might have a cleanup operation.
+ *
+ * Consumers of the onexit routines should call zfs_onexit_fd_hold() early
+ * on to validate the given fd and add a reference to its file table entry.
+ * This allows the consumer to do its work and then add a callback, knowing
+ * that zfs_onexit_add_cb() won't fail with EBADF.  When finished, consumers
+ * should call zfs_onexit_fd_rele().
+ *
+ * A simple example is zfs_ioc_recv(), where we might create an AVL tree
+ * with dataset/GUID mappings and then reuse that tree on subsequent
+ * zfs_ioc_recv() calls.
+ *
+ * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc()
+ * the AVL tree and pass it along with a callback function to
+ * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the
+ * callback and return an action handle.
+ *
+ * The action handle is then passed from user space to subsequent
+ * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree
+ * by calling zfs_onexit_cb_data() with the device minor number and
+ * action handle.
+ *
+ * If the user process exits abnormally, the callback is invoked implicitly
+ * as part of the driver close operation.  Once the user space process is
+ * finished with the accumulated kernel state, it can also just call close(2)
+ * on the cleanup fd to trigger the cleanup callback.
+ */
+
+void
+zfs_onexit_init(zfs_onexit_t **zop)
+{
+       zfs_onexit_t *zo;
+
+       zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP);
+       mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL);
+       list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t),
+           offsetof(zfs_onexit_action_node_t, za_link));
+}
+
+void
+zfs_onexit_destroy(zfs_onexit_t *zo)
+{
+       zfs_onexit_action_node_t *ap;
+
+       mutex_enter(&zo->zo_lock);
+       while ((ap = list_head(&zo->zo_actions)) != NULL) {
+               list_remove(&zo->zo_actions, ap);
+               mutex_exit(&zo->zo_lock);
+               ap->za_func(ap->za_data);
+               kmem_free(ap, sizeof (zfs_onexit_action_node_t));
+               mutex_enter(&zo->zo_lock);
+       }
+       mutex_exit(&zo->zo_lock);
+
+       list_destroy(&zo->zo_actions);
+       mutex_destroy(&zo->zo_lock);
+       kmem_free(zo, sizeof (zfs_onexit_t));
+}
+
+static int
+zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo)
+{
+       *zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
+       if (*zo == NULL)
+               return (EBADF);
+
+       return (0);
+}
+
+/*
+ * Consumers might need to operate by minor number instead of fd, since
+ * they might be running in another thread (e.g. txg_sync_thread). Callers
+ * of this function must call zfs_onexit_fd_rele() when they're finished
+ * using the minor number.
+ */
+int
+zfs_onexit_fd_hold(int fd, minor_t *minorp)
+{
+       file_t *fp;
+       zfs_onexit_t *zo;
+
+       fp = getf(fd);
+       if (fp == NULL)
+               return (EBADF);
+
+       *minorp = getminor(fp->f_vnode->v_rdev);
+       return (zfs_onexit_minor_to_state(*minorp, &zo));
+}
+
+void
+zfs_onexit_fd_rele(int fd)
+{
+       releasef(fd);
+}
+
+/*
+ * Add a callback to be invoked when the calling process exits.
+ */
+int
+zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+    uint64_t *action_handle)
+{
+       zfs_onexit_t *zo;
+       zfs_onexit_action_node_t *ap;
+       int error;
+
+       error = zfs_onexit_minor_to_state(minor, &zo);
+       if (error)
+               return (error);
+
+       ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP);
+       list_link_init(&ap->za_link);
+       ap->za_func = func;
+       ap->za_data = data;
+
+       mutex_enter(&zo->zo_lock);
+       list_insert_tail(&zo->zo_actions, ap);
+       mutex_exit(&zo->zo_lock);
+       if (action_handle)
+               *action_handle = (uint64_t)(uintptr_t)ap;
+
+       return (0);
+}
+
+static zfs_onexit_action_node_t *
+zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle)
+{
+       zfs_onexit_action_node_t *match;
+       zfs_onexit_action_node_t *ap;
+       list_t *l;
+
+       ASSERT(MUTEX_HELD(&zo->zo_lock));
+
+       match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle;
+       l = &zo->zo_actions;
+       for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) {
+               if (match == ap)
+                       break;
+       }
+       return (ap);
+}
+
+/*
+ * Delete the callback, triggering it first if 'fire' is set.
+ */
+int
+zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
+{
+       zfs_onexit_t *zo;
+       zfs_onexit_action_node_t *ap;
+       int error;
+
+       error = zfs_onexit_minor_to_state(minor, &zo);
+       if (error)
+               return (error);
+
+       mutex_enter(&zo->zo_lock);
+       ap = zfs_onexit_find_cb(zo, action_handle);
+       if (ap != NULL) {
+               list_remove(&zo->zo_actions, ap);
+               mutex_exit(&zo->zo_lock);
+               if (fire)
+                       ap->za_func(ap->za_data);
+               kmem_free(ap, sizeof (zfs_onexit_action_node_t));
+       } else {
+               mutex_exit(&zo->zo_lock);
+               error = ENOENT;
+       }
+
+       return (error);
+}
+
+/*
+ * Return the data associated with this callback.  This allows consumers
+ * of the cleanup-on-exit interfaces to stash kernel data across system
+ * calls, knowing that it will be cleaned up if the calling process exits.
+ */
+int
+zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
+{
+       zfs_onexit_t *zo;
+       zfs_onexit_action_node_t *ap;
+       int error;
+
+       *data = NULL;
+
+       error = zfs_onexit_minor_to_state(minor, &zo);
+       if (error)
+               return (error);
+
+       mutex_enter(&zo->zo_lock);
+       ap = zfs_onexit_find_cb(zo, action_handle);
+       if (ap != NULL)
+               *data = ap->za_data;
+       else
+               error = ENOENT;
+       mutex_exit(&zo->zo_lock);
+
+       return (error);
+}
index f26009b02c59bad32d324663d5f9705a63f889cf..9fb3368569906b7e40604249ab5b7899cd9d8f76 100644 (file)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -129,6 +128,10 @@ zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
                bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
        if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
                xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
+       if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+               xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0);
+       if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+               xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0);
 }
 
 static int
@@ -625,7 +628,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
        znode_t *zp;
        int error;
        ssize_t resid;
-       uint64_t orig_eof, eod, offset, length;
+       uint64_t eod, offset, length;
 
        if (byteswap)
                byteswap_uint64_array(lr, sizeof (*lr));
@@ -643,9 +646,20 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
 
        offset = lr->lr_offset;
        length = lr->lr_length;
-       eod = offset + length;          /* end of data for this write */
+       eod = offset + length;  /* end of data for this write */
 
-       orig_eof = zp->z_size;
+       /*
+        * This may be a write from a dmu_sync() for a whole block,
+        * and may extend beyond the current end of the file.
+        * We can't just replay what was written for this TX_WRITE as
+        * a future TX_WRITE2 may extend the eof and the data for that
+        * write needs to be there. So we write the whole block and
+        * reduce the eof. This needs to be done within the single dmu
+        * transaction created within vn_rdwr -> zfs_write. So a possible
+        * new end of file is passed through in zfsvfs->z_replay_eof
+        */
+
+       zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */
 
        /* If it's a dmu_sync() block, write the whole block */
        if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
@@ -654,23 +668,15 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
                        offset -= offset % blocksize;
                        length = blocksize;
                }
+               if (zp->z_size < eod)
+                       zfsvfs->z_replay_eof = eod;
        }
 
        error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
            UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 
-       /*
-        * This may be a write from a dmu_sync() for a whole block,
-        * and may extend beyond the current end of the file.
-        * We can't just replay what was written for this TX_WRITE as
-        * a future TX_WRITE2 may extend the eof and the data for that
-        * write needs to be there. So we write the whole block and
-        * reduce the eof.
-        */
-       if (orig_eof < zp->z_size) /* file length grew ? */
-               zp->z_size = eod;
-
        VN_RELE(ZTOV(zp));
+       zfsvfs->z_replay_eof = 0;       /* safety */
 
        return (error);
 }
@@ -694,10 +700,31 @@ zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
        if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
                return (error);
 
+top:
        end = lr->lr_offset + lr->lr_length;
        if (end > zp->z_size) {
-               ASSERT3U(end - zp->z_size, <, zp->z_blksz);
+               dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
                zp->z_size = end;
+               dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+               error = dmu_tx_assign(tx, TXG_WAIT);
+               if (error) {
+                       VN_RELE(ZTOV(zp));
+                       if (error == ERESTART) {
+                               dmu_tx_wait(tx);
+                               dmu_tx_abort(tx);
+                               goto top;
+                       }
+                       dmu_tx_abort(tx);
+                       return (error);
+               }
+               (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+                   (void *)&zp->z_size, sizeof (uint64_t), tx);
+
+               /* Ensure the replayed seq is updated */
+               (void) zil_replaying(zfsvfs->z_log, tx);
+
+               dmu_tx_commit(tx);
        }
 
        VN_RELE(ZTOV(zp));
index 73a40aa4fe8326171ca558836fadc03c0b1b8dcd..d141e43d722a06eff607fb8f700e8b3371c835ae 100644 (file)
@@ -125,6 +125,7 @@ zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap)
        zfsvfs_t *zfsvfs = zp->z_zfsvfs;
        xoptattr_t *xoap;
 
+       ASSERT(MUTEX_HELD(&zp->z_lock));
        VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
        if (zp->z_is_sa) {
                if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@@ -158,6 +159,7 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
        zfsvfs_t *zfsvfs = zp->z_zfsvfs;
        xoptattr_t *xoap;
 
+       ASSERT(MUTEX_HELD(&zp->z_lock));
        VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
        if (zp->z_is_sa)
                VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@@ -204,6 +206,7 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
        uint64_t crtime[2], mtime[2], ctime[2];
        zfs_acl_phys_t znode_acl;
        char scanstamp[AV_SCANSTAMP_SZ];
+       boolean_t drop_lock = B_FALSE;
 
        /*
         * No upgrade if ACL isn't cached
@@ -214,6 +217,22 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
        if (zp->z_acl_cached == NULL || ZTOV(zp)->v_type == VLNK)
                return;
 
+       /*
+        * If the z_lock is held and we aren't the owner
+        * the just return since we don't want to deadlock
+        * trying to update the status of z_is_sa.  This
+        * file can then be upgraded at a later time.
+        *
+        * Otherwise, we know we are doing the
+        * sa_update() that caused us to enter this function.
+        */
+       if (mutex_owner(&zp->z_lock) != curthread) {
+               if (mutex_tryenter(&zp->z_lock) == 0)
+                       return;
+               else
+                       drop_lock = B_TRUE;
+       }
+
        /* First do a bulk query of the attributes that aren't cached */
        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
@@ -228,7 +247,7 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
            &znode_acl, 88);
 
        if (sa_bulk_lookup_locked(hdl, bulk, count) != 0)
-               return;
+               goto done;
 
 
        /*
@@ -269,9 +288,10 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
        locate.cb_aclp = zp->z_acl_cached;
        SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
            zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes);
+
        if (xattr)
-               SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs),
-                   NULL, &rdev, 8);
+               SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs),
+                   NULL, &xattr, 8);
 
        /* if scanstamp then add scanstamp */
 
@@ -291,6 +311,9 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
                    znode_acl.z_acl_extern_obj, tx));
 
        zp->z_is_sa = B_TRUE;
+done:
+       if (drop_lock)
+               mutex_exit(&zp->z_lock);
 }
 
 void
@@ -299,12 +322,11 @@ zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp)
        if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa)
                return;
 
-       ASSERT(!zp->z_is_sa);
 
        dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 
-       if (ZFS_EXTERNAL_ACL(zp)) {
-               dmu_tx_hold_free(tx, ZFS_EXTERNAL_ACL(zp), 0,
+       if (zfs_external_acl(zp)) {
+               dmu_tx_hold_free(tx, zfs_external_acl(zp), 0,
                    DMU_OBJECT_END);
        }
 }
index f68dde85f82eddebda6e63a5e367b2ae08740c4c..cb8c1d086e702f09cbba8640f424581730f803a5 100644 (file)
@@ -166,7 +166,7 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
                }
 
                if (zfsvfs->z_log != NULL)
-                       zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
+                       zil_commit(zfsvfs->z_log, 0);
 
                ZFS_EXIT(zfsvfs);
        } else {
@@ -417,7 +417,8 @@ zfs_register_callbacks(vfs_t *vfsp)
         * of mount options, we stash away the current values and
         * restore them after we register the callbacks.
         */
-       if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+       if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
+           !spa_writeable(dmu_objset_spa(os))) {
                readonly = B_TRUE;
                do_readonly = B_TRUE;
        } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
@@ -821,23 +822,14 @@ zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
 {
        uint64_t fuid;
        uint64_t quotaobj;
-       uid_t id;
 
        quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 
-       id = isgroup ? zp->z_gid : zp->z_uid;
+       fuid = isgroup ? zp->z_gid : zp->z_uid;
 
        if (quotaobj == 0 || zfsvfs->z_replay)
                return (B_FALSE);
 
-       if (IS_EPHEMERAL(id)) {
-               VERIFY(0 == sa_lookup(zp->z_sa_hdl,
-                   isgroup ? SA_ZPL_GID(zfsvfs) : SA_ZPL_UID(zfsvfs),
-                   &fuid, sizeof (fuid)));
-       } else {
-               fuid = (uint64_t)id;
-       }
-
        return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
 }
 
@@ -922,7 +914,10 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
                sa_obj = 0;
        }
 
-       zfsvfs->z_attr_table = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END);
+       error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+           &zfsvfs->z_attr_table);
+       if (error)
+               goto out;
 
        if (zfsvfs->z_version >= ZPL_VERSION_SA)
                sa_register_update_callback(os, zfs_sa_upgrade);
@@ -1043,12 +1038,15 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
                 * allocated and in the unlinked set, and there is an
                 * intent log record saying to allocate it.
                 */
-               if (zil_replay_disable) {
-                       zil_destroy(zfsvfs->z_log, B_FALSE);
-               } else {
-                       zfsvfs->z_replay = B_TRUE;
-                       zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
-                       zfsvfs->z_replay = B_FALSE;
+               if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
+                       if (zil_replay_disable) {
+                               zil_destroy(zfsvfs->z_log, B_FALSE);
+                       } else {
+                               zfsvfs->z_replay = B_TRUE;
+                               zil_replay(zfsvfs->z_os, zfsvfs,
+                                   zfs_replay_vector);
+                               zfsvfs->z_replay = B_FALSE;
+                       }
                }
                zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
        }
@@ -1172,6 +1170,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
                        goto out;
                xattr_changed_cb(zfsvfs, pval);
                zfsvfs->z_issnap = B_TRUE;
+               zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
 
                mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
                dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
@@ -1808,10 +1807,10 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
        /*
         * Evict cached data
         */
-       if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
-               txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
-               (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
-       }
+       if (dmu_objset_is_dirty_anywhere(zfsvfs->z_os))
+               if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
+                       txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+       (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
 
        return (0);
 }
@@ -2031,8 +2030,9 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
                        goto bail;
 
 
-               zfsvfs->z_attr_table = sa_setup(zfsvfs->z_os, sa_obj,
-                   zfs_attr_table,  ZPL_END);
+               if ((err = sa_setup(zfsvfs->z_os, sa_obj,
+                   zfs_attr_table,  ZPL_END, &zfsvfs->z_attr_table)) != 0)
+                       goto bail;
 
                VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
 
@@ -2272,7 +2272,7 @@ static vfsdef_t vfw = {
        MNTTYPE_ZFS,
        zfs_vfsinit,
        VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
-           VSW_XID,
+           VSW_XID|VSW_ZMOUNT,
        &zfs_mntopts
 };
 
index aa43c065f083d4a8f684c7bb9afe4c4f637eff02..a0720079cf4679583c3dd088d3f2604e4e0d86ef 100644 (file)
  *  (6)        At the end of each vnode op, the DMU tx must always commit,
  *     regardless of whether there were any errors.
  *
- *  (7)        After dropping all locks, invoke zil_commit(zilog, seq, foid)
+ *  (7)        After dropping all locks, invoke zil_commit(zilog, foid)
  *     to ensure that synchronous semantics are provided when necessary.
  *
  * In general, this is how things should be ordered in each vnode op:
  *     rw_exit(...);                   // drop locks
  *     zfs_dirent_unlock(dl);          // unlock directory entry
  *     VN_RELE(...);                   // release held vnodes
- *     zil_commit(zilog, seq, foid);   // synchronous when necessary
+ *     zil_commit(zilog, foid);        // synchronous when necessary
  *     ZFS_EXIT(zfsvfs);               // finished in zfs
  *     return (error);                 // done, report error
  */
@@ -490,7 +490,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
         * If we're in FRSYNC mode, sync out this znode before reading it.
         */
        if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-               zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
+               zil_commit(zfsvfs->z_log, zp->z_id);
 
        /*
         * Lock the range against changes.
@@ -670,7 +670,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
            (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
                xuio = (xuio_t *)uio;
        else
-               uio_prefaultpages(n, uio);
+               uio_prefaultpages(MIN(n, max_blksz), uio);
 
        /*
         * If in append mode, set the io offset pointer to eof.
@@ -866,6 +866,8 @@ again:
                 * been done, but that would still expose the ISUID/ISGID
                 * to another app after the partial write is committed.
                 *
+                * Note: we don't call zfs_fuid_map_id() here because
+                * user 0 is not an ephemeral uid.
                 */
                mutex_enter(&zp->z_acl_lock);
                if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
@@ -893,6 +895,14 @@ again:
                            uio->uio_loffset);
                        ASSERT(error == 0);
                }
+               /*
+                * If we are replaying and eof is non zero then force
+                * the file size to the specified eof. Note, there's no
+                * concurrency during replay.
+                */
+               if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
+                       zp->z_size = zfsvfs->z_replay_eof;
+
                error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 
                zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
@@ -902,6 +912,9 @@ again:
                        break;
                ASSERT(tx_bytes == nbytes);
                n -= nbytes;
+
+               if (!xuio && n > 0)
+                       uio_prefaultpages(MIN(n, max_blksz), uio);
        }
 
        zfs_range_unlock(rl);
@@ -917,7 +930,7 @@ again:
 
        if (ioflag & (FSYNC | FDSYNC) ||
            zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-               zil_commit(zilog, zp->z_last_itx, zp->z_id);
+               zil_commit(zilog, zp->z_id);
 
        ZFS_EXIT(zfsvfs);
        return (0);
@@ -1356,6 +1369,8 @@ top:
                error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
                    NULL, NULL);
                if (error) {
+                       if (have_acl)
+                               zfs_acl_ids_free(&acl_ids);
                        if (strcmp(name, "..") == 0)
                                error = EISDIR;
                        ZFS_EXIT(zfsvfs);
@@ -1371,6 +1386,8 @@ top:
                 * to reference it.
                 */
                if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+                       if (have_acl)
+                               zfs_acl_ids_free(&acl_ids);
                        goto out;
                }
 
@@ -1381,6 +1398,8 @@ top:
 
                if ((dzp->z_pflags & ZFS_XATTR) &&
                    (vap->va_type != VREG)) {
+                       if (have_acl)
+                               zfs_acl_ids_free(&acl_ids);
                        error = EINVAL;
                        goto out;
                }
@@ -1440,6 +1459,10 @@ top:
        } else {
                int aflags = (flag & FAPPEND) ? V_APPEND : 0;
 
+               if (have_acl)
+                       zfs_acl_ids_free(&acl_ids);
+               have_acl = B_FALSE;
+
                /*
                 * A directory entry already exists for this name.
                 */
@@ -1496,7 +1519,7 @@ out:
        }
 
        if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-               zil_commit(zilog, UINT64_MAX, 0);
+               zil_commit(zilog, 0);
 
        ZFS_EXIT(zfsvfs);
        return (error);
@@ -1527,12 +1550,13 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
     int flags)
 {
        znode_t         *zp, *dzp = VTOZ(dvp);
-       znode_t         *xzp = NULL;
+       znode_t         *xzp;
        vnode_t         *vp;
        zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
        zilog_t         *zilog;
-       uint64_t        acl_obj, xattr_obj = 0;
+       uint64_t        acl_obj, xattr_obj;
        uint64_t        xattr_obj_unlinked = 0;
+       uint64_t        obj = 0;
        zfs_dirlock_t   *dl;
        dmu_tx_t        *tx;
        boolean_t       may_delete_now, delete_now = FALSE;
@@ -1554,6 +1578,8 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
        }
 
 top:
+       xattr_obj = 0;
+       xzp = NULL;
        /*
         * Attempt to lock directory; fail if entry doesn't exist.
         */
@@ -1596,6 +1622,7 @@ top:
         * other holds on the vnode.  So we dmu_tx_hold() the right things to
         * allow for either case.
         */
+       obj = zp->z_id;
        tx = dmu_tx_create(zfsvfs->z_os);
        dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
        dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
@@ -1612,16 +1639,17 @@ top:
        /* are there any extended attributes? */
        error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
            &xattr_obj, sizeof (xattr_obj));
-       if (xattr_obj) {
+       if (error == 0 && xattr_obj) {
                error = zfs_zget(zfsvfs, xattr_obj, &xzp);
                ASSERT3U(error, ==, 0);
                dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
                dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
        }
 
-       /* are there any additional acls */
-       if ((acl_obj = ZFS_EXTERNAL_ACL(zp)) != 0 && may_delete_now)
+       mutex_enter(&zp->z_lock);
+       if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
                dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+       mutex_exit(&zp->z_lock);
 
        /* charge as an update -- would be nice not to charge at all */
        dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
@@ -1630,6 +1658,8 @@ top:
        if (error) {
                zfs_dirent_unlock(dl);
                VN_RELE(vp);
+               if (xzp)
+                       VN_RELE(ZTOV(xzp));
                if (error == ERESTART) {
                        dmu_tx_wait(tx);
                        dmu_tx_abort(tx);
@@ -1654,13 +1684,18 @@ top:
 
        if (unlinked) {
 
+               /*
+                * Hold z_lock so that we can make sure that the ACL obj
+                * hasn't changed.  Could have been deleted due to
+                * zfs_sa_upgrade().
+                */
+               mutex_enter(&zp->z_lock);
                mutex_enter(&vp->v_lock);
-
                (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
                    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
                delete_now = may_delete_now && !toobig &&
                    vp->v_count == 1 && !vn_has_cached_data(vp) &&
-                   xattr_obj == xattr_obj_unlinked && ZFS_EXTERNAL_ACL(zp) ==
+                   xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
                    acl_obj;
                mutex_exit(&vp->v_lock);
        }
@@ -1676,6 +1711,7 @@ top:
                        ASSERT3U(error,  ==,  0);
                        mutex_exit(&xzp->z_lock);
                        zfs_unlinked_add(xzp, tx);
+
                        if (zp->z_is_sa)
                                error = sa_remove(zp->z_sa_hdl,
                                    SA_ZPL_XATTR(zfsvfs), tx);
@@ -1685,7 +1721,6 @@ top:
                                    sizeof (uint64_t), tx);
                        ASSERT3U(error, ==, 0);
                }
-               mutex_enter(&zp->z_lock);
                mutex_enter(&vp->v_lock);
                vp->v_count--;
                ASSERT3U(vp->v_count, ==, 0);
@@ -1693,13 +1728,14 @@ top:
                mutex_exit(&zp->z_lock);
                zfs_znode_delete(zp, tx);
        } else if (unlinked) {
+               mutex_exit(&zp->z_lock);
                zfs_unlinked_add(zp, tx);
        }
 
        txtype = TX_REMOVE;
        if (flags & FIGNORECASE)
                txtype |= TX_CI;
-       zfs_log_remove(zilog, tx, txtype, dzp, name);
+       zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
 
        dmu_tx_commit(tx);
 out:
@@ -1714,7 +1750,7 @@ out:
                VN_RELE(ZTOV(xzp));
 
        if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-               zil_commit(zilog, UINT64_MAX, 0);
+               zil_commit(zilog, 0);
 
        ZFS_EXIT(zfsvfs);
        return (error);
@@ -1896,7 +1932,7 @@ top:
        zfs_dirent_unlock(dl);
 
        if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-               zil_commit(zilog, UINT64_MAX, 0);
+               zil_commit(zilog, 0);
 
        ZFS_EXIT(zfsvfs);
        return (0);
@@ -2011,7 +2047,7 @@ top:
                uint64_t txtype = TX_RMDIR;
                if (flags & FIGNORECASE)
                        txtype |= TX_CI;
-               zfs_log_remove(zilog, tx, txtype, dzp, name);
+               zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
        }
 
        dmu_tx_commit(tx);
@@ -2024,7 +2060,7 @@ out:
        VN_RELE(vp);
 
        if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-               zil_commit(zilog, UINT64_MAX, 0);
+               zil_commit(zilog, 0);
 
        ZFS_EXIT(zfsvfs);
        return (error);
@@ -2164,7 +2200,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
        while (outcount < bytes_wanted) {
                ino64_t objnum;
                ushort_t reclen;
-               off64_t *next;
+               off64_t *next = NULL;
 
                /*
                 * Special case `.', `..', and `.zfs'.
@@ -2290,7 +2326,8 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
                } else {
                        offset += 1;
                }
-               *next = offset;
+               if (next)
+                       *next = offset;
        }
        zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
 
@@ -2343,7 +2380,7 @@ zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
        if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
                ZFS_ENTER(zfsvfs);
                ZFS_VERIFY_ZP(zp);
-               zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
+               zil_commit(zfsvfs->z_log, zp->z_id);
                ZFS_EXIT(zfsvfs);
        }
        return (0);
@@ -2384,6 +2421,8 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zp);
 
+       zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+
        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 
@@ -2397,7 +2436,8 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
         * Also, if we are the owner don't bother, since owner should
         * always be allowed to read basic attributes of file.
         */
-       if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (zp->z_uid != crgetuid(cr))) {
+       if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
+           (vap->va_uid != crgetuid(cr))) {
                if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
                    skipaclchk, cr)) {
                        ZFS_EXIT(zfsvfs);
@@ -2413,8 +2453,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
        mutex_enter(&zp->z_lock);
        vap->va_type = vp->v_type;
        vap->va_mode = zp->z_mode & MODEMASK;
-       vap->va_uid = zp->z_uid;
-       vap->va_gid = zp->z_gid;
        vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
        vap->va_nodeid = zp->z_id;
        if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
@@ -2515,6 +2553,22 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
                        xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
                        XVA_SET_RTN(xvap, XAT_REPARSE);
                }
+               if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
+                       xoap->xoa_generation = zp->z_gen;
+                       XVA_SET_RTN(xvap, XAT_GEN);
+               }
+
+               if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+                       xoap->xoa_offline =
+                           ((zp->z_pflags & ZFS_OFFLINE) != 0);
+                       XVA_SET_RTN(xvap, XAT_OFFLINE);
+               }
+
+               if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+                       xoap->xoa_sparse =
+                           ((zp->z_pflags & ZFS_SPARSE) != 0);
+                       XVA_SET_RTN(xvap, XAT_SPARSE);
+               }
        }
 
        ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
@@ -2570,7 +2624,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
        int             trim_mask = 0;
        uint64_t        new_mode;
        uint64_t        new_uid, new_gid;
-       uint64_t        xattr_obj = 0;
+       uint64_t        xattr_obj;
        uint64_t        mtime[2], ctime[2];
        znode_t         *attrzp;
        int             need_policy = FALSE;
@@ -2578,7 +2632,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
        zfs_fuid_info_t *fuidp = NULL;
        xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
        xoptattr_t      *xoap;
-       zfs_acl_t       *aclp = NULL;
+       zfs_acl_t       *aclp;
        boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
        boolean_t       fuid_dirtied = B_FALSE;
        sa_bulk_attr_t  bulk[7], xattr_bulk[7];
@@ -2657,6 +2711,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 
 top:
        attrzp = NULL;
+       aclp = NULL;
 
        /* Can this be moved to before the top label? */
        if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
@@ -2692,6 +2747,8 @@ top:
            ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
            XVA_ISSET_REQ(xvap, XAT_READONLY) ||
            XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
+           XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
+           XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
            XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
            XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
                need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
@@ -2748,8 +2805,7 @@ top:
 
        mutex_enter(&zp->z_lock);
        oldva.va_mode = zp->z_mode;
-       oldva.va_uid = zp->z_uid;
-       oldva.va_gid = zp->z_gid;
+       zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
        if (mask & AT_XVATTR) {
                /*
                 * Update xvattr mask to include only those attributes
@@ -2880,10 +2936,10 @@ top:
        mask = vap->va_mask;
 
        if ((mask & (AT_UID | AT_GID))) {
-               (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj,
-                   sizeof (xattr_obj));
+               err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+                   &xattr_obj, sizeof (xattr_obj));
 
-               if (xattr_obj) {
+               if (err == 0 && xattr_obj) {
                        err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
                        if (err)
                                goto out2;
@@ -2891,8 +2947,10 @@ top:
                if (mask & AT_UID) {
                        new_uid = zfs_fuid_create(zfsvfs,
                            (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
-                       if (vap->va_uid != zp->z_uid &&
+                       if (new_uid != zp->z_uid &&
                            zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
+                               if (attrzp)
+                                       VN_RELE(ZTOV(attrzp));
                                err = EDQUOT;
                                goto out2;
                        }
@@ -2903,6 +2961,8 @@ top:
                            cr, ZFS_GROUP, &fuidp);
                        if (new_gid != zp->z_gid &&
                            zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
+                               if (attrzp)
+                                       VN_RELE(ZTOV(attrzp));
                                err = EDQUOT;
                                goto out2;
                        }
@@ -2912,32 +2972,33 @@ top:
 
        if (mask & AT_MODE) {
                uint64_t pmode = zp->z_mode;
+               uint64_t acl_obj;
                new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
-               if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
-                       goto out;
+               zfs_acl_chmod_setattr(zp, &aclp, new_mode);
 
-               if (!zp->z_is_sa && ZFS_EXTERNAL_ACL(zp)) {
+               mutex_enter(&zp->z_lock);
+               if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
                        /*
                         * Are we upgrading ACL from old V0 format
                         * to V1 format?
                         */
-                       if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
-                           ZNODE_ACL_VERSION(zp) ==
+                       if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+                           zfs_znode_acl_version(zp) ==
                            ZFS_ACL_VERSION_INITIAL) {
-                               dmu_tx_hold_free(tx,
-                                   ZFS_EXTERNAL_ACL(zp), 0,
+                               dmu_tx_hold_free(tx, acl_obj, 0,
                                    DMU_OBJECT_END);
                                dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
                                    0, aclp->z_acl_bytes);
                        } else {
-                               dmu_tx_hold_write(tx, ZFS_EXTERNAL_ACL(zp), 0,
+                               dmu_tx_hold_write(tx, acl_obj, 0,
                                    aclp->z_acl_bytes);
                        }
                } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
                        dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
                            0, aclp->z_acl_bytes);
                }
+               mutex_exit(&zp->z_lock);
                dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
        } else {
                if ((mask & AT_XVATTR) &&
@@ -2973,12 +3034,17 @@ top:
         * updated as a side-effect of calling this function.
         */
 
+
+       if (mask & (AT_UID|AT_GID|AT_MODE))
+               mutex_enter(&zp->z_acl_lock);
        mutex_enter(&zp->z_lock);
 
        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
            &zp->z_pflags, sizeof (zp->z_pflags));
 
        if (attrzp) {
+               if (mask & (AT_UID|AT_GID|AT_MODE))
+                       mutex_enter(&attrzp->z_acl_lock);
                mutex_enter(&attrzp->z_lock);
                SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
                    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
@@ -2990,26 +3056,24 @@ top:
                if (mask & AT_UID) {
                        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
                            &new_uid, sizeof (new_uid));
-                       zp->z_uid = zfs_fuid_map_id(zfsvfs, new_uid,
-                           cr, ZFS_OWNER);
+                       zp->z_uid = new_uid;
                        if (attrzp) {
                                SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
                                    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
                                    sizeof (new_uid));
-                               attrzp->z_uid = zp->z_uid;
+                               attrzp->z_uid = new_uid;
                        }
                }
 
                if (mask & AT_GID) {
                        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
                            NULL, &new_gid, sizeof (new_gid));
-                       zp->z_gid = zfs_fuid_map_id(zfsvfs, new_gid, cr,
-                           ZFS_GROUP);
+                       zp->z_gid = new_gid;
                        if (attrzp) {
                                SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
                                    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
                                    sizeof (new_gid));
-                               attrzp->z_gid = zp->z_gid;
+                               attrzp->z_gid = new_gid;
                        }
                }
                if (!(mask & AT_MODE)) {
@@ -3026,20 +3090,18 @@ top:
        }
 
        if (mask & AT_MODE) {
-               mutex_enter(&zp->z_acl_lock);
                SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
                    &new_mode, sizeof (new_mode));
                zp->z_mode = new_mode;
                ASSERT3U((uintptr_t)aclp, !=, NULL);
                err = zfs_aclset_common(zp, aclp, cr, tx);
                ASSERT3U(err, ==, 0);
+               if (zp->z_acl_cached)
+                       zfs_acl_free(zp->z_acl_cached);
                zp->z_acl_cached = aclp;
                aclp = NULL;
-               mutex_exit(&zp->z_acl_lock);
        }
 
-       if (attrzp)
-               mutex_exit(&attrzp->z_lock);
 
        if (mask & AT_ATIME) {
                ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
@@ -3118,7 +3180,14 @@ top:
                zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
        mutex_exit(&zp->z_lock);
+       if (mask & (AT_UID|AT_GID|AT_MODE))
+               mutex_exit(&zp->z_acl_lock);
 
+       if (attrzp) {
+               if (mask & (AT_UID|AT_GID|AT_MODE))
+                       mutex_exit(&attrzp->z_acl_lock);
+               mutex_exit(&attrzp->z_lock);
+       }
 out:
        if (err == 0 && attrzp) {
                err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
@@ -3145,10 +3214,9 @@ out:
                dmu_tx_commit(tx);
        }
 
-
 out2:
        if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-               zil_commit(zilog, UINT64_MAX, 0);
+               zil_commit(zilog, 0);
 
        ZFS_EXIT(zfsvfs);
        return (err);
@@ -3555,9 +3623,8 @@ top:
                        error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
                        if (error == 0) {
                                zfs_log_rename(zilog, tx, TX_RENAME |
-                                   (flags & FIGNORECASE ? TX_CI : 0),
-                                   sdzp, sdl->dl_name, tdzp, tdl->dl_name,
-                                   szp);
+                                   (flags & FIGNORECASE ? TX_CI : 0), sdzp,
+                                   sdl->dl_name, tdzp, tdl->dl_name, szp);
 
                                /*
                                 * Update path information for the target vnode
@@ -3600,7 +3667,7 @@ out:
                VN_RELE(ZTOV(tzp));
 
        if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-               zil_commit(zilog, UINT64_MAX, 0);
+               zil_commit(zilog, 0);
 
        ZFS_EXIT(zfsvfs);
        return (error);
@@ -3724,11 +3791,13 @@ top:
        if (fuid_dirtied)
                zfs_fuid_sync(zfsvfs, tx);
 
+       mutex_enter(&zp->z_lock);
        if (zp->z_is_sa)
                error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
                    link, len, tx);
        else
                zfs_sa_symlink(zp, link, len, tx);
+       mutex_exit(&zp->z_lock);
 
        zp->z_size = len;
        (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
@@ -3751,7 +3820,7 @@ top:
        VN_RELE(ZTOV(zp));
 
        if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-               zil_commit(zilog, UINT64_MAX, 0);
+               zil_commit(zilog, 0);
 
        ZFS_EXIT(zfsvfs);
        return (error);
@@ -3785,11 +3854,13 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zp);
 
+       mutex_enter(&zp->z_lock);
        if (zp->z_is_sa)
                error = sa_lookup_uio(zp->z_sa_hdl,
                    SA_ZPL_SYMLINK(zfsvfs), uio);
        else
                error = zfs_sa_readlink(zp, uio);
+       mutex_exit(&zp->z_lock);
 
        ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
@@ -3828,6 +3899,7 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
        int             error;
        int             zf = ZNEW;
        uint64_t        parent;
+       uid_t           owner;
 
        ASSERT(tdvp->v_type == VDIR);
 
@@ -3887,8 +3959,8 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
        }
 
 
-       if (szp->z_uid != crgetuid(cr) &&
-           secpolicy_basic_link(cr) != 0) {
+       owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
+       if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
                ZFS_EXIT(zfsvfs);
                return (EPERM);
        }
@@ -3944,7 +4016,7 @@ top:
        }
 
        if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-               zil_commit(zilog, UINT64_MAX, 0);
+               zil_commit(zilog, 0);
 
        ZFS_EXIT(zfsvfs);
        return (error);
@@ -4181,7 +4253,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
 out:
        zfs_range_unlock(rl);
        if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-               zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id);
+               zil_commit(zfsvfs->z_log, zp->z_id);
        ZFS_EXIT(zfsvfs);
        return (error);
 }
@@ -4836,7 +4908,7 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
        error = zfs_setacl(zp, vsecp, skipaclchk, cr);
 
        if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-               zil_commit(zilog, UINT64_MAX, 0);
+               zil_commit(zilog, 0);
 
        ZFS_EXIT(zfsvfs);
        return (error);
index 24bd3ddcdd8275a3e363a7f7c578c1939dc27e59..e1e4e9e03ac26666820e3fcf467ef00693d005ed 100644 (file)
@@ -63,6 +63,7 @@
 #include <sys/zfs_znode.h>
 #include <sys/sa.h>
 #include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
@@ -81,9 +82,6 @@
 #define        ZNODE_STAT_ADD(stat)                    /* nothing */
 #endif /* ZNODE_STATS */
 
-#define        POINTER_IS_VALID(p)     (!((uintptr_t)(p) & 0x3))
-#define        POINTER_INVALIDATE(pp)  (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
-
 /*
  * Functions needed for userland (ie: libzpool) are not put under
  * #ifdef_KERNEL; the rest of the functions have dependencies
@@ -136,6 +134,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 
        zp->z_dirlocks = NULL;
        zp->z_acl_cached = NULL;
+       zp->z_moved = 0;
        return (0);
 }
 
@@ -196,7 +195,6 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
        nzp->z_blksz = ozp->z_blksz;
        nzp->z_seq = ozp->z_seq;
        nzp->z_mapcnt = ozp->z_mapcnt;
-       nzp->z_last_itx = ozp->z_last_itx;
        nzp->z_gen = ozp->z_gen;
        nzp->z_sync_cnt = ozp->z_sync_cnt;
        nzp->z_is_sa = ozp->z_is_sa;
@@ -228,6 +226,12 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
         */
        ozp->z_sa_hdl = NULL;
        POINTER_INVALIDATE(&ozp->z_zfsvfs);
+
+       /*
+        * Mark the znode.
+        */
+       nzp->z_moved = 1;
+       ozp->z_moved = (uint8_t)-1;
 }
 
 /*ARGSUSED*/
@@ -478,6 +482,8 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
        vattr.va_gid = crgetgid(kcred);
 
        sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+       ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
+       sharezp->z_moved = 0;
        sharezp->z_unlinked = 0;
        sharezp->z_atime_dirty = 0;
        sharezp->z_zfsvfs = zfsvfs;
@@ -619,7 +625,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
        vnode_t *vp;
        uint64_t mode;
        uint64_t parent;
-       uint64_t uid, gid;
        sa_bulk_attr_t bulk[9];
        int count = 0;
 
@@ -627,6 +632,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 
        ASSERT(zp->z_dirlocks == NULL);
        ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+       zp->z_moved = 0;
 
        /*
         * Defer setting z_zfsvfs until the znode is ready to be a candidate for
@@ -636,7 +642,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
        zp->z_unlinked = 0;
        zp->z_atime_dirty = 0;
        zp->z_mapcnt = 0;
-       zp->z_last_itx = 0;
        zp->z_id = db->db_object;
        zp->z_blksz = blksz;
        zp->z_seq = 0x7A4653;
@@ -659,9 +664,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
            &zp->z_atime, 16);
        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
-           &uid, 8);
+           &zp->z_uid, 8);
        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
-           &gid, 8);
+           &zp->z_gid, 8);
 
        if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
                if (hdl == NULL)
@@ -670,8 +675,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
                return (NULL);
        }
 
-       zp->z_uid = zfs_fuid_map_id(zfsvfs, uid, CRED(), ZFS_OWNER);
-       zp->z_gid = zfs_fuid_map_id(zfsvfs, gid, CRED(), ZFS_GROUP);
        zp->z_mode = mode;
        vp->v_vfsp = zfsvfs->z_parent->z_vfs;
 
@@ -705,7 +708,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
        case VREG:
                vp->v_flag |= VMODSORT;
                if (parent == zfsvfs->z_shares_dir) {
-                       ASSERT(uid == 0 && gid == 0);
+                       ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
                        vn_setops(vp, zfs_sharevnodeops);
                } else {
                        vn_setops(vp, zfs_fvnodeops);
@@ -759,7 +762,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 {
        uint64_t        crtime[2], atime[2], mtime[2], ctime[2];
        uint64_t        mode, size, links, parent, pflags;
-       uint64_t        dzp_pflags = 0;
+       uint64_t        dzp_pflags = 0;
        uint64_t        rdev = 0;
        zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
        dmu_buf_t       *db;
@@ -794,7 +797,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
         */
        /*
         * There's currently no mechanism for pre-reading the blocks that will
-        * be to needed allocate a new object, so we accept the small chance
+        * be needed to allocate a new object, so we accept the small chance
         * that there will be an i/o error and we will fail one of the
         * assertions below.
         */
@@ -1085,6 +1088,16 @@ zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
                    zp->z_pflags, tx);
                XVA_SET_RTN(xvap, XAT_REPARSE);
        }
+       if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+               ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
+                   zp->z_pflags, tx);
+               XVA_SET_RTN(xvap, XAT_OFFLINE);
+       }
+       if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+               ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
+                   zp->z_pflags, tx);
+               XVA_SET_RTN(xvap, XAT_SPARSE);
+       }
 }
 
 int
@@ -1174,7 +1187,6 @@ zfs_rezget(znode_t *zp)
        dmu_buf_t *db;
        uint64_t obj_num = zp->z_id;
        uint64_t mode;
-       uint64_t uid, gid;
        sa_bulk_attr_t bulk[8];
        int err;
        int count = 0;
@@ -1220,28 +1232,26 @@ zfs_rezget(znode_t *zp)
        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
            &zp->z_atime, sizeof (zp->z_atime));
        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
-           &uid, sizeof (uid));
+           &zp->z_uid, sizeof (zp->z_uid));
        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
-           &gid, sizeof (gid));
+           &zp->z_gid, sizeof (zp->z_gid));
        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
            &mode, sizeof (mode));
 
-       zp->z_mode = mode;
-
        if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
                zfs_znode_dmu_fini(zp);
                ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
                return (EIO);
        }
 
+       zp->z_mode = mode;
+
        if (gen != zp->z_gen) {
                zfs_znode_dmu_fini(zp);
                ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
                return (EIO);
        }
 
-       zp->z_uid = zfs_fuid_map_id(zfsvfs, uid, CRED(), ZFS_OWNER);
-       zp->z_gid = zfs_fuid_map_id(zfsvfs, gid, CRED(), ZFS_GROUP);
        zp->z_unlinked = (zp->z_links == 0);
        zp->z_blksz = doi.doi_data_block_size;
 
@@ -1256,11 +1266,13 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
        zfsvfs_t *zfsvfs = zp->z_zfsvfs;
        objset_t *os = zfsvfs->z_os;
        uint64_t obj = zp->z_id;
-       uint64_t acl_obj = ZFS_EXTERNAL_ACL(zp);
+       uint64_t acl_obj = zfs_external_acl(zp);
 
        ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
-       if (acl_obj)
+       if (acl_obj) {
+               VERIFY(!zp->z_is_sa);
                VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+       }
        VERIFY(0 == dmu_object_free(os, obj, tx));
        zfs_znode_dmu_fini(zp);
        ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
@@ -1562,6 +1574,8 @@ zfs_trunc(znode_t *zp, uint64_t end)
        dmu_tx_t *tx;
        rl_t *rl;
        int error;
+       sa_bulk_attr_t bulk[2];
+       int count = 0;
 
        /*
         * We will change zp_size, lock the whole file.
@@ -1598,9 +1612,15 @@ top:
        }
 
        zp->z_size = end;
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+           NULL, &zp->z_size, sizeof (zp->z_size));
 
-       VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
-           &zp->z_size, sizeof (zp->z_size), tx));
+       if (end == 0) {
+               zp->z_pflags &= ~ZFS_SPARSE;
+               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+                   NULL, &zp->z_pflags, 8);
+       }
+       VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
 
        dmu_tx_commit(tx);
 
@@ -1805,6 +1825,8 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
        vattr.va_gid = crgetgid(cr);
 
        rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+       ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
+       rootzp->z_moved = 0;
        rootzp->z_unlinked = 0;
        rootzp->z_atime_dirty = 0;
        rootzp->z_is_sa = USE_SA(version, os);
@@ -1822,7 +1844,10 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
        zfsvfs.z_use_sa = USE_SA(version, os);
        zfsvfs.z_norm = norm;
 
-       zfsvfs.z_attr_table = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END);
+       error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+           &zfsvfs.z_attr_table);
+
+       ASSERT(error == 0);
 
        /*
         * Fold case on file systems that are always or sometimes case
@@ -1838,7 +1863,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
        for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
                mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
-       ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
        rootzp->z_zfsvfs = &zfsvfs;
        VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
            cr, NULL, &acl_ids));
@@ -1868,78 +1892,121 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 
 #endif /* _KERNEL */
 
-/*
- * Given an object number, return its parent object number and whether
- * or not the object is an extended attribute directory.
- */
 static int
-zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir,
-    sa_attr_type_t *sa_table)
+zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
+{
+       uint64_t sa_obj = 0;
+       int error;
+
+       error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+       if (error != 0 && error != ENOENT)
+               return (error);
+
+       error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
+       return (error);
+}
+
+static int
+zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
+    dmu_buf_t **db)
 {
-       dmu_buf_t *db;
        dmu_object_info_t doi;
        int error;
-       uint64_t parent;
-       uint64_t pflags;
-       uint64_t mode;
-       sa_bulk_attr_t bulk[3];
-       sa_handle_t *hdl;
-       int count = 0;
 
-       if ((error = sa_buf_hold(osp, obj, FTAG, &db)) != 0)
+       if ((error = sa_buf_hold(osp, obj, FTAG, db)) != 0)
                return (error);
 
-       dmu_object_info_from_db(db, &doi);
+       dmu_object_info_from_db(*db, &doi);
        if ((doi.doi_bonus_type != DMU_OT_SA &&
            doi.doi_bonus_type != DMU_OT_ZNODE) ||
            doi.doi_bonus_type == DMU_OT_ZNODE &&
            doi.doi_bonus_size < sizeof (znode_phys_t)) {
-               sa_buf_rele(db, FTAG);
-               return (EINVAL);
+               sa_buf_rele(*db, FTAG);
+               return (ENOTSUP);
        }
 
-       if ((error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE,
-           &hdl)) != 0) {
-               sa_buf_rele(db, FTAG);
+       error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
+       if (error != 0) {
+               sa_buf_rele(*db, FTAG);
                return (error);
        }
 
-       SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT],
-           NULL, &parent, 8);
+       return (0);
+}
+
+void
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db)
+{
+       sa_handle_destroy(hdl);
+       sa_buf_rele(db, FTAG);
+}
+
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp,
+    int *is_xattrdir)
+{
+       uint64_t parent;
+       uint64_t pflags;
+       uint64_t mode;
+       sa_bulk_attr_t bulk[3];
+       int count = 0;
+       int error;
+
+       SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
+           &parent, sizeof (parent));
        SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
-           &pflags, 8);
+           &pflags, sizeof (pflags));
        SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
-           &mode, 8);
+           &mode, sizeof (mode));
 
-       if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) {
-               sa_buf_rele(db, FTAG);
-               sa_handle_destroy(hdl);
+       if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
                return (error);
-       }
+
        *pobjp = parent;
        *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
-       sa_handle_destroy(hdl);
-       sa_buf_rele(db, FTAG);
 
        return (0);
 }
 
-int
-zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+/*
+ * Given an object number, return some zpl level statistics
+ */
+static int
+zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
+    zfs_stat_t *sb)
 {
+       sa_bulk_attr_t bulk[4];
+       int count = 0;
+
+       SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+           &sb->zs_mode, sizeof (sb->zs_mode));
+       SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
+           &sb->zs_gen, sizeof (sb->zs_gen));
+       SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
+           &sb->zs_links, sizeof (sb->zs_links));
+       SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
+           &sb->zs_ctime, sizeof (sb->zs_ctime));
+
+       return (sa_bulk_lookup(hdl, bulk, count));
+}
+
+static int
+zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
+    sa_attr_type_t *sa_table, char *buf, int len)
+{
+       sa_handle_t *sa_hdl;
+       sa_handle_t *prevhdl = NULL;
+       dmu_buf_t *prevdb = NULL;
+       dmu_buf_t *sa_db = NULL;
        char *path = buf + len - 1;
-       sa_attr_type_t *sa_table;
        int error;
-       uint64_t sa_obj = 0;
 
        *path = '\0';
-
-       error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
-
-       if (error != 0 && error != ENOENT)
-               return (error);
-
-       sa_table = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END);
+       sa_hdl = hdl;
 
        for (;;) {
                uint64_t pobj;
@@ -1947,8 +2014,11 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
                size_t complen;
                int is_xattrdir;
 
-               if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
-                   &is_xattrdir, sa_table)) != 0)
+               if (prevdb)
+                       zfs_release_sa_handle(prevhdl, prevdb);
+
+               if ((error = zfs_obj_to_pobj(sa_hdl, sa_table, &pobj,
+                   &is_xattrdir)) != 0)
                        break;
 
                if (pobj == obj) {
@@ -1972,6 +2042,22 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
                ASSERT(path >= buf);
                bcopy(component, path, complen);
                obj = pobj;
+
+               if (sa_hdl != hdl) {
+                       prevhdl = sa_hdl;
+                       prevdb = sa_db;
+               }
+               error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db);
+               if (error != 0) {
+                       sa_hdl = prevhdl;
+                       sa_db = prevdb;
+                       break;
+               }
+       }
+
+       if (sa_hdl != NULL && sa_hdl != hdl) {
+               ASSERT(sa_db != NULL);
+               zfs_release_sa_handle(sa_hdl, sa_db);
        }
 
        if (error == 0)
@@ -1979,3 +2065,57 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
 
        return (error);
 }
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+       sa_attr_type_t *sa_table;
+       sa_handle_t *hdl;
+       dmu_buf_t *db;
+       int error;
+
+       error = zfs_sa_setup(osp, &sa_table);
+       if (error != 0)
+               return (error);
+
+       error = zfs_grab_sa_handle(osp, obj, &hdl, &db);
+       if (error != 0)
+               return (error);
+
+       error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+       zfs_release_sa_handle(hdl, db);
+       return (error);
+}
+
+int
+zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+    char *buf, int len)
+{
+       char *path = buf + len - 1;
+       sa_attr_type_t *sa_table;
+       sa_handle_t *hdl;
+       dmu_buf_t *db;
+       int error;
+
+       *path = '\0';
+
+       error = zfs_sa_setup(osp, &sa_table);
+       if (error != 0)
+               return (error);
+
+       error = zfs_grab_sa_handle(osp, obj, &hdl, &db);
+       if (error != 0)
+               return (error);
+
+       error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
+       if (error != 0) {
+               zfs_release_sa_handle(hdl, db);
+               return (error);
+       }
+
+       error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+       zfs_release_sa_handle(hdl, db);
+       return (error);
+}
index 4aa4d10b07e80a8ecd61024bb244a3a78b025369..c66313ff6f85c5bb4f2043530d3940c9a2a078e7 100644 (file)
@@ -34,7 +34,7 @@
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/dsl_dataset.h>
-#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 
@@ -78,12 +78,21 @@ boolean_t zfs_nocacheflush = B_FALSE;
 
 static kmem_cache_t *zil_lwb_cache;
 
-static boolean_t zil_empty(zilog_t *zilog);
+static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
 
 #define        LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
     sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
 
 
+/*
+ * ziltest is by and large an ugly hack, but very useful in
+ * checking replay without tedious work.
+ * When running ziltest we want to keep all itx's and so maintain
+ * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG
+ * We subtract TXG_CONCURRENT_STATES to allow for common code.
+ */
+#define        ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES)
+
 static int
 zil_bp_compare(const void *x1, const void *x2)
 {
@@ -631,6 +640,7 @@ zil_check_log_chain(const char *osname, void *tx)
 {
        zilog_t *zilog;
        objset_t *os;
+       blkptr_t *bp;
        int error;
 
        ASSERT(tx == NULL);
@@ -642,6 +652,29 @@ zil_check_log_chain(const char *osname, void *tx)
        }
 
        zilog = dmu_objset_zil(os);
+       bp = (blkptr_t *)&zilog->zl_header->zh_log;
+
+       /*
+        * Check the first block and determine if it's on a log device
+        * which may have been removed or faulted prior to loading this
+        * pool.  If so, there's no point in checking the rest of the log
+        * as its content should have already been synced to the pool.
+        */
+       if (!BP_IS_HOLE(bp)) {
+               vdev_t *vd;
+               boolean_t valid = B_TRUE;
+
+               spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
+               vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
+               if (vd->vdev_islog && vdev_is_dead(vd))
+                       valid = vdev_log_state_valid(vd);
+               spa_config_exit(os->os_spa, SCL_STATE, FTAG);
+
+               if (!valid) {
+                       dmu_objset_rele(os, FTAG);
+                       return (0);
+               }
+       }
 
        /*
         * Because tx == NULL, zil_claim_log_block() will not actually claim
@@ -661,8 +694,8 @@ zil_check_log_chain(const char *osname, void *tx)
 static int
 zil_vdev_compare(const void *x1, const void *x2)
 {
-       uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
-       uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
+       const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
+       const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
 
        if (v1 < v2)
                return (-1);
@@ -703,7 +736,7 @@ zil_add_block(zilog_t *zilog, const blkptr_t *bp)
        mutex_exit(&zilog->zl_vdev_lock);
 }
 
-void
+static void
 zil_flush_vdevs(zilog_t *zilog)
 {
        spa_t *spa = zilog->zl_spa;
@@ -1045,6 +1078,7 @@ zil_itx_create(uint64_t txtype, size_t lrsize)
        itx->itx_lr.lrc_reclen = lrsize;
        itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
        itx->itx_lr.lrc_seq = 0;        /* defensive */
+       itx->itx_sync = B_TRUE;         /* default is synchronous */
 
        return (itx);
 }
@@ -1055,190 +1089,362 @@ zil_itx_destroy(itx_t *itx)
        kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
 }
 
-uint64_t
-zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+/*
+ * Free up the sync and async itxs. The itxs_t has already been detached
+ * so no locks are needed.
+ */
+static void
+zil_itxg_clean(itxs_t *itxs)
 {
-       uint64_t seq;
+       itx_t *itx;
+       list_t *list;
+       avl_tree_t *t;
+       void *cookie;
+       itx_async_node_t *ian;
+
+       list = &itxs->i_sync_list;
+       while ((itx = list_head(list)) != NULL) {
+               list_remove(list, itx);
+               kmem_free(itx, offsetof(itx_t, itx_lr) +
+                   itx->itx_lr.lrc_reclen);
+       }
 
-       ASSERT(itx->itx_lr.lrc_seq == 0);
-       ASSERT(!zilog->zl_replay);
+       cookie = NULL;
+       t = &itxs->i_async_tree;
+       while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+               list = &ian->ia_list;
+               while ((itx = list_head(list)) != NULL) {
+                       list_remove(list, itx);
+                       kmem_free(itx, offsetof(itx_t, itx_lr) +
+                           itx->itx_lr.lrc_reclen);
+               }
+               list_destroy(list);
+               kmem_free(ian, sizeof (itx_async_node_t));
+       }
+       avl_destroy(t);
 
-       mutex_enter(&zilog->zl_lock);
-       list_insert_tail(&zilog->zl_itx_list, itx);
-       zilog->zl_itx_list_sz += itx->itx_sod;
-       itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
-       itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
-       mutex_exit(&zilog->zl_lock);
+       kmem_free(itxs, sizeof (itxs_t));
+}
+
+static int
+zil_aitx_compare(const void *x1, const void *x2)
+{
+       const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
+       const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
 
-       return (seq);
+       if (o1 < o2)
+               return (-1);
+       if (o1 > o2)
+               return (1);
+
+       return (0);
 }
 
 /*
- * Free up all in-memory intent log transactions that have now been synced.
+ * Remove all async itx with the given oid.
  */
 static void
-zil_itx_clean(zilog_t *zilog)
+zil_remove_async(zilog_t *zilog, uint64_t oid)
 {
-       uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
-       uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
+       uint64_t otxg, txg;
+       itx_async_node_t *ian;
+       avl_tree_t *t;
+       avl_index_t where;
        list_t clean_list;
        itx_t *itx;
 
+       ASSERT(oid != 0);
        list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
 
-       mutex_enter(&zilog->zl_lock);
-       /* wait for a log writer to finish walking list */
-       while (zilog->zl_writer) {
-               cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-       }
+       if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+               otxg = ZILTEST_TXG;
+       else
+               otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
-       /*
-        * Move the sync'd log transactions to a separate list so we can call
-        * kmem_free without holding the zl_lock.
-        *
-        * There is no need to set zl_writer as we don't drop zl_lock here
-        */
-       while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
-           itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
-               list_remove(&zilog->zl_itx_list, itx);
-               zilog->zl_itx_list_sz -= itx->itx_sod;
-               list_insert_tail(&clean_list, itx);
-       }
-       cv_broadcast(&zilog->zl_cv_writer);
-       mutex_exit(&zilog->zl_lock);
+       for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+               itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
-       /* destroy sync'd log transactions */
+               mutex_enter(&itxg->itxg_lock);
+               if (itxg->itxg_txg != txg) {
+                       mutex_exit(&itxg->itxg_lock);
+                       continue;
+               }
+
+               /*
+                * Locate the object node and append its list.
+                */
+               t = &itxg->itxg_itxs->i_async_tree;
+               ian = avl_find(t, &oid, &where);
+               if (ian != NULL)
+                       list_move_tail(&clean_list, &ian->ia_list);
+               mutex_exit(&itxg->itxg_lock);
+       }
        while ((itx = list_head(&clean_list)) != NULL) {
                list_remove(&clean_list, itx);
-               zil_itx_destroy(itx);
+               kmem_free(itx, offsetof(itx_t, itx_lr) +
+                   itx->itx_lr.lrc_reclen);
        }
        list_destroy(&clean_list);
 }
 
+void
+zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+{
+       uint64_t txg;
+       itxg_t *itxg;
+       itxs_t *itxs, *clean = NULL;
+
+       /*
+        * Object ids can be re-instantiated in the next txg so
+        * remove any async transactions to avoid future leaks.
+        * This can happen if a fsync occurs on the re-instantiated
+        * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
+        * the new file data and flushes a write record for the old object.
+        */
+       if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
+               zil_remove_async(zilog, itx->itx_oid);
+
+       /*
+        * Ensure the data of a renamed file is committed before the rename.
+        */
+       if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
+               zil_async_to_sync(zilog, itx->itx_oid);
+
+       if (spa_freeze_txg(zilog->zl_spa) !=  UINT64_MAX)
+               txg = ZILTEST_TXG;
+       else
+               txg = dmu_tx_get_txg(tx);
+
+       itxg = &zilog->zl_itxg[txg & TXG_MASK];
+       mutex_enter(&itxg->itxg_lock);
+       itxs = itxg->itxg_itxs;
+       if (itxg->itxg_txg != txg) {
+               if (itxs != NULL) {
+                       /*
+                        * The zil_clean callback hasn't got around to cleaning
+                        * this itxg. Save the itxs for release below.
+                        * This should be rare.
+                        */
+                       atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
+                       itxg->itxg_sod = 0;
+                       clean = itxg->itxg_itxs;
+               }
+               ASSERT(itxg->itxg_sod == 0);
+               itxg->itxg_txg = txg;
+               itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
+
+               list_create(&itxs->i_sync_list, sizeof (itx_t),
+                   offsetof(itx_t, itx_node));
+               avl_create(&itxs->i_async_tree, zil_aitx_compare,
+                   sizeof (itx_async_node_t),
+                   offsetof(itx_async_node_t, ia_node));
+       }
+       if (itx->itx_sync) {
+               list_insert_tail(&itxs->i_sync_list, itx);
+               atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod);
+               itxg->itxg_sod += itx->itx_sod;
+       } else {
+               avl_tree_t *t = &itxs->i_async_tree;
+               uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
+               itx_async_node_t *ian;
+               avl_index_t where;
+
+               ian = avl_find(t, &foid, &where);
+               if (ian == NULL) {
+                       ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP);
+                       list_create(&ian->ia_list, sizeof (itx_t),
+                           offsetof(itx_t, itx_node));
+                       ian->ia_foid = foid;
+                       avl_insert(t, ian, where);
+               }
+               list_insert_tail(&ian->ia_list, itx);
+       }
+
+       itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+       mutex_exit(&itxg->itxg_lock);
+
+       /* Release the old itxs now we've dropped the lock */
+       if (clean != NULL)
+               zil_itxg_clean(clean);
+}
+
 /*
  * If there are any in-memory intent log transactions which have now been
  * synced then start up a taskq to free them.
  */
 void
-zil_clean(zilog_t *zilog)
+zil_clean(zilog_t *zilog, uint64_t synced_txg)
 {
-       itx_t *itx;
+       itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
+       itxs_t *clean_me;
 
-       mutex_enter(&zilog->zl_lock);
-       itx = list_head(&zilog->zl_itx_list);
-       if ((itx != NULL) &&
-           (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
-               (void) taskq_dispatch(zilog->zl_clean_taskq,
-                   (task_func_t *)zil_itx_clean, zilog, TQ_NOSLEEP);
+       mutex_enter(&itxg->itxg_lock);
+       if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
+               mutex_exit(&itxg->itxg_lock);
+               return;
+       }
+       ASSERT3U(itxg->itxg_txg, <=, synced_txg);
+       ASSERT(itxg->itxg_txg != 0);
+       ASSERT(zilog->zl_clean_taskq != NULL);
+       atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
+       itxg->itxg_sod = 0;
+       clean_me = itxg->itxg_itxs;
+       itxg->itxg_itxs = NULL;
+       itxg->itxg_txg = 0;
+       mutex_exit(&itxg->itxg_lock);
+       /*
+        * Preferably start a task queue to free up the old itxs but
+        * if taskq_dispatch can't allocate resources to do that then
+        * free it in-line. This should be rare. Note, using TQ_SLEEP
+        * created a bad performance problem.
+        */
+       if (taskq_dispatch(zilog->zl_clean_taskq,
+           (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == NULL)
+               zil_itxg_clean(clean_me);
+}
+
+/*
+ * Get the list of itxs to commit into zl_itx_commit_list.
+ */
+static void
+zil_get_commit_list(zilog_t *zilog)
+{
+       uint64_t otxg, txg;
+       list_t *commit_list = &zilog->zl_itx_commit_list;
+       uint64_t push_sod = 0;
+
+       if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+               otxg = ZILTEST_TXG;
+       else
+               otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+       for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+               itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+               mutex_enter(&itxg->itxg_lock);
+               if (itxg->itxg_txg != txg) {
+                       mutex_exit(&itxg->itxg_lock);
+                       continue;
+               }
+
+               list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
+               push_sod += itxg->itxg_sod;
+               itxg->itxg_sod = 0;
+
+               mutex_exit(&itxg->itxg_lock);
+       }
+       atomic_add_64(&zilog->zl_itx_list_sz, -push_sod);
+}
+
+/*
+ * Move the async itxs for a specified object to commit into sync lists.
+ */
+static void
+zil_async_to_sync(zilog_t *zilog, uint64_t foid)
+{
+       uint64_t otxg, txg;
+       itx_async_node_t *ian;
+       avl_tree_t *t;
+       avl_index_t where;
+
+       if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+               otxg = ZILTEST_TXG;
+       else
+               otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+       for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+               itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+               mutex_enter(&itxg->itxg_lock);
+               if (itxg->itxg_txg != txg) {
+                       mutex_exit(&itxg->itxg_lock);
+                       continue;
+               }
+
+               /*
+                * If a foid is specified then find that node and append its
+                * list. Otherwise walk the tree appending all the lists
+                * to the sync list. We add to the end rather than the
+                * beginning to ensure the create has happened.
+                */
+               t = &itxg->itxg_itxs->i_async_tree;
+               if (foid != 0) {
+                       ian = avl_find(t, &foid, &where);
+                       if (ian != NULL) {
+                               list_move_tail(&itxg->itxg_itxs->i_sync_list,
+                                   &ian->ia_list);
+                       }
+               } else {
+                       void *cookie = NULL;
+
+                       while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+                               list_move_tail(&itxg->itxg_itxs->i_sync_list,
+                                   &ian->ia_list);
+                               list_destroy(&ian->ia_list);
+                               kmem_free(ian, sizeof (itx_async_node_t));
+                       }
+               }
+               mutex_exit(&itxg->itxg_lock);
        }
-       mutex_exit(&zilog->zl_lock);
 }
 
 static void
-zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
+zil_commit_writer(zilog_t *zilog)
 {
        uint64_t txg;
-       uint64_t commit_seq = 0;
-       itx_t *itx, *itx_next;
+       itx_t *itx;
        lwb_t *lwb;
-       spa_t *spa;
+       spa_t *spa = zilog->zl_spa;
        int error = 0;
 
-       zilog->zl_writer = B_TRUE;
        ASSERT(zilog->zl_root_zio == NULL);
-       spa = zilog->zl_spa;
+
+       mutex_exit(&zilog->zl_lock);
+
+       zil_get_commit_list(zilog);
+
+       /*
+        * Return if there's nothing to commit before we dirty the fs by
+        * calling zil_create().
+        */
+       if (list_head(&zilog->zl_itx_commit_list) == NULL) {
+               mutex_enter(&zilog->zl_lock);
+               return;
+       }
 
        if (zilog->zl_suspend) {
                lwb = NULL;
        } else {
                lwb = list_tail(&zilog->zl_lwb_list);
-               if (lwb == NULL) {
-                       /*
-                        * Return if there's nothing to flush before we
-                        * dirty the fs by calling zil_create()
-                        */
-                       if (list_is_empty(&zilog->zl_itx_list)) {
-                               zilog->zl_writer = B_FALSE;
-                               return;
-                       }
-                       mutex_exit(&zilog->zl_lock);
+               if (lwb == NULL)
                        lwb = zil_create(zilog);
-                       mutex_enter(&zilog->zl_lock);
-               }
        }
-       ASSERT(lwb == NULL || lwb->lwb_zio == NULL);
 
-       /* Loop through in-memory log transactions filling log blocks. */
        DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
-
-       for (itx = list_head(&zilog->zl_itx_list); itx; itx = itx_next) {
-               /*
-                * Save the next pointer.  Even though we drop zl_lock below,
-                * all threads that can remove itx list entries (other writers
-                * and zil_itx_clean()) can't do so until they have zl_writer.
-                */
-               itx_next = list_next(&zilog->zl_itx_list, itx);
-
-               /*
-                * Determine whether to push this itx.
-                * Push all transactions related to specified foid and
-                * all other transactions except those that can be logged
-                * out of order (TX_WRITE, TX_TRUNCATE, TX_SETATTR, TX_ACL)
-                * for all other files.
-                *
-                * If foid == 0 (meaning "push all foids") or
-                * itx->itx_sync is set (meaning O_[D]SYNC), push regardless.
-                */
-               if (foid != 0 && !itx->itx_sync &&
-                   TX_OOO(itx->itx_lr.lrc_txtype) &&
-                   ((lr_ooo_t *)&itx->itx_lr)->lr_foid != foid)
-                       continue; /* skip this record */
-
-               if ((itx->itx_lr.lrc_seq > seq) &&
-                   ((lwb == NULL) || (LWB_EMPTY(lwb)) ||
-                   (lwb->lwb_nused + itx->itx_sod > lwb->lwb_sz)))
-                       break;
-
-               list_remove(&zilog->zl_itx_list, itx);
-               zilog->zl_itx_list_sz -= itx->itx_sod;
-
-               mutex_exit(&zilog->zl_lock);
-
+       while (itx = list_head(&zilog->zl_itx_commit_list)) {
                txg = itx->itx_lr.lrc_txg;
                ASSERT(txg);
 
-               if (txg > spa_last_synced_txg(spa) ||
-                   txg > spa_freeze_txg(spa))
+               if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa))
                        lwb = zil_lwb_commit(zilog, itx, lwb);
-
-               zil_itx_destroy(itx);
-
-               mutex_enter(&zilog->zl_lock);
+               list_remove(&zilog->zl_itx_commit_list, itx);
+               kmem_free(itx, offsetof(itx_t, itx_lr)
+                   + itx->itx_lr.lrc_reclen);
        }
        DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
-       /* determine commit sequence number */
-       itx = list_head(&zilog->zl_itx_list);
-       if (itx)
-               commit_seq = itx->itx_lr.lrc_seq - 1;
-       else
-               commit_seq = zilog->zl_itx_seq;
-       mutex_exit(&zilog->zl_lock);
 
        /* write the last block out */
        if (lwb != NULL && lwb->lwb_zio != NULL)
                lwb = zil_lwb_write_start(zilog, lwb);
 
-       zilog->zl_prev_used = zilog->zl_cur_used;
        zilog->zl_cur_used = 0;
 
        /*
         * Wait if necessary for the log blocks to be on stable storage.
         */
        if (zilog->zl_root_zio) {
-               DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
                error = zio_wait(zilog->zl_root_zio);
                zilog->zl_root_zio = NULL;
-               DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
                zil_flush_vdevs(zilog);
        }
 
@@ -1246,10 +1452,6 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
                txg_wait_synced(zilog->zl_dmu_pool, 0);
 
        mutex_enter(&zilog->zl_lock);
-       zilog->zl_writer = B_FALSE;
-
-       ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
-       zilog->zl_commit_seq = commit_seq;
 
        /*
         * Remember the highest committed log sequence number for ztest.
@@ -1261,58 +1463,61 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
 }
 
 /*
- * Push zfs transactions to stable storage up to the supplied sequence number.
+ * Commit zfs transactions to stable storage.
  * If foid is 0 push out all transactions, otherwise push only those
- * for that file or might have been used to create that file.
+ * for that object or might reference that object.
+ *
+ * itxs are committed in batches. In a heavily stressed zil there will be
+ * a commit writer thread who is writing out a bunch of itxs to the log
+ * for a set of committing threads (cthreads) in the same batch as the writer.
+ * Those cthreads are all waiting on the same cv for that batch.
+ *
+ * There will also be a different and growing batch of threads that are
+ * waiting to commit (qthreads). When the committing batch completes
+ * a transition occurs such that the cthreads exit and the qthreads become
+ * cthreads. One of the new cthreads becomes the writer thread for the
+ * batch. Any new threads arriving become new qthreads.
+ *
+ * Only 2 condition variables are needed and there's no transition
+ * between the two cvs needed. They just flip-flop between qthreads
+ * and cthreads.
+ *
+ * Using this scheme we can efficiently wakeup up only those threads
+ * that have been committed.
  */
 void
-zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
+zil_commit(zilog_t *zilog, uint64_t foid)
 {
-       if (zilog->zl_sync == ZFS_SYNC_DISABLED || seq == 0)
-               return;
+       uint64_t mybatch;
 
-       mutex_enter(&zilog->zl_lock);
+       if (zilog->zl_sync == ZFS_SYNC_DISABLED)
+               return;
 
-       seq = MIN(seq, zilog->zl_itx_seq);      /* cap seq at largest itx seq */
+       /* move the async itxs for the foid to the sync queues */
+       zil_async_to_sync(zilog, foid);
 
+       mutex_enter(&zilog->zl_lock);
+       mybatch = zilog->zl_next_batch;
        while (zilog->zl_writer) {
-               cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-               if (seq <= zilog->zl_commit_seq) {
+               cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock);
+               if (mybatch <= zilog->zl_com_batch) {
                        mutex_exit(&zilog->zl_lock);
                        return;
                }
        }
-       zil_commit_writer(zilog, seq, foid); /* drops zl_lock */
-       /* wake up others waiting on the commit */
-       cv_broadcast(&zilog->zl_cv_writer);
-       mutex_exit(&zilog->zl_lock);
-}
-
-/*
- * Report whether all transactions are committed.
- */
-static boolean_t
-zil_is_committed(zilog_t *zilog)
-{
-       lwb_t *lwb;
-       boolean_t committed;
-
-       mutex_enter(&zilog->zl_lock);
 
-       while (zilog->zl_writer)
-               cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+       zilog->zl_next_batch++;
+       zilog->zl_writer = B_TRUE;
+       zil_commit_writer(zilog);
+       zilog->zl_com_batch = mybatch;
+       zilog->zl_writer = B_FALSE;
+       mutex_exit(&zilog->zl_lock);
 
-       if (!list_is_empty(&zilog->zl_itx_list))
-               committed = B_FALSE;            /* unpushed transactions */
-       else if ((lwb = list_head(&zilog->zl_lwb_list)) == NULL)
-               committed = B_TRUE;             /* intent log never used */
-       else if (list_next(&zilog->zl_lwb_list, lwb) != NULL)
-               committed = B_FALSE;            /* zil_sync() not done yet */
-       else
-               committed = B_TRUE;             /* everything synced */
+       /* wake up one thread to become the next writer */
+       cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]);
 
-       mutex_exit(&zilog->zl_lock);
-       return (committed);
+       /* wake up all threads waiting for this batch to be committed */
+       cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]);
 }
 
 /*
@@ -1425,15 +1630,21 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
        zilog->zl_destroy_txg = TXG_INITIAL - 1;
        zilog->zl_logbias = dmu_objset_logbias(os);
        zilog->zl_sync = dmu_objset_syncprop(os);
+       zilog->zl_next_batch = 1;
 
        mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 
-       list_create(&zilog->zl_itx_list, sizeof (itx_t),
-           offsetof(itx_t, itx_node));
+       for (int i = 0; i < TXG_SIZE; i++) {
+               mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
+                   MUTEX_DEFAULT, NULL);
+       }
 
        list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
            offsetof(lwb_t, lwb_node));
 
+       list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
+           offsetof(itx_t, itx_node));
+
        mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
 
        avl_create(&zilog->zl_vdev_tree, zil_vdev_compare,
@@ -1441,6 +1652,8 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 
        cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
        cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
+       cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL);
+       cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL);
 
        return (zilog);
 }
@@ -1448,27 +1661,47 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 void
 zil_free(zilog_t *zilog)
 {
-       lwb_t *lwb;
+       lwb_t *head_lwb;
 
        zilog->zl_stop_sync = 1;
 
-       while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
-               list_remove(&zilog->zl_lwb_list, lwb);
-               if (lwb->lwb_buf != NULL)
-                       zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-               kmem_cache_free(zil_lwb_cache, lwb);
+       /*
+        * After zil_close() there should only be one lwb with a buffer.
+        */
+       head_lwb = list_head(&zilog->zl_lwb_list);
+       if (head_lwb) {
+               ASSERT(head_lwb == list_tail(&zilog->zl_lwb_list));
+               list_remove(&zilog->zl_lwb_list, head_lwb);
+               zio_buf_free(head_lwb->lwb_buf, head_lwb->lwb_sz);
+               kmem_cache_free(zil_lwb_cache, head_lwb);
        }
        list_destroy(&zilog->zl_lwb_list);
 
        avl_destroy(&zilog->zl_vdev_tree);
        mutex_destroy(&zilog->zl_vdev_lock);
 
-       ASSERT(list_head(&zilog->zl_itx_list) == NULL);
-       list_destroy(&zilog->zl_itx_list);
+       ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
+       list_destroy(&zilog->zl_itx_commit_list);
+
+       for (int i = 0; i < TXG_SIZE; i++) {
+               /*
+                * It's possible for an itx to be generated that doesn't dirty
+                * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
+                * callback to remove the entry. We remove those here.
+                *
+                * Also free up the ziltest itxs.
+                */
+               if (zilog->zl_itxg[i].itxg_itxs)
+                       zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
+               mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
+       }
+
        mutex_destroy(&zilog->zl_lock);
 
        cv_destroy(&zilog->zl_cv_writer);
        cv_destroy(&zilog->zl_cv_suspend);
+       cv_destroy(&zilog->zl_cv_batch[0]);
+       cv_destroy(&zilog->zl_cv_batch[1]);
 
        kmem_free(zilog, sizeof (zilog_t));
 }
@@ -1494,26 +1727,28 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
 void
 zil_close(zilog_t *zilog)
 {
+       lwb_t *tail_lwb;
+       uint64_t txg = 0;
+
+       zil_commit(zilog, 0); /* commit all itx */
+
        /*
-        * If the log isn't already committed, mark the objset dirty
-        * (so zil_sync() will be called) and wait for that txg to sync.
+        * The lwb_max_txg for the stubby lwb will reflect the last activity
+        * for the zil.  After a txg_wait_synced() on the txg we know all the
+        * callbacks have occurred that may clean the zil.  Only then can we
+        * destroy the zl_clean_taskq.
         */
-       if (!zil_is_committed(zilog)) {
-               uint64_t txg;
-               dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
-               VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
-               dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-               txg = dmu_tx_get_txg(tx);
-               dmu_tx_commit(tx);
+       mutex_enter(&zilog->zl_lock);
+       tail_lwb = list_tail(&zilog->zl_lwb_list);
+       if (tail_lwb != NULL)
+               txg = tail_lwb->lwb_max_txg;
+       mutex_exit(&zilog->zl_lock);
+       if (txg)
                txg_wait_synced(zilog->zl_dmu_pool, txg);
-       }
 
        taskq_destroy(zilog->zl_clean_taskq);
        zilog->zl_clean_taskq = NULL;
        zilog->zl_get_data = NULL;
-
-       zil_itx_clean(zilog);
-       ASSERT(list_head(&zilog->zl_itx_list) == NULL);
 }
 
 /*
@@ -1545,15 +1780,7 @@ zil_suspend(zilog_t *zilog)
        zilog->zl_suspending = B_TRUE;
        mutex_exit(&zilog->zl_lock);
 
-       zil_commit(zilog, UINT64_MAX, 0);
-
-       /*
-        * Wait for any in-flight log writes to complete.
-        */
-       mutex_enter(&zilog->zl_lock);
-       while (zilog->zl_writer)
-               cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-       mutex_exit(&zilog->zl_lock);
+       zil_commit(zilog, 0);
 
        zil_destroy(zilog, B_FALSE);
 
index 88d80af4e99ff2ccd8a3e812234933a8be16bd4b..1ba2330bdcdc457e4184cda424ed1626721200b0 100644 (file)
@@ -2247,6 +2247,26 @@ zio_vdev_io_start(zio_t *zio)
                return (vdev_mirror_ops.vdev_op_io_start(zio));
        }
 
+       /*
+        * We keep track of time-sensitive I/Os so that the scan thread
+        * can quickly react to certain workloads.  In particular, we care
+        * about non-scrubbing, top-level reads and writes with the following
+        * characteristics:
+        *      - synchronous writes of user data to non-slog devices
+        *      - any reads of user data
+        * When these conditions are met, adjust the timestamp of spa_last_io
+        * which allows the scan thread to adjust its workload accordingly.
+        */
+       if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
+           vd == vd->vdev_top && !vd->vdev_islog &&
+           zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
+           zio->io_txg != spa_syncing_txg(spa)) {
+               uint64_t old = spa->spa_last_io;
+               uint64_t new = ddi_get_lbolt64();
+               if (old != new)
+                       (void) atomic_cas_64(&spa->spa_last_io, old, new);
+       }
+
        align = 1ULL << vd->vdev_top->vdev_ashift;
 
        if (P2PHASE(zio->io_size, align) != 0) {
@@ -2262,7 +2282,7 @@ zio_vdev_io_start(zio_t *zio)
 
        ASSERT(P2PHASE(zio->io_offset, align) == 0);
        ASSERT(P2PHASE(zio->io_size, align) == 0);
-       ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
+       VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
 
        /*
         * If this is a repair I/O, and there's no self-healing involved --
@@ -2744,6 +2764,7 @@ zio_done(zio_t *zio)
 
                if ((zio->io_type == ZIO_TYPE_READ ||
                    zio->io_type == ZIO_TYPE_FREE) &&
+                   !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
                    zio->io_error == ENXIO &&
                    spa_load_state(spa) == SPA_LOAD_NONE &&
                    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
index 16eaed6688e59270b60de98a6b8b1d6c80d804bf..9ae7d1f697fdb8d7708e11a270a602b2e995f982 100644 (file)
@@ -476,7 +476,6 @@ int
 zio_clear_fault(int id)
 {
        inject_handler_t *handler;
-       int ret;
 
        rw_enter(&inject_lock, RW_WRITER);
 
@@ -486,18 +485,18 @@ zio_clear_fault(int id)
                        break;
 
        if (handler == NULL) {
-               ret = ENOENT;
-       } else {
-               list_remove(&inject_handlers, handler);
-               spa_inject_delref(handler->zi_spa);
-               kmem_free(handler, sizeof (inject_handler_t));
-               atomic_add_32(&zio_injection_enabled, -1);
-               ret = 0;
+               rw_exit(&inject_lock);
+               return (ENOENT);
        }
 
+       list_remove(&inject_handlers, handler);
        rw_exit(&inject_lock);
 
-       return (ret);
+       spa_inject_delref(handler->zi_spa);
+       kmem_free(handler, sizeof (inject_handler_t));
+       atomic_add_32(&zio_injection_enabled, -1);
+
+       return (0);
 }
 
 void
diff --git a/module/zfs/zrlock.c b/module/zfs/zrlock.c
new file mode 100644 (file)
index 0000000..ec94b08
--- /dev/null
@@ -0,0 +1,194 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * A Zero Reference Lock (ZRL) is a reference count that can lock out new
+ * references only when the count is zero and only without waiting if the count
+ * is not already zero. It is similar to a read-write lock in that it allows
+ * multiple readers and only a single writer, but it does not allow a writer to
+ * block while waiting for readers to exit, and therefore the question of
+ * reader/writer priority is moot (no WRWANT bit). Since the equivalent of
+ * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it
+ * is perfectly safe for the same reader to acquire the same lock multiple
+ * times. The fact that a ZRL is reentrant for readers (through multiple calls
+ * to zrl_add()) makes it convenient for determining whether something is
+ * actively referenced without the fuss of flagging lock ownership across
+ * function calls.
+ */
+#include <sys/zrlock.h>
+
+/*
+ * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is
+ * treated as zero references.
+ */
+#define        ZRL_LOCKED      ((uint32_t)-1)
+#define        ZRL_DESTROYED   -2
+
+void
+zrl_init(zrlock_t *zrl)
+{
+       mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL);
+       zrl->zr_refcount = 0;
+       cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL);
+#ifdef ZFS_DEBUG
+       zrl->zr_owner = NULL;
+       zrl->zr_caller = NULL;
+#endif
+}
+
+void
+zrl_destroy(zrlock_t *zrl)
+{
+       ASSERT(zrl->zr_refcount == 0);
+
+       mutex_destroy(&zrl->zr_mtx);
+       zrl->zr_refcount = ZRL_DESTROYED;
+       cv_destroy(&zrl->zr_cv);
+}
+
+void
+#ifdef ZFS_DEBUG
+zrl_add_debug(zrlock_t *zrl, const char *zc)
+#else
+zrl_add(zrlock_t *zrl)
+#endif
+{
+       uint32_t n = (uint32_t)zrl->zr_refcount;
+
+       while (n != ZRL_LOCKED) {
+               uint32_t cas = atomic_cas_32(
+                   (uint32_t *)&zrl->zr_refcount, n, n + 1);
+               if (cas == n) {
+                       ASSERT((int32_t)n >= 0);
+#ifdef ZFS_DEBUG
+                       if (zrl->zr_owner == curthread) {
+                               DTRACE_PROBE2(zrlock__reentry,
+                                   zrlock_t *, zrl, uint32_t, n);
+                       }
+                       zrl->zr_owner = curthread;
+                       zrl->zr_caller = zc;
+#endif
+                       return;
+               }
+               n = cas;
+       }
+
+       mutex_enter(&zrl->zr_mtx);
+       while (zrl->zr_refcount == ZRL_LOCKED) {
+               cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
+       }
+       ASSERT(zrl->zr_refcount >= 0);
+       zrl->zr_refcount++;
+#ifdef ZFS_DEBUG
+       zrl->zr_owner = curthread;
+       zrl->zr_caller = zc;
+#endif
+       mutex_exit(&zrl->zr_mtx);
+}
+
+void
+zrl_remove(zrlock_t *zrl)
+{
+       uint32_t n;
+
+       n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
+       ASSERT((int32_t)n >= 0);
+#ifdef ZFS_DEBUG
+       if (zrl->zr_owner == curthread) {
+               zrl->zr_owner = NULL;
+               zrl->zr_caller = NULL;
+       }
+#endif
+}
+
+int
+zrl_tryenter(zrlock_t *zrl)
+{
+       uint32_t n = (uint32_t)zrl->zr_refcount;
+
+       if (n == 0) {
+               uint32_t cas = atomic_cas_32(
+                   (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED);
+               if (cas == 0) {
+#ifdef ZFS_DEBUG
+                       ASSERT(zrl->zr_owner == NULL);
+                       zrl->zr_owner = curthread;
+#endif
+                       return (1);
+               }
+       }
+
+       ASSERT((int32_t)n > ZRL_DESTROYED);
+
+       return (0);
+}
+
+void
+zrl_exit(zrlock_t *zrl)
+{
+       ASSERT(zrl->zr_refcount == ZRL_LOCKED);
+
+       mutex_enter(&zrl->zr_mtx);
+#ifdef ZFS_DEBUG
+       ASSERT(zrl->zr_owner == curthread);
+       zrl->zr_owner = NULL;
+       membar_producer();      /* make sure the owner store happens first */
+#endif
+       zrl->zr_refcount = 0;
+       cv_broadcast(&zrl->zr_cv);
+       mutex_exit(&zrl->zr_mtx);
+}
+
+int
+zrl_refcount(zrlock_t *zrl)
+{
+       ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+       int n = (int)zrl->zr_refcount;
+       return (n <= 0 ? 0 : n);
+}
+
+int
+zrl_is_zero(zrlock_t *zrl)
+{
+       ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+       return (zrl->zr_refcount <= 0);
+}
+
+int
+zrl_is_locked(zrlock_t *zrl)
+{
+       ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+       return (zrl->zr_refcount == ZRL_LOCKED);
+}
+
+#ifdef ZFS_DEBUG
+kthread_t *
+zrl_owner(zrlock_t *zrl)
+{
+       return (zrl->zr_owner);
+}
+#endif